1/*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright (C) 2011-2014 Matteo Landi
5 * Copyright (C) 2011-2016 Luigi Rizzo
6 * Copyright (C) 2011-2016 Giuseppe Lettieri
7 * Copyright (C) 2011-2016 Vincenzo Maffione
8 * All rights reserved.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *   1. Redistributions of source code must retain the above copyright
14 *      notice, this list of conditions and the following disclaimer.
15 *   2. Redistributions in binary form must reproduce the above copyright
16 *      notice, this list of conditions and the following disclaimer in the
17 *      documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 */
31
32
33/*
34 * $FreeBSD$
35 *
36 * This module supports memory mapped access to network devices,
37 * see netmap(4).
38 *
39 * The module uses a large, memory pool allocated by the kernel
40 * and accessible as mmapped memory by multiple userspace threads/processes.
41 * The memory pool contains packet buffers and "netmap rings",
42 * i.e. user-accessible copies of the interface's queues.
43 *
44 * Access to the network card works like this:
45 * 1. a process/thread issues one or more open() on /dev/netmap, to create
46 *    select()able file descriptor on which events are reported.
47 * 2. on each descriptor, the process issues an ioctl() to identify
48 *    the interface that should report events to the file descriptor.
49 * 3. on each descriptor, the process issues an mmap() request to
50 *    map the shared memory region within the process' address space.
51 *    The list of interesting queues is indicated by a location in
52 *    the shared memory region.
53 * 4. using the functions in the netmap(4) userspace API, a process
54 *    can look up the occupation state of a queue, access memory buffers,
55 *    and retrieve received packets or enqueue packets to transmit.
56 * 5. using some ioctl()s the process can synchronize the userspace view
57 *    of the queue with the actual status in the kernel. This includes both
58 *    receiving the notification of new packets, and transmitting new
59 *    packets on the output interface.
60 * 6. select() or poll() can be used to wait for events on individual
61 *    transmit or receive queues (or all queues for a given interface).
62 *
63
64		SYNCHRONIZATION (USER)
65
66The netmap rings and data structures may be shared among multiple
67user threads or even independent processes.
68Any synchronization among those threads/processes is delegated
69to the threads themselves. Only one thread at a time can be in
70a system call on the same netmap ring. The OS does not enforce
71this and only guarantees against system crashes in case of
72invalid usage.
73
74		LOCKING (INTERNAL)
75
76Within the kernel, access to the netmap rings is protected as follows:
77
78- a spinlock on each ring, to handle producer/consumer races on
79  RX rings attached to the host stack (against multiple host
80  threads writing from the host stack to the same ring),
81  and on 'destination' rings attached to a VALE switch
82  (i.e. RX rings in VALE ports, and TX rings in NIC/host ports)
83  protecting multiple active senders for the same destination)
84
85- an atomic variable to guarantee that there is at most one
86  instance of *_*xsync() on the ring at any time.
87  For rings connected to user file
88  descriptors, an atomic_test_and_set() protects this, and the
89  lock on the ring is not actually used.
90  For NIC RX rings connected to a VALE switch, an atomic_test_and_set()
91  is also used to prevent multiple executions (the driver might indeed
92  already guarantee this).
93  For NIC TX rings connected to a VALE switch, the lock arbitrates
94  access to the queue (both when allocating buffers and when pushing
95  them out).
96
97- *xsync() should be protected against initializations of the card.
98  On FreeBSD most devices have the reset routine protected by
99  a RING lock (ixgbe, igb, em) or core lock (re). lem is missing
100  the RING protection on rx_reset(), this should be added.
101
102  On linux there is an external lock on the tx path, which probably
103  also arbitrates access to the reset routine. XXX to be revised
104
105- a per-interface core_lock protecting access from the host stack
106  while interfaces may be detached from netmap mode.
107  XXX there should be no need for this lock if we detach the interfaces
108  only while they are down.
109
110
111--- VALE SWITCH ---
112
113NMG_LOCK() serializes all modifications to switches and ports.
114A switch cannot be deleted until all ports are gone.
115
116For each switch, an SX lock (RWlock on linux) protects
117deletion of ports. When configuring or deleting a new port, the
118lock is acquired in exclusive mode (after holding NMG_LOCK).
119When forwarding, the lock is acquired in shared mode (without NMG_LOCK).
120The lock is held throughout the entire forwarding cycle,
121during which the thread may incur in a page fault.
122Hence it is important that sleepable shared locks are used.
123
124On the rx ring, the per-port lock is grabbed initially to reserve
125a number of slot in the ring, then the lock is released,
126packets are copied from source to destination, and then
127the lock is acquired again and the receive ring is updated.
128(A similar thing is done on the tx ring for NIC and host stack
129ports attached to the switch)
130
131 */
132
133
134/* --- internals ----
135 *
136 * Roadmap to the code that implements the above.
137 *
138 * > 1. a process/thread issues one or more open() on /dev/netmap, to create
139 * >    select()able file descriptor on which events are reported.
140 *
141 *  	Internally, we allocate a netmap_priv_d structure, that will be
142 *  	initialized on ioctl(NIOCREGIF). There is one netmap_priv_d
143 *  	structure for each open().
144 *
145 *      os-specific:
146 *  	    FreeBSD: see netmap_open() (netmap_freebsd.c)
147 *  	    linux:   see linux_netmap_open() (netmap_linux.c)
148 *
149 * > 2. on each descriptor, the process issues an ioctl() to identify
150 * >    the interface that should report events to the file descriptor.
151 *
152 * 	Implemented by netmap_ioctl(), NIOCREGIF case, with nmr->nr_cmd==0.
153 * 	Most important things happen in netmap_get_na() and
154 * 	netmap_do_regif(), called from there. Additional details can be
155 * 	found in the comments above those functions.
156 *
157 * 	In all cases, this action creates/takes-a-reference-to a
158 * 	netmap_*_adapter describing the port, and allocates a netmap_if
159 * 	and all necessary netmap rings, filling them with netmap buffers.
160 *
161 *      In this phase, the sync callbacks for each ring are set (these are used
162 *      in steps 5 and 6 below).  The callbacks depend on the type of adapter.
163 *      The adapter creation/initialization code puts them in the
164 * 	netmap_adapter (fields na->nm_txsync and na->nm_rxsync).  Then, they
165 * 	are copied from there to the netmap_kring's during netmap_do_regif(), by
166 * 	the nm_krings_create() callback.  All the nm_krings_create callbacks
167 * 	actually call netmap_krings_create() to perform this and the other
168 * 	common stuff. netmap_krings_create() also takes care of the host rings,
169 * 	if needed, by setting their sync callbacks appropriately.
170 *
171 * 	Additional actions depend on the kind of netmap_adapter that has been
172 * 	registered:
173 *
174 * 	- netmap_hw_adapter:  	     [netmap.c]
175 * 	     This is a system netdev/ifp with native netmap support.
176 * 	     The ifp is detached from the host stack by redirecting:
177 * 	       - transmissions (from the network stack) to netmap_transmit()
178 * 	       - receive notifications to the nm_notify() callback for
179 * 	         this adapter. The callback is normally netmap_notify(), unless
180 * 	         the ifp is attached to a bridge using bwrap, in which case it
181 * 	         is netmap_bwrap_intr_notify().
182 *
183 * 	- netmap_generic_adapter:      [netmap_generic.c]
184 * 	      A system netdev/ifp without native netmap support.
185 *
186 * 	(the decision about native/non native support is taken in
187 * 	 netmap_get_hw_na(), called by netmap_get_na())
188 *
189 * 	- netmap_vp_adapter 		[netmap_vale.c]
190 * 	      Returned by netmap_get_bdg_na().
191 * 	      This is a persistent or ephemeral VALE port. Ephemeral ports
192 * 	      are created on the fly if they don't already exist, and are
193 * 	      always attached to a bridge.
194 * 	      Persistent VALE ports must must be created separately, and i
195 * 	      then attached like normal NICs. The NIOCREGIF we are examining
196 * 	      will find them only if they had previosly been created and
197 * 	      attached (see VALE_CTL below).
198 *
199 * 	- netmap_pipe_adapter 	      [netmap_pipe.c]
200 * 	      Returned by netmap_get_pipe_na().
201 * 	      Both pipe ends are created, if they didn't already exist.
202 *
203 * 	- netmap_monitor_adapter      [netmap_monitor.c]
204 * 	      Returned by netmap_get_monitor_na().
205 * 	      If successful, the nm_sync callbacks of the monitored adapter
206 * 	      will be intercepted by the returned monitor.
207 *
208 * 	- netmap_bwrap_adapter	      [netmap_vale.c]
209 * 	      Cannot be obtained in this way, see VALE_CTL below
210 *
211 *
212 * 	os-specific:
213 * 	    linux: we first go through linux_netmap_ioctl() to
214 * 	           adapt the FreeBSD interface to the linux one.
215 *
216 *
217 * > 3. on each descriptor, the process issues an mmap() request to
218 * >    map the shared memory region within the process' address space.
219 * >    The list of interesting queues is indicated by a location in
220 * >    the shared memory region.
221 *
222 *      os-specific:
223 *  	    FreeBSD: netmap_mmap_single (netmap_freebsd.c).
224 *  	    linux:   linux_netmap_mmap (netmap_linux.c).
225 *
226 * > 4. using the functions in the netmap(4) userspace API, a process
227 * >    can look up the occupation state of a queue, access memory buffers,
228 * >    and retrieve received packets or enqueue packets to transmit.
229 *
230 * 	these actions do not involve the kernel.
231 *
232 * > 5. using some ioctl()s the process can synchronize the userspace view
233 * >    of the queue with the actual status in the kernel. This includes both
234 * >    receiving the notification of new packets, and transmitting new
235 * >    packets on the output interface.
236 *
237 * 	These are implemented in netmap_ioctl(), NIOCTXSYNC and NIOCRXSYNC
238 * 	cases. They invoke the nm_sync callbacks on the netmap_kring
239 * 	structures, as initialized in step 2 and maybe later modified
240 * 	by a monitor. Monitors, however, will always call the original
241 * 	callback before doing anything else.
242 *
243 *
244 * > 6. select() or poll() can be used to wait for events on individual
245 * >    transmit or receive queues (or all queues for a given interface).
246 *
247 * 	Implemented in netmap_poll(). This will call the same nm_sync()
248 * 	callbacks as in step 5 above.
249 *
250 * 	os-specific:
251 * 		linux: we first go through linux_netmap_poll() to adapt
252 * 		       the FreeBSD interface to the linux one.
253 *
254 *
255 *  ----  VALE_CTL -----
256 *
257 *  VALE switches are controlled by issuing a NIOCREGIF with a non-null
258 *  nr_cmd in the nmreq structure. These subcommands are handled by
259 *  netmap_bdg_ctl() in netmap_vale.c. Persistent VALE ports are created
260 *  and destroyed by issuing the NETMAP_BDG_NEWIF and NETMAP_BDG_DELIF
261 *  subcommands, respectively.
262 *
263 *  Any network interface known to the system (including a persistent VALE
264 *  port) can be attached to a VALE switch by issuing the
265 *  NETMAP_REQ_VALE_ATTACH command. After the attachment, persistent VALE ports
266 *  look exactly like ephemeral VALE ports (as created in step 2 above).  The
267 *  attachment of other interfaces, instead, requires the creation of a
268 *  netmap_bwrap_adapter.  Moreover, the attached interface must be put in
269 *  netmap mode. This may require the creation of a netmap_generic_adapter if
270 *  we have no native support for the interface, or if generic adapters have
271 *  been forced by sysctl.
272 *
273 *  Both persistent VALE ports and bwraps are handled by netmap_get_bdg_na(),
274 *  called by nm_bdg_ctl_attach(), and discriminated by the nm_bdg_attach()
275 *  callback.  In the case of the bwrap, the callback creates the
276 *  netmap_bwrap_adapter.  The initialization of the bwrap is then
277 *  completed by calling netmap_do_regif() on it, in the nm_bdg_ctl()
278 *  callback (netmap_bwrap_bdg_ctl in netmap_vale.c).
279 *  A generic adapter for the wrapped ifp will be created if needed, when
280 *  netmap_get_bdg_na() calls netmap_get_hw_na().
281 *
282 *
283 *  ---- DATAPATHS -----
284 *
285 *              -= SYSTEM DEVICE WITH NATIVE SUPPORT =-
286 *
287 *    na == NA(ifp) == netmap_hw_adapter created in DEVICE_netmap_attach()
288 *
289 *    - tx from netmap userspace:
290 *	 concurrently:
291 *           1) ioctl(NIOCTXSYNC)/netmap_poll() in process context
292 *                kring->nm_sync() == DEVICE_netmap_txsync()
293 *           2) device interrupt handler
294 *                na->nm_notify()  == netmap_notify()
295 *    - rx from netmap userspace:
296 *       concurrently:
297 *           1) ioctl(NIOCRXSYNC)/netmap_poll() in process context
298 *                kring->nm_sync() == DEVICE_netmap_rxsync()
299 *           2) device interrupt handler
300 *                na->nm_notify()  == netmap_notify()
301 *    - rx from host stack
302 *       concurrently:
303 *           1) host stack
304 *                netmap_transmit()
305 *                  na->nm_notify  == netmap_notify()
306 *           2) ioctl(NIOCRXSYNC)/netmap_poll() in process context
307 *                kring->nm_sync() == netmap_rxsync_from_host
308 *                  netmap_rxsync_from_host(na, NULL, NULL)
309 *    - tx to host stack
310 *           ioctl(NIOCTXSYNC)/netmap_poll() in process context
311 *             kring->nm_sync() == netmap_txsync_to_host
312 *               netmap_txsync_to_host(na)
313 *                 nm_os_send_up()
314 *                   FreeBSD: na->if_input() == ether_input()
315 *                   linux: netif_rx() with NM_MAGIC_PRIORITY_RX
316 *
317 *
318 *               -= SYSTEM DEVICE WITH GENERIC SUPPORT =-
319 *
320 *    na == NA(ifp) == generic_netmap_adapter created in generic_netmap_attach()
321 *
322 *    - tx from netmap userspace:
323 *       concurrently:
324 *           1) ioctl(NIOCTXSYNC)/netmap_poll() in process context
325 *               kring->nm_sync() == generic_netmap_txsync()
326 *                   nm_os_generic_xmit_frame()
327 *                       linux:   dev_queue_xmit() with NM_MAGIC_PRIORITY_TX
328 *                           ifp->ndo_start_xmit == generic_ndo_start_xmit()
329 *                               gna->save_start_xmit == orig. dev. start_xmit
330 *                       FreeBSD: na->if_transmit() == orig. dev if_transmit
331 *           2) generic_mbuf_destructor()
332 *                   na->nm_notify() == netmap_notify()
333 *    - rx from netmap userspace:
334 *           1) ioctl(NIOCRXSYNC)/netmap_poll() in process context
335 *               kring->nm_sync() == generic_netmap_rxsync()
336 *                   mbq_safe_dequeue()
337 *           2) device driver
338 *               generic_rx_handler()
339 *                   mbq_safe_enqueue()
340 *                   na->nm_notify() == netmap_notify()
341 *    - rx from host stack
342 *        FreeBSD: same as native
343 *        Linux: same as native except:
344 *           1) host stack
345 *               dev_queue_xmit() without NM_MAGIC_PRIORITY_TX
346 *                   ifp->ndo_start_xmit == generic_ndo_start_xmit()
347 *                       netmap_transmit()
348 *                           na->nm_notify() == netmap_notify()
349 *    - tx to host stack (same as native):
350 *
351 *
352 *                           -= VALE =-
353 *
354 *   INCOMING:
355 *
356 *      - VALE ports:
357 *          ioctl(NIOCTXSYNC)/netmap_poll() in process context
358 *              kring->nm_sync() == netmap_vp_txsync()
359 *
360 *      - system device with native support:
361 *         from cable:
362 *             interrupt
363 *                na->nm_notify() == netmap_bwrap_intr_notify(ring_nr != host ring)
364 *                     kring->nm_sync() == DEVICE_netmap_rxsync()
365 *                     netmap_vp_txsync()
366 *                     kring->nm_sync() == DEVICE_netmap_rxsync()
367 *         from host stack:
368 *             netmap_transmit()
369 *                na->nm_notify() == netmap_bwrap_intr_notify(ring_nr == host ring)
370 *                     kring->nm_sync() == netmap_rxsync_from_host()
371 *                     netmap_vp_txsync()
372 *
373 *      - system device with generic support:
374 *         from device driver:
375 *            generic_rx_handler()
376 *                na->nm_notify() == netmap_bwrap_intr_notify(ring_nr != host ring)
377 *                     kring->nm_sync() == generic_netmap_rxsync()
378 *                     netmap_vp_txsync()
379 *                     kring->nm_sync() == generic_netmap_rxsync()
380 *         from host stack:
381 *            netmap_transmit()
382 *                na->nm_notify() == netmap_bwrap_intr_notify(ring_nr == host ring)
383 *                     kring->nm_sync() == netmap_rxsync_from_host()
384 *                     netmap_vp_txsync()
385 *
386 *   (all cases) --> nm_bdg_flush()
387 *                      dest_na->nm_notify() == (see below)
388 *
389 *   OUTGOING:
390 *
391 *      - VALE ports:
392 *         concurrently:
393 *             1) ioctl(NIOCRXSYNC)/netmap_poll() in process context
394 *                    kring->nm_sync() == netmap_vp_rxsync()
395 *             2) from nm_bdg_flush()
396 *                    na->nm_notify() == netmap_notify()
397 *
398 *      - system device with native support:
399 *          to cable:
400 *             na->nm_notify() == netmap_bwrap_notify()
401 *                 netmap_vp_rxsync()
402 *                 kring->nm_sync() == DEVICE_netmap_txsync()
403 *                 netmap_vp_rxsync()
404 *          to host stack:
405 *                 netmap_vp_rxsync()
406 *                 kring->nm_sync() == netmap_txsync_to_host
407 *                 netmap_vp_rxsync_locked()
408 *
409 *      - system device with generic adapter:
410 *          to device driver:
411 *             na->nm_notify() == netmap_bwrap_notify()
412 *                 netmap_vp_rxsync()
413 *                 kring->nm_sync() == generic_netmap_txsync()
414 *                 netmap_vp_rxsync()
415 *          to host stack:
416 *                 netmap_vp_rxsync()
417 *                 kring->nm_sync() == netmap_txsync_to_host
418 *                 netmap_vp_rxsync()
419 *
420 */
421
422/*
423 * OS-specific code that is used only within this file.
424 * Other OS-specific code that must be accessed by drivers
425 * is present in netmap_kern.h
426 */
427
428#if defined(__FreeBSD__)
429#include <sys/cdefs.h> /* prerequisite */
430#include <sys/types.h>
431#include <sys/errno.h>
432#include <sys/param.h>	/* defines used in kernel.h */
433#include <sys/kernel.h>	/* types used in module initialization */
434#include <sys/conf.h>	/* cdevsw struct, UID, GID */
435#include <sys/filio.h>	/* FIONBIO */
436#include <sys/sockio.h>
437#include <sys/socketvar.h>	/* struct socket */
438#include <sys/malloc.h>
439#include <sys/poll.h>
440#include <sys/rwlock.h>
441#include <sys/socket.h> /* sockaddrs */
442#include <sys/selinfo.h>
443#include <sys/sysctl.h>
444#include <sys/jail.h>
445#include <net/vnet.h>
446#include <net/if.h>
447#include <net/if_var.h>
448#include <net/bpf.h>		/* BIOCIMMEDIATE */
449#include <machine/bus.h>	/* bus_dmamap_* */
450#include <sys/endian.h>
451#include <sys/refcount.h>
452#include <net/ethernet.h>	/* ETHER_BPF_MTAP */
453
454
455#elif defined(linux)
456
457#include "bsd_glue.h"
458
459#elif defined(__APPLE__)
460
461#warning OSX support is only partial
462#include "osx_glue.h"
463
464#elif defined (_WIN32)
465
466#include "win_glue.h"
467
468#else
469
470#error	Unsupported platform
471
472#endif /* unsupported */
473
474/*
475 * common headers
476 */
477#include <net/netmap.h>
478#include <dev/netmap/netmap_kern.h>
479#include <dev/netmap/netmap_mem2.h>
480
481
482/* user-controlled variables */
483int netmap_verbose;
484#ifdef CONFIG_NETMAP_DEBUG
485int netmap_debug;
486#endif /* CONFIG_NETMAP_DEBUG */
487
488static int netmap_no_timestamp; /* don't timestamp on rxsync */
489int netmap_no_pendintr = 1;
490int netmap_txsync_retry = 2;
491static int netmap_fwd = 0;	/* force transparent forwarding */
492
493/*
494 * netmap_admode selects the netmap mode to use.
495 * Invalid values are reset to NETMAP_ADMODE_BEST
496 */
497enum {	NETMAP_ADMODE_BEST = 0,	/* use native, fallback to generic */
498	NETMAP_ADMODE_NATIVE,	/* either native or none */
499	NETMAP_ADMODE_GENERIC,	/* force generic */
500	NETMAP_ADMODE_LAST };
501static int netmap_admode = NETMAP_ADMODE_BEST;
502
503/* netmap_generic_mit controls mitigation of RX notifications for
504 * the generic netmap adapter. The value is a time interval in
505 * nanoseconds. */
506int netmap_generic_mit = 100*1000;
507
508/* We use by default netmap-aware qdiscs with generic netmap adapters,
509 * even if there can be a little performance hit with hardware NICs.
510 * However, using the qdisc is the safer approach, for two reasons:
511 * 1) it prevents non-fifo qdiscs to break the TX notification
512 *    scheme, which is based on mbuf destructors when txqdisc is
513 *    not used.
514 * 2) it makes it possible to transmit over software devices that
515 *    change skb->dev, like bridge, veth, ...
516 *
517 * Anyway users looking for the best performance should
518 * use native adapters.
519 */
520#ifdef linux
521int netmap_generic_txqdisc = 1;
522#endif
523
524/* Default number of slots and queues for generic adapters. */
525int netmap_generic_ringsize = 1024;
526int netmap_generic_rings = 1;
527
528/* Non-zero to enable checksum offloading in NIC drivers */
529int netmap_generic_hwcsum = 0;
530
531/* Non-zero if ptnet devices are allowed to use virtio-net headers. */
532int ptnet_vnet_hdr = 1;
533
534/*
535 * SYSCTL calls are grouped between SYSBEGIN and SYSEND to be emulated
536 * in some other operating systems
537 */
538SYSBEGIN(main_init);
539
540SYSCTL_DECL(_dev_netmap);
541SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args");
542SYSCTL_INT(_dev_netmap, OID_AUTO, verbose,
543		CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode");
544#ifdef CONFIG_NETMAP_DEBUG
545SYSCTL_INT(_dev_netmap, OID_AUTO, debug,
546		CTLFLAG_RW, &netmap_debug, 0, "Debug messages");
547#endif /* CONFIG_NETMAP_DEBUG */
548SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp,
549		CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp");
550SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr, CTLFLAG_RW, &netmap_no_pendintr,
551		0, "Always look for new received packets.");
552SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW,
553		&netmap_txsync_retry, 0, "Number of txsync loops in bridge's flush.");
554
555SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0,
556		"Force NR_FORWARD mode");
557SYSCTL_INT(_dev_netmap, OID_AUTO, admode, CTLFLAG_RW, &netmap_admode, 0,
558		"Adapter mode. 0 selects the best option available,"
559		"1 forces native adapter, 2 forces emulated adapter");
560SYSCTL_INT(_dev_netmap, OID_AUTO, generic_hwcsum, CTLFLAG_RW, &netmap_generic_hwcsum,
561		0, "Hardware checksums. 0 to disable checksum generation by the NIC (default),"
562		"1 to enable checksum generation by the NIC");
563SYSCTL_INT(_dev_netmap, OID_AUTO, generic_mit, CTLFLAG_RW, &netmap_generic_mit,
564		0, "RX notification interval in nanoseconds");
565SYSCTL_INT(_dev_netmap, OID_AUTO, generic_ringsize, CTLFLAG_RW,
566		&netmap_generic_ringsize, 0,
567		"Number of per-ring slots for emulated netmap mode");
568SYSCTL_INT(_dev_netmap, OID_AUTO, generic_rings, CTLFLAG_RW,
569		&netmap_generic_rings, 0,
570		"Number of TX/RX queues for emulated netmap adapters");
571#ifdef linux
572SYSCTL_INT(_dev_netmap, OID_AUTO, generic_txqdisc, CTLFLAG_RW,
573		&netmap_generic_txqdisc, 0, "Use qdisc for generic adapters");
574#endif
575SYSCTL_INT(_dev_netmap, OID_AUTO, ptnet_vnet_hdr, CTLFLAG_RW, &ptnet_vnet_hdr,
576		0, "Allow ptnet devices to use virtio-net headers");
577
578SYSEND;
579
580NMG_LOCK_T	netmap_global_lock;
581
582/*
583 * mark the ring as stopped, and run through the locks
584 * to make sure other users get to see it.
585 * stopped must be either NR_KR_STOPPED (for unbounded stop)
586 * of NR_KR_LOCKED (brief stop for mutual exclusion purposes)
587 */
588static void
589netmap_disable_ring(struct netmap_kring *kr, int stopped)
590{
591	nm_kr_stop(kr, stopped);
592	// XXX check if nm_kr_stop is sufficient
593	mtx_lock(&kr->q_lock);
594	mtx_unlock(&kr->q_lock);
595	nm_kr_put(kr);
596}
597
598/* stop or enable a single ring */
599void
600netmap_set_ring(struct netmap_adapter *na, u_int ring_id, enum txrx t, int stopped)
601{
602	if (stopped)
603		netmap_disable_ring(NMR(na, t)[ring_id], stopped);
604	else
605		NMR(na, t)[ring_id]->nkr_stopped = 0;
606}
607
608
609/* stop or enable all the rings of na */
610void
611netmap_set_all_rings(struct netmap_adapter *na, int stopped)
612{
613	int i;
614	enum txrx t;
615
616	if (!nm_netmap_on(na))
617		return;
618
619	if (netmap_verbose) {
620		nm_prinf("%s: %sable all rings", na->name,
621		    (stopped ? "dis" : "en"));
622	}
623	for_rx_tx(t) {
624		for (i = 0; i < netmap_real_rings(na, t); i++) {
625			netmap_set_ring(na, i, t, stopped);
626		}
627	}
628}
629
630/*
631 * Convenience function used in drivers.  Waits for current txsync()s/rxsync()s
632 * to finish and prevents any new one from starting.  Call this before turning
633 * netmap mode off, or before removing the hardware rings (e.g., on module
634 * onload).
635 */
636void
637netmap_disable_all_rings(struct ifnet *ifp)
638{
639	if (NM_NA_VALID(ifp)) {
640		netmap_set_all_rings(NA(ifp), NM_KR_LOCKED);
641	}
642}
643
644/*
645 * Convenience function used in drivers.  Re-enables rxsync and txsync on the
646 * adapter's rings In linux drivers, this should be placed near each
647 * napi_enable().
648 */
649void
650netmap_enable_all_rings(struct ifnet *ifp)
651{
652	if (NM_NA_VALID(ifp)) {
653		netmap_set_all_rings(NA(ifp), 0 /* enabled */);
654	}
655}
656
657void
658netmap_make_zombie(struct ifnet *ifp)
659{
660	if (NM_NA_VALID(ifp)) {
661		struct netmap_adapter *na = NA(ifp);
662		netmap_set_all_rings(na, NM_KR_LOCKED);
663		na->na_flags |= NAF_ZOMBIE;
664		netmap_set_all_rings(na, 0);
665	}
666}
667
668void
669netmap_undo_zombie(struct ifnet *ifp)
670{
671	if (NM_NA_VALID(ifp)) {
672		struct netmap_adapter *na = NA(ifp);
673		if (na->na_flags & NAF_ZOMBIE) {
674			netmap_set_all_rings(na, NM_KR_LOCKED);
675			na->na_flags &= ~NAF_ZOMBIE;
676			netmap_set_all_rings(na, 0);
677		}
678	}
679}
680
681/*
682 * generic bound_checking function
683 */
684u_int
685nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg)
686{
687	u_int oldv = *v;
688	const char *op = NULL;
689
690	if (dflt < lo)
691		dflt = lo;
692	if (dflt > hi)
693		dflt = hi;
694	if (oldv < lo) {
695		*v = dflt;
696		op = "Bump";
697	} else if (oldv > hi) {
698		*v = hi;
699		op = "Clamp";
700	}
701	if (op && msg)
702		nm_prinf("%s %s to %d (was %d)", op, msg, *v, oldv);
703	return *v;
704}
705
706
707/*
708 * packet-dump function, user-supplied or static buffer.
709 * The destination buffer must be at least 30+4*len
710 */
711const char *
712nm_dump_buf(char *p, int len, int lim, char *dst)
713{
714	static char _dst[8192];
715	int i, j, i0;
716	static char hex[] ="0123456789abcdef";
717	char *o;	/* output position */
718
719#define P_HI(x)	hex[((x) & 0xf0)>>4]
720#define P_LO(x)	hex[((x) & 0xf)]
721#define P_C(x)	((x) >= 0x20 && (x) <= 0x7e ? (x) : '.')
722	if (!dst)
723		dst = _dst;
724	if (lim <= 0 || lim > len)
725		lim = len;
726	o = dst;
727	sprintf(o, "buf 0x%p len %d lim %d\n", p, len, lim);
728	o += strlen(o);
729	/* hexdump routine */
730	for (i = 0; i < lim; ) {
731		sprintf(o, "%5d: ", i);
732		o += strlen(o);
733		memset(o, ' ', 48);
734		i0 = i;
735		for (j=0; j < 16 && i < lim; i++, j++) {
736			o[j*3] = P_HI(p[i]);
737			o[j*3+1] = P_LO(p[i]);
738		}
739		i = i0;
740		for (j=0; j < 16 && i < lim; i++, j++)
741			o[j + 48] = P_C(p[i]);
742		o[j+48] = '\n';
743		o += j+49;
744	}
745	*o = '\0';
746#undef P_HI
747#undef P_LO
748#undef P_C
749	return dst;
750}
751
752
753/*
754 * Fetch configuration from the device, to cope with dynamic
755 * reconfigurations after loading the module.
756 */
757/* call with NMG_LOCK held */
758int
759netmap_update_config(struct netmap_adapter *na)
760{
761	struct nm_config_info info;
762
763	bzero(&info, sizeof(info));
764	if (na->nm_config == NULL ||
765	    na->nm_config(na, &info)) {
766		/* take whatever we had at init time */
767		info.num_tx_rings = na->num_tx_rings;
768		info.num_tx_descs = na->num_tx_desc;
769		info.num_rx_rings = na->num_rx_rings;
770		info.num_rx_descs = na->num_rx_desc;
771		info.rx_buf_maxsize = na->rx_buf_maxsize;
772	}
773
774	if (na->num_tx_rings == info.num_tx_rings &&
775	    na->num_tx_desc == info.num_tx_descs &&
776	    na->num_rx_rings == info.num_rx_rings &&
777	    na->num_rx_desc == info.num_rx_descs &&
778	    na->rx_buf_maxsize == info.rx_buf_maxsize)
779		return 0; /* nothing changed */
780	if (na->active_fds == 0) {
781		na->num_tx_rings = info.num_tx_rings;
782		na->num_tx_desc = info.num_tx_descs;
783		na->num_rx_rings = info.num_rx_rings;
784		na->num_rx_desc = info.num_rx_descs;
785		na->rx_buf_maxsize = info.rx_buf_maxsize;
786		if (netmap_verbose)
787			nm_prinf("configuration changed for %s: txring %d x %d, "
788				"rxring %d x %d, rxbufsz %d",
789				na->name, na->num_tx_rings, na->num_tx_desc,
790				na->num_rx_rings, na->num_rx_desc, na->rx_buf_maxsize);
791		return 0;
792	}
793	nm_prerr("WARNING: configuration changed for %s while active: "
794		"txring %d x %d, rxring %d x %d, rxbufsz %d",
795		na->name, info.num_tx_rings, info.num_tx_descs,
796		info.num_rx_rings, info.num_rx_descs,
797		info.rx_buf_maxsize);
798	return 1;
799}
800
801/* nm_sync callbacks for the host rings */
802static int netmap_txsync_to_host(struct netmap_kring *kring, int flags);
803static int netmap_rxsync_from_host(struct netmap_kring *kring, int flags);
804
805/* create the krings array and initialize the fields common to all adapters.
806 * The array layout is this:
807 *
808 *                    +----------+
809 * na->tx_rings ----->|          | \
810 *                    |          |  } na->num_tx_ring
811 *                    |          | /
812 *                    +----------+
813 *                    |          |    host tx kring
814 * na->rx_rings ----> +----------+
815 *                    |          | \
816 *                    |          |  } na->num_rx_rings
817 *                    |          | /
818 *                    +----------+
819 *                    |          |    host rx kring
820 *                    +----------+
821 * na->tailroom ----->|          | \
822 *                    |          |  } tailroom bytes
823 *                    |          | /
824 *                    +----------+
825 *
826 * Note: for compatibility, host krings are created even when not needed.
827 * The tailroom space is currently used by vale ports for allocating leases.
828 */
829/* call with NMG_LOCK held */
830int
831netmap_krings_create(struct netmap_adapter *na, u_int tailroom)
832{
833	u_int i, len, ndesc;
834	struct netmap_kring *kring;
835	u_int n[NR_TXRX];
836	enum txrx t;
837	int err = 0;
838
839	if (na->tx_rings != NULL) {
840		if (netmap_debug & NM_DEBUG_ON)
841			nm_prerr("warning: krings were already created");
842		return 0;
843	}
844
845	/* account for the (possibly fake) host rings */
846	n[NR_TX] = netmap_all_rings(na, NR_TX);
847	n[NR_RX] = netmap_all_rings(na, NR_RX);
848
849	len = (n[NR_TX] + n[NR_RX]) *
850		(sizeof(struct netmap_kring) + sizeof(struct netmap_kring *))
851		+ tailroom;
852
853	na->tx_rings = nm_os_malloc((size_t)len);
854	if (na->tx_rings == NULL) {
855		nm_prerr("Cannot allocate krings");
856		return ENOMEM;
857	}
858	na->rx_rings = na->tx_rings + n[NR_TX];
859	na->tailroom = na->rx_rings + n[NR_RX];
860
861	/* link the krings in the krings array */
862	kring = (struct netmap_kring *)((char *)na->tailroom + tailroom);
863	for (i = 0; i < n[NR_TX] + n[NR_RX]; i++) {
864		na->tx_rings[i] = kring;
865		kring++;
866	}
867
868	/*
869	 * All fields in krings are 0 except the one initialized below.
870	 * but better be explicit on important kring fields.
871	 */
872	for_rx_tx(t) {
873		ndesc = nma_get_ndesc(na, t);
874		for (i = 0; i < n[t]; i++) {
875			kring = NMR(na, t)[i];
876			bzero(kring, sizeof(*kring));
877			kring->notify_na = na;
878			kring->ring_id = i;
879			kring->tx = t;
880			kring->nkr_num_slots = ndesc;
881			kring->nr_mode = NKR_NETMAP_OFF;
882			kring->nr_pending_mode = NKR_NETMAP_OFF;
883			if (i < nma_get_nrings(na, t)) {
884				kring->nm_sync = (t == NR_TX ? na->nm_txsync : na->nm_rxsync);
885			} else {
886				if (!(na->na_flags & NAF_HOST_RINGS))
887					kring->nr_kflags |= NKR_FAKERING;
888				kring->nm_sync = (t == NR_TX ?
889						netmap_txsync_to_host:
890						netmap_rxsync_from_host);
891			}
892			kring->nm_notify = na->nm_notify;
893			kring->rhead = kring->rcur = kring->nr_hwcur = 0;
894			/*
895			 * IMPORTANT: Always keep one slot empty.
896			 */
897			kring->rtail = kring->nr_hwtail = (t == NR_TX ? ndesc - 1 : 0);
898			snprintf(kring->name, sizeof(kring->name) - 1, "%s %s%d", na->name,
899					nm_txrx2str(t), i);
900			nm_prdis("ktx %s h %d c %d t %d",
901				kring->name, kring->rhead, kring->rcur, kring->rtail);
902			err = nm_os_selinfo_init(&kring->si, kring->name);
903			if (err) {
904				netmap_krings_delete(na);
905				return err;
906			}
907			mtx_init(&kring->q_lock, (t == NR_TX ? "nm_txq_lock" : "nm_rxq_lock"), NULL, MTX_DEF);
908			kring->na = na;	/* setting this field marks the mutex as initialized */
909		}
910		err = nm_os_selinfo_init(&na->si[t], na->name);
911		if (err) {
912			netmap_krings_delete(na);
913			return err;
914		}
915	}
916
917	return 0;
918}
919
920
921/* undo the actions performed by netmap_krings_create */
922/* call with NMG_LOCK held */
923void
924netmap_krings_delete(struct netmap_adapter *na)
925{
926	struct netmap_kring **kring = na->tx_rings;
927	enum txrx t;
928
929	if (na->tx_rings == NULL) {
930		if (netmap_debug & NM_DEBUG_ON)
931			nm_prerr("warning: krings were already deleted");
932		return;
933	}
934
935	for_rx_tx(t)
936		nm_os_selinfo_uninit(&na->si[t]);
937
938	/* we rely on the krings layout described above */
939	for ( ; kring != na->tailroom; kring++) {
940		if ((*kring)->na != NULL)
941			mtx_destroy(&(*kring)->q_lock);
942		nm_os_selinfo_uninit(&(*kring)->si);
943	}
944	nm_os_free(na->tx_rings);
945	na->tx_rings = na->rx_rings = na->tailroom = NULL;
946}
947
948
949/*
950 * Destructor for NIC ports. They also have an mbuf queue
951 * on the rings connected to the host so we need to purge
952 * them first.
953 */
954/* call with NMG_LOCK held */
955void
956netmap_hw_krings_delete(struct netmap_adapter *na)
957{
958	u_int lim = netmap_real_rings(na, NR_RX), i;
959
960	for (i = nma_get_nrings(na, NR_RX); i < lim; i++) {
961		struct mbq *q = &NMR(na, NR_RX)[i]->rx_queue;
962		nm_prdis("destroy sw mbq with len %d", mbq_len(q));
963		mbq_purge(q);
964		mbq_safe_fini(q);
965	}
966	netmap_krings_delete(na);
967}
968
969static void
970netmap_mem_drop(struct netmap_adapter *na)
971{
972	int last = netmap_mem_deref(na->nm_mem, na);
973	/* if the native allocator had been overrided on regif,
974	 * restore it now and drop the temporary one
975	 */
976	if (last && na->nm_mem_prev) {
977		netmap_mem_put(na->nm_mem);
978		na->nm_mem = na->nm_mem_prev;
979		na->nm_mem_prev = NULL;
980	}
981}
982
983/*
984 * Undo everything that was done in netmap_do_regif(). In particular,
985 * call nm_register(ifp,0) to stop netmap mode on the interface and
986 * revert to normal operation.
987 */
988/* call with NMG_LOCK held */
989static void netmap_unset_ringid(struct netmap_priv_d *);
990static void netmap_krings_put(struct netmap_priv_d *);
991void
992netmap_do_unregif(struct netmap_priv_d *priv)
993{
994	struct netmap_adapter *na = priv->np_na;
995
996	NMG_LOCK_ASSERT();
997	na->active_fds--;
998	/* unset nr_pending_mode and possibly release exclusive mode */
999	netmap_krings_put(priv);
1000
1001#ifdef	WITH_MONITOR
1002	/* XXX check whether we have to do something with monitor
1003	 * when rings change nr_mode. */
1004	if (na->active_fds <= 0) {
1005		/* walk through all the rings and tell any monitor
1006		 * that the port is going to exit netmap mode
1007		 */
1008		netmap_monitor_stop(na);
1009	}
1010#endif
1011
1012	if (na->active_fds <= 0 || nm_kring_pending(priv)) {
1013		na->nm_register(na, 0);
1014	}
1015
1016	/* delete rings and buffers that are no longer needed */
1017	netmap_mem_rings_delete(na);
1018
1019	if (na->active_fds <= 0) {	/* last instance */
1020		/*
1021		 * (TO CHECK) We enter here
1022		 * when the last reference to this file descriptor goes
1023		 * away. This means we cannot have any pending poll()
1024		 * or interrupt routine operating on the structure.
1025		 * XXX The file may be closed in a thread while
1026		 * another thread is using it.
1027		 * Linux keeps the file opened until the last reference
1028		 * by any outstanding ioctl/poll or mmap is gone.
1029		 * FreeBSD does not track mmap()s (but we do) and
1030		 * wakes up any sleeping poll(). Need to check what
1031		 * happens if the close() occurs while a concurrent
1032		 * syscall is running.
1033		 */
1034		if (netmap_debug & NM_DEBUG_ON)
1035			nm_prinf("deleting last instance for %s", na->name);
1036
1037		if (nm_netmap_on(na)) {
1038			nm_prerr("BUG: netmap on while going to delete the krings");
1039		}
1040
1041		na->nm_krings_delete(na);
1042
1043		/* restore the default number of host tx and rx rings */
1044		if (na->na_flags & NAF_HOST_RINGS) {
1045			na->num_host_tx_rings = 1;
1046			na->num_host_rx_rings = 1;
1047		} else {
1048			na->num_host_tx_rings = 0;
1049			na->num_host_rx_rings = 0;
1050		}
1051	}
1052
1053	/* possibily decrement counter of tx_si/rx_si users */
1054	netmap_unset_ringid(priv);
1055	/* delete the nifp */
1056	netmap_mem_if_delete(na, priv->np_nifp);
1057	/* drop the allocator */
1058	netmap_mem_drop(na);
1059	/* mark the priv as unregistered */
1060	priv->np_na = NULL;
1061	priv->np_nifp = NULL;
1062}
1063
1064struct netmap_priv_d*
1065netmap_priv_new(void)
1066{
1067	struct netmap_priv_d *priv;
1068
1069	priv = nm_os_malloc(sizeof(struct netmap_priv_d));
1070	if (priv == NULL)
1071		return NULL;
1072	priv->np_refs = 1;
1073	nm_os_get_module();
1074	return priv;
1075}
1076
1077/*
1078 * Destructor of the netmap_priv_d, called when the fd is closed
1079 * Action: undo all the things done by NIOCREGIF,
1080 * On FreeBSD we need to track whether there are active mmap()s,
1081 * and we use np_active_mmaps for that. On linux, the field is always 0.
1082 * Return: 1 if we can free priv, 0 otherwise.
1083 *
1084 */
1085/* call with NMG_LOCK held */
1086void
1087netmap_priv_delete(struct netmap_priv_d *priv)
1088{
1089	struct netmap_adapter *na = priv->np_na;
1090
1091	/* number of active references to this fd */
1092	if (--priv->np_refs > 0) {
1093		return;
1094	}
1095	nm_os_put_module();
1096	if (na) {
1097		netmap_do_unregif(priv);
1098	}
1099	netmap_unget_na(na, priv->np_ifp);
1100	bzero(priv, sizeof(*priv));	/* for safety */
1101	nm_os_free(priv);
1102}
1103
1104
1105/* call with NMG_LOCK *not* held */
1106void
1107netmap_dtor(void *data)
1108{
1109	struct netmap_priv_d *priv = data;
1110
1111	NMG_LOCK();
1112	netmap_priv_delete(priv);
1113	NMG_UNLOCK();
1114}
1115
1116
1117/*
1118 * Handlers for synchronization of the rings from/to the host stack.
1119 * These are associated to a network interface and are just another
1120 * ring pair managed by userspace.
1121 *
1122 * Netmap also supports transparent forwarding (NS_FORWARD and NR_FORWARD
1123 * flags):
1124 *
1125 * - Before releasing buffers on hw RX rings, the application can mark
1126 *   them with the NS_FORWARD flag. During the next RXSYNC or poll(), they
1127 *   will be forwarded to the host stack, similarly to what happened if
1128 *   the application moved them to the host TX ring.
1129 *
1130 * - Before releasing buffers on the host RX ring, the application can
1131 *   mark them with the NS_FORWARD flag. During the next RXSYNC or poll(),
1132 *   they will be forwarded to the hw TX rings, saving the application
1133 *   from doing the same task in user-space.
1134 *
1135 * Transparent fowarding can be enabled per-ring, by setting the NR_FORWARD
1136 * flag, or globally with the netmap_fwd sysctl.
1137 *
1138 * The transfer NIC --> host is relatively easy, just encapsulate
1139 * into mbufs and we are done. The host --> NIC side is slightly
1140 * harder because there might not be room in the tx ring so it
1141 * might take a while before releasing the buffer.
1142 */
1143
1144
1145/*
1146 * Pass a whole queue of mbufs to the host stack as coming from 'dst'
1147 * We do not need to lock because the queue is private.
1148 * After this call the queue is empty.
1149 */
1150static void
1151netmap_send_up(struct ifnet *dst, struct mbq *q)
1152{
1153	struct mbuf *m;
1154	struct mbuf *head = NULL, *prev = NULL;
1155
1156	/* Send packets up, outside the lock; head/prev machinery
1157	 * is only useful for Windows. */
1158	while ((m = mbq_dequeue(q)) != NULL) {
1159		if (netmap_debug & NM_DEBUG_HOST)
1160			nm_prinf("sending up pkt %p size %d", m, MBUF_LEN(m));
1161		prev = nm_os_send_up(dst, m, prev);
1162		if (head == NULL)
1163			head = prev;
1164	}
1165	if (head)
1166		nm_os_send_up(dst, NULL, head);
1167	mbq_fini(q);
1168}
1169
1170
1171/*
1172 * Scan the buffers from hwcur to ring->head, and put a copy of those
1173 * marked NS_FORWARD (or all of them if forced) into a queue of mbufs.
1174 * Drop remaining packets in the unlikely event
1175 * of an mbuf shortage.
1176 */
1177static void
1178netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force)
1179{
1180	u_int const lim = kring->nkr_num_slots - 1;
1181	u_int const head = kring->rhead;
1182	u_int n;
1183	struct netmap_adapter *na = kring->na;
1184
1185	for (n = kring->nr_hwcur; n != head; n = nm_next(n, lim)) {
1186		struct mbuf *m;
1187		struct netmap_slot *slot = &kring->ring->slot[n];
1188
1189		if ((slot->flags & NS_FORWARD) == 0 && !force)
1190			continue;
1191		if (slot->len < 14 || slot->len > NETMAP_BUF_SIZE(na)) {
1192			nm_prlim(5, "bad pkt at %d len %d", n, slot->len);
1193			continue;
1194		}
1195		slot->flags &= ~NS_FORWARD; // XXX needed ?
1196		/* XXX TODO: adapt to the case of a multisegment packet */
1197		m = m_devget(NMB(na, slot), slot->len, 0, na->ifp, NULL);
1198
1199		if (m == NULL)
1200			break;
1201		mbq_enqueue(q, m);
1202	}
1203}
1204
1205static inline int
1206_nm_may_forward(struct netmap_kring *kring)
1207{
1208	return	((netmap_fwd || kring->ring->flags & NR_FORWARD) &&
1209		 kring->na->na_flags & NAF_HOST_RINGS &&
1210		 kring->tx == NR_RX);
1211}
1212
1213static inline int
1214nm_may_forward_up(struct netmap_kring *kring)
1215{
1216	return	_nm_may_forward(kring) &&
1217		 kring->ring_id != kring->na->num_rx_rings;
1218}
1219
1220static inline int
1221nm_may_forward_down(struct netmap_kring *kring, int sync_flags)
1222{
1223	return	_nm_may_forward(kring) &&
1224		 (sync_flags & NAF_CAN_FORWARD_DOWN) &&
1225		 kring->ring_id == kring->na->num_rx_rings;
1226}
1227
1228/*
1229 * Send to the NIC rings packets marked NS_FORWARD between
1230 * kring->nr_hwcur and kring->rhead.
1231 * Called under kring->rx_queue.lock on the sw rx ring.
1232 *
1233 * It can only be called if the user opened all the TX hw rings,
1234 * see NAF_CAN_FORWARD_DOWN flag.
1235 * We can touch the TX netmap rings (slots, head and cur) since
1236 * we are in poll/ioctl system call context, and the application
1237 * is not supposed to touch the ring (using a different thread)
1238 * during the execution of the system call.
1239 */
1240static u_int
1241netmap_sw_to_nic(struct netmap_adapter *na)
1242{
1243	struct netmap_kring *kring = na->rx_rings[na->num_rx_rings];
1244	struct netmap_slot *rxslot = kring->ring->slot;
1245	u_int i, rxcur = kring->nr_hwcur;
1246	u_int const head = kring->rhead;
1247	u_int const src_lim = kring->nkr_num_slots - 1;
1248	u_int sent = 0;
1249
1250	/* scan rings to find space, then fill as much as possible */
1251	for (i = 0; i < na->num_tx_rings; i++) {
1252		struct netmap_kring *kdst = na->tx_rings[i];
1253		struct netmap_ring *rdst = kdst->ring;
1254		u_int const dst_lim = kdst->nkr_num_slots - 1;
1255
1256		/* XXX do we trust ring or kring->rcur,rtail ? */
1257		for (; rxcur != head && !nm_ring_empty(rdst);
1258		     rxcur = nm_next(rxcur, src_lim) ) {
1259			struct netmap_slot *src, *dst, tmp;
1260			u_int dst_head = rdst->head;
1261
1262			src = &rxslot[rxcur];
1263			if ((src->flags & NS_FORWARD) == 0 && !netmap_fwd)
1264				continue;
1265
1266			sent++;
1267
1268			dst = &rdst->slot[dst_head];
1269
1270			tmp = *src;
1271
1272			src->buf_idx = dst->buf_idx;
1273			src->flags = NS_BUF_CHANGED;
1274
1275			dst->buf_idx = tmp.buf_idx;
1276			dst->len = tmp.len;
1277			dst->flags = NS_BUF_CHANGED;
1278
1279			rdst->head = rdst->cur = nm_next(dst_head, dst_lim);
1280		}
1281		/* if (sent) XXX txsync ? it would be just an optimization */
1282	}
1283	return sent;
1284}
1285
1286
1287/*
1288 * netmap_txsync_to_host() passes packets up. We are called from a
1289 * system call in user process context, and the only contention
1290 * can be among multiple user threads erroneously calling
1291 * this routine concurrently.
1292 */
1293static int
1294netmap_txsync_to_host(struct netmap_kring *kring, int flags)
1295{
1296	struct netmap_adapter *na = kring->na;
1297	u_int const lim = kring->nkr_num_slots - 1;
1298	u_int const head = kring->rhead;
1299	struct mbq q;
1300
1301	/* Take packets from hwcur to head and pass them up.
1302	 * Force hwcur = head since netmap_grab_packets() stops at head
1303	 */
1304	mbq_init(&q);
1305	netmap_grab_packets(kring, &q, 1 /* force */);
1306	nm_prdis("have %d pkts in queue", mbq_len(&q));
1307	kring->nr_hwcur = head;
1308	kring->nr_hwtail = head + lim;
1309	if (kring->nr_hwtail > lim)
1310		kring->nr_hwtail -= lim + 1;
1311
1312	netmap_send_up(na->ifp, &q);
1313	return 0;
1314}
1315
1316
1317/*
1318 * rxsync backend for packets coming from the host stack.
1319 * They have been put in kring->rx_queue by netmap_transmit().
1320 * We protect access to the kring using kring->rx_queue.lock
1321 *
1322 * also moves to the nic hw rings any packet the user has marked
1323 * for transparent-mode forwarding, then sets the NR_FORWARD
1324 * flag in the kring to let the caller push them out
1325 */
1326static int
1327netmap_rxsync_from_host(struct netmap_kring *kring, int flags)
1328{
1329	struct netmap_adapter *na = kring->na;
1330	struct netmap_ring *ring = kring->ring;
1331	u_int nm_i, n;
1332	u_int const lim = kring->nkr_num_slots - 1;
1333	u_int const head = kring->rhead;
1334	int ret = 0;
1335	struct mbq *q = &kring->rx_queue, fq;
1336
1337	mbq_init(&fq); /* fq holds packets to be freed */
1338
1339	mbq_lock(q);
1340
1341	/* First part: import newly received packets */
1342	n = mbq_len(q);
1343	if (n) { /* grab packets from the queue */
1344		struct mbuf *m;
1345		uint32_t stop_i;
1346
1347		nm_i = kring->nr_hwtail;
1348		stop_i = nm_prev(kring->nr_hwcur, lim);
1349		while ( nm_i != stop_i && (m = mbq_dequeue(q)) != NULL ) {
1350			int len = MBUF_LEN(m);
1351			struct netmap_slot *slot = &ring->slot[nm_i];
1352
1353			m_copydata(m, 0, len, NMB(na, slot));
1354			nm_prdis("nm %d len %d", nm_i, len);
1355			if (netmap_debug & NM_DEBUG_HOST)
1356				nm_prinf("%s", nm_dump_buf(NMB(na, slot),len, 128, NULL));
1357
1358			slot->len = len;
1359			slot->flags = 0;
1360			nm_i = nm_next(nm_i, lim);
1361			mbq_enqueue(&fq, m);
1362		}
1363		kring->nr_hwtail = nm_i;
1364	}
1365
1366	/*
1367	 * Second part: skip past packets that userspace has released.
1368	 */
1369	nm_i = kring->nr_hwcur;
1370	if (nm_i != head) { /* something was released */
1371		if (nm_may_forward_down(kring, flags)) {
1372			ret = netmap_sw_to_nic(na);
1373			if (ret > 0) {
1374				kring->nr_kflags |= NR_FORWARD;
1375				ret = 0;
1376			}
1377		}
1378		kring->nr_hwcur = head;
1379	}
1380
1381	mbq_unlock(q);
1382
1383	mbq_purge(&fq);
1384	mbq_fini(&fq);
1385
1386	return ret;
1387}
1388
1389
1390/* Get a netmap adapter for the port.
1391 *
1392 * If it is possible to satisfy the request, return 0
1393 * with *na containing the netmap adapter found.
1394 * Otherwise return an error code, with *na containing NULL.
1395 *
1396 * When the port is attached to a bridge, we always return
1397 * EBUSY.
1398 * Otherwise, if the port is already bound to a file descriptor,
1399 * then we unconditionally return the existing adapter into *na.
1400 * In all the other cases, we return (into *na) either native,
1401 * generic or NULL, according to the following table:
1402 *
1403 *					native_support
1404 * active_fds   dev.netmap.admode         YES     NO
1405 * -------------------------------------------------------
1406 *    >0              *                 NA(ifp) NA(ifp)
1407 *
1408 *     0        NETMAP_ADMODE_BEST      NATIVE  GENERIC
1409 *     0        NETMAP_ADMODE_NATIVE    NATIVE   NULL
1410 *     0        NETMAP_ADMODE_GENERIC   GENERIC GENERIC
1411 *
1412 */
1413static void netmap_hw_dtor(struct netmap_adapter *); /* needed by NM_IS_NATIVE() */
1414int
1415netmap_get_hw_na(struct ifnet *ifp, struct netmap_mem_d *nmd, struct netmap_adapter **na)
1416{
1417	/* generic support */
1418	int i = netmap_admode;	/* Take a snapshot. */
1419	struct netmap_adapter *prev_na;
1420	int error = 0;
1421
1422	*na = NULL; /* default */
1423
1424	/* reset in case of invalid value */
1425	if (i < NETMAP_ADMODE_BEST || i >= NETMAP_ADMODE_LAST)
1426		i = netmap_admode = NETMAP_ADMODE_BEST;
1427
1428	if (NM_NA_VALID(ifp)) {
1429		prev_na = NA(ifp);
1430		/* If an adapter already exists, return it if
1431		 * there are active file descriptors or if
1432		 * netmap is not forced to use generic
1433		 * adapters.
1434		 */
1435		if (NETMAP_OWNED_BY_ANY(prev_na)
1436			|| i != NETMAP_ADMODE_GENERIC
1437			|| prev_na->na_flags & NAF_FORCE_NATIVE
1438#ifdef WITH_PIPES
1439			/* ugly, but we cannot allow an adapter switch
1440			 * if some pipe is referring to this one
1441			 */
1442			|| prev_na->na_next_pipe > 0
1443#endif
1444		) {
1445			*na = prev_na;
1446			goto assign_mem;
1447		}
1448	}
1449
1450	/* If there isn't native support and netmap is not allowed
1451	 * to use generic adapters, we cannot satisfy the request.
1452	 */
1453	if (!NM_IS_NATIVE(ifp) && i == NETMAP_ADMODE_NATIVE)
1454		return EOPNOTSUPP;
1455
1456	/* Otherwise, create a generic adapter and return it,
1457	 * saving the previously used netmap adapter, if any.
1458	 *
1459	 * Note that here 'prev_na', if not NULL, MUST be a
1460	 * native adapter, and CANNOT be a generic one. This is
1461	 * true because generic adapters are created on demand, and
1462	 * destroyed when not used anymore. Therefore, if the adapter
1463	 * currently attached to an interface 'ifp' is generic, it
1464	 * must be that
1465	 * (NA(ifp)->active_fds > 0 || NETMAP_OWNED_BY_KERN(NA(ifp))).
1466	 * Consequently, if NA(ifp) is generic, we will enter one of
1467	 * the branches above. This ensures that we never override
1468	 * a generic adapter with another generic adapter.
1469	 */
1470	error = generic_netmap_attach(ifp);
1471	if (error)
1472		return error;
1473
1474	*na = NA(ifp);
1475
1476assign_mem:
1477	if (nmd != NULL && !((*na)->na_flags & NAF_MEM_OWNER) &&
1478	    (*na)->active_fds == 0 && ((*na)->nm_mem != nmd)) {
1479		(*na)->nm_mem_prev = (*na)->nm_mem;
1480		(*na)->nm_mem = netmap_mem_get(nmd);
1481	}
1482
1483	return 0;
1484}
1485
1486/*
1487 * MUST BE CALLED UNDER NMG_LOCK()
1488 *
1489 * Get a refcounted reference to a netmap adapter attached
1490 * to the interface specified by req.
1491 * This is always called in the execution of an ioctl().
1492 *
1493 * Return ENXIO if the interface specified by the request does
1494 * not exist, ENOTSUP if netmap is not supported by the interface,
1495 * EBUSY if the interface is already attached to a bridge,
1496 * EINVAL if parameters are invalid, ENOMEM if needed resources
1497 * could not be allocated.
1498 * If successful, hold a reference to the netmap adapter.
1499 *
1500 * If the interface specified by req is a system one, also keep
1501 * a reference to it and return a valid *ifp.
1502 */
1503int
1504netmap_get_na(struct nmreq_header *hdr,
1505	      struct netmap_adapter **na, struct ifnet **ifp,
1506	      struct netmap_mem_d *nmd, int create)
1507{
1508	struct nmreq_register *req = (struct nmreq_register *)(uintptr_t)hdr->nr_body;
1509	int error = 0;
1510	struct netmap_adapter *ret = NULL;
1511	int nmd_ref = 0;
1512
1513	*na = NULL;     /* default return value */
1514	*ifp = NULL;
1515
1516	if (hdr->nr_reqtype != NETMAP_REQ_REGISTER) {
1517		return EINVAL;
1518	}
1519
1520	if (req->nr_mode == NR_REG_PIPE_MASTER ||
1521			req->nr_mode == NR_REG_PIPE_SLAVE) {
1522		/* Do not accept deprecated pipe modes. */
1523		nm_prerr("Deprecated pipe nr_mode, use xx{yy or xx}yy syntax");
1524		return EINVAL;
1525	}
1526
1527	NMG_LOCK_ASSERT();
1528
1529	/* if the request contain a memid, try to find the
1530	 * corresponding memory region
1531	 */
1532	if (nmd == NULL && req->nr_mem_id) {
1533		nmd = netmap_mem_find(req->nr_mem_id);
1534		if (nmd == NULL)
1535			return EINVAL;
1536		/* keep the rereference */
1537		nmd_ref = 1;
1538	}
1539
1540	/* We cascade through all possible types of netmap adapter.
1541	 * All netmap_get_*_na() functions return an error and an na,
1542	 * with the following combinations:
1543	 *
1544	 * error    na
1545	 *   0	   NULL		type doesn't match
1546	 *  !0	   NULL		type matches, but na creation/lookup failed
1547	 *   0	  !NULL		type matches and na created/found
1548	 *  !0    !NULL		impossible
1549	 */
1550	error = netmap_get_null_na(hdr, na, nmd, create);
1551	if (error || *na != NULL)
1552		goto out;
1553
1554	/* try to see if this is a monitor port */
1555	error = netmap_get_monitor_na(hdr, na, nmd, create);
1556	if (error || *na != NULL)
1557		goto out;
1558
1559	/* try to see if this is a pipe port */
1560	error = netmap_get_pipe_na(hdr, na, nmd, create);
1561	if (error || *na != NULL)
1562		goto out;
1563
1564	/* try to see if this is a bridge port */
1565	error = netmap_get_vale_na(hdr, na, nmd, create);
1566	if (error)
1567		goto out;
1568
1569	if (*na != NULL) /* valid match in netmap_get_bdg_na() */
1570		goto out;
1571
1572	/*
1573	 * This must be a hardware na, lookup the name in the system.
1574	 * Note that by hardware we actually mean "it shows up in ifconfig".
1575	 * This may still be a tap, a veth/epair, or even a
1576	 * persistent VALE port.
1577	 */
1578	*ifp = ifunit_ref(hdr->nr_name);
1579	if (*ifp == NULL) {
1580		error = ENXIO;
1581		goto out;
1582	}
1583
1584	error = netmap_get_hw_na(*ifp, nmd, &ret);
1585	if (error)
1586		goto out;
1587
1588	*na = ret;
1589	netmap_adapter_get(ret);
1590
1591	/*
1592	 * if the adapter supports the host rings and it is not alread open,
1593	 * try to set the number of host rings as requested by the user
1594	 */
1595	if (((*na)->na_flags & NAF_HOST_RINGS) && (*na)->active_fds == 0) {
1596		if (req->nr_host_tx_rings)
1597			(*na)->num_host_tx_rings = req->nr_host_tx_rings;
1598		if (req->nr_host_rx_rings)
1599			(*na)->num_host_rx_rings = req->nr_host_rx_rings;
1600	}
1601	nm_prdis("%s: host tx %d rx %u", (*na)->name, (*na)->num_host_tx_rings,
1602			(*na)->num_host_rx_rings);
1603
1604out:
1605	if (error) {
1606		if (ret)
1607			netmap_adapter_put(ret);
1608		if (*ifp) {
1609			if_rele(*ifp);
1610			*ifp = NULL;
1611		}
1612	}
1613	if (nmd_ref)
1614		netmap_mem_put(nmd);
1615
1616	return error;
1617}
1618
1619/* undo netmap_get_na() */
1620void
1621netmap_unget_na(struct netmap_adapter *na, struct ifnet *ifp)
1622{
1623	if (ifp)
1624		if_rele(ifp);
1625	if (na)
1626		netmap_adapter_put(na);
1627}
1628
1629
1630#define NM_FAIL_ON(t) do {						\
1631	if (unlikely(t)) {						\
1632		nm_prlim(5, "%s: fail '" #t "' "				\
1633			"h %d c %d t %d "				\
1634			"rh %d rc %d rt %d "				\
1635			"hc %d ht %d",					\
1636			kring->name,					\
1637			head, cur, ring->tail,				\
1638			kring->rhead, kring->rcur, kring->rtail,	\
1639			kring->nr_hwcur, kring->nr_hwtail);		\
1640		return kring->nkr_num_slots;				\
1641	}								\
1642} while (0)
1643
1644/*
1645 * validate parameters on entry for *_txsync()
1646 * Returns ring->cur if ok, or something >= kring->nkr_num_slots
1647 * in case of error.
1648 *
1649 * rhead, rcur and rtail=hwtail are stored from previous round.
1650 * hwcur is the next packet to send to the ring.
1651 *
1652 * We want
1653 *    hwcur <= *rhead <= head <= cur <= tail = *rtail <= hwtail
1654 *
1655 * hwcur, rhead, rtail and hwtail are reliable
1656 */
1657u_int
1658nm_txsync_prologue(struct netmap_kring *kring, struct netmap_ring *ring)
1659{
1660	u_int head = ring->head; /* read only once */
1661	u_int cur = ring->cur; /* read only once */
1662	u_int n = kring->nkr_num_slots;
1663
1664	nm_prdis(5, "%s kcur %d ktail %d head %d cur %d tail %d",
1665		kring->name,
1666		kring->nr_hwcur, kring->nr_hwtail,
1667		ring->head, ring->cur, ring->tail);
1668#if 1 /* kernel sanity checks; but we can trust the kring. */
1669	NM_FAIL_ON(kring->nr_hwcur >= n || kring->rhead >= n ||
1670	    kring->rtail >= n ||  kring->nr_hwtail >= n);
1671#endif /* kernel sanity checks */
1672	/*
1673	 * user sanity checks. We only use head,
1674	 * A, B, ... are possible positions for head:
1675	 *
1676	 *  0    A  rhead   B  rtail   C  n-1
1677	 *  0    D  rtail   E  rhead   F  n-1
1678	 *
1679	 * B, F, D are valid. A, C, E are wrong
1680	 */
1681	if (kring->rtail >= kring->rhead) {
1682		/* want rhead <= head <= rtail */
1683		NM_FAIL_ON(head < kring->rhead || head > kring->rtail);
1684		/* and also head <= cur <= rtail */
1685		NM_FAIL_ON(cur < head || cur > kring->rtail);
1686	} else { /* here rtail < rhead */
1687		/* we need head outside rtail .. rhead */
1688		NM_FAIL_ON(head > kring->rtail && head < kring->rhead);
1689
1690		/* two cases now: head <= rtail or head >= rhead  */
1691		if (head <= kring->rtail) {
1692			/* want head <= cur <= rtail */
1693			NM_FAIL_ON(cur < head || cur > kring->rtail);
1694		} else { /* head >= rhead */
1695			/* cur must be outside rtail..head */
1696			NM_FAIL_ON(cur > kring->rtail && cur < head);
1697		}
1698	}
1699	if (ring->tail != kring->rtail) {
1700		nm_prlim(5, "%s tail overwritten was %d need %d", kring->name,
1701			ring->tail, kring->rtail);
1702		ring->tail = kring->rtail;
1703	}
1704	kring->rhead = head;
1705	kring->rcur = cur;
1706	return head;
1707}
1708
1709
1710/*
1711 * validate parameters on entry for *_rxsync()
1712 * Returns ring->head if ok, kring->nkr_num_slots on error.
1713 *
1714 * For a valid configuration,
1715 * hwcur <= head <= cur <= tail <= hwtail
1716 *
1717 * We only consider head and cur.
1718 * hwcur and hwtail are reliable.
1719 *
1720 */
1721u_int
1722nm_rxsync_prologue(struct netmap_kring *kring, struct netmap_ring *ring)
1723{
1724	uint32_t const n = kring->nkr_num_slots;
1725	uint32_t head, cur;
1726
1727	nm_prdis(5,"%s kc %d kt %d h %d c %d t %d",
1728		kring->name,
1729		kring->nr_hwcur, kring->nr_hwtail,
1730		ring->head, ring->cur, ring->tail);
1731	/*
1732	 * Before storing the new values, we should check they do not
1733	 * move backwards. However:
1734	 * - head is not an issue because the previous value is hwcur;
1735	 * - cur could in principle go back, however it does not matter
1736	 *   because we are processing a brand new rxsync()
1737	 */
1738	cur = kring->rcur = ring->cur;	/* read only once */
1739	head = kring->rhead = ring->head;	/* read only once */
1740#if 1 /* kernel sanity checks */
1741	NM_FAIL_ON(kring->nr_hwcur >= n || kring->nr_hwtail >= n);
1742#endif /* kernel sanity checks */
1743	/* user sanity checks */
1744	if (kring->nr_hwtail >= kring->nr_hwcur) {
1745		/* want hwcur <= rhead <= hwtail */
1746		NM_FAIL_ON(head < kring->nr_hwcur || head > kring->nr_hwtail);
1747		/* and also rhead <= rcur <= hwtail */
1748		NM_FAIL_ON(cur < head || cur > kring->nr_hwtail);
1749	} else {
1750		/* we need rhead outside hwtail..hwcur */
1751		NM_FAIL_ON(head < kring->nr_hwcur && head > kring->nr_hwtail);
1752		/* two cases now: head <= hwtail or head >= hwcur  */
1753		if (head <= kring->nr_hwtail) {
1754			/* want head <= cur <= hwtail */
1755			NM_FAIL_ON(cur < head || cur > kring->nr_hwtail);
1756		} else {
1757			/* cur must be outside hwtail..head */
1758			NM_FAIL_ON(cur < head && cur > kring->nr_hwtail);
1759		}
1760	}
1761	if (ring->tail != kring->rtail) {
1762		nm_prlim(5, "%s tail overwritten was %d need %d",
1763			kring->name,
1764			ring->tail, kring->rtail);
1765		ring->tail = kring->rtail;
1766	}
1767	return head;
1768}
1769
1770
1771/*
1772 * Error routine called when txsync/rxsync detects an error.
1773 * Can't do much more than resetting head = cur = hwcur, tail = hwtail
1774 * Return 1 on reinit.
1775 *
1776 * This routine is only called by the upper half of the kernel.
1777 * It only reads hwcur (which is changed only by the upper half, too)
1778 * and hwtail (which may be changed by the lower half, but only on
1779 * a tx ring and only to increase it, so any error will be recovered
1780 * on the next call). For the above, we don't strictly need to call
1781 * it under lock.
1782 */
1783int
1784netmap_ring_reinit(struct netmap_kring *kring)
1785{
1786	struct netmap_ring *ring = kring->ring;
1787	u_int i, lim = kring->nkr_num_slots - 1;
1788	int errors = 0;
1789
1790	// XXX KASSERT nm_kr_tryget
1791	nm_prlim(10, "called for %s", kring->name);
1792	// XXX probably wrong to trust userspace
1793	kring->rhead = ring->head;
1794	kring->rcur  = ring->cur;
1795	kring->rtail = ring->tail;
1796
1797	if (ring->cur > lim)
1798		errors++;
1799	if (ring->head > lim)
1800		errors++;
1801	if (ring->tail > lim)
1802		errors++;
1803	for (i = 0; i <= lim; i++) {
1804		u_int idx = ring->slot[i].buf_idx;
1805		u_int len = ring->slot[i].len;
1806		if (idx < 2 || idx >= kring->na->na_lut.objtotal) {
1807			nm_prlim(5, "bad index at slot %d idx %d len %d ", i, idx, len);
1808			ring->slot[i].buf_idx = 0;
1809			ring->slot[i].len = 0;
1810		} else if (len > NETMAP_BUF_SIZE(kring->na)) {
1811			ring->slot[i].len = 0;
1812			nm_prlim(5, "bad len at slot %d idx %d len %d", i, idx, len);
1813		}
1814	}
1815	if (errors) {
1816		nm_prlim(10, "total %d errors", errors);
1817		nm_prlim(10, "%s reinit, cur %d -> %d tail %d -> %d",
1818			kring->name,
1819			ring->cur, kring->nr_hwcur,
1820			ring->tail, kring->nr_hwtail);
1821		ring->head = kring->rhead = kring->nr_hwcur;
1822		ring->cur  = kring->rcur  = kring->nr_hwcur;
1823		ring->tail = kring->rtail = kring->nr_hwtail;
1824	}
1825	return (errors ? 1 : 0);
1826}
1827
1828/* interpret the ringid and flags fields of an nmreq, by translating them
1829 * into a pair of intervals of ring indices:
1830 *
1831 * [priv->np_txqfirst, priv->np_txqlast) and
1832 * [priv->np_rxqfirst, priv->np_rxqlast)
1833 *
1834 */
1835int
1836netmap_interp_ringid(struct netmap_priv_d *priv, struct nmreq_header *hdr)
1837{
1838	struct netmap_adapter *na = priv->np_na;
1839	struct nmreq_register *reg = (struct nmreq_register *)hdr->nr_body;
1840	int excluded_direction[] = { NR_TX_RINGS_ONLY, NR_RX_RINGS_ONLY };
1841	enum txrx t;
1842	u_int j;
1843	u_int nr_flags = reg->nr_flags, nr_mode = reg->nr_mode,
1844	      nr_ringid = reg->nr_ringid;
1845
1846	for_rx_tx(t) {
1847		if (nr_flags & excluded_direction[t]) {
1848			priv->np_qfirst[t] = priv->np_qlast[t] = 0;
1849			continue;
1850		}
1851		switch (nr_mode) {
1852		case NR_REG_ALL_NIC:
1853		case NR_REG_NULL:
1854			priv->np_qfirst[t] = 0;
1855			priv->np_qlast[t] = nma_get_nrings(na, t);
1856			nm_prdis("ALL/PIPE: %s %d %d", nm_txrx2str(t),
1857				priv->np_qfirst[t], priv->np_qlast[t]);
1858			break;
1859		case NR_REG_SW:
1860		case NR_REG_NIC_SW:
1861			if (!(na->na_flags & NAF_HOST_RINGS)) {
1862				nm_prerr("host rings not supported");
1863				return EINVAL;
1864			}
1865			priv->np_qfirst[t] = (nr_mode == NR_REG_SW ?
1866				nma_get_nrings(na, t) : 0);
1867			priv->np_qlast[t] = netmap_all_rings(na, t);
1868			nm_prdis("%s: %s %d %d", nr_mode == NR_REG_SW ? "SW" : "NIC+SW",
1869				nm_txrx2str(t),
1870				priv->np_qfirst[t], priv->np_qlast[t]);
1871			break;
1872		case NR_REG_ONE_NIC:
1873			if (nr_ringid >= na->num_tx_rings &&
1874					nr_ringid >= na->num_rx_rings) {
1875				nm_prerr("invalid ring id %d", nr_ringid);
1876				return EINVAL;
1877			}
1878			/* if not enough rings, use the first one */
1879			j = nr_ringid;
1880			if (j >= nma_get_nrings(na, t))
1881				j = 0;
1882			priv->np_qfirst[t] = j;
1883			priv->np_qlast[t] = j + 1;
1884			nm_prdis("ONE_NIC: %s %d %d", nm_txrx2str(t),
1885				priv->np_qfirst[t], priv->np_qlast[t]);
1886			break;
1887		case NR_REG_ONE_SW:
1888			if (!(na->na_flags & NAF_HOST_RINGS)) {
1889				nm_prerr("host rings not supported");
1890				return EINVAL;
1891			}
1892			if (nr_ringid >= na->num_host_tx_rings &&
1893					nr_ringid >= na->num_host_rx_rings) {
1894				nm_prerr("invalid ring id %d", nr_ringid);
1895				return EINVAL;
1896			}
1897			/* if not enough rings, use the first one */
1898			j = nr_ringid;
1899			if (j >= nma_get_host_nrings(na, t))
1900				j = 0;
1901			priv->np_qfirst[t] = nma_get_nrings(na, t) + j;
1902			priv->np_qlast[t] = nma_get_nrings(na, t) + j + 1;
1903			nm_prdis("ONE_SW: %s %d %d", nm_txrx2str(t),
1904				priv->np_qfirst[t], priv->np_qlast[t]);
1905			break;
1906		default:
1907			nm_prerr("invalid regif type %d", nr_mode);
1908			return EINVAL;
1909		}
1910	}
1911	priv->np_flags = nr_flags;
1912
1913	/* Allow transparent forwarding mode in the host --> nic
1914	 * direction only if all the TX hw rings have been opened. */
1915	if (priv->np_qfirst[NR_TX] == 0 &&
1916			priv->np_qlast[NR_TX] >= na->num_tx_rings) {
1917		priv->np_sync_flags |= NAF_CAN_FORWARD_DOWN;
1918	}
1919
1920	if (netmap_verbose) {
1921		nm_prinf("%s: tx [%d,%d) rx [%d,%d) id %d",
1922			na->name,
1923			priv->np_qfirst[NR_TX],
1924			priv->np_qlast[NR_TX],
1925			priv->np_qfirst[NR_RX],
1926			priv->np_qlast[NR_RX],
1927			nr_ringid);
1928	}
1929	return 0;
1930}
1931
1932
1933/*
1934 * Set the ring ID. For devices with a single queue, a request
1935 * for all rings is the same as a single ring.
1936 */
1937static int
1938netmap_set_ringid(struct netmap_priv_d *priv, struct nmreq_header *hdr)
1939{
1940	struct netmap_adapter *na = priv->np_na;
1941	struct nmreq_register *reg = (struct nmreq_register *)hdr->nr_body;
1942	int error;
1943	enum txrx t;
1944
1945	error = netmap_interp_ringid(priv, hdr);
1946	if (error) {
1947		return error;
1948	}
1949
1950	priv->np_txpoll = (reg->nr_flags & NR_NO_TX_POLL) ? 0 : 1;
1951
1952	/* optimization: count the users registered for more than
1953	 * one ring, which are the ones sleeping on the global queue.
1954	 * The default netmap_notify() callback will then
1955	 * avoid signaling the global queue if nobody is using it
1956	 */
1957	for_rx_tx(t) {
1958		if (nm_si_user(priv, t))
1959			na->si_users[t]++;
1960	}
1961	return 0;
1962}
1963
1964static void
1965netmap_unset_ringid(struct netmap_priv_d *priv)
1966{
1967	struct netmap_adapter *na = priv->np_na;
1968	enum txrx t;
1969
1970	for_rx_tx(t) {
1971		if (nm_si_user(priv, t))
1972			na->si_users[t]--;
1973		priv->np_qfirst[t] = priv->np_qlast[t] = 0;
1974	}
1975	priv->np_flags = 0;
1976	priv->np_txpoll = 0;
1977	priv->np_kloop_state = 0;
1978}
1979
1980#define within_sel(p_, t_, i_)					  	  \
1981	((i_) < (p_)->np_qlast[(t_)])
1982#define nonempty_sel(p_, t_)						  \
1983	(within_sel((p_), (t_), (p_)->np_qfirst[(t_)]))
1984#define foreach_selected_ring(p_, t_, i_, kring_)			  \
1985	for ((t_) = nonempty_sel((p_), NR_RX) ? NR_RX : NR_TX,		  \
1986	     (i_) = (p_)->np_qfirst[(t_)];				  \
1987	     (t_ == NR_RX ||						  \
1988	      (t == NR_TX && within_sel((p_), (t_), (i_)))) &&     	  \
1989	      ((kring_) = NMR((p_)->np_na, (t_))[(i_)]); 		  \
1990	     (i_) = within_sel((p_), (t_), (i_) + 1) ? (i_) + 1 :         \
1991		(++(t_) < NR_TXRX ? (p_)->np_qfirst[(t_)] : (i_)))
1992
1993
1994/* Set the nr_pending_mode for the requested rings.
1995 * If requested, also try to get exclusive access to the rings, provided
1996 * the rings we want to bind are not exclusively owned by a previous bind.
1997 */
1998static int
1999netmap_krings_get(struct netmap_priv_d *priv)
2000{
2001	struct netmap_adapter *na = priv->np_na;
2002	u_int i;
2003	struct netmap_kring *kring;
2004	int excl = (priv->np_flags & NR_EXCLUSIVE);
2005	enum txrx t;
2006
2007	if (netmap_debug & NM_DEBUG_ON)
2008		nm_prinf("%s: grabbing tx [%d, %d) rx [%d, %d)",
2009			na->name,
2010			priv->np_qfirst[NR_TX],
2011			priv->np_qlast[NR_TX],
2012			priv->np_qfirst[NR_RX],
2013			priv->np_qlast[NR_RX]);
2014
2015	/* first round: check that all the requested rings
2016	 * are neither alread exclusively owned, nor we
2017	 * want exclusive ownership when they are already in use
2018	 */
2019	foreach_selected_ring(priv, t, i, kring) {
2020		if ((kring->nr_kflags & NKR_EXCLUSIVE) ||
2021		    (kring->users && excl))
2022		{
2023			nm_prdis("ring %s busy", kring->name);
2024			return EBUSY;
2025		}
2026	}
2027
2028	/* second round: increment usage count (possibly marking them
2029	 * as exclusive) and set the nr_pending_mode
2030	 */
2031	foreach_selected_ring(priv, t, i, kring) {
2032		kring->users++;
2033		if (excl)
2034			kring->nr_kflags |= NKR_EXCLUSIVE;
2035		kring->nr_pending_mode = NKR_NETMAP_ON;
2036	}
2037
2038	return 0;
2039
2040}
2041
2042/* Undo netmap_krings_get(). This is done by clearing the exclusive mode
2043 * if was asked on regif, and unset the nr_pending_mode if we are the
2044 * last users of the involved rings. */
2045static void
2046netmap_krings_put(struct netmap_priv_d *priv)
2047{
2048	u_int i;
2049	struct netmap_kring *kring;
2050	int excl = (priv->np_flags & NR_EXCLUSIVE);
2051	enum txrx t;
2052
2053	nm_prdis("%s: releasing tx [%d, %d) rx [%d, %d)",
2054			na->name,
2055			priv->np_qfirst[NR_TX],
2056			priv->np_qlast[NR_TX],
2057			priv->np_qfirst[NR_RX],
2058			priv->np_qlast[MR_RX]);
2059
2060	foreach_selected_ring(priv, t, i, kring) {
2061		if (excl)
2062			kring->nr_kflags &= ~NKR_EXCLUSIVE;
2063		kring->users--;
2064		if (kring->users == 0)
2065			kring->nr_pending_mode = NKR_NETMAP_OFF;
2066	}
2067}
2068
2069static int
2070nm_priv_rx_enabled(struct netmap_priv_d *priv)
2071{
2072	return (priv->np_qfirst[NR_RX] != priv->np_qlast[NR_RX]);
2073}
2074
2075/* Validate the CSB entries for both directions (atok and ktoa).
2076 * To be called under NMG_LOCK(). */
2077static int
2078netmap_csb_validate(struct netmap_priv_d *priv, struct nmreq_opt_csb *csbo)
2079{
2080	struct nm_csb_atok *csb_atok_base =
2081		(struct nm_csb_atok *)(uintptr_t)csbo->csb_atok;
2082	struct nm_csb_ktoa *csb_ktoa_base =
2083		(struct nm_csb_ktoa *)(uintptr_t)csbo->csb_ktoa;
2084	enum txrx t;
2085	int num_rings[NR_TXRX], tot_rings;
2086	size_t entry_size[2];
2087	void *csb_start[2];
2088	int i;
2089
2090	if (priv->np_kloop_state & NM_SYNC_KLOOP_RUNNING) {
2091		nm_prerr("Cannot update CSB while kloop is running");
2092		return EBUSY;
2093	}
2094
2095	tot_rings = 0;
2096	for_rx_tx(t) {
2097		num_rings[t] = priv->np_qlast[t] - priv->np_qfirst[t];
2098		tot_rings += num_rings[t];
2099	}
2100	if (tot_rings <= 0)
2101		return 0;
2102
2103	if (!(priv->np_flags & NR_EXCLUSIVE)) {
2104		nm_prerr("CSB mode requires NR_EXCLUSIVE");
2105		return EINVAL;
2106	}
2107
2108	entry_size[0] = sizeof(*csb_atok_base);
2109	entry_size[1] = sizeof(*csb_ktoa_base);
2110	csb_start[0] = (void *)csb_atok_base;
2111	csb_start[1] = (void *)csb_ktoa_base;
2112
2113	for (i = 0; i < 2; i++) {
2114		/* On Linux we could use access_ok() to simplify
2115		 * the validation. However, the advantage of
2116		 * this approach is that it works also on
2117		 * FreeBSD. */
2118		size_t csb_size = tot_rings * entry_size[i];
2119		void *tmp;
2120		int err;
2121
2122		if ((uintptr_t)csb_start[i] & (entry_size[i]-1)) {
2123			nm_prerr("Unaligned CSB address");
2124			return EINVAL;
2125		}
2126
2127		tmp = nm_os_malloc(csb_size);
2128		if (!tmp)
2129			return ENOMEM;
2130		if (i == 0) {
2131			/* Application --> kernel direction. */
2132			err = copyin(csb_start[i], tmp, csb_size);
2133		} else {
2134			/* Kernel --> application direction. */
2135			memset(tmp, 0, csb_size);
2136			err = copyout(tmp, csb_start[i], csb_size);
2137		}
2138		nm_os_free(tmp);
2139		if (err) {
2140			nm_prerr("Invalid CSB address");
2141			return err;
2142		}
2143	}
2144
2145	priv->np_csb_atok_base = csb_atok_base;
2146	priv->np_csb_ktoa_base = csb_ktoa_base;
2147
2148	/* Initialize the CSB. */
2149	for_rx_tx(t) {
2150		for (i = 0; i < num_rings[t]; i++) {
2151			struct netmap_kring *kring =
2152				NMR(priv->np_na, t)[i + priv->np_qfirst[t]];
2153			struct nm_csb_atok *csb_atok = csb_atok_base + i;
2154			struct nm_csb_ktoa *csb_ktoa = csb_ktoa_base + i;
2155
2156			if (t == NR_RX) {
2157				csb_atok += num_rings[NR_TX];
2158				csb_ktoa += num_rings[NR_TX];
2159			}
2160
2161			CSB_WRITE(csb_atok, head, kring->rhead);
2162			CSB_WRITE(csb_atok, cur, kring->rcur);
2163			CSB_WRITE(csb_atok, appl_need_kick, 1);
2164			CSB_WRITE(csb_atok, sync_flags, 1);
2165			CSB_WRITE(csb_ktoa, hwcur, kring->nr_hwcur);
2166			CSB_WRITE(csb_ktoa, hwtail, kring->nr_hwtail);
2167			CSB_WRITE(csb_ktoa, kern_need_kick, 1);
2168
2169			nm_prinf("csb_init for kring %s: head %u, cur %u, "
2170				"hwcur %u, hwtail %u", kring->name,
2171				kring->rhead, kring->rcur, kring->nr_hwcur,
2172				kring->nr_hwtail);
2173		}
2174	}
2175
2176	return 0;
2177}
2178
2179/* Ensure that the netmap adapter can support the given MTU.
2180 * @return EINVAL if the na cannot be set to mtu, 0 otherwise.
2181 */
2182int
2183netmap_buf_size_validate(const struct netmap_adapter *na, unsigned mtu) {
2184	unsigned nbs = NETMAP_BUF_SIZE(na);
2185
2186	if (mtu <= na->rx_buf_maxsize) {
2187		/* The MTU fits a single NIC slot. We only
2188		 * Need to check that netmap buffers are
2189		 * large enough to hold an MTU. NS_MOREFRAG
2190		 * cannot be used in this case. */
2191		if (nbs < mtu) {
2192			nm_prerr("error: netmap buf size (%u) "
2193				 "< device MTU (%u)", nbs, mtu);
2194			return EINVAL;
2195		}
2196	} else {
2197		/* More NIC slots may be needed to receive
2198		 * or transmit a single packet. Check that
2199		 * the adapter supports NS_MOREFRAG and that
2200		 * netmap buffers are large enough to hold
2201		 * the maximum per-slot size. */
2202		if (!(na->na_flags & NAF_MOREFRAG)) {
2203			nm_prerr("error: large MTU (%d) needed "
2204				 "but %s does not support "
2205				 "NS_MOREFRAG", mtu,
2206				 na->ifp->if_xname);
2207			return EINVAL;
2208		} else if (nbs < na->rx_buf_maxsize) {
2209			nm_prerr("error: using NS_MOREFRAG on "
2210				 "%s requires netmap buf size "
2211				 ">= %u", na->ifp->if_xname,
2212				 na->rx_buf_maxsize);
2213			return EINVAL;
2214		} else {
2215			nm_prinf("info: netmap application on "
2216				 "%s needs to support "
2217				 "NS_MOREFRAG "
2218				 "(MTU=%u,netmap_buf_size=%u)",
2219				 na->ifp->if_xname, mtu, nbs);
2220		}
2221	}
2222	return 0;
2223}
2224
2225
2226/*
2227 * possibly move the interface to netmap-mode.
2228 * If success it returns a pointer to netmap_if, otherwise NULL.
2229 * This must be called with NMG_LOCK held.
2230 *
2231 * The following na callbacks are called in the process:
2232 *
2233 * na->nm_config()			[by netmap_update_config]
2234 * (get current number and size of rings)
2235 *
2236 *  	We have a generic one for linux (netmap_linux_config).
2237 *  	The bwrap has to override this, since it has to forward
2238 *  	the request to the wrapped adapter (netmap_bwrap_config).
2239 *
2240 *
2241 * na->nm_krings_create()
2242 * (create and init the krings array)
2243 *
2244 * 	One of the following:
2245 *
2246 *	* netmap_hw_krings_create, 			(hw ports)
2247 *		creates the standard layout for the krings
2248 * 		and adds the mbq (used for the host rings).
2249 *
2250 * 	* netmap_vp_krings_create			(VALE ports)
2251 * 		add leases and scratchpads
2252 *
2253 * 	* netmap_pipe_krings_create			(pipes)
2254 * 		create the krings and rings of both ends and
2255 * 		cross-link them
2256 *
2257 *      * netmap_monitor_krings_create 			(monitors)
2258 *      	avoid allocating the mbq
2259 *
2260 *      * netmap_bwrap_krings_create			(bwraps)
2261 *      	create both the brap krings array,
2262 *      	the krings array of the wrapped adapter, and
2263 *      	(if needed) the fake array for the host adapter
2264 *
2265 * na->nm_register(, 1)
2266 * (put the adapter in netmap mode)
2267 *
2268 * 	This may be one of the following:
2269 *
2270 * 	* netmap_hw_reg				        (hw ports)
2271 * 		checks that the ifp is still there, then calls
2272 * 		the hardware specific callback;
2273 *
2274 * 	* netmap_vp_reg					(VALE ports)
2275 *		If the port is connected to a bridge,
2276 *		set the NAF_NETMAP_ON flag under the
2277 *		bridge write lock.
2278 *
2279 *	* netmap_pipe_reg				(pipes)
2280 *		inform the other pipe end that it is no
2281 *		longer responsible for the lifetime of this
2282 *		pipe end
2283 *
2284 *	* netmap_monitor_reg				(monitors)
2285 *		intercept the sync callbacks of the monitored
2286 *		rings
2287 *
2288 *	* netmap_bwrap_reg				(bwraps)
2289 *		cross-link the bwrap and hwna rings,
2290 *		forward the request to the hwna, override
2291 *		the hwna notify callback (to get the frames
2292 *		coming from outside go through the bridge).
2293 *
2294 *
2295 */
2296int
2297netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na,
2298	struct nmreq_header *hdr)
2299{
2300	struct netmap_if *nifp = NULL;
2301	int error;
2302
2303	NMG_LOCK_ASSERT();
2304	priv->np_na = na;     /* store the reference */
2305	error = netmap_mem_finalize(na->nm_mem, na);
2306	if (error)
2307		goto err;
2308
2309	if (na->active_fds == 0) {
2310
2311		/* cache the allocator info in the na */
2312		error = netmap_mem_get_lut(na->nm_mem, &na->na_lut);
2313		if (error)
2314			goto err_drop_mem;
2315		nm_prdis("lut %p bufs %u size %u", na->na_lut.lut, na->na_lut.objtotal,
2316					    na->na_lut.objsize);
2317
2318		/* ring configuration may have changed, fetch from the card */
2319		netmap_update_config(na);
2320	}
2321
2322	/* compute the range of tx and rx rings to monitor */
2323	error = netmap_set_ringid(priv, hdr);
2324	if (error)
2325		goto err_put_lut;
2326
2327	if (na->active_fds == 0) {
2328		/*
2329		 * If this is the first registration of the adapter,
2330		 * perform sanity checks and create the in-kernel view
2331		 * of the netmap rings (the netmap krings).
2332		 */
2333		if (na->ifp && nm_priv_rx_enabled(priv)) {
2334			/* This netmap adapter is attached to an ifnet. */
2335			unsigned mtu = nm_os_ifnet_mtu(na->ifp);
2336
2337			nm_prdis("%s: mtu %d rx_buf_maxsize %d netmap_buf_size %d",
2338				na->name, mtu, na->rx_buf_maxsize, NETMAP_BUF_SIZE(na));
2339
2340			if (na->rx_buf_maxsize == 0) {
2341				nm_prerr("%s: error: rx_buf_maxsize == 0", na->name);
2342				error = EIO;
2343				goto err_drop_mem;
2344			}
2345
2346			error = netmap_buf_size_validate(na, mtu);
2347			if (error)
2348				goto err_drop_mem;
2349		}
2350
2351		/*
2352		 * Depending on the adapter, this may also create
2353		 * the netmap rings themselves
2354		 */
2355		error = na->nm_krings_create(na);
2356		if (error)
2357			goto err_put_lut;
2358
2359	}
2360
2361	/* now the krings must exist and we can check whether some
2362	 * previous bind has exclusive ownership on them, and set
2363	 * nr_pending_mode
2364	 */
2365	error = netmap_krings_get(priv);
2366	if (error)
2367		goto err_del_krings;
2368
2369	/* create all needed missing netmap rings */
2370	error = netmap_mem_rings_create(na);
2371	if (error)
2372		goto err_rel_excl;
2373
2374	/* in all cases, create a new netmap if */
2375	nifp = netmap_mem_if_new(na, priv);
2376	if (nifp == NULL) {
2377		error = ENOMEM;
2378		goto err_rel_excl;
2379	}
2380
2381	if (nm_kring_pending(priv)) {
2382		/* Some kring is switching mode, tell the adapter to
2383		 * react on this. */
2384		error = na->nm_register(na, 1);
2385		if (error)
2386			goto err_del_if;
2387	}
2388
2389	/* Commit the reference. */
2390	na->active_fds++;
2391
2392	/*
2393	 * advertise that the interface is ready by setting np_nifp.
2394	 * The barrier is needed because readers (poll, *SYNC and mmap)
2395	 * check for priv->np_nifp != NULL without locking
2396	 */
2397	mb(); /* make sure previous writes are visible to all CPUs */
2398	priv->np_nifp = nifp;
2399
2400	return 0;
2401
2402err_del_if:
2403	netmap_mem_if_delete(na, nifp);
2404err_rel_excl:
2405	netmap_krings_put(priv);
2406	netmap_mem_rings_delete(na);
2407err_del_krings:
2408	if (na->active_fds == 0)
2409		na->nm_krings_delete(na);
2410err_put_lut:
2411	if (na->active_fds == 0)
2412		memset(&na->na_lut, 0, sizeof(na->na_lut));
2413err_drop_mem:
2414	netmap_mem_drop(na);
2415err:
2416	priv->np_na = NULL;
2417	return error;
2418}
2419
2420
2421/*
2422 * update kring and ring at the end of rxsync/txsync.
2423 */
2424static inline void
2425nm_sync_finalize(struct netmap_kring *kring)
2426{
2427	/*
2428	 * Update ring tail to what the kernel knows
2429	 * After txsync: head/rhead/hwcur might be behind cur/rcur
2430	 * if no carrier.
2431	 */
2432	kring->ring->tail = kring->rtail = kring->nr_hwtail;
2433
2434	nm_prdis(5, "%s now hwcur %d hwtail %d head %d cur %d tail %d",
2435		kring->name, kring->nr_hwcur, kring->nr_hwtail,
2436		kring->rhead, kring->rcur, kring->rtail);
2437}
2438
2439/* set ring timestamp */
2440static inline void
2441ring_timestamp_set(struct netmap_ring *ring)
2442{
2443	if (netmap_no_timestamp == 0 || ring->flags & NR_TIMESTAMP) {
2444		microtime(&ring->ts);
2445	}
2446}
2447
2448static int nmreq_copyin(struct nmreq_header *, int);
2449static int nmreq_copyout(struct nmreq_header *, int);
2450static int nmreq_checkoptions(struct nmreq_header *);
2451
2452/*
2453 * ioctl(2) support for the "netmap" device.
2454 *
2455 * Following a list of accepted commands:
2456 * - NIOCCTRL		device control API
2457 * - NIOCTXSYNC		sync TX rings
2458 * - NIOCRXSYNC		sync RX rings
2459 * - SIOCGIFADDR	just for convenience
2460 * - NIOCGINFO		deprecated (legacy API)
2461 * - NIOCREGIF		deprecated (legacy API)
2462 *
2463 * Return 0 on success, errno otherwise.
2464 */
2465int
2466netmap_ioctl(struct netmap_priv_d *priv, u_long cmd, caddr_t data,
2467		struct thread *td, int nr_body_is_user)
2468{
2469	struct mbq q;	/* packets from RX hw queues to host stack */
2470	struct netmap_adapter *na = NULL;
2471	struct netmap_mem_d *nmd = NULL;
2472	struct ifnet *ifp = NULL;
2473	int error = 0;
2474	u_int i, qfirst, qlast;
2475	struct netmap_kring **krings;
2476	int sync_flags;
2477	enum txrx t;
2478
2479	switch (cmd) {
2480	case NIOCCTRL: {
2481		struct nmreq_header *hdr = (struct nmreq_header *)data;
2482
2483		if (hdr->nr_version < NETMAP_MIN_API ||
2484		    hdr->nr_version > NETMAP_MAX_API) {
2485			nm_prerr("API mismatch: got %d need %d",
2486				hdr->nr_version, NETMAP_API);
2487			return EINVAL;
2488		}
2489
2490		/* Make a kernel-space copy of the user-space nr_body.
2491		 * For convenince, the nr_body pointer and the pointers
2492		 * in the options list will be replaced with their
2493		 * kernel-space counterparts. The original pointers are
2494		 * saved internally and later restored by nmreq_copyout
2495		 */
2496		error = nmreq_copyin(hdr, nr_body_is_user);
2497		if (error) {
2498			return error;
2499		}
2500
2501		/* Sanitize hdr->nr_name. */
2502		hdr->nr_name[sizeof(hdr->nr_name) - 1] = '\0';
2503
2504		switch (hdr->nr_reqtype) {
2505		case NETMAP_REQ_REGISTER: {
2506			struct nmreq_register *req =
2507				(struct nmreq_register *)(uintptr_t)hdr->nr_body;
2508			struct netmap_if *nifp;
2509
2510			/* Protect access to priv from concurrent requests. */
2511			NMG_LOCK();
2512			do {
2513				struct nmreq_option *opt;
2514				u_int memflags;
2515
2516				if (priv->np_nifp != NULL) {	/* thread already registered */
2517					error = EBUSY;
2518					break;
2519				}
2520
2521#ifdef WITH_EXTMEM
2522				opt = nmreq_getoption(hdr, NETMAP_REQ_OPT_EXTMEM);
2523				if (opt != NULL) {
2524					struct nmreq_opt_extmem *e =
2525						(struct nmreq_opt_extmem *)opt;
2526
2527					nmd = netmap_mem_ext_create(e->nro_usrptr,
2528							&e->nro_info, &error);
2529					opt->nro_status = error;
2530					if (nmd == NULL)
2531						break;
2532				}
2533#endif /* WITH_EXTMEM */
2534
2535				if (nmd == NULL && req->nr_mem_id) {
2536					/* find the allocator and get a reference */
2537					nmd = netmap_mem_find(req->nr_mem_id);
2538					if (nmd == NULL) {
2539						if (netmap_verbose) {
2540							nm_prerr("%s: failed to find mem_id %u",
2541									hdr->nr_name, req->nr_mem_id);
2542						}
2543						error = EINVAL;
2544						break;
2545					}
2546				}
2547				/* find the interface and a reference */
2548				error = netmap_get_na(hdr, &na, &ifp, nmd,
2549						      1 /* create */); /* keep reference */
2550				if (error)
2551					break;
2552				if (NETMAP_OWNED_BY_KERN(na)) {
2553					error = EBUSY;
2554					break;
2555				}
2556
2557				if (na->virt_hdr_len && !(req->nr_flags & NR_ACCEPT_VNET_HDR)) {
2558					nm_prerr("virt_hdr_len=%d, but application does "
2559						"not accept it", na->virt_hdr_len);
2560					error = EIO;
2561					break;
2562				}
2563
2564				error = netmap_do_regif(priv, na, hdr);
2565				if (error) {    /* reg. failed, release priv and ref */
2566					break;
2567				}
2568
2569				opt = nmreq_getoption(hdr, NETMAP_REQ_OPT_CSB);
2570				if (opt != NULL) {
2571					struct nmreq_opt_csb *csbo =
2572						(struct nmreq_opt_csb *)opt;
2573					error = netmap_csb_validate(priv, csbo);
2574					opt->nro_status = error;
2575					if (error) {
2576						netmap_do_unregif(priv);
2577						break;
2578					}
2579				}
2580
2581				nifp = priv->np_nifp;
2582
2583				/* return the offset of the netmap_if object */
2584				req->nr_rx_rings = na->num_rx_rings;
2585				req->nr_tx_rings = na->num_tx_rings;
2586				req->nr_rx_slots = na->num_rx_desc;
2587				req->nr_tx_slots = na->num_tx_desc;
2588				req->nr_host_tx_rings = na->num_host_tx_rings;
2589				req->nr_host_rx_rings = na->num_host_rx_rings;
2590				error = netmap_mem_get_info(na->nm_mem, &req->nr_memsize, &memflags,
2591					&req->nr_mem_id);
2592				if (error) {
2593					netmap_do_unregif(priv);
2594					break;
2595				}
2596				if (memflags & NETMAP_MEM_PRIVATE) {
2597					*(uint32_t *)(uintptr_t)&nifp->ni_flags |= NI_PRIV_MEM;
2598				}
2599				for_rx_tx(t) {
2600					priv->np_si[t] = nm_si_user(priv, t) ?
2601						&na->si[t] : &NMR(na, t)[priv->np_qfirst[t]]->si;
2602				}
2603
2604				if (req->nr_extra_bufs) {
2605					if (netmap_verbose)
2606						nm_prinf("requested %d extra buffers",
2607							req->nr_extra_bufs);
2608					req->nr_extra_bufs = netmap_extra_alloc(na,
2609						&nifp->ni_bufs_head, req->nr_extra_bufs);
2610					if (netmap_verbose)
2611						nm_prinf("got %d extra buffers", req->nr_extra_bufs);
2612				}
2613				req->nr_offset = netmap_mem_if_offset(na->nm_mem, nifp);
2614
2615				error = nmreq_checkoptions(hdr);
2616				if (error) {
2617					netmap_do_unregif(priv);
2618					break;
2619				}
2620
2621				/* store ifp reference so that priv destructor may release it */
2622				priv->np_ifp = ifp;
2623			} while (0);
2624			if (error) {
2625				netmap_unget_na(na, ifp);
2626			}
2627			/* release the reference from netmap_mem_find() or
2628			 * netmap_mem_ext_create()
2629			 */
2630			if (nmd)
2631				netmap_mem_put(nmd);
2632			NMG_UNLOCK();
2633			break;
2634		}
2635
2636		case NETMAP_REQ_PORT_INFO_GET: {
2637			struct nmreq_port_info_get *req =
2638				(struct nmreq_port_info_get *)(uintptr_t)hdr->nr_body;
2639			int nmd_ref = 0;
2640
2641			NMG_LOCK();
2642			do {
2643				u_int memflags;
2644
2645				if (hdr->nr_name[0] != '\0') {
2646					/* Build a nmreq_register out of the nmreq_port_info_get,
2647					 * so that we can call netmap_get_na(). */
2648					struct nmreq_register regreq;
2649					bzero(&regreq, sizeof(regreq));
2650					regreq.nr_mode = NR_REG_ALL_NIC;
2651					regreq.nr_tx_slots = req->nr_tx_slots;
2652					regreq.nr_rx_slots = req->nr_rx_slots;
2653					regreq.nr_tx_rings = req->nr_tx_rings;
2654					regreq.nr_rx_rings = req->nr_rx_rings;
2655					regreq.nr_host_tx_rings = req->nr_host_tx_rings;
2656					regreq.nr_host_rx_rings = req->nr_host_rx_rings;
2657					regreq.nr_mem_id = req->nr_mem_id;
2658
2659					/* get a refcount */
2660					hdr->nr_reqtype = NETMAP_REQ_REGISTER;
2661					hdr->nr_body = (uintptr_t)&regreq;
2662					error = netmap_get_na(hdr, &na, &ifp, NULL, 1 /* create */);
2663					hdr->nr_reqtype = NETMAP_REQ_PORT_INFO_GET; /* reset type */
2664					hdr->nr_body = (uintptr_t)req; /* reset nr_body */
2665					if (error) {
2666						na = NULL;
2667						ifp = NULL;
2668						break;
2669					}
2670					nmd = na->nm_mem; /* get memory allocator */
2671				} else {
2672					nmd = netmap_mem_find(req->nr_mem_id ? req->nr_mem_id : 1);
2673					if (nmd == NULL) {
2674						if (netmap_verbose)
2675							nm_prerr("%s: failed to find mem_id %u",
2676									hdr->nr_name,
2677									req->nr_mem_id ? req->nr_mem_id : 1);
2678						error = EINVAL;
2679						break;
2680					}
2681					nmd_ref = 1;
2682				}
2683
2684				error = netmap_mem_get_info(nmd, &req->nr_memsize, &memflags,
2685					&req->nr_mem_id);
2686				if (error)
2687					break;
2688				if (na == NULL) /* only memory info */
2689					break;
2690				netmap_update_config(na);
2691				req->nr_rx_rings = na->num_rx_rings;
2692				req->nr_tx_rings = na->num_tx_rings;
2693				req->nr_rx_slots = na->num_rx_desc;
2694				req->nr_tx_slots = na->num_tx_desc;
2695				req->nr_host_tx_rings = na->num_host_tx_rings;
2696				req->nr_host_rx_rings = na->num_host_rx_rings;
2697			} while (0);
2698			netmap_unget_na(na, ifp);
2699			if (nmd_ref)
2700				netmap_mem_put(nmd);
2701			NMG_UNLOCK();
2702			break;
2703		}
2704#ifdef WITH_VALE
2705		case NETMAP_REQ_VALE_ATTACH: {
2706			error = netmap_vale_attach(hdr, NULL /* userspace request */);
2707			break;
2708		}
2709
2710		case NETMAP_REQ_VALE_DETACH: {
2711			error = netmap_vale_detach(hdr, NULL /* userspace request */);
2712			break;
2713		}
2714
2715		case NETMAP_REQ_VALE_LIST: {
2716			error = netmap_vale_list(hdr);
2717			break;
2718		}
2719
2720		case NETMAP_REQ_PORT_HDR_SET: {
2721			struct nmreq_port_hdr *req =
2722				(struct nmreq_port_hdr *)(uintptr_t)hdr->nr_body;
2723			/* Build a nmreq_register out of the nmreq_port_hdr,
2724			 * so that we can call netmap_get_bdg_na(). */
2725			struct nmreq_register regreq;
2726			bzero(&regreq, sizeof(regreq));
2727			regreq.nr_mode = NR_REG_ALL_NIC;
2728
2729			/* For now we only support virtio-net headers, and only for
2730			 * VALE ports, but this may change in future. Valid lengths
2731			 * for the virtio-net header are 0 (no header), 10 and 12. */
2732			if (req->nr_hdr_len != 0 &&
2733				req->nr_hdr_len != sizeof(struct nm_vnet_hdr) &&
2734					req->nr_hdr_len != 12) {
2735				if (netmap_verbose)
2736					nm_prerr("invalid hdr_len %u", req->nr_hdr_len);
2737				error = EINVAL;
2738				break;
2739			}
2740			NMG_LOCK();
2741			hdr->nr_reqtype = NETMAP_REQ_REGISTER;
2742			hdr->nr_body = (uintptr_t)&regreq;
2743			error = netmap_get_vale_na(hdr, &na, NULL, 0);
2744			hdr->nr_reqtype = NETMAP_REQ_PORT_HDR_SET;
2745			hdr->nr_body = (uintptr_t)req;
2746			if (na && !error) {
2747				struct netmap_vp_adapter *vpna =
2748					(struct netmap_vp_adapter *)na;
2749				na->virt_hdr_len = req->nr_hdr_len;
2750				if (na->virt_hdr_len) {
2751					vpna->mfs = NETMAP_BUF_SIZE(na);
2752				}
2753				if (netmap_verbose)
2754					nm_prinf("Using vnet_hdr_len %d for %p", na->virt_hdr_len, na);
2755				netmap_adapter_put(na);
2756			} else if (!na) {
2757				error = ENXIO;
2758			}
2759			NMG_UNLOCK();
2760			break;
2761		}
2762
2763		case NETMAP_REQ_PORT_HDR_GET: {
2764			/* Get vnet-header length for this netmap port */
2765			struct nmreq_port_hdr *req =
2766				(struct nmreq_port_hdr *)(uintptr_t)hdr->nr_body;
2767			/* Build a nmreq_register out of the nmreq_port_hdr,
2768			 * so that we can call netmap_get_bdg_na(). */
2769			struct nmreq_register regreq;
2770			struct ifnet *ifp;
2771
2772			bzero(&regreq, sizeof(regreq));
2773			regreq.nr_mode = NR_REG_ALL_NIC;
2774			NMG_LOCK();
2775			hdr->nr_reqtype = NETMAP_REQ_REGISTER;
2776			hdr->nr_body = (uintptr_t)&regreq;
2777			error = netmap_get_na(hdr, &na, &ifp, NULL, 0);
2778			hdr->nr_reqtype = NETMAP_REQ_PORT_HDR_GET;
2779			hdr->nr_body = (uintptr_t)req;
2780			if (na && !error) {
2781				req->nr_hdr_len = na->virt_hdr_len;
2782			}
2783			netmap_unget_na(na, ifp);
2784			NMG_UNLOCK();
2785			break;
2786		}
2787
2788		case NETMAP_REQ_VALE_NEWIF: {
2789			error = nm_vi_create(hdr);
2790			break;
2791		}
2792
2793		case NETMAP_REQ_VALE_DELIF: {
2794			error = nm_vi_destroy(hdr->nr_name);
2795			break;
2796		}
2797
2798		case NETMAP_REQ_VALE_POLLING_ENABLE:
2799		case NETMAP_REQ_VALE_POLLING_DISABLE: {
2800			error = nm_bdg_polling(hdr);
2801			break;
2802		}
2803#endif  /* WITH_VALE */
2804		case NETMAP_REQ_POOLS_INFO_GET: {
2805			/* Get information from the memory allocator used for
2806			 * hdr->nr_name. */
2807			struct nmreq_pools_info *req =
2808				(struct nmreq_pools_info *)(uintptr_t)hdr->nr_body;
2809			NMG_LOCK();
2810			do {
2811				/* Build a nmreq_register out of the nmreq_pools_info,
2812				 * so that we can call netmap_get_na(). */
2813				struct nmreq_register regreq;
2814				bzero(&regreq, sizeof(regreq));
2815				regreq.nr_mem_id = req->nr_mem_id;
2816				regreq.nr_mode = NR_REG_ALL_NIC;
2817
2818				hdr->nr_reqtype = NETMAP_REQ_REGISTER;
2819				hdr->nr_body = (uintptr_t)&regreq;
2820				error = netmap_get_na(hdr, &na, &ifp, NULL, 1 /* create */);
2821				hdr->nr_reqtype = NETMAP_REQ_POOLS_INFO_GET; /* reset type */
2822				hdr->nr_body = (uintptr_t)req; /* reset nr_body */
2823				if (error) {
2824					na = NULL;
2825					ifp = NULL;
2826					break;
2827				}
2828				nmd = na->nm_mem; /* grab the memory allocator */
2829				if (nmd == NULL) {
2830					error = EINVAL;
2831					break;
2832				}
2833
2834				/* Finalize the memory allocator, get the pools
2835				 * information and release the allocator. */
2836				error = netmap_mem_finalize(nmd, na);
2837				if (error) {
2838					break;
2839				}
2840				error = netmap_mem_pools_info_get(req, nmd);
2841				netmap_mem_drop(na);
2842			} while (0);
2843			netmap_unget_na(na, ifp);
2844			NMG_UNLOCK();
2845			break;
2846		}
2847
2848		case NETMAP_REQ_CSB_ENABLE: {
2849			struct nmreq_option *opt;
2850
2851			opt = nmreq_getoption(hdr, NETMAP_REQ_OPT_CSB);
2852			if (opt == NULL) {
2853				error = EINVAL;
2854			} else {
2855				struct nmreq_opt_csb *csbo =
2856					(struct nmreq_opt_csb *)opt;
2857				NMG_LOCK();
2858				error = netmap_csb_validate(priv, csbo);
2859				NMG_UNLOCK();
2860				opt->nro_status = error;
2861			}
2862			break;
2863		}
2864
2865		case NETMAP_REQ_SYNC_KLOOP_START: {
2866			error = netmap_sync_kloop(priv, hdr);
2867			break;
2868		}
2869
2870		case NETMAP_REQ_SYNC_KLOOP_STOP: {
2871			error = netmap_sync_kloop_stop(priv);
2872			break;
2873		}
2874
2875		default: {
2876			error = EINVAL;
2877			break;
2878		}
2879		}
2880		/* Write back request body to userspace and reset the
2881		 * user-space pointer. */
2882		error = nmreq_copyout(hdr, error);
2883		break;
2884	}
2885
2886	case NIOCTXSYNC:
2887	case NIOCRXSYNC: {
2888		if (unlikely(priv->np_nifp == NULL)) {
2889			error = ENXIO;
2890			break;
2891		}
2892		mb(); /* make sure following reads are not from cache */
2893
2894		if (unlikely(priv->np_csb_atok_base)) {
2895			nm_prerr("Invalid sync in CSB mode");
2896			error = EBUSY;
2897			break;
2898		}
2899
2900		na = priv->np_na;      /* we have a reference */
2901
2902		mbq_init(&q);
2903		t = (cmd == NIOCTXSYNC ? NR_TX : NR_RX);
2904		krings = NMR(na, t);
2905		qfirst = priv->np_qfirst[t];
2906		qlast = priv->np_qlast[t];
2907		sync_flags = priv->np_sync_flags;
2908
2909		for (i = qfirst; i < qlast; i++) {
2910			struct netmap_kring *kring = krings[i];
2911			struct netmap_ring *ring = kring->ring;
2912
2913			if (unlikely(nm_kr_tryget(kring, 1, &error))) {
2914				error = (error ? EIO : 0);
2915				continue;
2916			}
2917
2918			if (cmd == NIOCTXSYNC) {
2919				if (netmap_debug & NM_DEBUG_TXSYNC)
2920					nm_prinf("pre txsync ring %d cur %d hwcur %d",
2921					    i, ring->cur,
2922					    kring->nr_hwcur);
2923				if (nm_txsync_prologue(kring, ring) >= kring->nkr_num_slots) {
2924					netmap_ring_reinit(kring);
2925				} else if (kring->nm_sync(kring, sync_flags | NAF_FORCE_RECLAIM) == 0) {
2926					nm_sync_finalize(kring);
2927				}
2928				if (netmap_debug & NM_DEBUG_TXSYNC)
2929					nm_prinf("post txsync ring %d cur %d hwcur %d",
2930					    i, ring->cur,
2931					    kring->nr_hwcur);
2932			} else {
2933				if (nm_rxsync_prologue(kring, ring) >= kring->nkr_num_slots) {
2934					netmap_ring_reinit(kring);
2935				}
2936				if (nm_may_forward_up(kring)) {
2937					/* transparent forwarding, see netmap_poll() */
2938					netmap_grab_packets(kring, &q, netmap_fwd);
2939				}
2940				if (kring->nm_sync(kring, sync_flags | NAF_FORCE_READ) == 0) {
2941					nm_sync_finalize(kring);
2942				}
2943				ring_timestamp_set(ring);
2944			}
2945			nm_kr_put(kring);
2946		}
2947
2948		if (mbq_peek(&q)) {
2949			netmap_send_up(na->ifp, &q);
2950		}
2951
2952		break;
2953	}
2954
2955	default: {
2956		return netmap_ioctl_legacy(priv, cmd, data, td);
2957		break;
2958	}
2959	}
2960
2961	return (error);
2962}
2963
2964size_t
2965nmreq_size_by_type(uint16_t nr_reqtype)
2966{
2967	switch (nr_reqtype) {
2968	case NETMAP_REQ_REGISTER:
2969		return sizeof(struct nmreq_register);
2970	case NETMAP_REQ_PORT_INFO_GET:
2971		return sizeof(struct nmreq_port_info_get);
2972	case NETMAP_REQ_VALE_ATTACH:
2973		return sizeof(struct nmreq_vale_attach);
2974	case NETMAP_REQ_VALE_DETACH:
2975		return sizeof(struct nmreq_vale_detach);
2976	case NETMAP_REQ_VALE_LIST:
2977		return sizeof(struct nmreq_vale_list);
2978	case NETMAP_REQ_PORT_HDR_SET:
2979	case NETMAP_REQ_PORT_HDR_GET:
2980		return sizeof(struct nmreq_port_hdr);
2981	case NETMAP_REQ_VALE_NEWIF:
2982		return sizeof(struct nmreq_vale_newif);
2983	case NETMAP_REQ_VALE_DELIF:
2984	case NETMAP_REQ_SYNC_KLOOP_STOP:
2985	case NETMAP_REQ_CSB_ENABLE:
2986		return 0;
2987	case NETMAP_REQ_VALE_POLLING_ENABLE:
2988	case NETMAP_REQ_VALE_POLLING_DISABLE:
2989		return sizeof(struct nmreq_vale_polling);
2990	case NETMAP_REQ_POOLS_INFO_GET:
2991		return sizeof(struct nmreq_pools_info);
2992	case NETMAP_REQ_SYNC_KLOOP_START:
2993		return sizeof(struct nmreq_sync_kloop_start);
2994	}
2995	return 0;
2996}
2997
2998static size_t
2999nmreq_opt_size_by_type(uint32_t nro_reqtype, uint64_t nro_size)
3000{
3001	size_t rv = sizeof(struct nmreq_option);
3002#ifdef NETMAP_REQ_OPT_DEBUG
3003	if (nro_reqtype & NETMAP_REQ_OPT_DEBUG)
3004		return (nro_reqtype & ~NETMAP_REQ_OPT_DEBUG);
3005#endif /* NETMAP_REQ_OPT_DEBUG */
3006	switch (nro_reqtype) {
3007#ifdef WITH_EXTMEM
3008	case NETMAP_REQ_OPT_EXTMEM:
3009		rv = sizeof(struct nmreq_opt_extmem);
3010		break;
3011#endif /* WITH_EXTMEM */
3012	case NETMAP_REQ_OPT_SYNC_KLOOP_EVENTFDS:
3013		if (nro_size >= rv)
3014			rv = nro_size;
3015		break;
3016	case NETMAP_REQ_OPT_CSB:
3017		rv = sizeof(struct nmreq_opt_csb);
3018		break;
3019	case NETMAP_REQ_OPT_SYNC_KLOOP_MODE:
3020		rv = sizeof(struct nmreq_opt_sync_kloop_mode);
3021		break;
3022	}
3023	/* subtract the common header */
3024	return rv - sizeof(struct nmreq_option);
3025}
3026
3027/*
3028 * nmreq_copyin: create an in-kernel version of the request.
3029 *
3030 * We build the following data structure:
3031 *
3032 * hdr -> +-------+                buf
3033 *        |       |          +---------------+
3034 *        +-------+          |usr body ptr   |
3035 *        |options|-.        +---------------+
3036 *        +-------+ |        |usr options ptr|
3037 *        |body   |--------->+---------------+
3038 *        +-------+ |        |               |
3039 *                  |        |  copy of body |
3040 *                  |        |               |
3041 *                  |        +---------------+
3042 *                  |        |    NULL       |
3043 *                  |        +---------------+
3044 *                  |    .---|               |\
3045 *                  |    |   +---------------+ |
3046 *                  | .------|               | |
3047 *                  | |  |   +---------------+  \ option table
3048 *                  | |  |   |      ...      |  / indexed by option
3049 *                  | |  |   +---------------+ |  type
3050 *                  | |  |   |               | |
3051 *                  | |  |   +---------------+/
3052 *                  | |  |   |usr next ptr 1 |
3053 *                  `-|----->+---------------+
3054 *                    |  |   | copy of opt 1 |
3055 *                    |  |   |               |
3056 *                    |  | .-| nro_next      |
3057 *                    |  | | +---------------+
3058 *                    |  | | |usr next ptr 2 |
3059 *                    |  `-`>+---------------+
3060 *                    |      | copy of opt 2 |
3061 *                    |      |               |
3062 *                    |    .-| nro_next      |
3063 *                    |    | +---------------+
3064 *                    |    | |               |
3065 *                    ~    ~ ~      ...      ~
3066 *                    |    .-|               |
3067 *                    `----->+---------------+
3068 *                         | |usr next ptr n |
3069 *                         `>+---------------+
3070 *                           | copy of opt n |
3071 *                           |               |
3072 *                           | nro_next(NULL)|
3073 *                           +---------------+
3074 *
3075 * The options and body fields of the hdr structure are overwritten
3076 * with in-kernel valid pointers inside the buf. The original user
3077 * pointers are saved in the buf and restored on copyout.
3078 * The list of options is copied and the pointers adjusted. The
3079 * original pointers are saved before the option they belonged.
3080 *
3081 * The option table has an entry for every availabe option.  Entries
3082 * for options that have not been passed contain NULL.
3083 *
3084 */
3085
3086int
3087nmreq_copyin(struct nmreq_header *hdr, int nr_body_is_user)
3088{
3089	size_t rqsz, optsz, bufsz;
3090	int error = 0;
3091	char *ker = NULL, *p;
3092	struct nmreq_option **next, *src, **opt_tab;
3093	struct nmreq_option buf;
3094	uint64_t *ptrs;
3095
3096	if (hdr->nr_reserved) {
3097		if (netmap_verbose)
3098			nm_prerr("nr_reserved must be zero");
3099		return EINVAL;
3100	}
3101
3102	if (!nr_body_is_user)
3103		return 0;
3104
3105	hdr->nr_reserved = nr_body_is_user;
3106
3107	/* compute the total size of the buffer */
3108	rqsz = nmreq_size_by_type(hdr->nr_reqtype);
3109	if (rqsz > NETMAP_REQ_MAXSIZE) {
3110		error = EMSGSIZE;
3111		goto out_err;
3112	}
3113	if ((rqsz && hdr->nr_body == (uintptr_t)NULL) ||
3114		(!rqsz && hdr->nr_body != (uintptr_t)NULL)) {
3115		/* Request body expected, but not found; or
3116		 * request body found but unexpected. */
3117		if (netmap_verbose)
3118			nm_prerr("nr_body expected but not found, or vice versa");
3119		error = EINVAL;
3120		goto out_err;
3121	}
3122
3123	bufsz = 2 * sizeof(void *) + rqsz +
3124		NETMAP_REQ_OPT_MAX * sizeof(opt_tab);
3125	/* compute the size of the buf below the option table.
3126	 * It must contain a copy of every received option structure.
3127	 * For every option we also need to store a copy of the user
3128	 * list pointer.
3129	 */
3130	optsz = 0;
3131	for (src = (struct nmreq_option *)(uintptr_t)hdr->nr_options; src;
3132	     src = (struct nmreq_option *)(uintptr_t)buf.nro_next)
3133	{
3134		error = copyin(src, &buf, sizeof(*src));
3135		if (error)
3136			goto out_err;
3137		optsz += sizeof(*src);
3138		optsz += nmreq_opt_size_by_type(buf.nro_reqtype, buf.nro_size);
3139		if (rqsz + optsz > NETMAP_REQ_MAXSIZE) {
3140			error = EMSGSIZE;
3141			goto out_err;
3142		}
3143		bufsz += sizeof(void *);
3144	}
3145	bufsz += optsz;
3146
3147	ker = nm_os_malloc(bufsz);
3148	if (ker == NULL) {
3149		error = ENOMEM;
3150		goto out_err;
3151	}
3152	p = ker;	/* write pointer into the buffer */
3153
3154	/* make a copy of the user pointers */
3155	ptrs = (uint64_t*)p;
3156	*ptrs++ = hdr->nr_body;
3157	*ptrs++ = hdr->nr_options;
3158	p = (char *)ptrs;
3159
3160	/* copy the body */
3161	error = copyin((void *)(uintptr_t)hdr->nr_body, p, rqsz);
3162	if (error)
3163		goto out_restore;
3164	/* overwrite the user pointer with the in-kernel one */
3165	hdr->nr_body = (uintptr_t)p;
3166	p += rqsz;
3167	/* start of the options table */
3168	opt_tab = (struct nmreq_option **)p;
3169	p += sizeof(opt_tab) * NETMAP_REQ_OPT_MAX;
3170
3171	/* copy the options */
3172	next = (struct nmreq_option **)&hdr->nr_options;
3173	src = *next;
3174	while (src) {
3175		struct nmreq_option *opt;
3176
3177		/* copy the option header */
3178		ptrs = (uint64_t *)p;
3179		opt = (struct nmreq_option *)(ptrs + 1);
3180		error = copyin(src, opt, sizeof(*src));
3181		if (error)
3182			goto out_restore;
3183		/* make a copy of the user next pointer */
3184		*ptrs = opt->nro_next;
3185		/* overwrite the user pointer with the in-kernel one */
3186		*next = opt;
3187
3188		/* initialize the option as not supported.
3189		 * Recognized options will update this field.
3190		 */
3191		opt->nro_status = EOPNOTSUPP;
3192
3193		/* check for invalid types */
3194		if (opt->nro_reqtype < 1) {
3195			if (netmap_verbose)
3196				nm_prinf("invalid option type: %u", opt->nro_reqtype);
3197			opt->nro_status = EINVAL;
3198			error = EINVAL;
3199			goto next;
3200		}
3201
3202		if (opt->nro_reqtype >= NETMAP_REQ_OPT_MAX) {
3203			/* opt->nro_status is already EOPNOTSUPP */
3204			error = EOPNOTSUPP;
3205			goto next;
3206		}
3207
3208		/* if the type is valid, index the option in the table
3209		 * unless it is a duplicate.
3210		 */
3211		if (opt_tab[opt->nro_reqtype] != NULL) {
3212			if (netmap_verbose)
3213				nm_prinf("duplicate option: %u", opt->nro_reqtype);
3214			opt->nro_status = EINVAL;
3215			opt_tab[opt->nro_reqtype]->nro_status = EINVAL;
3216			error = EINVAL;
3217			goto next;
3218		}
3219		opt_tab[opt->nro_reqtype] = opt;
3220
3221		p = (char *)(opt + 1);
3222
3223		/* copy the option body */
3224		optsz = nmreq_opt_size_by_type(opt->nro_reqtype,
3225						opt->nro_size);
3226		if (optsz) {
3227			/* the option body follows the option header */
3228			error = copyin(src + 1, p, optsz);
3229			if (error)
3230				goto out_restore;
3231			p += optsz;
3232		}
3233
3234	next:
3235		/* move to next option */
3236		next = (struct nmreq_option **)&opt->nro_next;
3237		src = *next;
3238	}
3239	if (error)
3240		nmreq_copyout(hdr, error);
3241	return error;
3242
3243out_restore:
3244	ptrs = (uint64_t *)ker;
3245	hdr->nr_body = *ptrs++;
3246	hdr->nr_options = *ptrs++;
3247	hdr->nr_reserved = 0;
3248	nm_os_free(ker);
3249out_err:
3250	return error;
3251}
3252
3253static int
3254nmreq_copyout(struct nmreq_header *hdr, int rerror)
3255{
3256	struct nmreq_option *src, *dst;
3257	void *ker = (void *)(uintptr_t)hdr->nr_body, *bufstart;
3258	uint64_t *ptrs;
3259	size_t bodysz;
3260	int error;
3261
3262	if (!hdr->nr_reserved)
3263		return rerror;
3264
3265	/* restore the user pointers in the header */
3266	ptrs = (uint64_t *)ker - 2;
3267	bufstart = ptrs;
3268	hdr->nr_body = *ptrs++;
3269	src = (struct nmreq_option *)(uintptr_t)hdr->nr_options;
3270	hdr->nr_options = *ptrs;
3271
3272	if (!rerror) {
3273		/* copy the body */
3274		bodysz = nmreq_size_by_type(hdr->nr_reqtype);
3275		error = copyout(ker, (void *)(uintptr_t)hdr->nr_body, bodysz);
3276		if (error) {
3277			rerror = error;
3278			goto out;
3279		}
3280	}
3281
3282	/* copy the options */
3283	dst = (struct nmreq_option *)(uintptr_t)hdr->nr_options;
3284	while (src) {
3285		size_t optsz;
3286		uint64_t next;
3287
3288		/* restore the user pointer */
3289		next = src->nro_next;
3290		ptrs = (uint64_t *)src - 1;
3291		src->nro_next = *ptrs;
3292
3293		/* always copy the option header */
3294		error = copyout(src, dst, sizeof(*src));
3295		if (error) {
3296			rerror = error;
3297			goto out;
3298		}
3299
3300		/* copy the option body only if there was no error */
3301		if (!rerror && !src->nro_status) {
3302			optsz = nmreq_opt_size_by_type(src->nro_reqtype,
3303							src->nro_size);
3304			if (optsz) {
3305				error = copyout(src + 1, dst + 1, optsz);
3306				if (error) {
3307					rerror = error;
3308					goto out;
3309				}
3310			}
3311		}
3312		src = (struct nmreq_option *)(uintptr_t)next;
3313		dst = (struct nmreq_option *)(uintptr_t)*ptrs;
3314	}
3315
3316
3317out:
3318	hdr->nr_reserved = 0;
3319	nm_os_free(bufstart);
3320	return rerror;
3321}
3322
3323struct nmreq_option *
3324nmreq_getoption(struct nmreq_header *hdr, uint16_t reqtype)
3325{
3326	struct nmreq_option **opt_tab;
3327
3328	if (!hdr->nr_options)
3329		return NULL;
3330
3331	opt_tab = (struct nmreq_option **)((uintptr_t)hdr->nr_options) -
3332	    (NETMAP_REQ_OPT_MAX + 1);
3333	return opt_tab[reqtype];
3334}
3335
3336static int
3337nmreq_checkoptions(struct nmreq_header *hdr)
3338{
3339	struct nmreq_option *opt;
3340	/* return error if there is still any option
3341	 * marked as not supported
3342	 */
3343
3344	for (opt = (struct nmreq_option *)(uintptr_t)hdr->nr_options; opt;
3345	     opt = (struct nmreq_option *)(uintptr_t)opt->nro_next)
3346		if (opt->nro_status == EOPNOTSUPP)
3347			return EOPNOTSUPP;
3348
3349	return 0;
3350}
3351
3352/*
3353 * select(2) and poll(2) handlers for the "netmap" device.
3354 *
3355 * Can be called for one or more queues.
3356 * Return true the event mask corresponding to ready events.
3357 * If there are no ready events (and 'sr' is not NULL), do a
3358 * selrecord on either individual selinfo or on the global one.
3359 * Device-dependent parts (locking and sync of tx/rx rings)
3360 * are done through callbacks.
3361 *
3362 * On linux, arguments are really pwait, the poll table, and 'td' is struct file *
3363 * The first one is remapped to pwait as selrecord() uses the name as an
3364 * hidden argument.
3365 */
3366int
3367netmap_poll(struct netmap_priv_d *priv, int events, NM_SELRECORD_T *sr)
3368{
3369	struct netmap_adapter *na;
3370	struct netmap_kring *kring;
3371	struct netmap_ring *ring;
3372	u_int i, want[NR_TXRX], revents = 0;
3373	NM_SELINFO_T *si[NR_TXRX];
3374#define want_tx want[NR_TX]
3375#define want_rx want[NR_RX]
3376	struct mbq q;	/* packets from RX hw queues to host stack */
3377
3378	/*
3379	 * In order to avoid nested locks, we need to "double check"
3380	 * txsync and rxsync if we decide to do a selrecord().
3381	 * retry_tx (and retry_rx, later) prevent looping forever.
3382	 */
3383	int retry_tx = 1, retry_rx = 1;
3384
3385	/* Transparent mode: send_down is 1 if we have found some
3386	 * packets to forward (host RX ring --> NIC) during the rx
3387	 * scan and we have not sent them down to the NIC yet.
3388	 * Transparent mode requires to bind all rings to a single
3389	 * file descriptor.
3390	 */
3391	int send_down = 0;
3392	int sync_flags = priv->np_sync_flags;
3393
3394	mbq_init(&q);
3395
3396	if (unlikely(priv->np_nifp == NULL)) {
3397		return POLLERR;
3398	}
3399	mb(); /* make sure following reads are not from cache */
3400
3401	na = priv->np_na;
3402
3403	if (unlikely(!nm_netmap_on(na)))
3404		return POLLERR;
3405
3406	if (unlikely(priv->np_csb_atok_base)) {
3407		nm_prerr("Invalid poll in CSB mode");
3408		return POLLERR;
3409	}
3410
3411	if (netmap_debug & NM_DEBUG_ON)
3412		nm_prinf("device %s events 0x%x", na->name, events);
3413	want_tx = events & (POLLOUT | POLLWRNORM);
3414	want_rx = events & (POLLIN | POLLRDNORM);
3415
3416	/*
3417	 * If the card has more than one queue AND the file descriptor is
3418	 * bound to all of them, we sleep on the "global" selinfo, otherwise
3419	 * we sleep on individual selinfo (FreeBSD only allows two selinfo's
3420	 * per file descriptor).
3421	 * The interrupt routine in the driver wake one or the other
3422	 * (or both) depending on which clients are active.
3423	 *
3424	 * rxsync() is only called if we run out of buffers on a POLLIN.
3425	 * txsync() is called if we run out of buffers on POLLOUT, or
3426	 * there are pending packets to send. The latter can be disabled
3427	 * passing NETMAP_NO_TX_POLL in the NIOCREG call.
3428	 */
3429	si[NR_RX] = priv->np_si[NR_RX];
3430	si[NR_TX] = priv->np_si[NR_TX];
3431
3432#ifdef __FreeBSD__
3433	/*
3434	 * We start with a lock free round which is cheap if we have
3435	 * slots available. If this fails, then lock and call the sync
3436	 * routines. We can't do this on Linux, as the contract says
3437	 * that we must call nm_os_selrecord() unconditionally.
3438	 */
3439	if (want_tx) {
3440		const enum txrx t = NR_TX;
3441		for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
3442			kring = NMR(na, t)[i];
3443			if (kring->ring->cur != kring->ring->tail) {
3444				/* Some unseen TX space is available, so what
3445				 * we don't need to run txsync. */
3446				revents |= want[t];
3447				want[t] = 0;
3448				break;
3449			}
3450		}
3451	}
3452	if (want_rx) {
3453		const enum txrx t = NR_RX;
3454		int rxsync_needed = 0;
3455
3456		for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
3457			kring = NMR(na, t)[i];
3458			if (kring->ring->cur == kring->ring->tail
3459				|| kring->rhead != kring->ring->head) {
3460				/* There are no unseen packets on this ring,
3461				 * or there are some buffers to be returned
3462				 * to the netmap port. We therefore go ahead
3463				 * and run rxsync. */
3464				rxsync_needed = 1;
3465				break;
3466			}
3467		}
3468		if (!rxsync_needed) {
3469			revents |= want_rx;
3470			want_rx = 0;
3471		}
3472	}
3473#endif
3474
3475#ifdef linux
3476	/* The selrecord must be unconditional on linux. */
3477	nm_os_selrecord(sr, si[NR_RX]);
3478	nm_os_selrecord(sr, si[NR_TX]);
3479#endif /* linux */
3480
3481	/*
3482	 * If we want to push packets out (priv->np_txpoll) or
3483	 * want_tx is still set, we must issue txsync calls
3484	 * (on all rings, to avoid that the tx rings stall).
3485	 * Fortunately, normal tx mode has np_txpoll set.
3486	 */
3487	if (priv->np_txpoll || want_tx) {
3488		/*
3489		 * The first round checks if anyone is ready, if not
3490		 * do a selrecord and another round to handle races.
3491		 * want_tx goes to 0 if any space is found, and is
3492		 * used to skip rings with no pending transmissions.
3493		 */
3494flush_tx:
3495		for (i = priv->np_qfirst[NR_TX]; i < priv->np_qlast[NR_TX]; i++) {
3496			int found = 0;
3497
3498			kring = na->tx_rings[i];
3499			ring = kring->ring;
3500
3501			/*
3502			 * Don't try to txsync this TX ring if we already found some
3503			 * space in some of the TX rings (want_tx == 0) and there are no
3504			 * TX slots in this ring that need to be flushed to the NIC
3505			 * (head == hwcur).
3506			 */
3507			if (!send_down && !want_tx && ring->head == kring->nr_hwcur)
3508				continue;
3509
3510			if (nm_kr_tryget(kring, 1, &revents))
3511				continue;
3512
3513			if (nm_txsync_prologue(kring, ring) >= kring->nkr_num_slots) {
3514				netmap_ring_reinit(kring);
3515				revents |= POLLERR;
3516			} else {
3517				if (kring->nm_sync(kring, sync_flags))
3518					revents |= POLLERR;
3519				else
3520					nm_sync_finalize(kring);
3521			}
3522
3523			/*
3524			 * If we found new slots, notify potential
3525			 * listeners on the same ring.
3526			 * Since we just did a txsync, look at the copies
3527			 * of cur,tail in the kring.
3528			 */
3529			found = kring->rcur != kring->rtail;
3530			nm_kr_put(kring);
3531			if (found) { /* notify other listeners */
3532				revents |= want_tx;
3533				want_tx = 0;
3534#ifndef linux
3535				kring->nm_notify(kring, 0);
3536#endif /* linux */
3537			}
3538		}
3539		/* if there were any packet to forward we must have handled them by now */
3540		send_down = 0;
3541		if (want_tx && retry_tx && sr) {
3542#ifndef linux
3543			nm_os_selrecord(sr, si[NR_TX]);
3544#endif /* !linux */
3545			retry_tx = 0;
3546			goto flush_tx;
3547		}
3548	}
3549
3550	/*
3551	 * If want_rx is still set scan receive rings.
3552	 * Do it on all rings because otherwise we starve.
3553	 */
3554	if (want_rx) {
3555		/* two rounds here for race avoidance */
3556do_retry_rx:
3557		for (i = priv->np_qfirst[NR_RX]; i < priv->np_qlast[NR_RX]; i++) {
3558			int found = 0;
3559
3560			kring = na->rx_rings[i];
3561			ring = kring->ring;
3562
3563			if (unlikely(nm_kr_tryget(kring, 1, &revents)))
3564				continue;
3565
3566			if (nm_rxsync_prologue(kring, ring) >= kring->nkr_num_slots) {
3567				netmap_ring_reinit(kring);
3568				revents |= POLLERR;
3569			}
3570			/* now we can use kring->rcur, rtail */
3571
3572			/*
3573			 * transparent mode support: collect packets from
3574			 * hw rxring(s) that have been released by the user
3575			 */
3576			if (nm_may_forward_up(kring)) {
3577				netmap_grab_packets(kring, &q, netmap_fwd);
3578			}
3579
3580			/* Clear the NR_FORWARD flag anyway, it may be set by
3581			 * the nm_sync() below only on for the host RX ring (see
3582			 * netmap_rxsync_from_host()). */
3583			kring->nr_kflags &= ~NR_FORWARD;
3584			if (kring->nm_sync(kring, sync_flags))
3585				revents |= POLLERR;
3586			else
3587				nm_sync_finalize(kring);
3588			send_down |= (kring->nr_kflags & NR_FORWARD);
3589			ring_timestamp_set(ring);
3590			found = kring->rcur != kring->rtail;
3591			nm_kr_put(kring);
3592			if (found) {
3593				revents |= want_rx;
3594				retry_rx = 0;
3595#ifndef linux
3596				kring->nm_notify(kring, 0);
3597#endif /* linux */
3598			}
3599		}
3600
3601#ifndef linux
3602		if (retry_rx && sr) {
3603			nm_os_selrecord(sr, si[NR_RX]);
3604		}
3605#endif /* !linux */
3606		if (send_down || retry_rx) {
3607			retry_rx = 0;
3608			if (send_down)
3609				goto flush_tx; /* and retry_rx */
3610			else
3611				goto do_retry_rx;
3612		}
3613	}
3614
3615	/*
3616	 * Transparent mode: released bufs (i.e. between kring->nr_hwcur and
3617	 * ring->head) marked with NS_FORWARD on hw rx rings are passed up
3618	 * to the host stack.
3619	 */
3620
3621	if (mbq_peek(&q)) {
3622		netmap_send_up(na->ifp, &q);
3623	}
3624
3625	return (revents);
3626#undef want_tx
3627#undef want_rx
3628}
3629
3630int
3631nma_intr_enable(struct netmap_adapter *na, int onoff)
3632{
3633	bool changed = false;
3634	enum txrx t;
3635	int i;
3636
3637	for_rx_tx(t) {
3638		for (i = 0; i < nma_get_nrings(na, t); i++) {
3639			struct netmap_kring *kring = NMR(na, t)[i];
3640			int on = !(kring->nr_kflags & NKR_NOINTR);
3641
3642			if (!!onoff != !!on) {
3643				changed = true;
3644			}
3645			if (onoff) {
3646				kring->nr_kflags &= ~NKR_NOINTR;
3647			} else {
3648				kring->nr_kflags |= NKR_NOINTR;
3649			}
3650		}
3651	}
3652
3653	if (!changed) {
3654		return 0; /* nothing to do */
3655	}
3656
3657	if (!na->nm_intr) {
3658		nm_prerr("Cannot %s interrupts for %s", onoff ? "enable" : "disable",
3659		  na->name);
3660		return -1;
3661	}
3662
3663	na->nm_intr(na, onoff);
3664
3665	return 0;
3666}
3667
3668
3669/*-------------------- driver support routines -------------------*/
3670
3671/* default notify callback */
3672static int
3673netmap_notify(struct netmap_kring *kring, int flags)
3674{
3675	struct netmap_adapter *na = kring->notify_na;
3676	enum txrx t = kring->tx;
3677
3678	nm_os_selwakeup(&kring->si);
3679	/* optimization: avoid a wake up on the global
3680	 * queue if nobody has registered for more
3681	 * than one ring
3682	 */
3683	if (na->si_users[t] > 0)
3684		nm_os_selwakeup(&na->si[t]);
3685
3686	return NM_IRQ_COMPLETED;
3687}
3688
3689/* called by all routines that create netmap_adapters.
3690 * provide some defaults and get a reference to the
3691 * memory allocator
3692 */
3693int
3694netmap_attach_common(struct netmap_adapter *na)
3695{
3696	if (!na->rx_buf_maxsize) {
3697		/* Set a conservative default (larger is safer). */
3698		na->rx_buf_maxsize = PAGE_SIZE;
3699	}
3700
3701#ifdef __FreeBSD__
3702	if (na->na_flags & NAF_HOST_RINGS && na->ifp) {
3703		na->if_input = na->ifp->if_input; /* for netmap_send_up */
3704	}
3705	na->pdev = na; /* make sure netmap_mem_map() is called */
3706#endif /* __FreeBSD__ */
3707	if (na->na_flags & NAF_HOST_RINGS) {
3708		if (na->num_host_rx_rings == 0)
3709			na->num_host_rx_rings = 1;
3710		if (na->num_host_tx_rings == 0)
3711			na->num_host_tx_rings = 1;
3712	}
3713	if (na->nm_krings_create == NULL) {
3714		/* we assume that we have been called by a driver,
3715		 * since other port types all provide their own
3716		 * nm_krings_create
3717		 */
3718		na->nm_krings_create = netmap_hw_krings_create;
3719		na->nm_krings_delete = netmap_hw_krings_delete;
3720	}
3721	if (na->nm_notify == NULL)
3722		na->nm_notify = netmap_notify;
3723	na->active_fds = 0;
3724
3725	if (na->nm_mem == NULL) {
3726		/* use the global allocator */
3727		na->nm_mem = netmap_mem_get(&nm_mem);
3728	}
3729#ifdef WITH_VALE
3730	if (na->nm_bdg_attach == NULL)
3731		/* no special nm_bdg_attach callback. On VALE
3732		 * attach, we need to interpose a bwrap
3733		 */
3734		na->nm_bdg_attach = netmap_default_bdg_attach;
3735#endif
3736
3737	return 0;
3738}
3739
3740/* Wrapper for the register callback provided netmap-enabled
3741 * hardware drivers.
3742 * nm_iszombie(na) means that the driver module has been
3743 * unloaded, so we cannot call into it.
3744 * nm_os_ifnet_lock() must guarantee mutual exclusion with
3745 * module unloading.
3746 */
3747static int
3748netmap_hw_reg(struct netmap_adapter *na, int onoff)
3749{
3750	struct netmap_hw_adapter *hwna =
3751		(struct netmap_hw_adapter*)na;
3752	int error = 0;
3753
3754	nm_os_ifnet_lock();
3755
3756	if (nm_iszombie(na)) {
3757		if (onoff) {
3758			error = ENXIO;
3759		} else if (na != NULL) {
3760			na->na_flags &= ~NAF_NETMAP_ON;
3761		}
3762		goto out;
3763	}
3764
3765	error = hwna->nm_hw_register(na, onoff);
3766
3767out:
3768	nm_os_ifnet_unlock();
3769
3770	return error;
3771}
3772
3773static void
3774netmap_hw_dtor(struct netmap_adapter *na)
3775{
3776	if (na->ifp == NULL)
3777		return;
3778
3779	NM_DETACH_NA(na->ifp);
3780}
3781
3782
3783/*
3784 * Allocate a netmap_adapter object, and initialize it from the
3785 * 'arg' passed by the driver on attach.
3786 * We allocate a block of memory of 'size' bytes, which has room
3787 * for struct netmap_adapter plus additional room private to
3788 * the caller.
3789 * Return 0 on success, ENOMEM otherwise.
3790 */
3791int
3792netmap_attach_ext(struct netmap_adapter *arg, size_t size, int override_reg)
3793{
3794	struct netmap_hw_adapter *hwna = NULL;
3795	struct ifnet *ifp = NULL;
3796
3797	if (size < sizeof(struct netmap_hw_adapter)) {
3798		if (netmap_debug & NM_DEBUG_ON)
3799			nm_prerr("Invalid netmap adapter size %d", (int)size);
3800		return EINVAL;
3801	}
3802
3803	if (arg == NULL || arg->ifp == NULL) {
3804		if (netmap_debug & NM_DEBUG_ON)
3805			nm_prerr("either arg or arg->ifp is NULL");
3806		return EINVAL;
3807	}
3808
3809	if (arg->num_tx_rings == 0 || arg->num_rx_rings == 0) {
3810		if (netmap_debug & NM_DEBUG_ON)
3811			nm_prerr("%s: invalid rings tx %d rx %d",
3812				arg->name, arg->num_tx_rings, arg->num_rx_rings);
3813		return EINVAL;
3814	}
3815
3816	ifp = arg->ifp;
3817	if (NM_NA_CLASH(ifp)) {
3818		/* If NA(ifp) is not null but there is no valid netmap
3819		 * adapter it means that someone else is using the same
3820		 * pointer (e.g. ax25_ptr on linux). This happens for
3821		 * instance when also PF_RING is in use. */
3822		nm_prerr("Error: netmap adapter hook is busy");
3823		return EBUSY;
3824	}
3825
3826	hwna = nm_os_malloc(size);
3827	if (hwna == NULL)
3828		goto fail;
3829	hwna->up = *arg;
3830	hwna->up.na_flags |= NAF_HOST_RINGS | NAF_NATIVE;
3831	strlcpy(hwna->up.name, ifp->if_xname, sizeof(hwna->up.name));
3832	if (override_reg) {
3833		hwna->nm_hw_register = hwna->up.nm_register;
3834		hwna->up.nm_register = netmap_hw_reg;
3835	}
3836	if (netmap_attach_common(&hwna->up)) {
3837		nm_os_free(hwna);
3838		goto fail;
3839	}
3840	netmap_adapter_get(&hwna->up);
3841
3842	NM_ATTACH_NA(ifp, &hwna->up);
3843
3844	nm_os_onattach(ifp);
3845
3846	if (arg->nm_dtor == NULL) {
3847		hwna->up.nm_dtor = netmap_hw_dtor;
3848	}
3849
3850	if_printf(ifp, "netmap queues/slots: TX %d/%d, RX %d/%d\n",
3851	    hwna->up.num_tx_rings, hwna->up.num_tx_desc,
3852	    hwna->up.num_rx_rings, hwna->up.num_rx_desc);
3853	return 0;
3854
3855fail:
3856	nm_prerr("fail, arg %p ifp %p na %p", arg, ifp, hwna);
3857	return (hwna ? EINVAL : ENOMEM);
3858}
3859
3860
3861int
3862netmap_attach(struct netmap_adapter *arg)
3863{
3864	return netmap_attach_ext(arg, sizeof(struct netmap_hw_adapter),
3865			1 /* override nm_reg */);
3866}
3867
3868
3869void
3870NM_DBG(netmap_adapter_get)(struct netmap_adapter *na)
3871{
3872	if (!na) {
3873		return;
3874	}
3875
3876	refcount_acquire(&na->na_refcount);
3877}
3878
3879
3880/* returns 1 iff the netmap_adapter is destroyed */
3881int
3882NM_DBG(netmap_adapter_put)(struct netmap_adapter *na)
3883{
3884	if (!na)
3885		return 1;
3886
3887	if (!refcount_release(&na->na_refcount))
3888		return 0;
3889
3890	if (na->nm_dtor)
3891		na->nm_dtor(na);
3892
3893	if (na->tx_rings) { /* XXX should not happen */
3894		if (netmap_debug & NM_DEBUG_ON)
3895			nm_prerr("freeing leftover tx_rings");
3896		na->nm_krings_delete(na);
3897	}
3898	netmap_pipe_dealloc(na);
3899	if (na->nm_mem)
3900		netmap_mem_put(na->nm_mem);
3901	bzero(na, sizeof(*na));
3902	nm_os_free(na);
3903
3904	return 1;
3905}
3906
3907/* nm_krings_create callback for all hardware native adapters */
3908int
3909netmap_hw_krings_create(struct netmap_adapter *na)
3910{
3911	int ret = netmap_krings_create(na, 0);
3912	if (ret == 0) {
3913		/* initialize the mbq for the sw rx ring */
3914		u_int lim = netmap_real_rings(na, NR_RX), i;
3915		for (i = na->num_rx_rings; i < lim; i++) {
3916			mbq_safe_init(&NMR(na, NR_RX)[i]->rx_queue);
3917		}
3918		nm_prdis("initialized sw rx queue %d", na->num_rx_rings);
3919	}
3920	return ret;
3921}
3922
3923
3924
3925/*
3926 * Called on module unload by the netmap-enabled drivers
3927 */
3928void
3929netmap_detach(struct ifnet *ifp)
3930{
3931	struct netmap_adapter *na = NA(ifp);
3932
3933	if (!na)
3934		return;
3935
3936	NMG_LOCK();
3937	netmap_set_all_rings(na, NM_KR_LOCKED);
3938	/*
3939	 * if the netmap adapter is not native, somebody
3940	 * changed it, so we can not release it here.
3941	 * The NAF_ZOMBIE flag will notify the new owner that
3942	 * the driver is gone.
3943	 */
3944	if (!(na->na_flags & NAF_NATIVE) || !netmap_adapter_put(na)) {
3945		na->na_flags |= NAF_ZOMBIE;
3946	}
3947	/* give active users a chance to notice that NAF_ZOMBIE has been
3948	 * turned on, so that they can stop and return an error to userspace.
3949	 * Note that this becomes a NOP if there are no active users and,
3950	 * therefore, the put() above has deleted the na, since now NA(ifp) is
3951	 * NULL.
3952	 */
3953	netmap_enable_all_rings(ifp);
3954	NMG_UNLOCK();
3955}
3956
3957
3958/*
3959 * Intercept packets from the network stack and pass them
3960 * to netmap as incoming packets on the 'software' ring.
3961 *
3962 * We only store packets in a bounded mbq and then copy them
3963 * in the relevant rxsync routine.
3964 *
3965 * We rely on the OS to make sure that the ifp and na do not go
3966 * away (typically the caller checks for IFF_DRV_RUNNING or the like).
3967 * In nm_register() or whenever there is a reinitialization,
3968 * we make sure to make the mode change visible here.
3969 */
3970int
3971netmap_transmit(struct ifnet *ifp, struct mbuf *m)
3972{
3973	struct netmap_adapter *na = NA(ifp);
3974	struct netmap_kring *kring, *tx_kring;
3975	u_int len = MBUF_LEN(m);
3976	u_int error = ENOBUFS;
3977	unsigned int txr;
3978	struct mbq *q;
3979	int busy;
3980	u_int i;
3981
3982	i = MBUF_TXQ(m);
3983	if (i >= na->num_host_rx_rings) {
3984		i = i % na->num_host_rx_rings;
3985	}
3986	kring = NMR(na, NR_RX)[nma_get_nrings(na, NR_RX) + i];
3987
3988	// XXX [Linux] we do not need this lock
3989	// if we follow the down/configure/up protocol -gl
3990	// mtx_lock(&na->core_lock);
3991
3992	if (!nm_netmap_on(na)) {
3993		nm_prerr("%s not in netmap mode anymore", na->name);
3994		error = ENXIO;
3995		goto done;
3996	}
3997
3998	txr = MBUF_TXQ(m);
3999	if (txr >= na->num_tx_rings) {
4000		txr %= na->num_tx_rings;
4001	}
4002	tx_kring = NMR(na, NR_TX)[txr];
4003
4004	if (tx_kring->nr_mode == NKR_NETMAP_OFF) {
4005		return MBUF_TRANSMIT(na, ifp, m);
4006	}
4007
4008	q = &kring->rx_queue;
4009
4010	// XXX reconsider long packets if we handle fragments
4011	if (len > NETMAP_BUF_SIZE(na)) { /* too long for us */
4012		nm_prerr("%s from_host, drop packet size %d > %d", na->name,
4013			len, NETMAP_BUF_SIZE(na));
4014		goto done;
4015	}
4016
4017	if (!netmap_generic_hwcsum) {
4018		if (nm_os_mbuf_has_csum_offld(m)) {
4019			nm_prlim(1, "%s drop mbuf that needs checksum offload", na->name);
4020			goto done;
4021		}
4022	}
4023
4024	if (nm_os_mbuf_has_seg_offld(m)) {
4025		nm_prlim(1, "%s drop mbuf that needs generic segmentation offload", na->name);
4026		goto done;
4027	}
4028
4029#ifdef __FreeBSD__
4030	ETHER_BPF_MTAP(ifp, m);
4031#endif /* __FreeBSD__ */
4032
4033	/* protect against netmap_rxsync_from_host(), netmap_sw_to_nic()
4034	 * and maybe other instances of netmap_transmit (the latter
4035	 * not possible on Linux).
4036	 * We enqueue the mbuf only if we are sure there is going to be
4037	 * enough room in the host RX ring, otherwise we drop it.
4038	 */
4039	mbq_lock(q);
4040
4041	busy = kring->nr_hwtail - kring->nr_hwcur;
4042	if (busy < 0)
4043		busy += kring->nkr_num_slots;
4044	if (busy + mbq_len(q) >= kring->nkr_num_slots - 1) {
4045		nm_prlim(2, "%s full hwcur %d hwtail %d qlen %d", na->name,
4046			kring->nr_hwcur, kring->nr_hwtail, mbq_len(q));
4047	} else {
4048		mbq_enqueue(q, m);
4049		nm_prdis(2, "%s %d bufs in queue", na->name, mbq_len(q));
4050		/* notify outside the lock */
4051		m = NULL;
4052		error = 0;
4053	}
4054	mbq_unlock(q);
4055
4056done:
4057	if (m)
4058		m_freem(m);
4059	/* unconditionally wake up listeners */
4060	kring->nm_notify(kring, 0);
4061	/* this is normally netmap_notify(), but for nics
4062	 * connected to a bridge it is netmap_bwrap_intr_notify(),
4063	 * that possibly forwards the frames through the switch
4064	 */
4065
4066	return (error);
4067}
4068
4069
4070/*
4071 * Reset function to be called by the driver routines when reinitializing
4072 * a hardware ring. The driver is in charge of locking to protect the kring
4073 * while this operation is being performed. This is normally achieved by
4074 * calling netmap_disable_all_rings() before triggering a reset.
4075 * If the kring is not in netmap mode, return NULL to inform the caller
4076 * that this is the case.
4077 * If the kring is in netmap mode, set hwofs so that the netmap indices
4078 * seen by userspace (head/cut/tail) do not change, although the internal
4079 * NIC indices have been reset to 0.
4080 * In any case, adjust kring->nr_mode.
4081 */
4082struct netmap_slot *
4083netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n,
4084	u_int new_cur)
4085{
4086	struct netmap_kring *kring;
4087	u_int new_hwtail, new_hwofs;
4088
4089	if (!nm_native_on(na)) {
4090		nm_prdis("interface not in native netmap mode");
4091		return NULL;	/* nothing to reinitialize */
4092	}
4093
4094	if (tx == NR_TX) {
4095		if (n >= na->num_tx_rings)
4096			return NULL;
4097		kring = na->tx_rings[n];
4098		/*
4099		 * Set hwofs to rhead, so that slots[rhead] is mapped to
4100		 * the NIC internal slot 0, and thus the netmap buffer
4101		 * at rhead is the next to be transmitted. Transmissions
4102		 * that were pending before the reset are considered as
4103		 * sent, so that we can have hwcur = rhead. All the slots
4104		 * are now owned by the user, so we can also reinit hwtail.
4105		 */
4106		new_hwofs = kring->rhead;
4107		new_hwtail = nm_prev(kring->rhead, kring->nkr_num_slots - 1);
4108	} else {
4109		if (n >= na->num_rx_rings)
4110			return NULL;
4111		kring = na->rx_rings[n];
4112		/*
4113		 * Set hwofs to hwtail, so that slots[hwtail] is mapped to
4114		 * the NIC internal slot 0, and thus the netmap buffer
4115		 * at hwtail is the next to be given to the NIC.
4116		 * Unread slots (the ones in [rhead,hwtail[) are owned by
4117		 * the user, and thus the caller cannot give them
4118		 * to the NIC right now.
4119		 */
4120		new_hwofs = kring->nr_hwtail;
4121		new_hwtail = kring->nr_hwtail;
4122	}
4123	if (kring->nr_pending_mode == NKR_NETMAP_OFF) {
4124		kring->nr_mode = NKR_NETMAP_OFF;
4125		return NULL;
4126	}
4127	if (netmap_verbose) {
4128	    nm_prinf("%s, hc %u->%u, ht %u->%u, ho %u->%u", kring->name,
4129	        kring->nr_hwcur, kring->rhead,
4130	        kring->nr_hwtail, new_hwtail,
4131		kring->nkr_hwofs, new_hwofs);
4132	}
4133	kring->nr_hwcur = kring->rhead;
4134	kring->nr_hwtail = new_hwtail;
4135	kring->nkr_hwofs = new_hwofs;
4136
4137	/*
4138	 * Wakeup on the individual and global selwait
4139	 * We do the wakeup here, but the ring is not yet reconfigured.
4140	 * However, we are under lock so there are no races.
4141	 */
4142	kring->nr_mode = NKR_NETMAP_ON;
4143	kring->nm_notify(kring, 0);
4144	return kring->ring->slot;
4145}
4146
4147
4148/*
4149 * Dispatch rx/tx interrupts to the netmap rings.
4150 *
4151 * "work_done" is non-null on the RX path, NULL for the TX path.
4152 * We rely on the OS to make sure that there is only one active
4153 * instance per queue, and that there is appropriate locking.
4154 *
4155 * The 'notify' routine depends on what the ring is attached to.
4156 * - for a netmap file descriptor, do a selwakeup on the individual
4157 *   waitqueue, plus one on the global one if needed
4158 *   (see netmap_notify)
4159 * - for a nic connected to a switch, call the proper forwarding routine
4160 *   (see netmap_bwrap_intr_notify)
4161 */
4162int
4163netmap_common_irq(struct netmap_adapter *na, u_int q, u_int *work_done)
4164{
4165	struct netmap_kring *kring;
4166	enum txrx t = (work_done ? NR_RX : NR_TX);
4167
4168	q &= NETMAP_RING_MASK;
4169
4170	if (netmap_debug & (NM_DEBUG_RXINTR|NM_DEBUG_TXINTR)) {
4171	        nm_prlim(5, "received %s queue %d", work_done ? "RX" : "TX" , q);
4172	}
4173
4174	if (q >= nma_get_nrings(na, t))
4175		return NM_IRQ_PASS; // not a physical queue
4176
4177	kring = NMR(na, t)[q];
4178
4179	if (kring->nr_mode == NKR_NETMAP_OFF) {
4180		return NM_IRQ_PASS;
4181	}
4182
4183	if (t == NR_RX) {
4184		kring->nr_kflags |= NKR_PENDINTR;	// XXX atomic ?
4185		*work_done = 1; /* do not fire napi again */
4186	}
4187
4188	return kring->nm_notify(kring, 0);
4189}
4190
4191
4192/*
4193 * Default functions to handle rx/tx interrupts from a physical device.
4194 * "work_done" is non-null on the RX path, NULL for the TX path.
4195 *
4196 * If the card is not in netmap mode, simply return NM_IRQ_PASS,
4197 * so that the caller proceeds with regular processing.
4198 * Otherwise call netmap_common_irq().
4199 *
4200 * If the card is connected to a netmap file descriptor,
4201 * do a selwakeup on the individual queue, plus one on the global one
4202 * if needed (multiqueue card _and_ there are multiqueue listeners),
4203 * and return NR_IRQ_COMPLETED.
4204 *
4205 * Finally, if called on rx from an interface connected to a switch,
4206 * calls the proper forwarding routine.
4207 */
4208int
4209netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done)
4210{
4211	struct netmap_adapter *na = NA(ifp);
4212
4213	/*
4214	 * XXX emulated netmap mode sets NAF_SKIP_INTR so
4215	 * we still use the regular driver even though the previous
4216	 * check fails. It is unclear whether we should use
4217	 * nm_native_on() here.
4218	 */
4219	if (!nm_netmap_on(na))
4220		return NM_IRQ_PASS;
4221
4222	if (na->na_flags & NAF_SKIP_INTR) {
4223		nm_prdis("use regular interrupt");
4224		return NM_IRQ_PASS;
4225	}
4226
4227	return netmap_common_irq(na, q, work_done);
4228}
4229
4230/* set/clear native flags and if_transmit/netdev_ops */
4231void
4232nm_set_native_flags(struct netmap_adapter *na)
4233{
4234	struct ifnet *ifp = na->ifp;
4235
4236	/* We do the setup for intercepting packets only if we are the
4237	 * first user of this adapter. */
4238	if (na->active_fds > 0) {
4239		return;
4240	}
4241
4242	na->na_flags |= NAF_NETMAP_ON;
4243	nm_os_onenter(ifp);
4244	nm_update_hostrings_mode(na);
4245}
4246
4247void
4248nm_clear_native_flags(struct netmap_adapter *na)
4249{
4250	struct ifnet *ifp = na->ifp;
4251
4252	/* We undo the setup for intercepting packets only if we are the
4253	 * last user of this adapter. */
4254	if (na->active_fds > 0) {
4255		return;
4256	}
4257
4258	nm_update_hostrings_mode(na);
4259	nm_os_onexit(ifp);
4260
4261	na->na_flags &= ~NAF_NETMAP_ON;
4262}
4263
4264void
4265netmap_krings_mode_commit(struct netmap_adapter *na, int onoff)
4266{
4267	enum txrx t;
4268
4269	for_rx_tx(t) {
4270		int i;
4271
4272		for (i = 0; i < netmap_real_rings(na, t); i++) {
4273			struct netmap_kring *kring = NMR(na, t)[i];
4274
4275			if (onoff && nm_kring_pending_on(kring))
4276				kring->nr_mode = NKR_NETMAP_ON;
4277			else if (!onoff && nm_kring_pending_off(kring))
4278				kring->nr_mode = NKR_NETMAP_OFF;
4279		}
4280	}
4281}
4282
4283/*
4284 * Module loader and unloader
4285 *
4286 * netmap_init() creates the /dev/netmap device and initializes
4287 * all global variables. Returns 0 on success, errno on failure
4288 * (but there is no chance)
4289 *
4290 * netmap_fini() destroys everything.
4291 */
4292
4293static struct cdev *netmap_dev; /* /dev/netmap character device. */
4294extern struct cdevsw netmap_cdevsw;
4295
4296
4297void
4298netmap_fini(void)
4299{
4300	if (netmap_dev)
4301		destroy_dev(netmap_dev);
4302	/* we assume that there are no longer netmap users */
4303	nm_os_ifnet_fini();
4304	netmap_uninit_bridges();
4305	netmap_mem_fini();
4306	NMG_LOCK_DESTROY();
4307	nm_prinf("netmap: unloaded module.");
4308}
4309
4310
4311int
4312netmap_init(void)
4313{
4314	int error;
4315
4316	NMG_LOCK_INIT();
4317
4318	error = netmap_mem_init();
4319	if (error != 0)
4320		goto fail;
4321	/*
4322	 * MAKEDEV_ETERNAL_KLD avoids an expensive check on syscalls
4323	 * when the module is compiled in.
4324	 * XXX could use make_dev_credv() to get error number
4325	 */
4326	netmap_dev = make_dev_credf(MAKEDEV_ETERNAL_KLD,
4327		&netmap_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0600,
4328			      "netmap");
4329	if (!netmap_dev)
4330		goto fail;
4331
4332	error = netmap_init_bridges();
4333	if (error)
4334		goto fail;
4335
4336#ifdef __FreeBSD__
4337	nm_os_vi_init_index();
4338#endif
4339
4340	error = nm_os_ifnet_init();
4341	if (error)
4342		goto fail;
4343
4344	nm_prinf("netmap: loaded module");
4345	return (0);
4346fail:
4347	netmap_fini();
4348	return (EINVAL); /* may be incorrect */
4349}
4350