netmap_vale.c revision 267164
1/*
2 * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 *   1. Redistributions of source code must retain the above copyright
8 *      notice, this list of conditions and the following disclaimer.
9 *   2. Redistributions in binary form must reproduce the above copyright
10 *      notice, this list of conditions and the following disclaimer in the
11 *      documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 * SUCH DAMAGE.
24 */
25
26
27/*
28 * This module implements the VALE switch for netmap
29
30--- VALE SWITCH ---
31
32NMG_LOCK() serializes all modifications to switches and ports.
33A switch cannot be deleted until all ports are gone.
34
35For each switch, an SX lock (RWlock on linux) protects
36deletion of ports. When configuring or deleting a new port, the
37lock is acquired in exclusive mode (after holding NMG_LOCK).
38When forwarding, the lock is acquired in shared mode (without NMG_LOCK).
39The lock is held throughout the entire forwarding cycle,
40during which the thread may incur in a page fault.
41Hence it is important that sleepable shared locks are used.
42
43On the rx ring, the per-port lock is grabbed initially to reserve
44a number of slot in the ring, then the lock is released,
45packets are copied from source to destination, and then
46the lock is acquired again and the receive ring is updated.
47(A similar thing is done on the tx ring for NIC and host stack
48ports attached to the switch)
49
50 */
51
52/*
53 * OS-specific code that is used only within this file.
54 * Other OS-specific code that must be accessed by drivers
55 * is present in netmap_kern.h
56 */
57
58#if defined(__FreeBSD__)
59#include <sys/cdefs.h> /* prerequisite */
60__FBSDID("$FreeBSD: head/sys/dev/netmap/netmap_vale.c 267164 2014-06-06 14:57:40Z luigi $");
61
62#include <sys/types.h>
63#include <sys/errno.h>
64#include <sys/param.h>	/* defines used in kernel.h */
65#include <sys/kernel.h>	/* types used in module initialization */
66#include <sys/conf.h>	/* cdevsw struct, UID, GID */
67#include <sys/sockio.h>
68#include <sys/socketvar.h>	/* struct socket */
69#include <sys/malloc.h>
70#include <sys/poll.h>
71#include <sys/rwlock.h>
72#include <sys/socket.h> /* sockaddrs */
73#include <sys/selinfo.h>
74#include <sys/sysctl.h>
75#include <net/if.h>
76#include <net/if_var.h>
77#include <net/bpf.h>		/* BIOCIMMEDIATE */
78#include <machine/bus.h>	/* bus_dmamap_* */
79#include <sys/endian.h>
80#include <sys/refcount.h>
81
82
83#define BDG_RWLOCK_T		struct rwlock // struct rwlock
84
85#define	BDG_RWINIT(b)		\
86	rw_init_flags(&(b)->bdg_lock, "bdg lock", RW_NOWITNESS)
87#define BDG_WLOCK(b)		rw_wlock(&(b)->bdg_lock)
88#define BDG_WUNLOCK(b)		rw_wunlock(&(b)->bdg_lock)
89#define BDG_RLOCK(b)		rw_rlock(&(b)->bdg_lock)
90#define BDG_RTRYLOCK(b)		rw_try_rlock(&(b)->bdg_lock)
91#define BDG_RUNLOCK(b)		rw_runlock(&(b)->bdg_lock)
92#define BDG_RWDESTROY(b)	rw_destroy(&(b)->bdg_lock)
93
94
95#elif defined(linux)
96
97#include "bsd_glue.h"
98
99#elif defined(__APPLE__)
100
101#warning OSX support is only partial
102#include "osx_glue.h"
103
104#else
105
106#error	Unsupported platform
107
108#endif /* unsupported */
109
110/*
111 * common headers
112 */
113
114#include <net/netmap.h>
115#include <dev/netmap/netmap_kern.h>
116#include <dev/netmap/netmap_mem2.h>
117
118#ifdef WITH_VALE
119
120/*
121 * system parameters (most of them in netmap_kern.h)
122 * NM_NAME	prefix for switch port names, default "vale"
123 * NM_BDG_MAXPORTS	number of ports
124 * NM_BRIDGES	max number of switches in the system.
125 *	XXX should become a sysctl or tunable
126 *
127 * Switch ports are named valeX:Y where X is the switch name and Y
128 * is the port. If Y matches a physical interface name, the port is
129 * connected to a physical device.
130 *
131 * Unlike physical interfaces, switch ports use their own memory region
132 * for rings and buffers.
133 * The virtual interfaces use per-queue lock instead of core lock.
134 * In the tx loop, we aggregate traffic in batches to make all operations
135 * faster. The batch size is bridge_batch.
136 */
137#define NM_BDG_MAXRINGS		16	/* XXX unclear how many. */
138#define NM_BDG_MAXSLOTS		4096	/* XXX same as above */
139#define NM_BRIDGE_RINGSIZE	1024	/* in the device */
140#define NM_BDG_HASH		1024	/* forwarding table entries */
141#define NM_BDG_BATCH		1024	/* entries in the forwarding buffer */
142#define NM_MULTISEG		64	/* max size of a chain of bufs */
143/* actual size of the tables */
144#define NM_BDG_BATCH_MAX	(NM_BDG_BATCH + NM_MULTISEG)
145/* NM_FT_NULL terminates a list of slots in the ft */
146#define NM_FT_NULL		NM_BDG_BATCH_MAX
147#define	NM_BRIDGES		8	/* number of bridges */
148
149
150/*
151 * bridge_batch is set via sysctl to the max batch size to be
152 * used in the bridge. The actual value may be larger as the
153 * last packet in the block may overflow the size.
154 */
155int bridge_batch = NM_BDG_BATCH; /* bridge batch size */
156SYSCTL_DECL(_dev_netmap);
157SYSCTL_INT(_dev_netmap, OID_AUTO, bridge_batch, CTLFLAG_RW, &bridge_batch, 0 , "");
158
159
160static int bdg_netmap_attach(struct nmreq *nmr, struct ifnet *ifp);
161static int bdg_netmap_reg(struct netmap_adapter *na, int onoff);
162static int netmap_bwrap_attach(struct ifnet *, struct ifnet *);
163static int netmap_bwrap_register(struct netmap_adapter *, int onoff);
164int kern_netmap_regif(struct nmreq *nmr);
165
166/*
167 * For each output interface, nm_bdg_q is used to construct a list.
168 * bq_len is the number of output buffers (we can have coalescing
169 * during the copy).
170 */
171struct nm_bdg_q {
172	uint16_t bq_head;
173	uint16_t bq_tail;
174	uint32_t bq_len;	/* number of buffers */
175};
176
177/* XXX revise this */
178struct nm_hash_ent {
179	uint64_t	mac;	/* the top 2 bytes are the epoch */
180	uint64_t	ports;
181};
182
183/*
184 * nm_bridge is a descriptor for a VALE switch.
185 * Interfaces for a bridge are all in bdg_ports[].
186 * The array has fixed size, an empty entry does not terminate
187 * the search, but lookups only occur on attach/detach so we
188 * don't mind if they are slow.
189 *
190 * The bridge is non blocking on the transmit ports: excess
191 * packets are dropped if there is no room on the output port.
192 *
193 * bdg_lock protects accesses to the bdg_ports array.
194 * This is a rw lock (or equivalent).
195 */
196struct nm_bridge {
197	/* XXX what is the proper alignment/layout ? */
198	BDG_RWLOCK_T	bdg_lock;	/* protects bdg_ports */
199	int		bdg_namelen;
200	uint32_t	bdg_active_ports; /* 0 means free */
201	char		bdg_basename[IFNAMSIZ];
202
203	/* Indexes of active ports (up to active_ports)
204	 * and all other remaining ports.
205	 */
206	uint8_t		bdg_port_index[NM_BDG_MAXPORTS];
207
208	struct netmap_vp_adapter *bdg_ports[NM_BDG_MAXPORTS];
209
210
211	/*
212	 * The function to decide the destination port.
213	 * It returns either of an index of the destination port,
214	 * NM_BDG_BROADCAST to broadcast this packet, or NM_BDG_NOPORT not to
215	 * forward this packet.  ring_nr is the source ring index, and the
216	 * function may overwrite this value to forward this packet to a
217	 * different ring index.
218	 * This function must be set by netmap_bdgctl().
219	 */
220	bdg_lookup_fn_t nm_bdg_lookup;
221
222	/* the forwarding table, MAC+ports.
223	 * XXX should be changed to an argument to be passed to
224	 * the lookup function, and allocated on attach
225	 */
226	struct nm_hash_ent ht[NM_BDG_HASH];
227};
228
229
230/*
231 * XXX in principle nm_bridges could be created dynamically
232 * Right now we have a static array and deletions are protected
233 * by an exclusive lock.
234 */
235struct nm_bridge nm_bridges[NM_BRIDGES];
236
237
238/*
239 * this is a slightly optimized copy routine which rounds
240 * to multiple of 64 bytes and is often faster than dealing
241 * with other odd sizes. We assume there is enough room
242 * in the source and destination buffers.
243 *
244 * XXX only for multiples of 64 bytes, non overlapped.
245 */
246static inline void
247pkt_copy(void *_src, void *_dst, int l)
248{
249        uint64_t *src = _src;
250        uint64_t *dst = _dst;
251        if (unlikely(l >= 1024)) {
252                memcpy(dst, src, l);
253                return;
254        }
255        for (; likely(l > 0); l-=64) {
256                *dst++ = *src++;
257                *dst++ = *src++;
258                *dst++ = *src++;
259                *dst++ = *src++;
260                *dst++ = *src++;
261                *dst++ = *src++;
262                *dst++ = *src++;
263                *dst++ = *src++;
264        }
265}
266
267
268/*
269 * locate a bridge among the existing ones.
270 * MUST BE CALLED WITH NMG_LOCK()
271 *
272 * a ':' in the name terminates the bridge name. Otherwise, just NM_NAME.
273 * We assume that this is called with a name of at least NM_NAME chars.
274 */
275static struct nm_bridge *
276nm_find_bridge(const char *name, int create)
277{
278	int i, l, namelen;
279	struct nm_bridge *b = NULL;
280
281	NMG_LOCK_ASSERT();
282
283	namelen = strlen(NM_NAME);	/* base length */
284	l = name ? strlen(name) : 0;		/* actual length */
285	if (l < namelen) {
286		D("invalid bridge name %s", name ? name : NULL);
287		return NULL;
288	}
289	for (i = namelen + 1; i < l; i++) {
290		if (name[i] == ':') {
291			namelen = i;
292			break;
293		}
294	}
295	if (namelen >= IFNAMSIZ)
296		namelen = IFNAMSIZ;
297	ND("--- prefix is '%.*s' ---", namelen, name);
298
299	/* lookup the name, remember empty slot if there is one */
300	for (i = 0; i < NM_BRIDGES; i++) {
301		struct nm_bridge *x = nm_bridges + i;
302
303		if (x->bdg_active_ports == 0) {
304			if (create && b == NULL)
305				b = x;	/* record empty slot */
306		} else if (x->bdg_namelen != namelen) {
307			continue;
308		} else if (strncmp(name, x->bdg_basename, namelen) == 0) {
309			ND("found '%.*s' at %d", namelen, name, i);
310			b = x;
311			break;
312		}
313	}
314	if (i == NM_BRIDGES && b) { /* name not found, can create entry */
315		/* initialize the bridge */
316		strncpy(b->bdg_basename, name, namelen);
317		ND("create new bridge %s with ports %d", b->bdg_basename,
318			b->bdg_active_ports);
319		b->bdg_namelen = namelen;
320		b->bdg_active_ports = 0;
321		for (i = 0; i < NM_BDG_MAXPORTS; i++)
322			b->bdg_port_index[i] = i;
323		/* set the default function */
324		b->nm_bdg_lookup = netmap_bdg_learning;
325		/* reset the MAC address table */
326		bzero(b->ht, sizeof(struct nm_hash_ent) * NM_BDG_HASH);
327	}
328	return b;
329}
330
331
332/*
333 * Free the forwarding tables for rings attached to switch ports.
334 */
335static void
336nm_free_bdgfwd(struct netmap_adapter *na)
337{
338	int nrings, i;
339	struct netmap_kring *kring;
340
341	NMG_LOCK_ASSERT();
342	nrings = na->num_tx_rings;
343	kring = na->tx_rings;
344	for (i = 0; i < nrings; i++) {
345		if (kring[i].nkr_ft) {
346			free(kring[i].nkr_ft, M_DEVBUF);
347			kring[i].nkr_ft = NULL; /* protect from freeing twice */
348		}
349	}
350}
351
352
353/*
354 * Allocate the forwarding tables for the rings attached to the bridge ports.
355 */
356static int
357nm_alloc_bdgfwd(struct netmap_adapter *na)
358{
359	int nrings, l, i, num_dstq;
360	struct netmap_kring *kring;
361
362	NMG_LOCK_ASSERT();
363	/* all port:rings + broadcast */
364	num_dstq = NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1;
365	l = sizeof(struct nm_bdg_fwd) * NM_BDG_BATCH_MAX;
366	l += sizeof(struct nm_bdg_q) * num_dstq;
367	l += sizeof(uint16_t) * NM_BDG_BATCH_MAX;
368
369	nrings = netmap_real_tx_rings(na);
370	kring = na->tx_rings;
371	for (i = 0; i < nrings; i++) {
372		struct nm_bdg_fwd *ft;
373		struct nm_bdg_q *dstq;
374		int j;
375
376		ft = malloc(l, M_DEVBUF, M_NOWAIT | M_ZERO);
377		if (!ft) {
378			nm_free_bdgfwd(na);
379			return ENOMEM;
380		}
381		dstq = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX);
382		for (j = 0; j < num_dstq; j++) {
383			dstq[j].bq_head = dstq[j].bq_tail = NM_FT_NULL;
384			dstq[j].bq_len = 0;
385		}
386		kring[i].nkr_ft = ft;
387	}
388	return 0;
389}
390
391
392static void
393netmap_bdg_detach_common(struct nm_bridge *b, int hw, int sw)
394{
395	int s_hw = hw, s_sw = sw;
396	int i, lim =b->bdg_active_ports;
397	uint8_t tmp[NM_BDG_MAXPORTS];
398
399	/*
400	New algorithm:
401	make a copy of bdg_port_index;
402	lookup NA(ifp)->bdg_port and SWNA(ifp)->bdg_port
403	in the array of bdg_port_index, replacing them with
404	entries from the bottom of the array;
405	decrement bdg_active_ports;
406	acquire BDG_WLOCK() and copy back the array.
407	 */
408
409	if (netmap_verbose)
410		D("detach %d and %d (lim %d)", hw, sw, lim);
411	/* make a copy of the list of active ports, update it,
412	 * and then copy back within BDG_WLOCK().
413	 */
414	memcpy(tmp, b->bdg_port_index, sizeof(tmp));
415	for (i = 0; (hw >= 0 || sw >= 0) && i < lim; ) {
416		if (hw >= 0 && tmp[i] == hw) {
417			ND("detach hw %d at %d", hw, i);
418			lim--; /* point to last active port */
419			tmp[i] = tmp[lim]; /* swap with i */
420			tmp[lim] = hw;	/* now this is inactive */
421			hw = -1;
422		} else if (sw >= 0 && tmp[i] == sw) {
423			ND("detach sw %d at %d", sw, i);
424			lim--;
425			tmp[i] = tmp[lim];
426			tmp[lim] = sw;
427			sw = -1;
428		} else {
429			i++;
430		}
431	}
432	if (hw >= 0 || sw >= 0) {
433		D("XXX delete failed hw %d sw %d, should panic...", hw, sw);
434	}
435
436	BDG_WLOCK(b);
437	b->bdg_ports[s_hw] = NULL;
438	if (s_sw >= 0) {
439		b->bdg_ports[s_sw] = NULL;
440	}
441	memcpy(b->bdg_port_index, tmp, sizeof(tmp));
442	b->bdg_active_ports = lim;
443	BDG_WUNLOCK(b);
444
445	ND("now %d active ports", lim);
446	if (lim == 0) {
447		ND("marking bridge %s as free", b->bdg_basename);
448		b->nm_bdg_lookup = NULL;
449	}
450}
451
452
453static void
454netmap_adapter_vp_dtor(struct netmap_adapter *na)
455{
456	struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na;
457	struct nm_bridge *b = vpna->na_bdg;
458	struct ifnet *ifp = na->ifp;
459
460	ND("%s has %d references", NM_IFPNAME(ifp), na->na_refcount);
461
462	if (b) {
463		netmap_bdg_detach_common(b, vpna->bdg_port, -1);
464	}
465
466	bzero(ifp, sizeof(*ifp));
467	free(ifp, M_DEVBUF);
468	na->ifp = NULL;
469}
470
471
472/* Try to get a reference to a netmap adapter attached to a VALE switch.
473 * If the adapter is found (or is created), this function returns 0, a
474 * non NULL pointer is returned into *na, and the caller holds a
475 * reference to the adapter.
476 * If an adapter is not found, then no reference is grabbed and the
477 * function returns an error code, or 0 if there is just a VALE prefix
478 * mismatch. Therefore the caller holds a reference when
479 * (*na != NULL && return == 0).
480 */
481int
482netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na, int create)
483{
484	const char *name = nmr->nr_name;
485	struct ifnet *ifp;
486	int error = 0;
487	struct netmap_adapter *ret;
488	struct netmap_vp_adapter *vpna;
489	struct nm_bridge *b;
490	int i, j, cand = -1, cand2 = -1;
491	int needed;
492
493	*na = NULL;     /* default return value */
494
495	/* first try to see if this is a bridge port. */
496	NMG_LOCK_ASSERT();
497	if (strncmp(name, NM_NAME, sizeof(NM_NAME) - 1)) {
498		return 0;  /* no error, but no VALE prefix */
499	}
500
501	b = nm_find_bridge(name, create);
502	if (b == NULL) {
503		D("no bridges available for '%s'", name);
504		return (create ? ENOMEM : ENXIO);
505	}
506
507	/* Now we are sure that name starts with the bridge's name,
508	 * lookup the port in the bridge. We need to scan the entire
509	 * list. It is not important to hold a WLOCK on the bridge
510	 * during the search because NMG_LOCK already guarantees
511	 * that there are no other possible writers.
512	 */
513
514	/* lookup in the local list of ports */
515	for (j = 0; j < b->bdg_active_ports; j++) {
516		i = b->bdg_port_index[j];
517		vpna = b->bdg_ports[i];
518		// KASSERT(na != NULL);
519		ifp = vpna->up.ifp;
520		/* XXX make sure the name only contains one : */
521		if (!strcmp(NM_IFPNAME(ifp), name)) {
522			netmap_adapter_get(&vpna->up);
523			ND("found existing if %s refs %d", name,
524				vpna->na_bdg_refcount);
525			*na = (struct netmap_adapter *)vpna;
526			return 0;
527		}
528	}
529	/* not found, should we create it? */
530	if (!create)
531		return ENXIO;
532	/* yes we should, see if we have space to attach entries */
533	needed = 2; /* in some cases we only need 1 */
534	if (b->bdg_active_ports + needed >= NM_BDG_MAXPORTS) {
535		D("bridge full %d, cannot create new port", b->bdg_active_ports);
536		return ENOMEM;
537	}
538	/* record the next two ports available, but do not allocate yet */
539	cand = b->bdg_port_index[b->bdg_active_ports];
540	cand2 = b->bdg_port_index[b->bdg_active_ports + 1];
541	ND("+++ bridge %s port %s used %d avail %d %d",
542		b->bdg_basename, name, b->bdg_active_ports, cand, cand2);
543
544	/*
545	 * try see if there is a matching NIC with this name
546	 * (after the bridge's name)
547	 */
548	ifp = ifunit_ref(name + b->bdg_namelen + 1);
549	if (!ifp) { /* this is a virtual port */
550		if (nmr->nr_cmd) {
551			/* nr_cmd must be 0 for a virtual port */
552			return EINVAL;
553		}
554
555	 	/* create a struct ifnet for the new port.
556		 * need M_NOWAIT as we are under nma_lock
557		 */
558		ifp = malloc(sizeof(*ifp), M_DEVBUF, M_NOWAIT | M_ZERO);
559		if (!ifp)
560			return ENOMEM;
561
562		strcpy(ifp->if_xname, name);
563		/* bdg_netmap_attach creates a struct netmap_adapter */
564		error = bdg_netmap_attach(nmr, ifp);
565		if (error) {
566			D("error %d", error);
567			free(ifp, M_DEVBUF);
568			return error;
569		}
570		ret = NA(ifp);
571		cand2 = -1;	/* only need one port */
572	} else {  /* this is a NIC */
573		struct ifnet *fake_ifp;
574
575		error = netmap_get_hw_na(ifp, &ret);
576		if (error || ret == NULL)
577			goto out;
578
579		/* make sure the NIC is not already in use */
580		if (NETMAP_OWNED_BY_ANY(ret)) {
581			D("NIC %s busy, cannot attach to bridge",
582				NM_IFPNAME(ifp));
583			error = EBUSY;
584			goto out;
585		}
586		/* create a fake interface */
587		fake_ifp = malloc(sizeof(*ifp), M_DEVBUF, M_NOWAIT | M_ZERO);
588		if (!fake_ifp) {
589			error = ENOMEM;
590			goto out;
591		}
592		strcpy(fake_ifp->if_xname, name);
593		error = netmap_bwrap_attach(fake_ifp, ifp);
594		if (error) {
595			free(fake_ifp, M_DEVBUF);
596			goto out;
597		}
598		ret = NA(fake_ifp);
599		if (nmr->nr_arg1 != NETMAP_BDG_HOST)
600			cand2 = -1; /* only need one port */
601		if_rele(ifp);
602	}
603	vpna = (struct netmap_vp_adapter *)ret;
604
605	BDG_WLOCK(b);
606	vpna->bdg_port = cand;
607	ND("NIC  %p to bridge port %d", vpna, cand);
608	/* bind the port to the bridge (virtual ports are not active) */
609	b->bdg_ports[cand] = vpna;
610	vpna->na_bdg = b;
611	b->bdg_active_ports++;
612	if (cand2 >= 0) {
613		struct netmap_vp_adapter *hostna = vpna + 1;
614		/* also bind the host stack to the bridge */
615		b->bdg_ports[cand2] = hostna;
616		hostna->bdg_port = cand2;
617		hostna->na_bdg = b;
618		b->bdg_active_ports++;
619		ND("host %p to bridge port %d", hostna, cand2);
620	}
621	ND("if %s refs %d", name, vpna->up.na_refcount);
622	BDG_WUNLOCK(b);
623	*na = ret;
624	netmap_adapter_get(ret);
625	return 0;
626
627out:
628	if_rele(ifp);
629
630	return error;
631}
632
633
634/* Process NETMAP_BDG_ATTACH and NETMAP_BDG_DETACH */
635static int
636nm_bdg_attach(struct nmreq *nmr)
637{
638	struct netmap_adapter *na;
639	struct netmap_if *nifp;
640	struct netmap_priv_d *npriv;
641	struct netmap_bwrap_adapter *bna;
642	int error;
643
644	npriv = malloc(sizeof(*npriv), M_DEVBUF, M_NOWAIT|M_ZERO);
645	if (npriv == NULL)
646		return ENOMEM;
647
648	NMG_LOCK();
649
650	error = netmap_get_bdg_na(nmr, &na, 1 /* create if not exists */);
651	if (error) /* no device, or another bridge or user owns the device */
652		goto unlock_exit;
653
654	if (na == NULL) { /* VALE prefix missing */
655		error = EINVAL;
656		goto unlock_exit;
657	}
658
659	if (na->active_fds > 0) { /* already registered */
660		error = EBUSY;
661		goto unref_exit;
662	}
663
664	nifp = netmap_do_regif(npriv, na, nmr->nr_ringid, nmr->nr_flags, &error);
665	if (!nifp) {
666		goto unref_exit;
667	}
668
669	bna = (struct netmap_bwrap_adapter*)na;
670	bna->na_kpriv = npriv;
671	NMG_UNLOCK();
672	ND("registered %s to netmap-mode", NM_IFPNAME(na->ifp));
673	return 0;
674
675unref_exit:
676	netmap_adapter_put(na);
677unlock_exit:
678	NMG_UNLOCK();
679	bzero(npriv, sizeof(*npriv));
680	free(npriv, M_DEVBUF);
681	return error;
682}
683
684
685static int
686nm_bdg_detach(struct nmreq *nmr)
687{
688	struct netmap_adapter *na;
689	int error;
690	struct netmap_bwrap_adapter *bna;
691	int last_instance;
692
693	NMG_LOCK();
694	error = netmap_get_bdg_na(nmr, &na, 0 /* don't create */);
695	if (error) { /* no device, or another bridge or user owns the device */
696		goto unlock_exit;
697	}
698
699	if (na == NULL) { /* VALE prefix missing */
700		error = EINVAL;
701		goto unlock_exit;
702	}
703
704	bna = (struct netmap_bwrap_adapter *)na;
705
706	if (na->active_fds == 0) { /* not registered */
707		error = EINVAL;
708		goto unref_exit;
709	}
710
711	last_instance = netmap_dtor_locked(bna->na_kpriv); /* unregister */
712	if (!last_instance) {
713		D("--- error, trying to detach an entry with active mmaps");
714		error = EINVAL;
715	} else {
716		struct netmap_priv_d *npriv = bna->na_kpriv;
717
718		bna->na_kpriv = NULL;
719		D("deleting priv");
720
721		bzero(npriv, sizeof(*npriv));
722		free(npriv, M_DEVBUF);
723	}
724
725unref_exit:
726	netmap_adapter_put(na);
727unlock_exit:
728	NMG_UNLOCK();
729	return error;
730
731}
732
733
734/* exported to kernel callers, e.g. OVS ?
735 * Entry point.
736 * Called without NMG_LOCK.
737 */
738int
739netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func)
740{
741	struct nm_bridge *b;
742	struct netmap_adapter *na;
743	struct netmap_vp_adapter *vpna;
744	struct ifnet *iter;
745	char *name = nmr->nr_name;
746	int cmd = nmr->nr_cmd, namelen = strlen(name);
747	int error = 0, i, j;
748
749	switch (cmd) {
750	case NETMAP_BDG_ATTACH:
751		error = nm_bdg_attach(nmr);
752		break;
753
754	case NETMAP_BDG_DETACH:
755		error = nm_bdg_detach(nmr);
756		break;
757
758	case NETMAP_BDG_LIST:
759		/* this is used to enumerate bridges and ports */
760		if (namelen) { /* look up indexes of bridge and port */
761			if (strncmp(name, NM_NAME, strlen(NM_NAME))) {
762				error = EINVAL;
763				break;
764			}
765			NMG_LOCK();
766			b = nm_find_bridge(name, 0 /* don't create */);
767			if (!b) {
768				error = ENOENT;
769				NMG_UNLOCK();
770				break;
771			}
772
773			error = ENOENT;
774			for (j = 0; j < b->bdg_active_ports; j++) {
775				i = b->bdg_port_index[j];
776				vpna = b->bdg_ports[i];
777				if (vpna == NULL) {
778					D("---AAAAAAAAARGH-------");
779					continue;
780				}
781				iter = vpna->up.ifp;
782				/* the former and the latter identify a
783				 * virtual port and a NIC, respectively
784				 */
785				if (!strcmp(iter->if_xname, name)) {
786					/* bridge index */
787					nmr->nr_arg1 = b - nm_bridges;
788					nmr->nr_arg2 = i; /* port index */
789					error = 0;
790					break;
791				}
792			}
793			NMG_UNLOCK();
794		} else {
795			/* return the first non-empty entry starting from
796			 * bridge nr_arg1 and port nr_arg2.
797			 *
798			 * Users can detect the end of the same bridge by
799			 * seeing the new and old value of nr_arg1, and can
800			 * detect the end of all the bridge by error != 0
801			 */
802			i = nmr->nr_arg1;
803			j = nmr->nr_arg2;
804
805			NMG_LOCK();
806			for (error = ENOENT; i < NM_BRIDGES; i++) {
807				b = nm_bridges + i;
808				if (j >= b->bdg_active_ports) {
809					j = 0; /* following bridges scan from 0 */
810					continue;
811				}
812				nmr->nr_arg1 = i;
813				nmr->nr_arg2 = j;
814				j = b->bdg_port_index[j];
815				vpna = b->bdg_ports[j];
816				iter = vpna->up.ifp;
817				strncpy(name, iter->if_xname, (size_t)IFNAMSIZ);
818				error = 0;
819				break;
820			}
821			NMG_UNLOCK();
822		}
823		break;
824
825	case NETMAP_BDG_LOOKUP_REG:
826		/* register a lookup function to the given bridge.
827		 * nmr->nr_name may be just bridge's name (including ':'
828		 * if it is not just NM_NAME).
829		 */
830		if (!func) {
831			error = EINVAL;
832			break;
833		}
834		NMG_LOCK();
835		b = nm_find_bridge(name, 0 /* don't create */);
836		if (!b) {
837			error = EINVAL;
838		} else {
839			b->nm_bdg_lookup = func;
840		}
841		NMG_UNLOCK();
842		break;
843
844	case NETMAP_BDG_VNET_HDR:
845		/* Valid lengths for the virtio-net header are 0 (no header),
846		   10 and 12. */
847		if (nmr->nr_arg1 != 0 &&
848			nmr->nr_arg1 != sizeof(struct nm_vnet_hdr) &&
849				nmr->nr_arg1 != 12) {
850			error = EINVAL;
851			break;
852		}
853		NMG_LOCK();
854		error = netmap_get_bdg_na(nmr, &na, 0);
855		if (na && !error) {
856			vpna = (struct netmap_vp_adapter *)na;
857			vpna->virt_hdr_len = nmr->nr_arg1;
858			if (vpna->virt_hdr_len)
859				vpna->mfs = NETMAP_BDG_BUF_SIZE(na->nm_mem);
860			D("Using vnet_hdr_len %d for %p", vpna->virt_hdr_len, vpna);
861			netmap_adapter_put(na);
862		}
863		NMG_UNLOCK();
864		break;
865
866	default:
867		D("invalid cmd (nmr->nr_cmd) (0x%x)", cmd);
868		error = EINVAL;
869		break;
870	}
871	return error;
872}
873
874static int
875netmap_vp_krings_create(struct netmap_adapter *na)
876{
877	u_int tailroom;
878	int error, i;
879	uint32_t *leases;
880	u_int nrx = netmap_real_rx_rings(na);
881
882	/*
883	 * Leases are attached to RX rings on vale ports
884	 */
885	tailroom = sizeof(uint32_t) * na->num_rx_desc * nrx;
886
887	error = netmap_krings_create(na, tailroom);
888	if (error)
889		return error;
890
891	leases = na->tailroom;
892
893	for (i = 0; i < nrx; i++) { /* Receive rings */
894		na->rx_rings[i].nkr_leases = leases;
895		leases += na->num_rx_desc;
896	}
897
898	error = nm_alloc_bdgfwd(na);
899	if (error) {
900		netmap_krings_delete(na);
901		return error;
902	}
903
904	return 0;
905}
906
907
908static void
909netmap_vp_krings_delete(struct netmap_adapter *na)
910{
911	nm_free_bdgfwd(na);
912	netmap_krings_delete(na);
913}
914
915
916static int
917nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n,
918	struct netmap_vp_adapter *na, u_int ring_nr);
919
920
921/*
922 * Grab packets from a kring, move them into the ft structure
923 * associated to the tx (input) port. Max one instance per port,
924 * filtered on input (ioctl, poll or XXX).
925 * Returns the next position in the ring.
926 */
927static int
928nm_bdg_preflush(struct netmap_vp_adapter *na, u_int ring_nr,
929	struct netmap_kring *kring, u_int end)
930{
931	struct netmap_ring *ring = kring->ring;
932	struct nm_bdg_fwd *ft;
933	u_int j = kring->nr_hwcur, lim = kring->nkr_num_slots - 1;
934	u_int ft_i = 0;	/* start from 0 */
935	u_int frags = 1; /* how many frags ? */
936	struct nm_bridge *b = na->na_bdg;
937
938	/* To protect against modifications to the bridge we acquire a
939	 * shared lock, waiting if we can sleep (if the source port is
940	 * attached to a user process) or with a trylock otherwise (NICs).
941	 */
942	ND("wait rlock for %d packets", ((j > end ? lim+1 : 0) + end) - j);
943	if (na->up.na_flags & NAF_BDG_MAYSLEEP)
944		BDG_RLOCK(b);
945	else if (!BDG_RTRYLOCK(b))
946		return 0;
947	ND(5, "rlock acquired for %d packets", ((j > end ? lim+1 : 0) + end) - j);
948	ft = kring->nkr_ft;
949
950	for (; likely(j != end); j = nm_next(j, lim)) {
951		struct netmap_slot *slot = &ring->slot[j];
952		char *buf;
953
954		ft[ft_i].ft_len = slot->len;
955		ft[ft_i].ft_flags = slot->flags;
956
957		ND("flags is 0x%x", slot->flags);
958		/* this slot goes into a list so initialize the link field */
959		ft[ft_i].ft_next = NM_FT_NULL;
960		buf = ft[ft_i].ft_buf = (slot->flags & NS_INDIRECT) ?
961			(void *)(uintptr_t)slot->ptr : BDG_NMB(&na->up, slot);
962		if (unlikely(buf == NULL)) {
963			RD(5, "NULL %s buffer pointer from %s slot %d len %d",
964				(slot->flags & NS_INDIRECT) ? "INDIRECT" : "DIRECT",
965				kring->name, j, ft[ft_i].ft_len);
966			buf = ft[ft_i].ft_buf = NMB_VA(0); /* the 'null' buffer */
967			ft[ft_i].ft_len = 0;
968			ft[ft_i].ft_flags = 0;
969		}
970		__builtin_prefetch(buf);
971		++ft_i;
972		if (slot->flags & NS_MOREFRAG) {
973			frags++;
974			continue;
975		}
976		if (unlikely(netmap_verbose && frags > 1))
977			RD(5, "%d frags at %d", frags, ft_i - frags);
978		ft[ft_i - frags].ft_frags = frags;
979		frags = 1;
980		if (unlikely((int)ft_i >= bridge_batch))
981			ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr);
982	}
983	if (frags > 1) {
984		D("truncate incomplete fragment at %d (%d frags)", ft_i, frags);
985		// ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG
986		ft[ft_i - 1].ft_frags &= ~NS_MOREFRAG;
987		ft[ft_i - frags].ft_frags = frags - 1;
988	}
989	if (ft_i)
990		ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr);
991	BDG_RUNLOCK(b);
992	return j;
993}
994
995
996/* ----- FreeBSD if_bridge hash function ------- */
997
998/*
999 * The following hash function is adapted from "Hash Functions" by Bob Jenkins
1000 * ("Algorithm Alley", Dr. Dobbs Journal, September 1997).
1001 *
1002 * http://www.burtleburtle.net/bob/hash/spooky.html
1003 */
1004#define mix(a, b, c)                                                    \
1005do {                                                                    \
1006        a -= b; a -= c; a ^= (c >> 13);                                 \
1007        b -= c; b -= a; b ^= (a << 8);                                  \
1008        c -= a; c -= b; c ^= (b >> 13);                                 \
1009        a -= b; a -= c; a ^= (c >> 12);                                 \
1010        b -= c; b -= a; b ^= (a << 16);                                 \
1011        c -= a; c -= b; c ^= (b >> 5);                                  \
1012        a -= b; a -= c; a ^= (c >> 3);                                  \
1013        b -= c; b -= a; b ^= (a << 10);                                 \
1014        c -= a; c -= b; c ^= (b >> 15);                                 \
1015} while (/*CONSTCOND*/0)
1016
1017
1018static __inline uint32_t
1019nm_bridge_rthash(const uint8_t *addr)
1020{
1021        uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = 0; // hask key
1022
1023        b += addr[5] << 8;
1024        b += addr[4];
1025        a += addr[3] << 24;
1026        a += addr[2] << 16;
1027        a += addr[1] << 8;
1028        a += addr[0];
1029
1030        mix(a, b, c);
1031#define BRIDGE_RTHASH_MASK	(NM_BDG_HASH-1)
1032        return (c & BRIDGE_RTHASH_MASK);
1033}
1034
1035#undef mix
1036
1037
1038static int
1039bdg_netmap_reg(struct netmap_adapter *na, int onoff)
1040{
1041	struct netmap_vp_adapter *vpna =
1042		(struct netmap_vp_adapter*)na;
1043	struct ifnet *ifp = na->ifp;
1044
1045	/* the interface is already attached to the bridge,
1046	 * so we only need to toggle IFCAP_NETMAP.
1047	 */
1048	BDG_WLOCK(vpna->na_bdg);
1049	if (onoff) {
1050		ifp->if_capenable |= IFCAP_NETMAP;
1051	} else {
1052		ifp->if_capenable &= ~IFCAP_NETMAP;
1053	}
1054	BDG_WUNLOCK(vpna->na_bdg);
1055	return 0;
1056}
1057
1058
1059/*
1060 * Lookup function for a learning bridge.
1061 * Update the hash table with the source address,
1062 * and then returns the destination port index, and the
1063 * ring in *dst_ring (at the moment, always use ring 0)
1064 */
1065u_int
1066netmap_bdg_learning(char *buf, u_int buf_len, uint8_t *dst_ring,
1067		struct netmap_vp_adapter *na)
1068{
1069	struct nm_hash_ent *ht = na->na_bdg->ht;
1070	uint32_t sh, dh;
1071	u_int dst, mysrc = na->bdg_port;
1072	uint64_t smac, dmac;
1073
1074	if (buf_len < 14) {
1075		RD(5, "invalid buf length %d", buf_len);
1076		return NM_BDG_NOPORT;
1077	}
1078	dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff;
1079	smac = le64toh(*(uint64_t *)(buf + 4));
1080	smac >>= 16;
1081
1082	/*
1083	 * The hash is somewhat expensive, there might be some
1084	 * worthwhile optimizations here.
1085	 */
1086	if ((buf[6] & 1) == 0) { /* valid src */
1087		uint8_t *s = buf+6;
1088		sh = nm_bridge_rthash(s); // XXX hash of source
1089		/* update source port forwarding entry */
1090		ht[sh].mac = smac;	/* XXX expire ? */
1091		ht[sh].ports = mysrc;
1092		if (netmap_verbose)
1093		    D("src %02x:%02x:%02x:%02x:%02x:%02x on port %d",
1094			s[0], s[1], s[2], s[3], s[4], s[5], mysrc);
1095	}
1096	dst = NM_BDG_BROADCAST;
1097	if ((buf[0] & 1) == 0) { /* unicast */
1098		dh = nm_bridge_rthash(buf); // XXX hash of dst
1099		if (ht[dh].mac == dmac) {	/* found dst */
1100			dst = ht[dh].ports;
1101		}
1102		/* XXX otherwise return NM_BDG_UNKNOWN ? */
1103	}
1104	*dst_ring = 0;
1105	return dst;
1106}
1107
1108
1109/*
1110 * Available space in the ring. Only used in VALE code
1111 * and only with is_rx = 1
1112 */
1113static inline uint32_t
1114nm_kr_space(struct netmap_kring *k, int is_rx)
1115{
1116	int space;
1117
1118	if (is_rx) {
1119		int busy = k->nkr_hwlease - k->nr_hwcur;
1120		if (busy < 0)
1121			busy += k->nkr_num_slots;
1122		space = k->nkr_num_slots - 1 - busy;
1123	} else {
1124		/* XXX never used in this branch */
1125		space = k->nr_hwtail - k->nkr_hwlease;
1126		if (space < 0)
1127			space += k->nkr_num_slots;
1128	}
1129#if 0
1130	// sanity check
1131	if (k->nkr_hwlease >= k->nkr_num_slots ||
1132		k->nr_hwcur >= k->nkr_num_slots ||
1133		k->nr_tail >= k->nkr_num_slots ||
1134		busy < 0 ||
1135		busy >= k->nkr_num_slots) {
1136		D("invalid kring, cur %d tail %d lease %d lease_idx %d lim %d",			k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease,
1137			k->nkr_lease_idx, k->nkr_num_slots);
1138	}
1139#endif
1140	return space;
1141}
1142
1143
1144
1145
1146/* make a lease on the kring for N positions. return the
1147 * lease index
1148 * XXX only used in VALE code and with is_rx = 1
1149 */
1150static inline uint32_t
1151nm_kr_lease(struct netmap_kring *k, u_int n, int is_rx)
1152{
1153	uint32_t lim = k->nkr_num_slots - 1;
1154	uint32_t lease_idx = k->nkr_lease_idx;
1155
1156	k->nkr_leases[lease_idx] = NR_NOSLOT;
1157	k->nkr_lease_idx = nm_next(lease_idx, lim);
1158
1159	if (n > nm_kr_space(k, is_rx)) {
1160		D("invalid request for %d slots", n);
1161		panic("x");
1162	}
1163	/* XXX verify that there are n slots */
1164	k->nkr_hwlease += n;
1165	if (k->nkr_hwlease > lim)
1166		k->nkr_hwlease -= lim + 1;
1167
1168	if (k->nkr_hwlease >= k->nkr_num_slots ||
1169		k->nr_hwcur >= k->nkr_num_slots ||
1170		k->nr_hwtail >= k->nkr_num_slots ||
1171		k->nkr_lease_idx >= k->nkr_num_slots) {
1172		D("invalid kring %s, cur %d tail %d lease %d lease_idx %d lim %d",
1173			k->na->ifp->if_xname,
1174			k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease,
1175			k->nkr_lease_idx, k->nkr_num_slots);
1176	}
1177	return lease_idx;
1178}
1179
1180/*
1181 * This flush routine supports only unicast and broadcast but a large
1182 * number of ports, and lets us replace the learn and dispatch functions.
1183 */
1184int
1185nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na,
1186		u_int ring_nr)
1187{
1188	struct nm_bdg_q *dst_ents, *brddst;
1189	uint16_t num_dsts = 0, *dsts;
1190	struct nm_bridge *b = na->na_bdg;
1191	u_int i, j, me = na->bdg_port;
1192
1193	/*
1194	 * The work area (pointed by ft) is followed by an array of
1195	 * pointers to queues , dst_ents; there are NM_BDG_MAXRINGS
1196	 * queues per port plus one for the broadcast traffic.
1197	 * Then we have an array of destination indexes.
1198	 */
1199	dst_ents = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX);
1200	dsts = (uint16_t *)(dst_ents + NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1);
1201
1202	/* first pass: find a destination for each packet in the batch */
1203	for (i = 0; likely(i < n); i += ft[i].ft_frags) {
1204		uint8_t dst_ring = ring_nr; /* default, same ring as origin */
1205		uint16_t dst_port, d_i;
1206		struct nm_bdg_q *d;
1207		uint8_t *buf = ft[i].ft_buf;
1208		u_int len = ft[i].ft_len;
1209
1210		ND("slot %d frags %d", i, ft[i].ft_frags);
1211		/* Drop the packet if the virtio-net header is not into the first
1212		   fragment nor at the very beginning of the second. */
1213		if (unlikely(na->virt_hdr_len > len))
1214			continue;
1215		if (len == na->virt_hdr_len) {
1216			buf = ft[i+1].ft_buf;
1217			len = ft[i+1].ft_len;
1218		} else {
1219			buf += na->virt_hdr_len;
1220			len -= na->virt_hdr_len;
1221		}
1222		dst_port = b->nm_bdg_lookup(buf, len, &dst_ring, na);
1223		if (netmap_verbose > 255)
1224			RD(5, "slot %d port %d -> %d", i, me, dst_port);
1225		if (dst_port == NM_BDG_NOPORT)
1226			continue; /* this packet is identified to be dropped */
1227		else if (unlikely(dst_port > NM_BDG_MAXPORTS))
1228			continue;
1229		else if (dst_port == NM_BDG_BROADCAST)
1230			dst_ring = 0; /* broadcasts always go to ring 0 */
1231		else if (unlikely(dst_port == me ||
1232		    !b->bdg_ports[dst_port]))
1233			continue;
1234
1235		/* get a position in the scratch pad */
1236		d_i = dst_port * NM_BDG_MAXRINGS + dst_ring;
1237		d = dst_ents + d_i;
1238
1239		/* append the first fragment to the list */
1240		if (d->bq_head == NM_FT_NULL) { /* new destination */
1241			d->bq_head = d->bq_tail = i;
1242			/* remember this position to be scanned later */
1243			if (dst_port != NM_BDG_BROADCAST)
1244				dsts[num_dsts++] = d_i;
1245		} else {
1246			ft[d->bq_tail].ft_next = i;
1247			d->bq_tail = i;
1248		}
1249		d->bq_len += ft[i].ft_frags;
1250	}
1251
1252	/*
1253	 * Broadcast traffic goes to ring 0 on all destinations.
1254	 * So we need to add these rings to the list of ports to scan.
1255	 * XXX at the moment we scan all NM_BDG_MAXPORTS ports, which is
1256	 * expensive. We should keep a compact list of active destinations
1257	 * so we could shorten this loop.
1258	 */
1259	brddst = dst_ents + NM_BDG_BROADCAST * NM_BDG_MAXRINGS;
1260	if (brddst->bq_head != NM_FT_NULL) {
1261		for (j = 0; likely(j < b->bdg_active_ports); j++) {
1262			uint16_t d_i;
1263			i = b->bdg_port_index[j];
1264			if (unlikely(i == me))
1265				continue;
1266			d_i = i * NM_BDG_MAXRINGS;
1267			if (dst_ents[d_i].bq_head == NM_FT_NULL)
1268				dsts[num_dsts++] = d_i;
1269		}
1270	}
1271
1272	ND(5, "pass 1 done %d pkts %d dsts", n, num_dsts);
1273	/* second pass: scan destinations (XXX will be modular somehow) */
1274	for (i = 0; i < num_dsts; i++) {
1275		struct ifnet *dst_ifp;
1276		struct netmap_vp_adapter *dst_na;
1277		struct netmap_kring *kring;
1278		struct netmap_ring *ring;
1279		u_int dst_nr, lim, j, d_i, next, brd_next;
1280		u_int needed, howmany;
1281		int retry = netmap_txsync_retry;
1282		struct nm_bdg_q *d;
1283		uint32_t my_start = 0, lease_idx = 0;
1284		int nrings;
1285		int virt_hdr_mismatch = 0;
1286
1287		d_i = dsts[i];
1288		ND("second pass %d port %d", i, d_i);
1289		d = dst_ents + d_i;
1290		// XXX fix the division
1291		dst_na = b->bdg_ports[d_i/NM_BDG_MAXRINGS];
1292		/* protect from the lookup function returning an inactive
1293		 * destination port
1294		 */
1295		if (unlikely(dst_na == NULL))
1296			goto cleanup;
1297		if (dst_na->up.na_flags & NAF_SW_ONLY)
1298			goto cleanup;
1299		dst_ifp = dst_na->up.ifp;
1300		/*
1301		 * The interface may be in !netmap mode in two cases:
1302		 * - when na is attached but not activated yet;
1303		 * - when na is being deactivated but is still attached.
1304		 */
1305		if (unlikely(!(dst_ifp->if_capenable & IFCAP_NETMAP))) {
1306			ND("not in netmap mode!");
1307			goto cleanup;
1308		}
1309
1310		/* there is at least one either unicast or broadcast packet */
1311		brd_next = brddst->bq_head;
1312		next = d->bq_head;
1313		/* we need to reserve this many slots. If fewer are
1314		 * available, some packets will be dropped.
1315		 * Packets may have multiple fragments, so we may not use
1316		 * there is a chance that we may not use all of the slots
1317		 * we have claimed, so we will need to handle the leftover
1318		 * ones when we regain the lock.
1319		 */
1320		needed = d->bq_len + brddst->bq_len;
1321
1322		if (unlikely(dst_na->virt_hdr_len != na->virt_hdr_len)) {
1323			RD(3, "virt_hdr_mismatch, src %d len %d", na->virt_hdr_len, dst_na->virt_hdr_len);
1324			/* There is a virtio-net header/offloadings mismatch between
1325			 * source and destination. The slower mismatch datapath will
1326			 * be used to cope with all the mismatches.
1327			 */
1328			virt_hdr_mismatch = 1;
1329			if (dst_na->mfs < na->mfs) {
1330				/* We may need to do segmentation offloadings, and so
1331				 * we may need a number of destination slots greater
1332				 * than the number of input slots ('needed').
1333				 * We look for the smallest integer 'x' which satisfies:
1334				 *	needed * na->mfs + x * H <= x * na->mfs
1335				 * where 'H' is the length of the longest header that may
1336				 * be replicated in the segmentation process (e.g. for
1337				 * TCPv4 we must account for ethernet header, IP header
1338				 * and TCPv4 header).
1339				 */
1340				needed = (needed * na->mfs) /
1341						(dst_na->mfs - WORST_CASE_GSO_HEADER) + 1;
1342				ND(3, "srcmtu=%u, dstmtu=%u, x=%u", na->mfs, dst_na->mfs, needed);
1343			}
1344		}
1345
1346		ND(5, "pass 2 dst %d is %x %s",
1347			i, d_i, is_vp ? "virtual" : "nic/host");
1348		dst_nr = d_i & (NM_BDG_MAXRINGS-1);
1349		nrings = dst_na->up.num_rx_rings;
1350		if (dst_nr >= nrings)
1351			dst_nr = dst_nr % nrings;
1352		kring = &dst_na->up.rx_rings[dst_nr];
1353		ring = kring->ring;
1354		lim = kring->nkr_num_slots - 1;
1355
1356retry:
1357
1358		if (dst_na->retry && retry) {
1359			/* try to get some free slot from the previous run */
1360			dst_na->up.nm_notify(&dst_na->up, dst_nr, NR_RX, 0);
1361		}
1362		/* reserve the buffers in the queue and an entry
1363		 * to report completion, and drop lock.
1364		 * XXX this might become a helper function.
1365		 */
1366		mtx_lock(&kring->q_lock);
1367		if (kring->nkr_stopped) {
1368			mtx_unlock(&kring->q_lock);
1369			goto cleanup;
1370		}
1371		my_start = j = kring->nkr_hwlease;
1372		howmany = nm_kr_space(kring, 1);
1373		if (needed < howmany)
1374			howmany = needed;
1375		lease_idx = nm_kr_lease(kring, howmany, 1);
1376		mtx_unlock(&kring->q_lock);
1377
1378		/* only retry if we need more than available slots */
1379		if (retry && needed <= howmany)
1380			retry = 0;
1381
1382		/* copy to the destination queue */
1383		while (howmany > 0) {
1384			struct netmap_slot *slot;
1385			struct nm_bdg_fwd *ft_p, *ft_end;
1386			u_int cnt;
1387
1388			/* find the queue from which we pick next packet.
1389			 * NM_FT_NULL is always higher than valid indexes
1390			 * so we never dereference it if the other list
1391			 * has packets (and if both are empty we never
1392			 * get here).
1393			 */
1394			if (next < brd_next) {
1395				ft_p = ft + next;
1396				next = ft_p->ft_next;
1397			} else { /* insert broadcast */
1398				ft_p = ft + brd_next;
1399				brd_next = ft_p->ft_next;
1400			}
1401			cnt = ft_p->ft_frags; // cnt > 0
1402			if (unlikely(cnt > howmany))
1403			    break; /* no more space */
1404			if (netmap_verbose && cnt > 1)
1405				RD(5, "rx %d frags to %d", cnt, j);
1406			ft_end = ft_p + cnt;
1407			if (unlikely(virt_hdr_mismatch)) {
1408				bdg_mismatch_datapath(na, dst_na, ft_p, ring, &j, lim, &howmany);
1409			} else {
1410				howmany -= cnt;
1411				do {
1412					char *dst, *src = ft_p->ft_buf;
1413					size_t copy_len = ft_p->ft_len, dst_len = copy_len;
1414
1415					slot = &ring->slot[j];
1416					dst = BDG_NMB(&dst_na->up, slot);
1417
1418					ND("send [%d] %d(%d) bytes at %s:%d",
1419							i, (int)copy_len, (int)dst_len,
1420							NM_IFPNAME(dst_ifp), j);
1421					/* round to a multiple of 64 */
1422					copy_len = (copy_len + 63) & ~63;
1423
1424					if (unlikely(copy_len > NETMAP_BUF_SIZE ||
1425							copy_len > NETMAP_BUF_SIZE)) {
1426						RD(5, "invalid len %d, down to 64", (int)copy_len);
1427						copy_len = dst_len = 64; // XXX
1428					}
1429					if (ft_p->ft_flags & NS_INDIRECT) {
1430						if (copyin(src, dst, copy_len)) {
1431							// invalid user pointer, pretend len is 0
1432							dst_len = 0;
1433						}
1434					} else {
1435						//memcpy(dst, src, copy_len);
1436						pkt_copy(src, dst, (int)copy_len);
1437					}
1438					slot->len = dst_len;
1439					slot->flags = (cnt << 8)| NS_MOREFRAG;
1440					j = nm_next(j, lim);
1441					needed--;
1442					ft_p++;
1443				} while (ft_p != ft_end);
1444				slot->flags = (cnt << 8); /* clear flag on last entry */
1445			}
1446			/* are we done ? */
1447			if (next == NM_FT_NULL && brd_next == NM_FT_NULL)
1448				break;
1449		}
1450		{
1451		    /* current position */
1452		    uint32_t *p = kring->nkr_leases; /* shorthand */
1453		    uint32_t update_pos;
1454		    int still_locked = 1;
1455
1456		    mtx_lock(&kring->q_lock);
1457		    if (unlikely(howmany > 0)) {
1458			/* not used all bufs. If i am the last one
1459			 * i can recover the slots, otherwise must
1460			 * fill them with 0 to mark empty packets.
1461			 */
1462			ND("leftover %d bufs", howmany);
1463			if (nm_next(lease_idx, lim) == kring->nkr_lease_idx) {
1464			    /* yes i am the last one */
1465			    ND("roll back nkr_hwlease to %d", j);
1466			    kring->nkr_hwlease = j;
1467			} else {
1468			    while (howmany-- > 0) {
1469				ring->slot[j].len = 0;
1470				ring->slot[j].flags = 0;
1471				j = nm_next(j, lim);
1472			    }
1473			}
1474		    }
1475		    p[lease_idx] = j; /* report I am done */
1476
1477		    update_pos = kring->nr_hwtail;
1478
1479		    if (my_start == update_pos) {
1480			/* all slots before my_start have been reported,
1481			 * so scan subsequent leases to see if other ranges
1482			 * have been completed, and to a selwakeup or txsync.
1483		         */
1484			while (lease_idx != kring->nkr_lease_idx &&
1485				p[lease_idx] != NR_NOSLOT) {
1486			    j = p[lease_idx];
1487			    p[lease_idx] = NR_NOSLOT;
1488			    lease_idx = nm_next(lease_idx, lim);
1489			}
1490			/* j is the new 'write' position. j != my_start
1491			 * means there are new buffers to report
1492			 */
1493			if (likely(j != my_start)) {
1494				kring->nr_hwtail = j;
1495				still_locked = 0;
1496				mtx_unlock(&kring->q_lock);
1497				dst_na->up.nm_notify(&dst_na->up, dst_nr, NR_RX, 0);
1498				if (dst_na->retry && retry--)
1499					goto retry;
1500			}
1501		    }
1502		    if (still_locked)
1503			mtx_unlock(&kring->q_lock);
1504		}
1505cleanup:
1506		d->bq_head = d->bq_tail = NM_FT_NULL; /* cleanup */
1507		d->bq_len = 0;
1508	}
1509	brddst->bq_head = brddst->bq_tail = NM_FT_NULL; /* cleanup */
1510	brddst->bq_len = 0;
1511	return 0;
1512}
1513
1514
1515static int
1516netmap_vp_txsync(struct netmap_vp_adapter *na, u_int ring_nr, int flags)
1517{
1518	struct netmap_kring *kring = &na->up.tx_rings[ring_nr];
1519	u_int done;
1520	u_int const lim = kring->nkr_num_slots - 1;
1521	u_int const cur = kring->rcur;
1522
1523	if (bridge_batch <= 0) { /* testing only */
1524		done = cur; // used all
1525		goto done;
1526	}
1527	if (bridge_batch > NM_BDG_BATCH)
1528		bridge_batch = NM_BDG_BATCH;
1529
1530	done = nm_bdg_preflush(na, ring_nr, kring, cur);
1531done:
1532	if (done != cur)
1533		D("early break at %d/ %d, tail %d", done, cur, kring->nr_hwtail);
1534	/*
1535	 * packets between 'done' and 'cur' are left unsent.
1536	 */
1537	kring->nr_hwcur = done;
1538	kring->nr_hwtail = nm_prev(done, lim);
1539	nm_txsync_finalize(kring);
1540	if (netmap_verbose)
1541		D("%s ring %d flags %d", NM_IFPNAME(na->up.ifp), ring_nr, flags);
1542	return 0;
1543}
1544
1545
1546/*
1547 * main dispatch routine for the bridge.
1548 * We already know that only one thread is running this.
1549 * we must run nm_bdg_preflush without lock.
1550 */
1551static int
1552bdg_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
1553{
1554	struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na;
1555	return netmap_vp_txsync(vpna, ring_nr, flags);
1556}
1557
1558static int
1559netmap_vp_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
1560{
1561	struct netmap_kring *kring = &na->rx_rings[ring_nr];
1562	struct netmap_ring *ring = kring->ring;
1563	u_int nm_i, lim = kring->nkr_num_slots - 1;
1564	u_int head = nm_rxsync_prologue(kring);
1565	int n;
1566
1567	if (head > lim) {
1568		D("ouch dangerous reset!!!");
1569		n = netmap_ring_reinit(kring);
1570		goto done;
1571	}
1572
1573	/* First part, import newly received packets. */
1574	/* actually nothing to do here, they are already in the kring */
1575
1576	/* Second part, skip past packets that userspace has released. */
1577	nm_i = kring->nr_hwcur;
1578	if (nm_i != head) {
1579		/* consistency check, but nothing really important here */
1580		for (n = 0; likely(nm_i != head); n++) {
1581			struct netmap_slot *slot = &ring->slot[nm_i];
1582			void *addr = BDG_NMB(na, slot);
1583
1584			if (addr == netmap_buffer_base) { /* bad buf */
1585				D("bad buffer index %d, ignore ?",
1586					slot->buf_idx);
1587			}
1588			slot->flags &= ~NS_BUF_CHANGED;
1589			nm_i = nm_next(nm_i, lim);
1590		}
1591		kring->nr_hwcur = head;
1592	}
1593
1594	/* tell userspace that there are new packets */
1595	nm_rxsync_finalize(kring);
1596	n = 0;
1597done:
1598	return n;
1599}
1600
1601/*
1602 * user process reading from a VALE switch.
1603 * Already protected against concurrent calls from userspace,
1604 * but we must acquire the queue's lock to protect against
1605 * writers on the same queue.
1606 */
1607static int
1608bdg_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
1609{
1610	struct netmap_kring *kring = &na->rx_rings[ring_nr];
1611	int n;
1612
1613	mtx_lock(&kring->q_lock);
1614	n = netmap_vp_rxsync(na, ring_nr, flags);
1615	mtx_unlock(&kring->q_lock);
1616	return n;
1617}
1618
1619
1620static int
1621bdg_netmap_attach(struct nmreq *nmr, struct ifnet *ifp)
1622{
1623	struct netmap_vp_adapter *vpna;
1624	struct netmap_adapter *na;
1625	int error;
1626	u_int npipes = 0;
1627
1628	vpna = malloc(sizeof(*vpna), M_DEVBUF, M_NOWAIT | M_ZERO);
1629	if (vpna == NULL)
1630		return ENOMEM;
1631
1632 	na = &vpna->up;
1633
1634	na->ifp = ifp;
1635
1636	/* bound checking */
1637	na->num_tx_rings = nmr->nr_tx_rings;
1638	nm_bound_var(&na->num_tx_rings, 1, 1, NM_BDG_MAXRINGS, NULL);
1639	nmr->nr_tx_rings = na->num_tx_rings; // write back
1640	na->num_rx_rings = nmr->nr_rx_rings;
1641	nm_bound_var(&na->num_rx_rings, 1, 1, NM_BDG_MAXRINGS, NULL);
1642	nmr->nr_rx_rings = na->num_rx_rings; // write back
1643	nm_bound_var(&nmr->nr_tx_slots, NM_BRIDGE_RINGSIZE,
1644			1, NM_BDG_MAXSLOTS, NULL);
1645	na->num_tx_desc = nmr->nr_tx_slots;
1646	nm_bound_var(&nmr->nr_rx_slots, NM_BRIDGE_RINGSIZE,
1647			1, NM_BDG_MAXSLOTS, NULL);
1648	/* validate number of pipes. We want at least 1,
1649	 * but probably can do with some more.
1650	 * So let's use 2 as default (when 0 is supplied)
1651	 */
1652	npipes = nmr->nr_arg1;
1653	nm_bound_var(&npipes, 2, 1, NM_MAXPIPES, NULL);
1654	nmr->nr_arg1 = npipes;	/* write back */
1655	/* validate extra bufs */
1656	nm_bound_var(&nmr->nr_arg3, 0, 0,
1657			128*NM_BDG_MAXSLOTS, NULL);
1658	na->num_rx_desc = nmr->nr_rx_slots;
1659	vpna->virt_hdr_len = 0;
1660	vpna->mfs = 1514;
1661	/*if (vpna->mfs > netmap_buf_size)  TODO netmap_buf_size is zero??
1662		vpna->mfs = netmap_buf_size; */
1663        if (netmap_verbose)
1664		D("max frame size %u", vpna->mfs);
1665
1666	na->na_flags |= NAF_BDG_MAYSLEEP | NAF_MEM_OWNER;
1667	na->nm_txsync = bdg_netmap_txsync;
1668	na->nm_rxsync = bdg_netmap_rxsync;
1669	na->nm_register = bdg_netmap_reg;
1670	na->nm_dtor = netmap_adapter_vp_dtor;
1671	na->nm_krings_create = netmap_vp_krings_create;
1672	na->nm_krings_delete = netmap_vp_krings_delete;
1673	na->nm_mem = netmap_mem_private_new(NM_IFPNAME(na->ifp),
1674			na->num_tx_rings, na->num_tx_desc,
1675			na->num_rx_rings, na->num_rx_desc,
1676			nmr->nr_arg3, npipes, &error);
1677	if (na->nm_mem == NULL)
1678		goto err;
1679	/* other nmd fields are set in the common routine */
1680	error = netmap_attach_common(na);
1681	if (error)
1682		goto err;
1683	return 0;
1684
1685err:
1686	if (na->nm_mem != NULL)
1687		netmap_mem_private_delete(na->nm_mem);
1688	free(vpna, M_DEVBUF);
1689	return error;
1690}
1691
1692
1693static void
1694netmap_bwrap_dtor(struct netmap_adapter *na)
1695{
1696	struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na;
1697	struct netmap_adapter *hwna = bna->hwna;
1698	struct nm_bridge *b = bna->up.na_bdg,
1699		*bh = bna->host.na_bdg;
1700	struct ifnet *ifp = na->ifp;
1701
1702	ND("na %p", na);
1703
1704	if (b) {
1705		netmap_bdg_detach_common(b, bna->up.bdg_port,
1706			(bh ? bna->host.bdg_port : -1));
1707	}
1708
1709	hwna->na_private = NULL;
1710	netmap_adapter_put(hwna);
1711
1712	bzero(ifp, sizeof(*ifp));
1713	free(ifp, M_DEVBUF);
1714	na->ifp = NULL;
1715
1716}
1717
1718
1719/*
1720 * Intr callback for NICs connected to a bridge.
1721 * Simply ignore tx interrupts (maybe we could try to recover space ?)
1722 * and pass received packets from nic to the bridge.
1723 *
1724 * XXX TODO check locking: this is called from the interrupt
1725 * handler so we should make sure that the interface is not
1726 * disconnected while passing down an interrupt.
1727 *
1728 * Note, no user process can access this NIC or the host stack.
1729 * The only part of the ring that is significant are the slots,
1730 * and head/cur/tail are set from the kring as needed
1731 * (part as a receive ring, part as a transmit ring).
1732 *
1733 * callback that overwrites the hwna notify callback.
1734 * Packets come from the outside or from the host stack and are put on an hwna rx ring.
1735 * The bridge wrapper then sends the packets through the bridge.
1736 */
1737static int
1738netmap_bwrap_intr_notify(struct netmap_adapter *na, u_int ring_nr, enum txrx tx, int flags)
1739{
1740	struct ifnet *ifp = na->ifp;
1741	struct netmap_bwrap_adapter *bna = na->na_private;
1742	struct netmap_vp_adapter *hostna = &bna->host;
1743	struct netmap_kring *kring, *bkring;
1744	struct netmap_ring *ring;
1745	int is_host_ring = ring_nr == na->num_rx_rings;
1746	struct netmap_vp_adapter *vpna = &bna->up;
1747	int error = 0;
1748
1749	if (netmap_verbose)
1750	    D("%s %s%d 0x%x", NM_IFPNAME(ifp),
1751		(tx == NR_TX ? "TX" : "RX"), ring_nr, flags);
1752
1753	if (flags & NAF_DISABLE_NOTIFY) {
1754		kring = tx == NR_TX ? na->tx_rings : na->rx_rings;
1755		bkring = tx == NR_TX ? vpna->up.rx_rings : vpna->up.tx_rings;
1756		if (kring[ring_nr].nkr_stopped)
1757			netmap_disable_ring(&bkring[ring_nr]);
1758		else
1759			bkring[ring_nr].nkr_stopped = 0;
1760		return 0;
1761	}
1762
1763	if (ifp == NULL || !(ifp->if_capenable & IFCAP_NETMAP))
1764		return 0;
1765
1766	/* we only care about receive interrupts */
1767	if (tx == NR_TX)
1768		return 0;
1769
1770	kring = &na->rx_rings[ring_nr];
1771	ring = kring->ring;
1772
1773	/* make sure the ring is not disabled */
1774	if (nm_kr_tryget(kring))
1775		return 0;
1776
1777	if (is_host_ring && hostna->na_bdg == NULL) {
1778		error = bna->save_notify(na, ring_nr, tx, flags);
1779		goto put_out;
1780	}
1781
1782	/* Here we expect ring->head = ring->cur = ring->tail
1783	 * because everything has been released from the previous round.
1784	 * However the ring is shared and we might have info from
1785	 * the wrong side (the tx ring). Hence we overwrite with
1786	 * the info from the rx kring.
1787	 */
1788	if (netmap_verbose)
1789	    D("%s head %d cur %d tail %d (kring %d %d %d)",  NM_IFPNAME(ifp),
1790		ring->head, ring->cur, ring->tail,
1791		kring->rhead, kring->rcur, kring->rtail);
1792
1793	ring->head = kring->rhead;
1794	ring->cur = kring->rcur;
1795	ring->tail = kring->rtail;
1796
1797	if (is_host_ring) {
1798		vpna = hostna;
1799		ring_nr = 0;
1800	}
1801	/* simulate a user wakeup on the rx ring */
1802	/* fetch packets that have arrived.
1803	 * XXX maybe do this in a loop ?
1804	 */
1805	error = kring->nm_sync(kring, 0);
1806	if (error)
1807		goto put_out;
1808	if (kring->nr_hwcur == kring->nr_hwtail && netmap_verbose) {
1809		D("how strange, interrupt with no packets on %s",
1810			NM_IFPNAME(ifp));
1811		goto put_out;
1812	}
1813
1814	/* new packets are ring->cur to ring->tail, and the bkring
1815	 * had hwcur == ring->cur. So advance ring->cur to ring->tail
1816	 * to push all packets out.
1817	 */
1818	ring->head = ring->cur = ring->tail;
1819
1820	/* also set tail to what the bwrap expects */
1821	bkring = &vpna->up.tx_rings[ring_nr];
1822	ring->tail = bkring->nr_hwtail; // rtail too ?
1823
1824	/* pass packets to the switch */
1825	nm_txsync_prologue(bkring); // XXX error checking ?
1826	netmap_vp_txsync(vpna, ring_nr, flags);
1827
1828	/* mark all buffers as released on this ring */
1829	ring->head = ring->cur = kring->nr_hwtail;
1830	ring->tail = kring->rtail;
1831	/* another call to actually release the buffers */
1832	if (!is_host_ring) {
1833		error = kring->nm_sync(kring, 0);
1834	} else {
1835		/* mark all packets as released, as in the
1836		 * second part of netmap_rxsync_from_host()
1837		 */
1838		kring->nr_hwcur = kring->nr_hwtail;
1839		nm_rxsync_finalize(kring);
1840	}
1841
1842put_out:
1843	nm_kr_put(kring);
1844	return error;
1845}
1846
1847
1848static int
1849netmap_bwrap_register(struct netmap_adapter *na, int onoff)
1850{
1851	struct netmap_bwrap_adapter *bna =
1852		(struct netmap_bwrap_adapter *)na;
1853	struct netmap_adapter *hwna = bna->hwna;
1854	struct netmap_vp_adapter *hostna = &bna->host;
1855	int error;
1856
1857	ND("%s %s", NM_IFPNAME(na->ifp), onoff ? "on" : "off");
1858
1859	if (onoff) {
1860		int i;
1861
1862		hwna->na_lut = na->na_lut;
1863		hwna->na_lut_objtotal = na->na_lut_objtotal;
1864
1865		if (hostna->na_bdg) {
1866			hostna->up.na_lut = na->na_lut;
1867			hostna->up.na_lut_objtotal = na->na_lut_objtotal;
1868		}
1869
1870		/* cross-link the netmap rings
1871		 * The original number of rings comes from hwna,
1872		 * rx rings on one side equals tx rings on the other.
1873		 */
1874		for (i = 0; i < na->num_rx_rings + 1; i++) {
1875			hwna->tx_rings[i].nkr_num_slots = na->rx_rings[i].nkr_num_slots;
1876			hwna->tx_rings[i].ring = na->rx_rings[i].ring;
1877		}
1878		for (i = 0; i < na->num_tx_rings + 1; i++) {
1879			hwna->rx_rings[i].nkr_num_slots = na->tx_rings[i].nkr_num_slots;
1880			hwna->rx_rings[i].ring = na->tx_rings[i].ring;
1881		}
1882	}
1883
1884	if (hwna->ifp) {
1885		error = hwna->nm_register(hwna, onoff);
1886		if (error)
1887			return error;
1888	}
1889
1890	bdg_netmap_reg(na, onoff);
1891
1892	if (onoff) {
1893		bna->save_notify = hwna->nm_notify;
1894		hwna->nm_notify = netmap_bwrap_intr_notify;
1895	} else {
1896		hwna->nm_notify = bna->save_notify;
1897		hwna->na_lut = NULL;
1898		hwna->na_lut_objtotal = 0;
1899	}
1900
1901	return 0;
1902}
1903
1904
1905static int
1906netmap_bwrap_config(struct netmap_adapter *na, u_int *txr, u_int *txd,
1907				    u_int *rxr, u_int *rxd)
1908{
1909	struct netmap_bwrap_adapter *bna =
1910		(struct netmap_bwrap_adapter *)na;
1911	struct netmap_adapter *hwna = bna->hwna;
1912
1913	/* forward the request */
1914	netmap_update_config(hwna);
1915	/* swap the results */
1916	*txr = hwna->num_rx_rings;
1917	*txd = hwna->num_rx_desc;
1918	*rxr = hwna->num_tx_rings;
1919	*rxd = hwna->num_rx_desc;
1920
1921	return 0;
1922}
1923
1924
1925static int
1926netmap_bwrap_krings_create(struct netmap_adapter *na)
1927{
1928	struct netmap_bwrap_adapter *bna =
1929		(struct netmap_bwrap_adapter *)na;
1930	struct netmap_adapter *hwna = bna->hwna;
1931	struct netmap_adapter *hostna = &bna->host.up;
1932	int error;
1933
1934	ND("%s", NM_IFPNAME(na->ifp));
1935
1936	error = netmap_vp_krings_create(na);
1937	if (error)
1938		return error;
1939
1940	error = hwna->nm_krings_create(hwna);
1941	if (error) {
1942		netmap_vp_krings_delete(na);
1943		return error;
1944	}
1945
1946	if (na->na_flags & NAF_HOST_RINGS) {
1947		hostna->tx_rings = na->tx_rings + na->num_tx_rings;
1948		hostna->rx_rings = na->rx_rings + na->num_rx_rings;
1949	}
1950
1951	return 0;
1952}
1953
1954
1955static void
1956netmap_bwrap_krings_delete(struct netmap_adapter *na)
1957{
1958	struct netmap_bwrap_adapter *bna =
1959		(struct netmap_bwrap_adapter *)na;
1960	struct netmap_adapter *hwna = bna->hwna;
1961
1962	ND("%s", NM_IFPNAME(na->ifp));
1963
1964	hwna->nm_krings_delete(hwna);
1965	netmap_vp_krings_delete(na);
1966}
1967
1968
1969/* notify method for the bridge-->hwna direction */
1970static int
1971netmap_bwrap_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int flags)
1972{
1973	struct netmap_bwrap_adapter *bna =
1974		(struct netmap_bwrap_adapter *)na;
1975	struct netmap_adapter *hwna = bna->hwna;
1976	struct netmap_kring *kring, *hw_kring;
1977	struct netmap_ring *ring;
1978	u_int lim;
1979	int error = 0;
1980
1981	if (tx == NR_TX)
1982	        return EINVAL;
1983
1984	kring = &na->rx_rings[ring_n];
1985	hw_kring = &hwna->tx_rings[ring_n];
1986	ring = kring->ring;
1987	lim = kring->nkr_num_slots - 1;
1988
1989	if (hwna->ifp == NULL || !(hwna->ifp->if_capenable & IFCAP_NETMAP))
1990		return 0;
1991	mtx_lock(&kring->q_lock);
1992	/* first step: simulate a user wakeup on the rx ring */
1993	netmap_vp_rxsync(na, ring_n, flags);
1994	ND("%s[%d] PRE rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)",
1995		NM_IFPNAME(na->ifp), ring_n,
1996		kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease,
1997		ring->head, ring->cur, ring->tail,
1998		hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_ring->rtail);
1999	/* second step: the simulated user consumes all new packets */
2000	ring->head = ring->cur = ring->tail;
2001
2002	/* third step: the new packets are sent on the tx ring
2003	 * (which is actually the same ring)
2004	 */
2005	/* set tail to what the hw expects */
2006	ring->tail = hw_kring->rtail;
2007	nm_txsync_prologue(&hwna->tx_rings[ring_n]); // XXX error checking ?
2008	error = hw_kring->nm_sync(hw_kring, flags);
2009
2010	/* fourth step: now we are back the rx ring */
2011	/* claim ownership on all hw owned bufs */
2012	ring->head = nm_next(ring->tail, lim); /* skip past reserved slot */
2013	ring->tail = kring->rtail; /* restore saved value of tail, for safety */
2014
2015	/* fifth step: the user goes to sleep again, causing another rxsync */
2016	netmap_vp_rxsync(na, ring_n, flags);
2017	ND("%s[%d] PST rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)",
2018		NM_IFPNAME(na->ifp), ring_n,
2019		kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease,
2020		ring->head, ring->cur, ring->tail,
2021		hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_kring->rtail);
2022	mtx_unlock(&kring->q_lock);
2023	return error;
2024}
2025
2026
2027static int
2028netmap_bwrap_host_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int flags)
2029{
2030	struct netmap_bwrap_adapter *bna = na->na_private;
2031	struct netmap_adapter *port_na = &bna->up.up;
2032	if (tx == NR_TX || ring_n != 0)
2033		return EINVAL;
2034	return netmap_bwrap_notify(port_na, port_na->num_rx_rings, NR_RX, flags);
2035}
2036
2037
2038/* attach a bridge wrapper to the 'real' device */
2039static int
2040netmap_bwrap_attach(struct ifnet *fake, struct ifnet *real)
2041{
2042	struct netmap_bwrap_adapter *bna;
2043	struct netmap_adapter *na;
2044	struct netmap_adapter *hwna = NA(real);
2045	struct netmap_adapter *hostna;
2046	int error;
2047
2048
2049	bna = malloc(sizeof(*bna), M_DEVBUF, M_NOWAIT | M_ZERO);
2050	if (bna == NULL)
2051		return ENOMEM;
2052
2053	na = &bna->up.up;
2054	na->ifp = fake;
2055	/* fill the ring data for the bwrap adapter with rx/tx meanings
2056	 * swapped. The real cross-linking will be done during register,
2057	 * when all the krings will have been created.
2058	 */
2059	na->num_rx_rings = hwna->num_tx_rings;
2060	na->num_tx_rings = hwna->num_rx_rings;
2061	na->num_tx_desc = hwna->num_rx_desc;
2062	na->num_rx_desc = hwna->num_tx_desc;
2063	na->nm_dtor = netmap_bwrap_dtor;
2064	na->nm_register = netmap_bwrap_register;
2065	// na->nm_txsync = netmap_bwrap_txsync;
2066	// na->nm_rxsync = netmap_bwrap_rxsync;
2067	na->nm_config = netmap_bwrap_config;
2068	na->nm_krings_create = netmap_bwrap_krings_create;
2069	na->nm_krings_delete = netmap_bwrap_krings_delete;
2070	na->nm_notify = netmap_bwrap_notify;
2071	na->nm_mem = hwna->nm_mem;
2072	na->na_private = na; /* prevent NIOCREGIF */
2073	bna->up.retry = 1; /* XXX maybe this should depend on the hwna */
2074
2075	bna->hwna = hwna;
2076	netmap_adapter_get(hwna);
2077	hwna->na_private = bna; /* weak reference */
2078
2079	if (hwna->na_flags & NAF_HOST_RINGS) {
2080		na->na_flags |= NAF_HOST_RINGS;
2081		hostna = &bna->host.up;
2082		hostna->ifp = hwna->ifp;
2083		hostna->num_tx_rings = 1;
2084		hostna->num_tx_desc = hwna->num_rx_desc;
2085		hostna->num_rx_rings = 1;
2086		hostna->num_rx_desc = hwna->num_tx_desc;
2087		// hostna->nm_txsync = netmap_bwrap_host_txsync;
2088		// hostna->nm_rxsync = netmap_bwrap_host_rxsync;
2089		hostna->nm_notify = netmap_bwrap_host_notify;
2090		hostna->nm_mem = na->nm_mem;
2091		hostna->na_private = bna;
2092	}
2093
2094	ND("%s<->%s txr %d txd %d rxr %d rxd %d",
2095		fake->if_xname, real->if_xname,
2096		na->num_tx_rings, na->num_tx_desc,
2097		na->num_rx_rings, na->num_rx_desc);
2098
2099	error = netmap_attach_common(na);
2100	if (error) {
2101		netmap_adapter_put(hwna);
2102		free(bna, M_DEVBUF);
2103		return error;
2104	}
2105	return 0;
2106}
2107
2108
2109void
2110netmap_init_bridges(void)
2111{
2112	int i;
2113	bzero(nm_bridges, sizeof(struct nm_bridge) * NM_BRIDGES); /* safety */
2114	for (i = 0; i < NM_BRIDGES; i++)
2115		BDG_RWINIT(&nm_bridges[i]);
2116}
2117#endif /* WITH_VALE */
2118