netmap_vale.c revision 260368
1/*
2 * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 *   1. Redistributions of source code must retain the above copyright
8 *      notice, this list of conditions and the following disclaimer.
9 *   2. Redistributions in binary form must reproduce the above copyright
10 *      notice, this list of conditions and the following disclaimer in the
11 *      documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 * SUCH DAMAGE.
24 */
25
26
27/*
28 * This module implements the VALE switch for netmap
29
30--- VALE SWITCH ---
31
32NMG_LOCK() serializes all modifications to switches and ports.
33A switch cannot be deleted until all ports are gone.
34
35For each switch, an SX lock (RWlock on linux) protects
36deletion of ports. When configuring or deleting a new port, the
37lock is acquired in exclusive mode (after holding NMG_LOCK).
38When forwarding, the lock is acquired in shared mode (without NMG_LOCK).
39The lock is held throughout the entire forwarding cycle,
40during which the thread may incur in a page fault.
41Hence it is important that sleepable shared locks are used.
42
43On the rx ring, the per-port lock is grabbed initially to reserve
44a number of slot in the ring, then the lock is released,
45packets are copied from source to destination, and then
46the lock is acquired again and the receive ring is updated.
47(A similar thing is done on the tx ring for NIC and host stack
48ports attached to the switch)
49
50 */
51
52/*
53 * OS-specific code that is used only within this file.
54 * Other OS-specific code that must be accessed by drivers
55 * is present in netmap_kern.h
56 */
57
58#if defined(__FreeBSD__)
59#include <sys/cdefs.h> /* prerequisite */
60__FBSDID("$FreeBSD: head/sys/dev/netmap/netmap_vale.c 260368 2014-01-06 12:53:15Z luigi $");
61
62#include <sys/types.h>
63#include <sys/errno.h>
64#include <sys/param.h>	/* defines used in kernel.h */
65#include <sys/kernel.h>	/* types used in module initialization */
66#include <sys/conf.h>	/* cdevsw struct, UID, GID */
67#include <sys/sockio.h>
68#include <sys/socketvar.h>	/* struct socket */
69#include <sys/malloc.h>
70#include <sys/poll.h>
71#include <sys/rwlock.h>
72#include <sys/socket.h> /* sockaddrs */
73#include <sys/selinfo.h>
74#include <sys/sysctl.h>
75#include <net/if.h>
76#include <net/if_var.h>
77#include <net/bpf.h>		/* BIOCIMMEDIATE */
78#include <machine/bus.h>	/* bus_dmamap_* */
79#include <sys/endian.h>
80#include <sys/refcount.h>
81
82
83#define BDG_RWLOCK_T		struct rwlock // struct rwlock
84
85#define	BDG_RWINIT(b)		\
86	rw_init_flags(&(b)->bdg_lock, "bdg lock", RW_NOWITNESS)
87#define BDG_WLOCK(b)		rw_wlock(&(b)->bdg_lock)
88#define BDG_WUNLOCK(b)		rw_wunlock(&(b)->bdg_lock)
89#define BDG_RLOCK(b)		rw_rlock(&(b)->bdg_lock)
90#define BDG_RTRYLOCK(b)		rw_try_rlock(&(b)->bdg_lock)
91#define BDG_RUNLOCK(b)		rw_runlock(&(b)->bdg_lock)
92#define BDG_RWDESTROY(b)	rw_destroy(&(b)->bdg_lock)
93
94
95#elif defined(linux)
96
97#include "bsd_glue.h"
98
99#elif defined(__APPLE__)
100
101#warning OSX support is only partial
102#include "osx_glue.h"
103
104#else
105
106#error	Unsupported platform
107
108#endif /* unsupported */
109
110/*
111 * common headers
112 */
113
114#include <net/netmap.h>
115#include <dev/netmap/netmap_kern.h>
116#include <dev/netmap/netmap_mem2.h>
117
118#ifdef WITH_VALE
119
120/*
121 * system parameters (most of them in netmap_kern.h)
122 * NM_NAME	prefix for switch port names, default "vale"
123 * NM_BDG_MAXPORTS	number of ports
124 * NM_BRIDGES	max number of switches in the system.
125 *	XXX should become a sysctl or tunable
126 *
127 * Switch ports are named valeX:Y where X is the switch name and Y
128 * is the port. If Y matches a physical interface name, the port is
129 * connected to a physical device.
130 *
131 * Unlike physical interfaces, switch ports use their own memory region
132 * for rings and buffers.
133 * The virtual interfaces use per-queue lock instead of core lock.
134 * In the tx loop, we aggregate traffic in batches to make all operations
135 * faster. The batch size is bridge_batch.
136 */
137#define NM_BDG_MAXRINGS		16	/* XXX unclear how many. */
138#define NM_BDG_MAXSLOTS		4096	/* XXX same as above */
139#define NM_BRIDGE_RINGSIZE	1024	/* in the device */
140#define NM_BDG_HASH		1024	/* forwarding table entries */
141#define NM_BDG_BATCH		1024	/* entries in the forwarding buffer */
142#define NM_MULTISEG		64	/* max size of a chain of bufs */
143/* actual size of the tables */
144#define NM_BDG_BATCH_MAX	(NM_BDG_BATCH + NM_MULTISEG)
145/* NM_FT_NULL terminates a list of slots in the ft */
146#define NM_FT_NULL		NM_BDG_BATCH_MAX
147#define	NM_BRIDGES		8	/* number of bridges */
148
149
150/*
151 * bridge_batch is set via sysctl to the max batch size to be
152 * used in the bridge. The actual value may be larger as the
153 * last packet in the block may overflow the size.
154 */
155int bridge_batch = NM_BDG_BATCH; /* bridge batch size */
156SYSCTL_DECL(_dev_netmap);
157SYSCTL_INT(_dev_netmap, OID_AUTO, bridge_batch, CTLFLAG_RW, &bridge_batch, 0 , "");
158
159
160static int bdg_netmap_attach(struct nmreq *nmr, struct ifnet *ifp);
161static int bdg_netmap_reg(struct netmap_adapter *na, int onoff);
162static int netmap_bwrap_attach(struct ifnet *, struct ifnet *);
163static int netmap_bwrap_register(struct netmap_adapter *, int onoff);
164int kern_netmap_regif(struct nmreq *nmr);
165
166/*
167 * Each transmit queue accumulates a batch of packets into
168 * a structure before forwarding. Packets to the same
169 * destination are put in a list using ft_next as a link field.
170 * ft_frags and ft_next are valid only on the first fragment.
171 */
172struct nm_bdg_fwd {	/* forwarding entry for a bridge */
173	void *ft_buf;		/* netmap or indirect buffer */
174	uint8_t ft_frags;	/* how many fragments (only on 1st frag) */
175	uint8_t _ft_port;	/* dst port (unused) */
176	uint16_t ft_flags;	/* flags, e.g. indirect */
177	uint16_t ft_len;	/* src fragment len */
178	uint16_t ft_next;	/* next packet to same destination */
179};
180
181/*
182 * For each output interface, nm_bdg_q is used to construct a list.
183 * bq_len is the number of output buffers (we can have coalescing
184 * during the copy).
185 */
186struct nm_bdg_q {
187	uint16_t bq_head;
188	uint16_t bq_tail;
189	uint32_t bq_len;	/* number of buffers */
190};
191
192/* XXX revise this */
193struct nm_hash_ent {
194	uint64_t	mac;	/* the top 2 bytes are the epoch */
195	uint64_t	ports;
196};
197
198/*
199 * nm_bridge is a descriptor for a VALE switch.
200 * Interfaces for a bridge are all in bdg_ports[].
201 * The array has fixed size, an empty entry does not terminate
202 * the search, but lookups only occur on attach/detach so we
203 * don't mind if they are slow.
204 *
205 * The bridge is non blocking on the transmit ports: excess
206 * packets are dropped if there is no room on the output port.
207 *
208 * bdg_lock protects accesses to the bdg_ports array.
209 * This is a rw lock (or equivalent).
210 */
211struct nm_bridge {
212	/* XXX what is the proper alignment/layout ? */
213	BDG_RWLOCK_T	bdg_lock;	/* protects bdg_ports */
214	int		bdg_namelen;
215	uint32_t	bdg_active_ports; /* 0 means free */
216	char		bdg_basename[IFNAMSIZ];
217
218	/* Indexes of active ports (up to active_ports)
219	 * and all other remaining ports.
220	 */
221	uint8_t		bdg_port_index[NM_BDG_MAXPORTS];
222
223	struct netmap_vp_adapter *bdg_ports[NM_BDG_MAXPORTS];
224
225
226	/*
227	 * The function to decide the destination port.
228	 * It returns either of an index of the destination port,
229	 * NM_BDG_BROADCAST to broadcast this packet, or NM_BDG_NOPORT not to
230	 * forward this packet.  ring_nr is the source ring index, and the
231	 * function may overwrite this value to forward this packet to a
232	 * different ring index.
233	 * This function must be set by netmap_bdgctl().
234	 */
235	bdg_lookup_fn_t nm_bdg_lookup;
236
237	/* the forwarding table, MAC+ports.
238	 * XXX should be changed to an argument to be passed to
239	 * the lookup function, and allocated on attach
240	 */
241	struct nm_hash_ent ht[NM_BDG_HASH];
242};
243
244
245/*
246 * XXX in principle nm_bridges could be created dynamically
247 * Right now we have a static array and deletions are protected
248 * by an exclusive lock.
249 */
250struct nm_bridge nm_bridges[NM_BRIDGES];
251
252
253/*
254 * this is a slightly optimized copy routine which rounds
255 * to multiple of 64 bytes and is often faster than dealing
256 * with other odd sizes. We assume there is enough room
257 * in the source and destination buffers.
258 *
259 * XXX only for multiples of 64 bytes, non overlapped.
260 */
261static inline void
262pkt_copy(void *_src, void *_dst, int l)
263{
264        uint64_t *src = _src;
265        uint64_t *dst = _dst;
266        if (unlikely(l >= 1024)) {
267                memcpy(dst, src, l);
268                return;
269        }
270        for (; likely(l > 0); l-=64) {
271                *dst++ = *src++;
272                *dst++ = *src++;
273                *dst++ = *src++;
274                *dst++ = *src++;
275                *dst++ = *src++;
276                *dst++ = *src++;
277                *dst++ = *src++;
278                *dst++ = *src++;
279        }
280}
281
282
283/*
284 * locate a bridge among the existing ones.
285 * MUST BE CALLED WITH NMG_LOCK()
286 *
287 * a ':' in the name terminates the bridge name. Otherwise, just NM_NAME.
288 * We assume that this is called with a name of at least NM_NAME chars.
289 */
290static struct nm_bridge *
291nm_find_bridge(const char *name, int create)
292{
293	int i, l, namelen;
294	struct nm_bridge *b = NULL;
295
296	NMG_LOCK_ASSERT();
297
298	namelen = strlen(NM_NAME);	/* base length */
299	l = name ? strlen(name) : 0;		/* actual length */
300	if (l < namelen) {
301		D("invalid bridge name %s", name ? name : NULL);
302		return NULL;
303	}
304	for (i = namelen + 1; i < l; i++) {
305		if (name[i] == ':') {
306			namelen = i;
307			break;
308		}
309	}
310	if (namelen >= IFNAMSIZ)
311		namelen = IFNAMSIZ;
312	ND("--- prefix is '%.*s' ---", namelen, name);
313
314	/* lookup the name, remember empty slot if there is one */
315	for (i = 0; i < NM_BRIDGES; i++) {
316		struct nm_bridge *x = nm_bridges + i;
317
318		if (x->bdg_active_ports == 0) {
319			if (create && b == NULL)
320				b = x;	/* record empty slot */
321		} else if (x->bdg_namelen != namelen) {
322			continue;
323		} else if (strncmp(name, x->bdg_basename, namelen) == 0) {
324			ND("found '%.*s' at %d", namelen, name, i);
325			b = x;
326			break;
327		}
328	}
329	if (i == NM_BRIDGES && b) { /* name not found, can create entry */
330		/* initialize the bridge */
331		strncpy(b->bdg_basename, name, namelen);
332		ND("create new bridge %s with ports %d", b->bdg_basename,
333			b->bdg_active_ports);
334		b->bdg_namelen = namelen;
335		b->bdg_active_ports = 0;
336		for (i = 0; i < NM_BDG_MAXPORTS; i++)
337			b->bdg_port_index[i] = i;
338		/* set the default function */
339		b->nm_bdg_lookup = netmap_bdg_learning;
340		/* reset the MAC address table */
341		bzero(b->ht, sizeof(struct nm_hash_ent) * NM_BDG_HASH);
342	}
343	return b;
344}
345
346
347/*
348 * Free the forwarding tables for rings attached to switch ports.
349 */
350static void
351nm_free_bdgfwd(struct netmap_adapter *na)
352{
353	int nrings, i;
354	struct netmap_kring *kring;
355
356	NMG_LOCK_ASSERT();
357	nrings = na->num_tx_rings;
358	kring = na->tx_rings;
359	for (i = 0; i < nrings; i++) {
360		if (kring[i].nkr_ft) {
361			free(kring[i].nkr_ft, M_DEVBUF);
362			kring[i].nkr_ft = NULL; /* protect from freeing twice */
363		}
364	}
365}
366
367
368/*
369 * Allocate the forwarding tables for the rings attached to the bridge ports.
370 */
371static int
372nm_alloc_bdgfwd(struct netmap_adapter *na)
373{
374	int nrings, l, i, num_dstq;
375	struct netmap_kring *kring;
376
377	NMG_LOCK_ASSERT();
378	/* all port:rings + broadcast */
379	num_dstq = NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1;
380	l = sizeof(struct nm_bdg_fwd) * NM_BDG_BATCH_MAX;
381	l += sizeof(struct nm_bdg_q) * num_dstq;
382	l += sizeof(uint16_t) * NM_BDG_BATCH_MAX;
383
384	nrings = na->num_tx_rings + 1;
385	kring = na->tx_rings;
386	for (i = 0; i < nrings; i++) {
387		struct nm_bdg_fwd *ft;
388		struct nm_bdg_q *dstq;
389		int j;
390
391		ft = malloc(l, M_DEVBUF, M_NOWAIT | M_ZERO);
392		if (!ft) {
393			nm_free_bdgfwd(na);
394			return ENOMEM;
395		}
396		dstq = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX);
397		for (j = 0; j < num_dstq; j++) {
398			dstq[j].bq_head = dstq[j].bq_tail = NM_FT_NULL;
399			dstq[j].bq_len = 0;
400		}
401		kring[i].nkr_ft = ft;
402	}
403	return 0;
404}
405
406
407static void
408netmap_bdg_detach_common(struct nm_bridge *b, int hw, int sw)
409{
410	int s_hw = hw, s_sw = sw;
411	int i, lim =b->bdg_active_ports;
412	uint8_t tmp[NM_BDG_MAXPORTS];
413
414	/*
415	New algorithm:
416	make a copy of bdg_port_index;
417	lookup NA(ifp)->bdg_port and SWNA(ifp)->bdg_port
418	in the array of bdg_port_index, replacing them with
419	entries from the bottom of the array;
420	decrement bdg_active_ports;
421	acquire BDG_WLOCK() and copy back the array.
422	 */
423
424	D("detach %d and %d (lim %d)", hw, sw, lim);
425	/* make a copy of the list of active ports, update it,
426	 * and then copy back within BDG_WLOCK().
427	 */
428	memcpy(tmp, b->bdg_port_index, sizeof(tmp));
429	for (i = 0; (hw >= 0 || sw >= 0) && i < lim; ) {
430		if (hw >= 0 && tmp[i] == hw) {
431			ND("detach hw %d at %d", hw, i);
432			lim--; /* point to last active port */
433			tmp[i] = tmp[lim]; /* swap with i */
434			tmp[lim] = hw;	/* now this is inactive */
435			hw = -1;
436		} else if (sw >= 0 && tmp[i] == sw) {
437			ND("detach sw %d at %d", sw, i);
438			lim--;
439			tmp[i] = tmp[lim];
440			tmp[lim] = sw;
441			sw = -1;
442		} else {
443			i++;
444		}
445	}
446	if (hw >= 0 || sw >= 0) {
447		D("XXX delete failed hw %d sw %d, should panic...", hw, sw);
448	}
449
450	BDG_WLOCK(b);
451	b->bdg_ports[s_hw] = NULL;
452	if (s_sw >= 0) {
453		b->bdg_ports[s_sw] = NULL;
454	}
455	memcpy(b->bdg_port_index, tmp, sizeof(tmp));
456	b->bdg_active_ports = lim;
457	BDG_WUNLOCK(b);
458
459	ND("now %d active ports", lim);
460	if (lim == 0) {
461		ND("marking bridge %s as free", b->bdg_basename);
462		b->nm_bdg_lookup = NULL;
463	}
464}
465
466
467static void
468netmap_adapter_vp_dtor(struct netmap_adapter *na)
469{
470	struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na;
471	struct nm_bridge *b = vpna->na_bdg;
472	struct ifnet *ifp = na->ifp;
473
474	ND("%s has %d references", NM_IFPNAME(ifp), na->na_refcount);
475
476	if (b) {
477		netmap_bdg_detach_common(b, vpna->bdg_port, -1);
478	}
479
480	bzero(ifp, sizeof(*ifp));
481	free(ifp, M_DEVBUF);
482	na->ifp = NULL;
483}
484
485
486/* Try to get a reference to a netmap adapter attached to a VALE switch.
487 * If the adapter is found (or is created), this function returns 0, a
488 * non NULL pointer is returned into *na, and the caller holds a
489 * reference to the adapter.
490 * If an adapter is not found, then no reference is grabbed and the
491 * function returns an error code, or 0 if there is just a VALE prefix
492 * mismatch. Therefore the caller holds a reference when
493 * (*na != NULL && return == 0).
494 */
495int
496netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na, int create)
497{
498	const char *name = nmr->nr_name;
499	struct ifnet *ifp;
500	int error = 0;
501	struct netmap_adapter *ret;
502	struct netmap_vp_adapter *vpna;
503	struct nm_bridge *b;
504	int i, j, cand = -1, cand2 = -1;
505	int needed;
506
507	*na = NULL;     /* default return value */
508
509	/* first try to see if this is a bridge port. */
510	NMG_LOCK_ASSERT();
511	if (strncmp(name, NM_NAME, sizeof(NM_NAME) - 1)) {
512		return 0;  /* no error, but no VALE prefix */
513	}
514
515	b = nm_find_bridge(name, create);
516	if (b == NULL) {
517		D("no bridges available for '%s'", name);
518		return (ENXIO);
519	}
520
521	/* Now we are sure that name starts with the bridge's name,
522	 * lookup the port in the bridge. We need to scan the entire
523	 * list. It is not important to hold a WLOCK on the bridge
524	 * during the search because NMG_LOCK already guarantees
525	 * that there are no other possible writers.
526	 */
527
528	/* lookup in the local list of ports */
529	for (j = 0; j < b->bdg_active_ports; j++) {
530		i = b->bdg_port_index[j];
531		vpna = b->bdg_ports[i];
532		// KASSERT(na != NULL);
533		ifp = vpna->up.ifp;
534		/* XXX make sure the name only contains one : */
535		if (!strcmp(NM_IFPNAME(ifp), name)) {
536			netmap_adapter_get(&vpna->up);
537			ND("found existing if %s refs %d", name,
538				vpna->na_bdg_refcount);
539			*na = (struct netmap_adapter *)vpna;
540			return 0;
541		}
542	}
543	/* not found, should we create it? */
544	if (!create)
545		return ENXIO;
546	/* yes we should, see if we have space to attach entries */
547	needed = 2; /* in some cases we only need 1 */
548	if (b->bdg_active_ports + needed >= NM_BDG_MAXPORTS) {
549		D("bridge full %d, cannot create new port", b->bdg_active_ports);
550		return EINVAL;
551	}
552	/* record the next two ports available, but do not allocate yet */
553	cand = b->bdg_port_index[b->bdg_active_ports];
554	cand2 = b->bdg_port_index[b->bdg_active_ports + 1];
555	ND("+++ bridge %s port %s used %d avail %d %d",
556		b->bdg_basename, name, b->bdg_active_ports, cand, cand2);
557
558	/*
559	 * try see if there is a matching NIC with this name
560	 * (after the bridge's name)
561	 */
562	ifp = ifunit_ref(name + b->bdg_namelen + 1);
563	if (!ifp) { /* this is a virtual port */
564		if (nmr->nr_cmd) {
565			/* nr_cmd must be 0 for a virtual port */
566			return EINVAL;
567		}
568
569	 	/* create a struct ifnet for the new port.
570		 * need M_NOWAIT as we are under nma_lock
571		 */
572		ifp = malloc(sizeof(*ifp), M_DEVBUF, M_NOWAIT | M_ZERO);
573		if (!ifp)
574			return ENOMEM;
575
576		strcpy(ifp->if_xname, name);
577		/* bdg_netmap_attach creates a struct netmap_adapter */
578		error = bdg_netmap_attach(nmr, ifp);
579		if (error) {
580			D("error %d", error);
581			free(ifp, M_DEVBUF);
582			return error;
583		}
584		ret = NA(ifp);
585		cand2 = -1;	/* only need one port */
586	} else {  /* this is a NIC */
587		struct ifnet *fake_ifp;
588
589		error = netmap_get_hw_na(ifp, &ret);
590		if (error || ret == NULL)
591			goto out;
592
593		/* make sure the NIC is not already in use */
594		if (NETMAP_OWNED_BY_ANY(ret)) {
595			D("NIC %s busy, cannot attach to bridge",
596				NM_IFPNAME(ifp));
597			error = EINVAL;
598			goto out;
599		}
600		/* create a fake interface */
601		fake_ifp = malloc(sizeof(*ifp), M_DEVBUF, M_NOWAIT | M_ZERO);
602		if (!fake_ifp) {
603			error = ENOMEM;
604			goto out;
605		}
606		strcpy(fake_ifp->if_xname, name);
607		error = netmap_bwrap_attach(fake_ifp, ifp);
608		if (error) {
609			free(fake_ifp, M_DEVBUF);
610			goto out;
611		}
612		ret = NA(fake_ifp);
613		if (nmr->nr_arg1 != NETMAP_BDG_HOST)
614			cand2 = -1; /* only need one port */
615		if_rele(ifp);
616	}
617	vpna = (struct netmap_vp_adapter *)ret;
618
619	BDG_WLOCK(b);
620	vpna->bdg_port = cand;
621	ND("NIC  %p to bridge port %d", vpna, cand);
622	/* bind the port to the bridge (virtual ports are not active) */
623	b->bdg_ports[cand] = vpna;
624	vpna->na_bdg = b;
625	b->bdg_active_ports++;
626	if (cand2 >= 0) {
627		struct netmap_vp_adapter *hostna = vpna + 1;
628		/* also bind the host stack to the bridge */
629		b->bdg_ports[cand2] = hostna;
630		hostna->bdg_port = cand2;
631		hostna->na_bdg = b;
632		b->bdg_active_ports++;
633		ND("host %p to bridge port %d", hostna, cand2);
634	}
635	ND("if %s refs %d", name, vpna->up.na_refcount);
636	BDG_WUNLOCK(b);
637	*na = ret;
638	netmap_adapter_get(ret);
639	return 0;
640
641out:
642	if_rele(ifp);
643
644	return error;
645}
646
647
648/* Process NETMAP_BDG_ATTACH and NETMAP_BDG_DETACH */
649static int
650nm_bdg_attach(struct nmreq *nmr)
651{
652	struct netmap_adapter *na;
653	struct netmap_if *nifp;
654	struct netmap_priv_d *npriv;
655	struct netmap_bwrap_adapter *bna;
656	int error;
657
658	npriv = malloc(sizeof(*npriv), M_DEVBUF, M_NOWAIT|M_ZERO);
659	if (npriv == NULL)
660		return ENOMEM;
661	NMG_LOCK();
662	/* XXX probably netmap_get_bdg_na() */
663	error = netmap_get_bdg_na(nmr, &na, 1 /* create if not exists */);
664	if (error) /* no device, or another bridge or user owns the device */
665		goto unlock_exit;
666	if (na == NULL) { /* VALE prefix missing */
667		error = EINVAL;
668		goto unlock_exit;
669	}
670
671	if (na->active_fds > 0) { /* already registered */
672		error = EBUSY;
673		goto unref_exit;
674	}
675
676	nifp = netmap_do_regif(npriv, na, nmr->nr_ringid, &error);
677	if (!nifp) {
678		goto unref_exit;
679	}
680
681	bna = (struct netmap_bwrap_adapter*)na;
682	bna->na_kpriv = npriv;
683	NMG_UNLOCK();
684	ND("registered %s to netmap-mode", NM_IFPNAME(na->ifp));
685	return 0;
686
687unref_exit:
688	netmap_adapter_put(na);
689unlock_exit:
690	NMG_UNLOCK();
691	bzero(npriv, sizeof(*npriv));
692	free(npriv, M_DEVBUF);
693	return error;
694}
695
696
697static int
698nm_bdg_detach(struct nmreq *nmr)
699{
700	struct netmap_adapter *na;
701	int error;
702	struct netmap_bwrap_adapter *bna;
703	int last_instance;
704
705	NMG_LOCK();
706	error = netmap_get_bdg_na(nmr, &na, 0 /* don't create */);
707	if (error) { /* no device, or another bridge or user owns the device */
708		goto unlock_exit;
709	}
710	if (na == NULL) { /* VALE prefix missing */
711		error = EINVAL;
712		goto unlock_exit;
713	}
714
715	bna = (struct netmap_bwrap_adapter *)na;
716
717	if (na->active_fds == 0) { /* not registered */
718		error = EINVAL;
719		goto unref_exit;
720	}
721
722	last_instance = netmap_dtor_locked(bna->na_kpriv); /* unregister */
723	if (!last_instance) {
724		D("--- error, trying to detach an entry with active mmaps");
725		error = EINVAL;
726	} else {
727		struct netmap_priv_d *npriv = bna->na_kpriv;
728
729		bna->na_kpriv = NULL;
730		D("deleting priv");
731
732		bzero(npriv, sizeof(*npriv));
733		free(npriv, M_DEVBUF);
734	}
735
736unref_exit:
737	netmap_adapter_put(na);
738unlock_exit:
739	NMG_UNLOCK();
740	return error;
741
742}
743
744
745/* exported to kernel callers, e.g. OVS ?
746 * Entry point.
747 * Called without NMG_LOCK.
748 */
749int
750netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func)
751{
752	struct nm_bridge *b;
753	struct netmap_adapter *na;
754	struct netmap_vp_adapter *vpna;
755	struct ifnet *iter;
756	char *name = nmr->nr_name;
757	int cmd = nmr->nr_cmd, namelen = strlen(name);
758	int error = 0, i, j;
759
760	switch (cmd) {
761	case NETMAP_BDG_ATTACH:
762		error = nm_bdg_attach(nmr);
763		break;
764
765	case NETMAP_BDG_DETACH:
766		error = nm_bdg_detach(nmr);
767		break;
768
769	case NETMAP_BDG_LIST:
770		/* this is used to enumerate bridges and ports */
771		if (namelen) { /* look up indexes of bridge and port */
772			if (strncmp(name, NM_NAME, strlen(NM_NAME))) {
773				error = EINVAL;
774				break;
775			}
776			NMG_LOCK();
777			b = nm_find_bridge(name, 0 /* don't create */);
778			if (!b) {
779				error = ENOENT;
780				NMG_UNLOCK();
781				break;
782			}
783
784			error = ENOENT;
785			for (j = 0; j < b->bdg_active_ports; j++) {
786				i = b->bdg_port_index[j];
787				vpna = b->bdg_ports[i];
788				if (vpna == NULL) {
789					D("---AAAAAAAAARGH-------");
790					continue;
791				}
792				iter = vpna->up.ifp;
793				/* the former and the latter identify a
794				 * virtual port and a NIC, respectively
795				 */
796				if (!strcmp(iter->if_xname, name)) {
797					/* bridge index */
798					nmr->nr_arg1 = b - nm_bridges;
799					nmr->nr_arg2 = i; /* port index */
800					error = 0;
801					break;
802				}
803			}
804			NMG_UNLOCK();
805		} else {
806			/* return the first non-empty entry starting from
807			 * bridge nr_arg1 and port nr_arg2.
808			 *
809			 * Users can detect the end of the same bridge by
810			 * seeing the new and old value of nr_arg1, and can
811			 * detect the end of all the bridge by error != 0
812			 */
813			i = nmr->nr_arg1;
814			j = nmr->nr_arg2;
815
816			NMG_LOCK();
817			for (error = ENOENT; i < NM_BRIDGES; i++) {
818				b = nm_bridges + i;
819				if (j >= b->bdg_active_ports) {
820					j = 0; /* following bridges scan from 0 */
821					continue;
822				}
823				nmr->nr_arg1 = i;
824				nmr->nr_arg2 = j;
825				j = b->bdg_port_index[j];
826				vpna = b->bdg_ports[j];
827				iter = vpna->up.ifp;
828				strncpy(name, iter->if_xname, (size_t)IFNAMSIZ);
829				error = 0;
830				break;
831			}
832			NMG_UNLOCK();
833		}
834		break;
835
836	case NETMAP_BDG_LOOKUP_REG:
837		/* register a lookup function to the given bridge.
838		 * nmr->nr_name may be just bridge's name (including ':'
839		 * if it is not just NM_NAME).
840		 */
841		if (!func) {
842			error = EINVAL;
843			break;
844		}
845		NMG_LOCK();
846		b = nm_find_bridge(name, 0 /* don't create */);
847		if (!b) {
848			error = EINVAL;
849		} else {
850			b->nm_bdg_lookup = func;
851		}
852		NMG_UNLOCK();
853		break;
854
855	case NETMAP_BDG_OFFSET:
856		NMG_LOCK();
857		error = netmap_get_bdg_na(nmr, &na, 0);
858		if (na && !error) {
859			vpna = (struct netmap_vp_adapter *)na;
860			if (nmr->nr_arg1 > NETMAP_BDG_MAX_OFFSET)
861				nmr->nr_arg1 = NETMAP_BDG_MAX_OFFSET;
862			vpna->offset = nmr->nr_arg1;
863			D("Using offset %d for %p", vpna->offset, vpna);
864			netmap_adapter_put(na);
865		}
866		NMG_UNLOCK();
867		break;
868
869	default:
870		D("invalid cmd (nmr->nr_cmd) (0x%x)", cmd);
871		error = EINVAL;
872		break;
873	}
874	return error;
875}
876
877
878static int
879netmap_vp_krings_create(struct netmap_adapter *na)
880{
881	u_int ntx, nrx, tailroom;
882	int error, i;
883	uint32_t *leases;
884
885	/* XXX vps do not need host rings,
886	 * but we crash if we don't have one
887	 */
888	ntx = na->num_tx_rings + 1;
889	nrx = na->num_rx_rings + 1;
890
891	/*
892	 * Leases are attached to RX rings on vale ports
893	 */
894	tailroom = sizeof(uint32_t) * na->num_rx_desc * nrx;
895
896	error = netmap_krings_create(na, ntx, nrx, tailroom);
897	if (error)
898		return error;
899
900	leases = na->tailroom;
901
902	for (i = 0; i < nrx; i++) { /* Receive rings */
903		na->rx_rings[i].nkr_leases = leases;
904		leases += na->num_rx_desc;
905	}
906
907	error = nm_alloc_bdgfwd(na);
908	if (error) {
909		netmap_krings_delete(na);
910		return error;
911	}
912
913	return 0;
914}
915
916
917static void
918netmap_vp_krings_delete(struct netmap_adapter *na)
919{
920	nm_free_bdgfwd(na);
921	netmap_krings_delete(na);
922}
923
924
925static int
926nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n,
927	struct netmap_vp_adapter *na, u_int ring_nr);
928
929
930/*
931 * Grab packets from a kring, move them into the ft structure
932 * associated to the tx (input) port. Max one instance per port,
933 * filtered on input (ioctl, poll or XXX).
934 * Returns the next position in the ring.
935 */
936static int
937nm_bdg_preflush(struct netmap_vp_adapter *na, u_int ring_nr,
938	struct netmap_kring *kring, u_int end)
939{
940	struct netmap_ring *ring = kring->ring;
941	struct nm_bdg_fwd *ft;
942	u_int j = kring->nr_hwcur, lim = kring->nkr_num_slots - 1;
943	u_int ft_i = 0;	/* start from 0 */
944	u_int frags = 1; /* how many frags ? */
945	struct nm_bridge *b = na->na_bdg;
946
947	/* To protect against modifications to the bridge we acquire a
948	 * shared lock, waiting if we can sleep (if the source port is
949	 * attached to a user process) or with a trylock otherwise (NICs).
950	 */
951	ND("wait rlock for %d packets", ((j > end ? lim+1 : 0) + end) - j);
952	if (na->up.na_flags & NAF_BDG_MAYSLEEP)
953		BDG_RLOCK(b);
954	else if (!BDG_RTRYLOCK(b))
955		return 0;
956	ND(5, "rlock acquired for %d packets", ((j > end ? lim+1 : 0) + end) - j);
957	ft = kring->nkr_ft;
958
959	for (; likely(j != end); j = nm_next(j, lim)) {
960		struct netmap_slot *slot = &ring->slot[j];
961		char *buf;
962
963		ft[ft_i].ft_len = slot->len;
964		ft[ft_i].ft_flags = slot->flags;
965
966		ND("flags is 0x%x", slot->flags);
967		/* this slot goes into a list so initialize the link field */
968		ft[ft_i].ft_next = NM_FT_NULL;
969		buf = ft[ft_i].ft_buf = (slot->flags & NS_INDIRECT) ?
970			(void *)(uintptr_t)slot->ptr : BDG_NMB(&na->up, slot);
971		__builtin_prefetch(buf);
972		++ft_i;
973		if (slot->flags & NS_MOREFRAG) {
974			frags++;
975			continue;
976		}
977		if (unlikely(netmap_verbose && frags > 1))
978			RD(5, "%d frags at %d", frags, ft_i - frags);
979		ft[ft_i - frags].ft_frags = frags;
980		frags = 1;
981		if (unlikely((int)ft_i >= bridge_batch))
982			ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr);
983	}
984	if (frags > 1) {
985		D("truncate incomplete fragment at %d (%d frags)", ft_i, frags);
986		// ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG
987		ft[ft_i - 1].ft_frags &= ~NS_MOREFRAG;
988		ft[ft_i - frags].ft_frags = frags - 1;
989	}
990	if (ft_i)
991		ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr);
992	BDG_RUNLOCK(b);
993	return j;
994}
995
996
997/* ----- FreeBSD if_bridge hash function ------- */
998
999/*
1000 * The following hash function is adapted from "Hash Functions" by Bob Jenkins
1001 * ("Algorithm Alley", Dr. Dobbs Journal, September 1997).
1002 *
1003 * http://www.burtleburtle.net/bob/hash/spooky.html
1004 */
1005#define mix(a, b, c)                                                    \
1006do {                                                                    \
1007        a -= b; a -= c; a ^= (c >> 13);                                 \
1008        b -= c; b -= a; b ^= (a << 8);                                  \
1009        c -= a; c -= b; c ^= (b >> 13);                                 \
1010        a -= b; a -= c; a ^= (c >> 12);                                 \
1011        b -= c; b -= a; b ^= (a << 16);                                 \
1012        c -= a; c -= b; c ^= (b >> 5);                                  \
1013        a -= b; a -= c; a ^= (c >> 3);                                  \
1014        b -= c; b -= a; b ^= (a << 10);                                 \
1015        c -= a; c -= b; c ^= (b >> 15);                                 \
1016} while (/*CONSTCOND*/0)
1017
1018
1019static __inline uint32_t
1020nm_bridge_rthash(const uint8_t *addr)
1021{
1022        uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = 0; // hask key
1023
1024        b += addr[5] << 8;
1025        b += addr[4];
1026        a += addr[3] << 24;
1027        a += addr[2] << 16;
1028        a += addr[1] << 8;
1029        a += addr[0];
1030
1031        mix(a, b, c);
1032#define BRIDGE_RTHASH_MASK	(NM_BDG_HASH-1)
1033        return (c & BRIDGE_RTHASH_MASK);
1034}
1035
1036#undef mix
1037
1038
1039static int
1040bdg_netmap_reg(struct netmap_adapter *na, int onoff)
1041{
1042	struct netmap_vp_adapter *vpna =
1043		(struct netmap_vp_adapter*)na;
1044	struct ifnet *ifp = na->ifp;
1045
1046	/* the interface is already attached to the bridge,
1047	 * so we only need to toggle IFCAP_NETMAP.
1048	 */
1049	BDG_WLOCK(vpna->na_bdg);
1050	if (onoff) {
1051		ifp->if_capenable |= IFCAP_NETMAP;
1052	} else {
1053		ifp->if_capenable &= ~IFCAP_NETMAP;
1054	}
1055	BDG_WUNLOCK(vpna->na_bdg);
1056	return 0;
1057}
1058
1059
1060/*
1061 * Lookup function for a learning bridge.
1062 * Update the hash table with the source address,
1063 * and then returns the destination port index, and the
1064 * ring in *dst_ring (at the moment, always use ring 0)
1065 */
1066u_int
1067netmap_bdg_learning(char *buf, u_int buf_len, uint8_t *dst_ring,
1068		struct netmap_vp_adapter *na)
1069{
1070	struct nm_hash_ent *ht = na->na_bdg->ht;
1071	uint32_t sh, dh;
1072	u_int dst, mysrc = na->bdg_port;
1073	uint64_t smac, dmac;
1074
1075	if (buf_len < 14) {
1076		D("invalid buf length %d", buf_len);
1077		return NM_BDG_NOPORT;
1078	}
1079	dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff;
1080	smac = le64toh(*(uint64_t *)(buf + 4));
1081	smac >>= 16;
1082
1083	/*
1084	 * The hash is somewhat expensive, there might be some
1085	 * worthwhile optimizations here.
1086	 */
1087	if ((buf[6] & 1) == 0) { /* valid src */
1088		uint8_t *s = buf+6;
1089		sh = nm_bridge_rthash(s); // XXX hash of source
1090		/* update source port forwarding entry */
1091		ht[sh].mac = smac;	/* XXX expire ? */
1092		ht[sh].ports = mysrc;
1093		if (netmap_verbose)
1094		    D("src %02x:%02x:%02x:%02x:%02x:%02x on port %d",
1095			s[0], s[1], s[2], s[3], s[4], s[5], mysrc);
1096	}
1097	dst = NM_BDG_BROADCAST;
1098	if ((buf[0] & 1) == 0) { /* unicast */
1099		dh = nm_bridge_rthash(buf); // XXX hash of dst
1100		if (ht[dh].mac == dmac) {	/* found dst */
1101			dst = ht[dh].ports;
1102		}
1103		/* XXX otherwise return NM_BDG_UNKNOWN ? */
1104	}
1105	*dst_ring = 0;
1106	return dst;
1107}
1108
1109
1110/*
1111 * Available space in the ring. Only used in VALE code
1112 * and only with is_rx = 1
1113 */
1114static inline uint32_t
1115nm_kr_space(struct netmap_kring *k, int is_rx)
1116{
1117	int space;
1118
1119	if (is_rx) {
1120		int busy = k->nkr_hwlease - k->nr_hwcur;
1121		if (busy < 0)
1122			busy += k->nkr_num_slots;
1123		space = k->nkr_num_slots - 1 - busy;
1124	} else {
1125		/* XXX never used in this branch */
1126		space = k->nr_hwtail - k->nkr_hwlease;
1127		if (space < 0)
1128			space += k->nkr_num_slots;
1129	}
1130#if 0
1131	// sanity check
1132	if (k->nkr_hwlease >= k->nkr_num_slots ||
1133		k->nr_hwcur >= k->nkr_num_slots ||
1134		k->nr_tail >= k->nkr_num_slots ||
1135		busy < 0 ||
1136		busy >= k->nkr_num_slots) {
1137		D("invalid kring, cur %d tail %d lease %d lease_idx %d lim %d",			k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease,
1138			k->nkr_lease_idx, k->nkr_num_slots);
1139	}
1140#endif
1141	return space;
1142}
1143
1144
1145
1146
1147/* make a lease on the kring for N positions. return the
1148 * lease index
1149 * XXX only used in VALE code and with is_rx = 1
1150 */
1151static inline uint32_t
1152nm_kr_lease(struct netmap_kring *k, u_int n, int is_rx)
1153{
1154	uint32_t lim = k->nkr_num_slots - 1;
1155	uint32_t lease_idx = k->nkr_lease_idx;
1156
1157	k->nkr_leases[lease_idx] = NR_NOSLOT;
1158	k->nkr_lease_idx = nm_next(lease_idx, lim);
1159
1160	if (n > nm_kr_space(k, is_rx)) {
1161		D("invalid request for %d slots", n);
1162		panic("x");
1163	}
1164	/* XXX verify that there are n slots */
1165	k->nkr_hwlease += n;
1166	if (k->nkr_hwlease > lim)
1167		k->nkr_hwlease -= lim + 1;
1168
1169	if (k->nkr_hwlease >= k->nkr_num_slots ||
1170		k->nr_hwcur >= k->nkr_num_slots ||
1171		k->nr_hwtail >= k->nkr_num_slots ||
1172		k->nkr_lease_idx >= k->nkr_num_slots) {
1173		D("invalid kring %s, cur %d tail %d lease %d lease_idx %d lim %d",
1174			k->na->ifp->if_xname,
1175			k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease,
1176			k->nkr_lease_idx, k->nkr_num_slots);
1177	}
1178	return lease_idx;
1179}
1180
1181/*
1182 * This flush routine supports only unicast and broadcast but a large
1183 * number of ports, and lets us replace the learn and dispatch functions.
1184 */
1185int
1186nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na,
1187		u_int ring_nr)
1188{
1189	struct nm_bdg_q *dst_ents, *brddst;
1190	uint16_t num_dsts = 0, *dsts;
1191	struct nm_bridge *b = na->na_bdg;
1192	u_int i, j, me = na->bdg_port;
1193
1194	/*
1195	 * The work area (pointed by ft) is followed by an array of
1196	 * pointers to queues , dst_ents; there are NM_BDG_MAXRINGS
1197	 * queues per port plus one for the broadcast traffic.
1198	 * Then we have an array of destination indexes.
1199	 */
1200	dst_ents = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX);
1201	dsts = (uint16_t *)(dst_ents + NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1);
1202
1203	/* first pass: find a destination for each packet in the batch */
1204	for (i = 0; likely(i < n); i += ft[i].ft_frags) {
1205		uint8_t dst_ring = ring_nr; /* default, same ring as origin */
1206		uint16_t dst_port, d_i;
1207		struct nm_bdg_q *d;
1208		uint8_t *buf = ft[i].ft_buf;
1209		u_int len = ft[i].ft_len;
1210
1211		ND("slot %d frags %d", i, ft[i].ft_frags);
1212		/* Drop the packet if the offset is not into the first
1213		   fragment nor at the very beginning of the second. */
1214		if (unlikely(na->offset > len))
1215			continue;
1216		if (len == na->offset) {
1217			buf = ft[i+1].ft_buf;
1218			len = ft[i+1].ft_len;
1219		} else {
1220			buf += na->offset;
1221			len -= na->offset;
1222		}
1223		dst_port = b->nm_bdg_lookup(buf, len, &dst_ring, na);
1224		if (netmap_verbose > 255)
1225			RD(5, "slot %d port %d -> %d", i, me, dst_port);
1226		if (dst_port == NM_BDG_NOPORT)
1227			continue; /* this packet is identified to be dropped */
1228		else if (unlikely(dst_port > NM_BDG_MAXPORTS))
1229			continue;
1230		else if (dst_port == NM_BDG_BROADCAST)
1231			dst_ring = 0; /* broadcasts always go to ring 0 */
1232		else if (unlikely(dst_port == me ||
1233		    !b->bdg_ports[dst_port]))
1234			continue;
1235
1236		/* get a position in the scratch pad */
1237		d_i = dst_port * NM_BDG_MAXRINGS + dst_ring;
1238		d = dst_ents + d_i;
1239
1240		/* append the first fragment to the list */
1241		if (d->bq_head == NM_FT_NULL) { /* new destination */
1242			d->bq_head = d->bq_tail = i;
1243			/* remember this position to be scanned later */
1244			if (dst_port != NM_BDG_BROADCAST)
1245				dsts[num_dsts++] = d_i;
1246		} else {
1247			ft[d->bq_tail].ft_next = i;
1248			d->bq_tail = i;
1249		}
1250		d->bq_len += ft[i].ft_frags;
1251	}
1252
1253	/*
1254	 * Broadcast traffic goes to ring 0 on all destinations.
1255	 * So we need to add these rings to the list of ports to scan.
1256	 * XXX at the moment we scan all NM_BDG_MAXPORTS ports, which is
1257	 * expensive. We should keep a compact list of active destinations
1258	 * so we could shorten this loop.
1259	 */
1260	brddst = dst_ents + NM_BDG_BROADCAST * NM_BDG_MAXRINGS;
1261	if (brddst->bq_head != NM_FT_NULL) {
1262		for (j = 0; likely(j < b->bdg_active_ports); j++) {
1263			uint16_t d_i;
1264			i = b->bdg_port_index[j];
1265			if (unlikely(i == me))
1266				continue;
1267			d_i = i * NM_BDG_MAXRINGS;
1268			if (dst_ents[d_i].bq_head == NM_FT_NULL)
1269				dsts[num_dsts++] = d_i;
1270		}
1271	}
1272
1273	ND(5, "pass 1 done %d pkts %d dsts", n, num_dsts);
1274	/* second pass: scan destinations (XXX will be modular somehow) */
1275	for (i = 0; i < num_dsts; i++) {
1276		struct ifnet *dst_ifp;
1277		struct netmap_vp_adapter *dst_na;
1278		struct netmap_kring *kring;
1279		struct netmap_ring *ring;
1280		u_int dst_nr, lim, j, sent = 0, d_i, next, brd_next;
1281		u_int needed, howmany;
1282		int retry = netmap_txsync_retry;
1283		struct nm_bdg_q *d;
1284		uint32_t my_start = 0, lease_idx = 0;
1285		int nrings;
1286		int offset_mismatch;
1287
1288		d_i = dsts[i];
1289		ND("second pass %d port %d", i, d_i);
1290		d = dst_ents + d_i;
1291		// XXX fix the division
1292		dst_na = b->bdg_ports[d_i/NM_BDG_MAXRINGS];
1293		/* protect from the lookup function returning an inactive
1294		 * destination port
1295		 */
1296		if (unlikely(dst_na == NULL))
1297			goto cleanup;
1298		if (dst_na->up.na_flags & NAF_SW_ONLY)
1299			goto cleanup;
1300		dst_ifp = dst_na->up.ifp;
1301		/*
1302		 * The interface may be in !netmap mode in two cases:
1303		 * - when na is attached but not activated yet;
1304		 * - when na is being deactivated but is still attached.
1305		 */
1306		if (unlikely(!(dst_ifp->if_capenable & IFCAP_NETMAP))) {
1307			ND("not in netmap mode!");
1308			goto cleanup;
1309		}
1310
1311		offset_mismatch = (dst_na->offset != na->offset);
1312
1313		/* there is at least one either unicast or broadcast packet */
1314		brd_next = brddst->bq_head;
1315		next = d->bq_head;
1316		/* we need to reserve this many slots. If fewer are
1317		 * available, some packets will be dropped.
1318		 * Packets may have multiple fragments, so we may not use
1319		 * there is a chance that we may not use all of the slots
1320		 * we have claimed, so we will need to handle the leftover
1321		 * ones when we regain the lock.
1322		 */
1323		needed = d->bq_len + brddst->bq_len;
1324
1325		ND(5, "pass 2 dst %d is %x %s",
1326			i, d_i, is_vp ? "virtual" : "nic/host");
1327		dst_nr = d_i & (NM_BDG_MAXRINGS-1);
1328		nrings = dst_na->up.num_rx_rings;
1329		if (dst_nr >= nrings)
1330			dst_nr = dst_nr % nrings;
1331		kring = &dst_na->up.rx_rings[dst_nr];
1332		ring = kring->ring;
1333		lim = kring->nkr_num_slots - 1;
1334
1335retry:
1336
1337		/* reserve the buffers in the queue and an entry
1338		 * to report completion, and drop lock.
1339		 * XXX this might become a helper function.
1340		 */
1341		mtx_lock(&kring->q_lock);
1342		if (kring->nkr_stopped) {
1343			mtx_unlock(&kring->q_lock);
1344			goto cleanup;
1345		}
1346		if (dst_na->retry) {
1347			dst_na->up.nm_notify(&dst_na->up, dst_nr, NR_RX, 0);
1348		}
1349		my_start = j = kring->nkr_hwlease;
1350		howmany = nm_kr_space(kring, 1);
1351		if (needed < howmany)
1352			howmany = needed;
1353		lease_idx = nm_kr_lease(kring, howmany, 1);
1354		mtx_unlock(&kring->q_lock);
1355
1356		/* only retry if we need more than available slots */
1357		if (retry && needed <= howmany)
1358			retry = 0;
1359
1360		/* copy to the destination queue */
1361		while (howmany > 0) {
1362			struct netmap_slot *slot;
1363			struct nm_bdg_fwd *ft_p, *ft_end;
1364			u_int cnt;
1365			int fix_mismatch = offset_mismatch;
1366
1367			/* find the queue from which we pick next packet.
1368			 * NM_FT_NULL is always higher than valid indexes
1369			 * so we never dereference it if the other list
1370			 * has packets (and if both are empty we never
1371			 * get here).
1372			 */
1373			if (next < brd_next) {
1374				ft_p = ft + next;
1375				next = ft_p->ft_next;
1376			} else { /* insert broadcast */
1377				ft_p = ft + brd_next;
1378				brd_next = ft_p->ft_next;
1379			}
1380			cnt = ft_p->ft_frags; // cnt > 0
1381			if (unlikely(cnt > howmany))
1382			    break; /* no more space */
1383			howmany -= cnt;
1384			if (netmap_verbose && cnt > 1)
1385				RD(5, "rx %d frags to %d", cnt, j);
1386			ft_end = ft_p + cnt;
1387			do {
1388			    char *dst, *src = ft_p->ft_buf;
1389			    size_t copy_len = ft_p->ft_len, dst_len = copy_len;
1390
1391			    slot = &ring->slot[j];
1392			    dst = BDG_NMB(&dst_na->up, slot);
1393
1394			    if (unlikely(fix_mismatch)) {
1395				    /* We are processing the first fragment
1396				     * and there is a mismatch between source
1397				     * and destination offsets. Create a zeroed
1398				     * header for the destination, independently
1399				     * of the source header length and content.
1400				     */
1401				    src += na->offset;
1402				    copy_len -= na->offset;
1403				    bzero(dst, dst_na->offset);
1404				    dst += dst_na->offset;
1405				    dst_len = dst_na->offset + copy_len;
1406				    /* fix the first fragment only */
1407				    fix_mismatch = 0;
1408				    /* Here it could be copy_len == dst_len == 0,
1409				     * and so a zero length fragment is passed.
1410				     */
1411			    }
1412
1413			    ND("send [%d] %d(%d) bytes at %s:%d",
1414				i, (int)copy_len, (int)dst_len,
1415				NM_IFPNAME(dst_ifp), j);
1416			    /* round to a multiple of 64 */
1417			    copy_len = (copy_len + 63) & ~63;
1418
1419			    if (ft_p->ft_flags & NS_INDIRECT) {
1420				if (copyin(src, dst, copy_len)) {
1421					// invalid user pointer, pretend len is 0
1422					dst_len = 0;
1423				}
1424			    } else {
1425				//memcpy(dst, src, copy_len);
1426				pkt_copy(src, dst, (int)copy_len);
1427			    }
1428			    slot->len = dst_len;
1429			    slot->flags = (cnt << 8)| NS_MOREFRAG;
1430			    j = nm_next(j, lim);
1431			    ft_p++;
1432			    sent++;
1433			} while (ft_p != ft_end);
1434			slot->flags = (cnt << 8); /* clear flag on last entry */
1435			/* are we done ? */
1436			if (next == NM_FT_NULL && brd_next == NM_FT_NULL)
1437				break;
1438		}
1439		{
1440		    /* current position */
1441		    uint32_t *p = kring->nkr_leases; /* shorthand */
1442		    uint32_t update_pos;
1443		    int still_locked = 1;
1444
1445		    mtx_lock(&kring->q_lock);
1446		    if (unlikely(howmany > 0)) {
1447			/* not used all bufs. If i am the last one
1448			 * i can recover the slots, otherwise must
1449			 * fill them with 0 to mark empty packets.
1450			 */
1451			ND("leftover %d bufs", howmany);
1452			if (nm_next(lease_idx, lim) == kring->nkr_lease_idx) {
1453			    /* yes i am the last one */
1454			    ND("roll back nkr_hwlease to %d", j);
1455			    kring->nkr_hwlease = j;
1456			} else {
1457			    while (howmany-- > 0) {
1458				ring->slot[j].len = 0;
1459				ring->slot[j].flags = 0;
1460				j = nm_next(j, lim);
1461			    }
1462			}
1463		    }
1464		    p[lease_idx] = j; /* report I am done */
1465
1466		    update_pos = kring->nr_hwtail;
1467
1468		    if (my_start == update_pos) {
1469			/* all slots before my_start have been reported,
1470			 * so scan subsequent leases to see if other ranges
1471			 * have been completed, and to a selwakeup or txsync.
1472		         */
1473			while (lease_idx != kring->nkr_lease_idx &&
1474				p[lease_idx] != NR_NOSLOT) {
1475			    j = p[lease_idx];
1476			    p[lease_idx] = NR_NOSLOT;
1477			    lease_idx = nm_next(lease_idx, lim);
1478			}
1479			/* j is the new 'write' position. j != my_start
1480			 * means there are new buffers to report
1481			 */
1482			if (likely(j != my_start)) {
1483				kring->nr_hwtail = j;
1484				dst_na->up.nm_notify(&dst_na->up, dst_nr, NR_RX, 0);
1485				still_locked = 0;
1486				mtx_unlock(&kring->q_lock);
1487				if (dst_na->retry && retry--)
1488					goto retry;
1489			}
1490		    }
1491		    if (still_locked)
1492			mtx_unlock(&kring->q_lock);
1493		}
1494cleanup:
1495		d->bq_head = d->bq_tail = NM_FT_NULL; /* cleanup */
1496		d->bq_len = 0;
1497	}
1498	brddst->bq_head = brddst->bq_tail = NM_FT_NULL; /* cleanup */
1499	brddst->bq_len = 0;
1500	return 0;
1501}
1502
1503
1504static int
1505netmap_vp_txsync(struct netmap_vp_adapter *na, u_int ring_nr, int flags)
1506{
1507	struct netmap_kring *kring = &na->up.tx_rings[ring_nr];
1508	u_int done;
1509	u_int const lim = kring->nkr_num_slots - 1;
1510	u_int const cur = kring->rcur;
1511
1512	if (bridge_batch <= 0) { /* testing only */
1513		done = cur; // used all
1514		goto done;
1515	}
1516	if (bridge_batch > NM_BDG_BATCH)
1517		bridge_batch = NM_BDG_BATCH;
1518
1519	done = nm_bdg_preflush(na, ring_nr, kring, cur);
1520done:
1521	if (done != cur)
1522		D("early break at %d/ %d, tail %d", done, cur, kring->nr_hwtail);
1523	/*
1524	 * packets between 'done' and 'cur' are left unsent.
1525	 */
1526	kring->nr_hwcur = done;
1527	kring->nr_hwtail = nm_prev(done, lim);
1528	nm_txsync_finalize(kring);
1529	if (netmap_verbose)
1530		D("%s ring %d flags %d", NM_IFPNAME(na->up.ifp), ring_nr, flags);
1531	return 0;
1532}
1533
1534
1535/*
1536 * main dispatch routine for the bridge.
1537 * We already know that only one thread is running this.
1538 * we must run nm_bdg_preflush without lock.
1539 */
1540static int
1541bdg_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
1542{
1543	struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na;
1544	return netmap_vp_txsync(vpna, ring_nr, flags);
1545}
1546
1547static int
1548netmap_vp_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
1549{
1550	struct netmap_kring *kring = &na->rx_rings[ring_nr];
1551	struct netmap_ring *ring = kring->ring;
1552	u_int nm_i, lim = kring->nkr_num_slots - 1;
1553	u_int head = nm_rxsync_prologue(kring);
1554	int n;
1555
1556	if (head > lim) {
1557		D("ouch dangerous reset!!!");
1558		n = netmap_ring_reinit(kring);
1559		goto done;
1560	}
1561
1562	/* First part, import newly received packets. */
1563	/* actually nothing to do here, they are already in the kring */
1564
1565	/* Second part, skip past packets that userspace has released. */
1566	nm_i = kring->nr_hwcur;
1567	if (nm_i != head) {
1568		/* consistency check, but nothing really important here */
1569		for (n = 0; likely(nm_i != head); n++) {
1570			struct netmap_slot *slot = &ring->slot[nm_i];
1571			void *addr = BDG_NMB(na, slot);
1572
1573			if (addr == netmap_buffer_base) { /* bad buf */
1574				D("bad buffer index %d, ignore ?",
1575					slot->buf_idx);
1576			}
1577			slot->flags &= ~NS_BUF_CHANGED;
1578			nm_i = nm_next(nm_i, lim);
1579		}
1580		kring->nr_hwcur = head;
1581	}
1582
1583	/* tell userspace that there are new packets */
1584	nm_rxsync_finalize(kring);
1585	n = 0;
1586done:
1587	return n;
1588}
1589
1590/*
1591 * user process reading from a VALE switch.
1592 * Already protected against concurrent calls from userspace,
1593 * but we must acquire the queue's lock to protect against
1594 * writers on the same queue.
1595 */
1596static int
1597bdg_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
1598{
1599	struct netmap_kring *kring = &na->rx_rings[ring_nr];
1600	int n;
1601
1602	mtx_lock(&kring->q_lock);
1603	n = netmap_vp_rxsync(na, ring_nr, flags);
1604	mtx_unlock(&kring->q_lock);
1605	return n;
1606}
1607
1608
1609static int
1610bdg_netmap_attach(struct nmreq *nmr, struct ifnet *ifp)
1611{
1612	struct netmap_vp_adapter *vpna;
1613	struct netmap_adapter *na;
1614	int error;
1615
1616	vpna = malloc(sizeof(*vpna), M_DEVBUF, M_NOWAIT | M_ZERO);
1617	if (vpna == NULL)
1618		return ENOMEM;
1619
1620 	na = &vpna->up;
1621
1622	na->ifp = ifp;
1623
1624	/* bound checking */
1625	na->num_tx_rings = nmr->nr_tx_rings;
1626	nm_bound_var(&na->num_tx_rings, 1, 1, NM_BDG_MAXRINGS, NULL);
1627	nmr->nr_tx_rings = na->num_tx_rings; // write back
1628	na->num_rx_rings = nmr->nr_rx_rings;
1629	nm_bound_var(&na->num_rx_rings, 1, 1, NM_BDG_MAXRINGS, NULL);
1630	nmr->nr_rx_rings = na->num_rx_rings; // write back
1631	nm_bound_var(&nmr->nr_tx_slots, NM_BRIDGE_RINGSIZE,
1632			1, NM_BDG_MAXSLOTS, NULL);
1633	na->num_tx_desc = nmr->nr_tx_slots;
1634	nm_bound_var(&nmr->nr_rx_slots, NM_BRIDGE_RINGSIZE,
1635			1, NM_BDG_MAXSLOTS, NULL);
1636	na->num_rx_desc = nmr->nr_rx_slots;
1637	vpna->offset = 0;
1638
1639	na->na_flags |= NAF_BDG_MAYSLEEP | NAF_MEM_OWNER;
1640	na->nm_txsync = bdg_netmap_txsync;
1641	na->nm_rxsync = bdg_netmap_rxsync;
1642	na->nm_register = bdg_netmap_reg;
1643	na->nm_dtor = netmap_adapter_vp_dtor;
1644	na->nm_krings_create = netmap_vp_krings_create;
1645	na->nm_krings_delete = netmap_vp_krings_delete;
1646	na->nm_mem = netmap_mem_private_new(NM_IFPNAME(na->ifp),
1647			na->num_tx_rings, na->num_tx_desc,
1648			na->num_rx_rings, na->num_rx_desc);
1649	/* other nmd fields are set in the common routine */
1650	error = netmap_attach_common(na);
1651	if (error) {
1652		free(vpna, M_DEVBUF);
1653		return error;
1654	}
1655	return 0;
1656}
1657
1658
1659static void
1660netmap_bwrap_dtor(struct netmap_adapter *na)
1661{
1662	struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na;
1663	struct netmap_adapter *hwna = bna->hwna;
1664	struct nm_bridge *b = bna->up.na_bdg,
1665		*bh = bna->host.na_bdg;
1666	struct ifnet *ifp = na->ifp;
1667
1668	ND("na %p", na);
1669
1670	if (b) {
1671		netmap_bdg_detach_common(b, bna->up.bdg_port,
1672			(bh ? bna->host.bdg_port : -1));
1673	}
1674
1675	hwna->na_private = NULL;
1676	netmap_adapter_put(hwna);
1677
1678	bzero(ifp, sizeof(*ifp));
1679	free(ifp, M_DEVBUF);
1680	na->ifp = NULL;
1681
1682}
1683
1684
1685/*
1686 * Intr callback for NICs connected to a bridge.
1687 * Simply ignore tx interrupts (maybe we could try to recover space ?)
1688 * and pass received packets from nic to the bridge.
1689 *
1690 * XXX TODO check locking: this is called from the interrupt
1691 * handler so we should make sure that the interface is not
1692 * disconnected while passing down an interrupt.
1693 *
1694 * Note, no user process can access this NIC or the host stack.
1695 * The only part of the ring that is significant are the slots,
1696 * and head/cur/tail are set from the kring as needed
1697 * (part as a receive ring, part as a transmit ring).
1698 *
1699 * callback that overwrites the hwna notify callback.
1700 * Packets come from the outside or from the host stack and are put on an hwna rx ring.
1701 * The bridge wrapper then sends the packets through the bridge.
1702 */
1703static int
1704netmap_bwrap_intr_notify(struct netmap_adapter *na, u_int ring_nr, enum txrx tx, int flags)
1705{
1706	struct ifnet *ifp = na->ifp;
1707	struct netmap_bwrap_adapter *bna = na->na_private;
1708	struct netmap_vp_adapter *hostna = &bna->host;
1709	struct netmap_kring *kring, *bkring;
1710	struct netmap_ring *ring;
1711	int is_host_ring = ring_nr == na->num_rx_rings;
1712	struct netmap_vp_adapter *vpna = &bna->up;
1713	int error = 0;
1714
1715	if (netmap_verbose)
1716	    D("%s %s%d 0x%x", NM_IFPNAME(ifp),
1717		(tx == NR_TX ? "TX" : "RX"), ring_nr, flags);
1718
1719	if (flags & NAF_DISABLE_NOTIFY) {
1720		kring = tx == NR_TX ? na->tx_rings : na->rx_rings;
1721		bkring = tx == NR_TX ? vpna->up.rx_rings : vpna->up.tx_rings;
1722		if (kring[ring_nr].nkr_stopped)
1723			netmap_disable_ring(&bkring[ring_nr]);
1724		else
1725			bkring[ring_nr].nkr_stopped = 0;
1726		return 0;
1727	}
1728
1729	if (ifp == NULL || !(ifp->if_capenable & IFCAP_NETMAP))
1730		return 0;
1731
1732	/* we only care about receive interrupts */
1733	if (tx == NR_TX)
1734		return 0;
1735
1736	kring = &na->rx_rings[ring_nr];
1737	ring = kring->ring;
1738
1739	/* make sure the ring is not disabled */
1740	if (nm_kr_tryget(kring))
1741		return 0;
1742
1743	if (is_host_ring && hostna->na_bdg == NULL) {
1744		error = bna->save_notify(na, ring_nr, tx, flags);
1745		goto put_out;
1746	}
1747
1748	/* Here we expect ring->head = ring->cur = ring->tail
1749	 * because everything has been released from the previous round.
1750	 * However the ring is shared and we might have info from
1751	 * the wrong side (the tx ring). Hence we overwrite with
1752	 * the info from the rx kring.
1753	 */
1754	if (netmap_verbose)
1755	    D("%s head %d cur %d tail %d (kring %d %d %d)",  NM_IFPNAME(ifp),
1756		ring->head, ring->cur, ring->tail,
1757		kring->rhead, kring->rcur, kring->rtail);
1758
1759	ring->head = kring->rhead;
1760	ring->cur = kring->rcur;
1761	ring->tail = kring->rtail;
1762
1763	/* simulate a user wakeup on the rx ring */
1764	if (is_host_ring) {
1765		netmap_rxsync_from_host(na, NULL, NULL);
1766		vpna = hostna;
1767		ring_nr = 0;
1768	} else {
1769		/* fetch packets that have arrived.
1770		 * XXX maybe do this in a loop ?
1771		 */
1772		error = na->nm_rxsync(na, ring_nr, 0);
1773		if (error)
1774			goto put_out;
1775	}
1776	if (kring->nr_hwcur == kring->nr_hwtail && netmap_verbose) {
1777		D("how strange, interrupt with no packets on %s",
1778			NM_IFPNAME(ifp));
1779		goto put_out;
1780	}
1781
1782	/* new packets are ring->cur to ring->tail, and the bkring
1783	 * had hwcur == ring->cur. So advance ring->cur to ring->tail
1784	 * to push all packets out.
1785	 */
1786	ring->head = ring->cur = ring->tail;
1787
1788	/* also set tail to what the bwrap expects */
1789	bkring = &vpna->up.tx_rings[ring_nr];
1790	ring->tail = bkring->nr_hwtail; // rtail too ?
1791
1792	/* pass packets to the switch */
1793	nm_txsync_prologue(bkring); // XXX error checking ?
1794	netmap_vp_txsync(vpna, ring_nr, flags);
1795
1796	/* mark all buffers as released on this ring */
1797	ring->head = ring->cur = kring->nr_hwtail;
1798	ring->tail = kring->rtail;
1799	/* another call to actually release the buffers */
1800	if (!is_host_ring) {
1801		error = na->nm_rxsync(na, ring_nr, 0);
1802	} else {
1803		/* mark all packets as released, as in the
1804		 * second part of netmap_rxsync_from_host()
1805		 */
1806		kring->nr_hwcur = kring->nr_hwtail;
1807		nm_rxsync_finalize(kring);
1808	}
1809
1810put_out:
1811	nm_kr_put(kring);
1812	return error;
1813}
1814
1815
1816static int
1817netmap_bwrap_register(struct netmap_adapter *na, int onoff)
1818{
1819	struct netmap_bwrap_adapter *bna =
1820		(struct netmap_bwrap_adapter *)na;
1821	struct netmap_adapter *hwna = bna->hwna;
1822	struct netmap_vp_adapter *hostna = &bna->host;
1823	int error;
1824
1825	ND("%s %s", NM_IFPNAME(na->ifp), onoff ? "on" : "off");
1826
1827	if (onoff) {
1828		int i;
1829
1830		hwna->na_lut = na->na_lut;
1831		hwna->na_lut_objtotal = na->na_lut_objtotal;
1832
1833		if (hostna->na_bdg) {
1834			hostna->up.na_lut = na->na_lut;
1835			hostna->up.na_lut_objtotal = na->na_lut_objtotal;
1836		}
1837
1838		/* cross-link the netmap rings */
1839		for (i = 0; i <= na->num_tx_rings; i++) {
1840			hwna->tx_rings[i].nkr_num_slots = na->rx_rings[i].nkr_num_slots;
1841			hwna->tx_rings[i].ring = na->rx_rings[i].ring;
1842		}
1843		for (i = 0; i <= na->num_rx_rings; i++) {
1844			hwna->rx_rings[i].nkr_num_slots = na->tx_rings[i].nkr_num_slots;
1845			hwna->rx_rings[i].ring = na->tx_rings[i].ring;
1846		}
1847	}
1848
1849	if (hwna->ifp) {
1850		error = hwna->nm_register(hwna, onoff);
1851		if (error)
1852			return error;
1853	}
1854
1855	bdg_netmap_reg(na, onoff);
1856
1857	if (onoff) {
1858		bna->save_notify = hwna->nm_notify;
1859		hwna->nm_notify = netmap_bwrap_intr_notify;
1860	} else {
1861		hwna->nm_notify = bna->save_notify;
1862		hwna->na_lut = NULL;
1863		hwna->na_lut_objtotal = 0;
1864	}
1865
1866	return 0;
1867}
1868
1869
1870static int
1871netmap_bwrap_config(struct netmap_adapter *na, u_int *txr, u_int *txd,
1872				    u_int *rxr, u_int *rxd)
1873{
1874	struct netmap_bwrap_adapter *bna =
1875		(struct netmap_bwrap_adapter *)na;
1876	struct netmap_adapter *hwna = bna->hwna;
1877
1878	/* forward the request */
1879	netmap_update_config(hwna);
1880	/* swap the results */
1881	*txr = hwna->num_rx_rings;
1882	*txd = hwna->num_rx_desc;
1883	*rxr = hwna->num_tx_rings;
1884	*rxd = hwna->num_rx_desc;
1885
1886	return 0;
1887}
1888
1889
1890static int
1891netmap_bwrap_krings_create(struct netmap_adapter *na)
1892{
1893	struct netmap_bwrap_adapter *bna =
1894		(struct netmap_bwrap_adapter *)na;
1895	struct netmap_adapter *hwna = bna->hwna;
1896	struct netmap_adapter *hostna = &bna->host.up;
1897	int error;
1898
1899	ND("%s", NM_IFPNAME(na->ifp));
1900
1901	error = netmap_vp_krings_create(na);
1902	if (error)
1903		return error;
1904
1905	error = hwna->nm_krings_create(hwna);
1906	if (error) {
1907		netmap_vp_krings_delete(na);
1908		return error;
1909	}
1910
1911	hostna->tx_rings = na->tx_rings + na->num_tx_rings;
1912	hostna->rx_rings = na->rx_rings + na->num_rx_rings;
1913
1914	return 0;
1915}
1916
1917
1918static void
1919netmap_bwrap_krings_delete(struct netmap_adapter *na)
1920{
1921	struct netmap_bwrap_adapter *bna =
1922		(struct netmap_bwrap_adapter *)na;
1923	struct netmap_adapter *hwna = bna->hwna;
1924
1925	ND("%s", NM_IFPNAME(na->ifp));
1926
1927	hwna->nm_krings_delete(hwna);
1928	netmap_vp_krings_delete(na);
1929}
1930
1931
1932/* notify method for the bridge-->hwna direction */
1933static int
1934netmap_bwrap_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int flags)
1935{
1936	struct netmap_bwrap_adapter *bna =
1937		(struct netmap_bwrap_adapter *)na;
1938	struct netmap_adapter *hwna = bna->hwna;
1939	struct netmap_kring *kring, *hw_kring;
1940	struct netmap_ring *ring;
1941	u_int lim;
1942	int error = 0;
1943
1944	if (tx == NR_TX)
1945	        return ENXIO;
1946
1947	kring = &na->rx_rings[ring_n];
1948	hw_kring = &hwna->tx_rings[ring_n];
1949	ring = kring->ring;
1950	lim = kring->nkr_num_slots - 1;
1951
1952	if (hwna->ifp == NULL || !(hwna->ifp->if_capenable & IFCAP_NETMAP))
1953		return 0;
1954	/* first step: simulate a user wakeup on the rx ring */
1955	netmap_vp_rxsync(na, ring_n, flags);
1956	ND("%s[%d] PRE rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)",
1957		NM_IFPNAME(na->ifp), ring_n,
1958		kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease,
1959		ring->head, ring->cur, ring->tail,
1960		hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_ring->rtail);
1961	/* second step: the simulated user consumes all new packets */
1962	ring->head = ring->cur = ring->tail;
1963
1964	/* third step: the new packets are sent on the tx ring
1965	 * (which is actually the same ring)
1966	 */
1967	/* set tail to what the hw expects */
1968	ring->tail = hw_kring->rtail;
1969	if (ring_n == na->num_rx_rings) {
1970		netmap_txsync_to_host(hwna);
1971	} else {
1972		nm_txsync_prologue(&hwna->tx_rings[ring_n]); // XXX error checking ?
1973		error = hwna->nm_txsync(hwna, ring_n, flags);
1974	}
1975
1976	/* fourth step: now we are back the rx ring */
1977	/* claim ownership on all hw owned bufs */
1978	ring->head = nm_next(ring->tail, lim); /* skip past reserved slot */
1979	ring->tail = kring->rtail; /* restore saved value of tail, for safety */
1980
1981	/* fifth step: the user goes to sleep again, causing another rxsync */
1982	netmap_vp_rxsync(na, ring_n, flags);
1983	ND("%s[%d] PST rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)",
1984		NM_IFPNAME(na->ifp), ring_n,
1985		kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease,
1986		ring->head, ring->cur, ring->tail,
1987		hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_kring->rtail);
1988
1989	return error;
1990}
1991
1992
1993static int
1994netmap_bwrap_host_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int flags)
1995{
1996	struct netmap_bwrap_adapter *bna = na->na_private;
1997	struct netmap_adapter *port_na = &bna->up.up;
1998	if (tx == NR_TX || ring_n != 0)
1999		return ENXIO;
2000	return netmap_bwrap_notify(port_na, port_na->num_rx_rings, NR_RX, flags);
2001}
2002
2003
2004/* attach a bridge wrapper to the 'real' device */
2005static int
2006netmap_bwrap_attach(struct ifnet *fake, struct ifnet *real)
2007{
2008	struct netmap_bwrap_adapter *bna;
2009	struct netmap_adapter *na;
2010	struct netmap_adapter *hwna = NA(real);
2011	struct netmap_adapter *hostna;
2012	int error;
2013
2014
2015	bna = malloc(sizeof(*bna), M_DEVBUF, M_NOWAIT | M_ZERO);
2016	if (bna == NULL)
2017		return ENOMEM;
2018
2019	na = &bna->up.up;
2020	na->ifp = fake;
2021	/* fill the ring data for the bwrap adapter with rx/tx meanings
2022	 * swapped. The real cross-linking will be done during register,
2023	 * when all the krings will have been created.
2024	 */
2025	na->num_rx_rings = hwna->num_tx_rings;
2026	na->num_tx_rings = hwna->num_rx_rings;
2027	na->num_tx_desc = hwna->num_rx_desc;
2028	na->num_rx_desc = hwna->num_tx_desc;
2029	na->nm_dtor = netmap_bwrap_dtor;
2030	na->nm_register = netmap_bwrap_register;
2031	// na->nm_txsync = netmap_bwrap_txsync;
2032	// na->nm_rxsync = netmap_bwrap_rxsync;
2033	na->nm_config = netmap_bwrap_config;
2034	na->nm_krings_create = netmap_bwrap_krings_create;
2035	na->nm_krings_delete = netmap_bwrap_krings_delete;
2036	na->nm_notify = netmap_bwrap_notify;
2037	na->nm_mem = hwna->nm_mem;
2038	na->na_private = na; /* prevent NIOCREGIF */
2039	bna->up.retry = 1; /* XXX maybe this should depend on the hwna */
2040
2041	bna->hwna = hwna;
2042	netmap_adapter_get(hwna);
2043	hwna->na_private = bna; /* weak reference */
2044
2045	hostna = &bna->host.up;
2046	hostna->ifp = hwna->ifp;
2047	hostna->num_tx_rings = 1;
2048	hostna->num_tx_desc = hwna->num_rx_desc;
2049	hostna->num_rx_rings = 1;
2050	hostna->num_rx_desc = hwna->num_tx_desc;
2051	// hostna->nm_txsync = netmap_bwrap_host_txsync;
2052	// hostna->nm_rxsync = netmap_bwrap_host_rxsync;
2053	hostna->nm_notify = netmap_bwrap_host_notify;
2054	hostna->nm_mem = na->nm_mem;
2055	hostna->na_private = bna;
2056
2057	ND("%s<->%s txr %d txd %d rxr %d rxd %d",
2058		fake->if_xname, real->if_xname,
2059		na->num_tx_rings, na->num_tx_desc,
2060		na->num_rx_rings, na->num_rx_desc);
2061
2062	error = netmap_attach_common(na);
2063	if (error) {
2064		netmap_adapter_put(hwna);
2065		free(bna, M_DEVBUF);
2066		return error;
2067	}
2068	return 0;
2069}
2070
2071
2072void
2073netmap_init_bridges(void)
2074{
2075	int i;
2076	bzero(nm_bridges, sizeof(struct nm_bridge) * NM_BRIDGES); /* safety */
2077	for (i = 0; i < NM_BRIDGES; i++)
2078		BDG_RWINIT(&nm_bridges[i]);
2079}
2080#endif /* WITH_VALE */
2081