netmap_vale.c revision 302408
1/*
2 * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 *   1. Redistributions of source code must retain the above copyright
8 *      notice, this list of conditions and the following disclaimer.
9 *   2. Redistributions in binary form must reproduce the above copyright
10 *      notice, this list of conditions and the following disclaimer in the
11 *      documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 * SUCH DAMAGE.
24 */
25
26
27/*
28 * This module implements the VALE switch for netmap
29
30--- VALE SWITCH ---
31
32NMG_LOCK() serializes all modifications to switches and ports.
33A switch cannot be deleted until all ports are gone.
34
35For each switch, an SX lock (RWlock on linux) protects
36deletion of ports. When configuring or deleting a new port, the
37lock is acquired in exclusive mode (after holding NMG_LOCK).
38When forwarding, the lock is acquired in shared mode (without NMG_LOCK).
39The lock is held throughout the entire forwarding cycle,
40during which the thread may incur in a page fault.
41Hence it is important that sleepable shared locks are used.
42
43On the rx ring, the per-port lock is grabbed initially to reserve
44a number of slot in the ring, then the lock is released,
45packets are copied from source to destination, and then
46the lock is acquired again and the receive ring is updated.
47(A similar thing is done on the tx ring for NIC and host stack
48ports attached to the switch)
49
50 */
51
52/*
53 * OS-specific code that is used only within this file.
54 * Other OS-specific code that must be accessed by drivers
55 * is present in netmap_kern.h
56 */
57
58#if defined(__FreeBSD__)
59#include <sys/cdefs.h> /* prerequisite */
60__FBSDID("$FreeBSD: stable/11/sys/dev/netmap/netmap_vale.c 285698 2015-07-19 18:06:30Z luigi $");
61
62#include <sys/types.h>
63#include <sys/errno.h>
64#include <sys/param.h>	/* defines used in kernel.h */
65#include <sys/kernel.h>	/* types used in module initialization */
66#include <sys/conf.h>	/* cdevsw struct, UID, GID */
67#include <sys/sockio.h>
68#include <sys/socketvar.h>	/* struct socket */
69#include <sys/malloc.h>
70#include <sys/poll.h>
71#include <sys/rwlock.h>
72#include <sys/socket.h> /* sockaddrs */
73#include <sys/selinfo.h>
74#include <sys/sysctl.h>
75#include <net/if.h>
76#include <net/if_var.h>
77#include <net/bpf.h>		/* BIOCIMMEDIATE */
78#include <machine/bus.h>	/* bus_dmamap_* */
79#include <sys/endian.h>
80#include <sys/refcount.h>
81
82
83#define BDG_RWLOCK_T		struct rwlock // struct rwlock
84
85#define	BDG_RWINIT(b)		\
86	rw_init_flags(&(b)->bdg_lock, "bdg lock", RW_NOWITNESS)
87#define BDG_WLOCK(b)		rw_wlock(&(b)->bdg_lock)
88#define BDG_WUNLOCK(b)		rw_wunlock(&(b)->bdg_lock)
89#define BDG_RLOCK(b)		rw_rlock(&(b)->bdg_lock)
90#define BDG_RTRYLOCK(b)		rw_try_rlock(&(b)->bdg_lock)
91#define BDG_RUNLOCK(b)		rw_runlock(&(b)->bdg_lock)
92#define BDG_RWDESTROY(b)	rw_destroy(&(b)->bdg_lock)
93
94
95#elif defined(linux)
96
97#include "bsd_glue.h"
98
99#elif defined(__APPLE__)
100
101#warning OSX support is only partial
102#include "osx_glue.h"
103
104#else
105
106#error	Unsupported platform
107
108#endif /* unsupported */
109
110/*
111 * common headers
112 */
113
114#include <net/netmap.h>
115#include <dev/netmap/netmap_kern.h>
116#include <dev/netmap/netmap_mem2.h>
117
118#ifdef WITH_VALE
119
120/*
121 * system parameters (most of them in netmap_kern.h)
122 * NM_NAME	prefix for switch port names, default "vale"
123 * NM_BDG_MAXPORTS	number of ports
124 * NM_BRIDGES	max number of switches in the system.
125 *	XXX should become a sysctl or tunable
126 *
127 * Switch ports are named valeX:Y where X is the switch name and Y
128 * is the port. If Y matches a physical interface name, the port is
129 * connected to a physical device.
130 *
131 * Unlike physical interfaces, switch ports use their own memory region
132 * for rings and buffers.
133 * The virtual interfaces use per-queue lock instead of core lock.
134 * In the tx loop, we aggregate traffic in batches to make all operations
135 * faster. The batch size is bridge_batch.
136 */
137#define NM_BDG_MAXRINGS		16	/* XXX unclear how many. */
138#define NM_BDG_MAXSLOTS		4096	/* XXX same as above */
139#define NM_BRIDGE_RINGSIZE	1024	/* in the device */
140#define NM_BDG_HASH		1024	/* forwarding table entries */
141#define NM_BDG_BATCH		1024	/* entries in the forwarding buffer */
142#define NM_MULTISEG		64	/* max size of a chain of bufs */
143/* actual size of the tables */
144#define NM_BDG_BATCH_MAX	(NM_BDG_BATCH + NM_MULTISEG)
145/* NM_FT_NULL terminates a list of slots in the ft */
146#define NM_FT_NULL		NM_BDG_BATCH_MAX
147#define	NM_BRIDGES		8	/* number of bridges */
148
149
150/*
151 * bridge_batch is set via sysctl to the max batch size to be
152 * used in the bridge. The actual value may be larger as the
153 * last packet in the block may overflow the size.
154 */
155int bridge_batch = NM_BDG_BATCH; /* bridge batch size */
156SYSCTL_DECL(_dev_netmap);
157SYSCTL_INT(_dev_netmap, OID_AUTO, bridge_batch, CTLFLAG_RW, &bridge_batch, 0 , "");
158
159
160static int netmap_vp_create(struct nmreq *, struct ifnet *, struct netmap_vp_adapter **);
161static int netmap_vp_reg(struct netmap_adapter *na, int onoff);
162static int netmap_bwrap_register(struct netmap_adapter *, int onoff);
163
164/*
165 * For each output interface, nm_bdg_q is used to construct a list.
166 * bq_len is the number of output buffers (we can have coalescing
167 * during the copy).
168 */
169struct nm_bdg_q {
170	uint16_t bq_head;
171	uint16_t bq_tail;
172	uint32_t bq_len;	/* number of buffers */
173};
174
175/* XXX revise this */
176struct nm_hash_ent {
177	uint64_t	mac;	/* the top 2 bytes are the epoch */
178	uint64_t	ports;
179};
180
181/*
182 * nm_bridge is a descriptor for a VALE switch.
183 * Interfaces for a bridge are all in bdg_ports[].
184 * The array has fixed size, an empty entry does not terminate
185 * the search, but lookups only occur on attach/detach so we
186 * don't mind if they are slow.
187 *
188 * The bridge is non blocking on the transmit ports: excess
189 * packets are dropped if there is no room on the output port.
190 *
191 * bdg_lock protects accesses to the bdg_ports array.
192 * This is a rw lock (or equivalent).
193 */
194struct nm_bridge {
195	/* XXX what is the proper alignment/layout ? */
196	BDG_RWLOCK_T	bdg_lock;	/* protects bdg_ports */
197	int		bdg_namelen;
198	uint32_t	bdg_active_ports; /* 0 means free */
199	char		bdg_basename[IFNAMSIZ];
200
201	/* Indexes of active ports (up to active_ports)
202	 * and all other remaining ports.
203	 */
204	uint8_t		bdg_port_index[NM_BDG_MAXPORTS];
205
206	struct netmap_vp_adapter *bdg_ports[NM_BDG_MAXPORTS];
207
208
209	/*
210	 * The function to decide the destination port.
211	 * It returns either of an index of the destination port,
212	 * NM_BDG_BROADCAST to broadcast this packet, or NM_BDG_NOPORT not to
213	 * forward this packet.  ring_nr is the source ring index, and the
214	 * function may overwrite this value to forward this packet to a
215	 * different ring index.
216	 * This function must be set by netmap_bdgctl().
217	 */
218	struct netmap_bdg_ops bdg_ops;
219
220	/* the forwarding table, MAC+ports.
221	 * XXX should be changed to an argument to be passed to
222	 * the lookup function, and allocated on attach
223	 */
224	struct nm_hash_ent ht[NM_BDG_HASH];
225
226#ifdef CONFIG_NET_NS
227	struct net *ns;
228#endif /* CONFIG_NET_NS */
229};
230
231const char*
232netmap_bdg_name(struct netmap_vp_adapter *vp)
233{
234	struct nm_bridge *b = vp->na_bdg;
235	if (b == NULL)
236		return NULL;
237	return b->bdg_basename;
238}
239
240
241#ifndef CONFIG_NET_NS
242/*
243 * XXX in principle nm_bridges could be created dynamically
244 * Right now we have a static array and deletions are protected
245 * by an exclusive lock.
246 */
247struct nm_bridge *nm_bridges;
248#endif /* !CONFIG_NET_NS */
249
250
251/*
252 * this is a slightly optimized copy routine which rounds
253 * to multiple of 64 bytes and is often faster than dealing
254 * with other odd sizes. We assume there is enough room
255 * in the source and destination buffers.
256 *
257 * XXX only for multiples of 64 bytes, non overlapped.
258 */
259static inline void
260pkt_copy(void *_src, void *_dst, int l)
261{
262        uint64_t *src = _src;
263        uint64_t *dst = _dst;
264        if (unlikely(l >= 1024)) {
265                memcpy(dst, src, l);
266                return;
267        }
268        for (; likely(l > 0); l-=64) {
269                *dst++ = *src++;
270                *dst++ = *src++;
271                *dst++ = *src++;
272                *dst++ = *src++;
273                *dst++ = *src++;
274                *dst++ = *src++;
275                *dst++ = *src++;
276                *dst++ = *src++;
277        }
278}
279
280
281/*
282 * locate a bridge among the existing ones.
283 * MUST BE CALLED WITH NMG_LOCK()
284 *
285 * a ':' in the name terminates the bridge name. Otherwise, just NM_NAME.
286 * We assume that this is called with a name of at least NM_NAME chars.
287 */
288static struct nm_bridge *
289nm_find_bridge(const char *name, int create)
290{
291	int i, l, namelen;
292	struct nm_bridge *b = NULL, *bridges;
293	u_int num_bridges;
294
295	NMG_LOCK_ASSERT();
296
297	netmap_bns_getbridges(&bridges, &num_bridges);
298
299	namelen = strlen(NM_NAME);	/* base length */
300	l = name ? strlen(name) : 0;		/* actual length */
301	if (l < namelen) {
302		D("invalid bridge name %s", name ? name : NULL);
303		return NULL;
304	}
305	for (i = namelen + 1; i < l; i++) {
306		if (name[i] == ':') {
307			namelen = i;
308			break;
309		}
310	}
311	if (namelen >= IFNAMSIZ)
312		namelen = IFNAMSIZ;
313	ND("--- prefix is '%.*s' ---", namelen, name);
314
315	/* lookup the name, remember empty slot if there is one */
316	for (i = 0; i < num_bridges; i++) {
317		struct nm_bridge *x = bridges + i;
318
319		if (x->bdg_active_ports == 0) {
320			if (create && b == NULL)
321				b = x;	/* record empty slot */
322		} else if (x->bdg_namelen != namelen) {
323			continue;
324		} else if (strncmp(name, x->bdg_basename, namelen) == 0) {
325			ND("found '%.*s' at %d", namelen, name, i);
326			b = x;
327			break;
328		}
329	}
330	if (i == num_bridges && b) { /* name not found, can create entry */
331		/* initialize the bridge */
332		strncpy(b->bdg_basename, name, namelen);
333		ND("create new bridge %s with ports %d", b->bdg_basename,
334			b->bdg_active_ports);
335		b->bdg_namelen = namelen;
336		b->bdg_active_ports = 0;
337		for (i = 0; i < NM_BDG_MAXPORTS; i++)
338			b->bdg_port_index[i] = i;
339		/* set the default function */
340		b->bdg_ops.lookup = netmap_bdg_learning;
341		/* reset the MAC address table */
342		bzero(b->ht, sizeof(struct nm_hash_ent) * NM_BDG_HASH);
343		NM_BNS_GET(b);
344	}
345	return b;
346}
347
348
349/*
350 * Free the forwarding tables for rings attached to switch ports.
351 */
352static void
353nm_free_bdgfwd(struct netmap_adapter *na)
354{
355	int nrings, i;
356	struct netmap_kring *kring;
357
358	NMG_LOCK_ASSERT();
359	nrings = na->num_tx_rings;
360	kring = na->tx_rings;
361	for (i = 0; i < nrings; i++) {
362		if (kring[i].nkr_ft) {
363			free(kring[i].nkr_ft, M_DEVBUF);
364			kring[i].nkr_ft = NULL; /* protect from freeing twice */
365		}
366	}
367}
368
369
370/*
371 * Allocate the forwarding tables for the rings attached to the bridge ports.
372 */
373static int
374nm_alloc_bdgfwd(struct netmap_adapter *na)
375{
376	int nrings, l, i, num_dstq;
377	struct netmap_kring *kring;
378
379	NMG_LOCK_ASSERT();
380	/* all port:rings + broadcast */
381	num_dstq = NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1;
382	l = sizeof(struct nm_bdg_fwd) * NM_BDG_BATCH_MAX;
383	l += sizeof(struct nm_bdg_q) * num_dstq;
384	l += sizeof(uint16_t) * NM_BDG_BATCH_MAX;
385
386	nrings = netmap_real_rings(na, NR_TX);
387	kring = na->tx_rings;
388	for (i = 0; i < nrings; i++) {
389		struct nm_bdg_fwd *ft;
390		struct nm_bdg_q *dstq;
391		int j;
392
393		ft = malloc(l, M_DEVBUF, M_NOWAIT | M_ZERO);
394		if (!ft) {
395			nm_free_bdgfwd(na);
396			return ENOMEM;
397		}
398		dstq = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX);
399		for (j = 0; j < num_dstq; j++) {
400			dstq[j].bq_head = dstq[j].bq_tail = NM_FT_NULL;
401			dstq[j].bq_len = 0;
402		}
403		kring[i].nkr_ft = ft;
404	}
405	return 0;
406}
407
408
409/* remove from bridge b the ports in slots hw and sw
410 * (sw can be -1 if not needed)
411 */
412static void
413netmap_bdg_detach_common(struct nm_bridge *b, int hw, int sw)
414{
415	int s_hw = hw, s_sw = sw;
416	int i, lim =b->bdg_active_ports;
417	uint8_t tmp[NM_BDG_MAXPORTS];
418
419	/*
420	New algorithm:
421	make a copy of bdg_port_index;
422	lookup NA(ifp)->bdg_port and SWNA(ifp)->bdg_port
423	in the array of bdg_port_index, replacing them with
424	entries from the bottom of the array;
425	decrement bdg_active_ports;
426	acquire BDG_WLOCK() and copy back the array.
427	 */
428
429	if (netmap_verbose)
430		D("detach %d and %d (lim %d)", hw, sw, lim);
431	/* make a copy of the list of active ports, update it,
432	 * and then copy back within BDG_WLOCK().
433	 */
434	memcpy(tmp, b->bdg_port_index, sizeof(tmp));
435	for (i = 0; (hw >= 0 || sw >= 0) && i < lim; ) {
436		if (hw >= 0 && tmp[i] == hw) {
437			ND("detach hw %d at %d", hw, i);
438			lim--; /* point to last active port */
439			tmp[i] = tmp[lim]; /* swap with i */
440			tmp[lim] = hw;	/* now this is inactive */
441			hw = -1;
442		} else if (sw >= 0 && tmp[i] == sw) {
443			ND("detach sw %d at %d", sw, i);
444			lim--;
445			tmp[i] = tmp[lim];
446			tmp[lim] = sw;
447			sw = -1;
448		} else {
449			i++;
450		}
451	}
452	if (hw >= 0 || sw >= 0) {
453		D("XXX delete failed hw %d sw %d, should panic...", hw, sw);
454	}
455
456	BDG_WLOCK(b);
457	if (b->bdg_ops.dtor)
458		b->bdg_ops.dtor(b->bdg_ports[s_hw]);
459	b->bdg_ports[s_hw] = NULL;
460	if (s_sw >= 0) {
461		b->bdg_ports[s_sw] = NULL;
462	}
463	memcpy(b->bdg_port_index, tmp, sizeof(tmp));
464	b->bdg_active_ports = lim;
465	BDG_WUNLOCK(b);
466
467	ND("now %d active ports", lim);
468	if (lim == 0) {
469		ND("marking bridge %s as free", b->bdg_basename);
470		bzero(&b->bdg_ops, sizeof(b->bdg_ops));
471		NM_BNS_PUT(b);
472	}
473}
474
475/* nm_bdg_ctl callback for VALE ports */
476static int
477netmap_vp_bdg_ctl(struct netmap_adapter *na, struct nmreq *nmr, int attach)
478{
479	struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na;
480	struct nm_bridge *b = vpna->na_bdg;
481
482	if (attach)
483		return 0; /* nothing to do */
484	if (b) {
485		netmap_set_all_rings(na, 0 /* disable */);
486		netmap_bdg_detach_common(b, vpna->bdg_port, -1);
487		vpna->na_bdg = NULL;
488		netmap_set_all_rings(na, 1 /* enable */);
489	}
490	/* I have took reference just for attach */
491	netmap_adapter_put(na);
492	return 0;
493}
494
495/* nm_dtor callback for ephemeral VALE ports */
496static void
497netmap_vp_dtor(struct netmap_adapter *na)
498{
499	struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na;
500	struct nm_bridge *b = vpna->na_bdg;
501
502	ND("%s has %d references", na->name, na->na_refcount);
503
504	if (b) {
505		netmap_bdg_detach_common(b, vpna->bdg_port, -1);
506	}
507}
508
509/* remove a persistent VALE port from the system */
510static int
511nm_vi_destroy(const char *name)
512{
513	struct ifnet *ifp;
514	int error;
515
516	ifp = ifunit_ref(name);
517	if (!ifp)
518		return ENXIO;
519	NMG_LOCK();
520	/* make sure this is actually a VALE port */
521	if (!NETMAP_CAPABLE(ifp) || NA(ifp)->nm_register != netmap_vp_reg) {
522		error = EINVAL;
523		goto err;
524	}
525
526	if (NA(ifp)->na_refcount > 1) {
527		error = EBUSY;
528		goto err;
529	}
530	NMG_UNLOCK();
531
532	D("destroying a persistent vale interface %s", ifp->if_xname);
533	/* Linux requires all the references are released
534	 * before unregister
535	 */
536	if_rele(ifp);
537	netmap_detach(ifp);
538	nm_vi_detach(ifp);
539	return 0;
540
541err:
542	NMG_UNLOCK();
543	if_rele(ifp);
544	return error;
545}
546
547/*
548 * Create a virtual interface registered to the system.
549 * The interface will be attached to a bridge later.
550 */
551static int
552nm_vi_create(struct nmreq *nmr)
553{
554	struct ifnet *ifp;
555	struct netmap_vp_adapter *vpna;
556	int error;
557
558	/* don't include VALE prefix */
559	if (!strncmp(nmr->nr_name, NM_NAME, strlen(NM_NAME)))
560		return EINVAL;
561	ifp = ifunit_ref(nmr->nr_name);
562	if (ifp) { /* already exist, cannot create new one */
563		if_rele(ifp);
564		return EEXIST;
565	}
566	error = nm_vi_persist(nmr->nr_name, &ifp);
567	if (error)
568		return error;
569
570	NMG_LOCK();
571	/* netmap_vp_create creates a struct netmap_vp_adapter */
572	error = netmap_vp_create(nmr, ifp, &vpna);
573	if (error) {
574		D("error %d", error);
575		nm_vi_detach(ifp);
576		return error;
577	}
578	/* persist-specific routines */
579	vpna->up.nm_bdg_ctl = netmap_vp_bdg_ctl;
580	netmap_adapter_get(&vpna->up);
581	NMG_UNLOCK();
582	D("created %s", ifp->if_xname);
583	return 0;
584}
585
586/* Try to get a reference to a netmap adapter attached to a VALE switch.
587 * If the adapter is found (or is created), this function returns 0, a
588 * non NULL pointer is returned into *na, and the caller holds a
589 * reference to the adapter.
590 * If an adapter is not found, then no reference is grabbed and the
591 * function returns an error code, or 0 if there is just a VALE prefix
592 * mismatch. Therefore the caller holds a reference when
593 * (*na != NULL && return == 0).
594 */
595int
596netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na, int create)
597{
598	char *nr_name = nmr->nr_name;
599	const char *ifname;
600	struct ifnet *ifp;
601	int error = 0;
602	struct netmap_vp_adapter *vpna, *hostna = NULL;
603	struct nm_bridge *b;
604	int i, j, cand = -1, cand2 = -1;
605	int needed;
606
607	*na = NULL;     /* default return value */
608
609	/* first try to see if this is a bridge port. */
610	NMG_LOCK_ASSERT();
611	if (strncmp(nr_name, NM_NAME, sizeof(NM_NAME) - 1)) {
612		return 0;  /* no error, but no VALE prefix */
613	}
614
615	b = nm_find_bridge(nr_name, create);
616	if (b == NULL) {
617		D("no bridges available for '%s'", nr_name);
618		return (create ? ENOMEM : ENXIO);
619	}
620	if (strlen(nr_name) < b->bdg_namelen) /* impossible */
621		panic("x");
622
623	/* Now we are sure that name starts with the bridge's name,
624	 * lookup the port in the bridge. We need to scan the entire
625	 * list. It is not important to hold a WLOCK on the bridge
626	 * during the search because NMG_LOCK already guarantees
627	 * that there are no other possible writers.
628	 */
629
630	/* lookup in the local list of ports */
631	for (j = 0; j < b->bdg_active_ports; j++) {
632		i = b->bdg_port_index[j];
633		vpna = b->bdg_ports[i];
634		// KASSERT(na != NULL);
635		ND("checking %s", vpna->up.name);
636		if (!strcmp(vpna->up.name, nr_name)) {
637			netmap_adapter_get(&vpna->up);
638			ND("found existing if %s refs %d", nr_name)
639			*na = &vpna->up;
640			return 0;
641		}
642	}
643	/* not found, should we create it? */
644	if (!create)
645		return ENXIO;
646	/* yes we should, see if we have space to attach entries */
647	needed = 2; /* in some cases we only need 1 */
648	if (b->bdg_active_ports + needed >= NM_BDG_MAXPORTS) {
649		D("bridge full %d, cannot create new port", b->bdg_active_ports);
650		return ENOMEM;
651	}
652	/* record the next two ports available, but do not allocate yet */
653	cand = b->bdg_port_index[b->bdg_active_ports];
654	cand2 = b->bdg_port_index[b->bdg_active_ports + 1];
655	ND("+++ bridge %s port %s used %d avail %d %d",
656		b->bdg_basename, ifname, b->bdg_active_ports, cand, cand2);
657
658	/*
659	 * try see if there is a matching NIC with this name
660	 * (after the bridge's name)
661	 */
662	ifname = nr_name + b->bdg_namelen + 1;
663	ifp = ifunit_ref(ifname);
664	if (!ifp) {
665		/* Create an ephemeral virtual port
666		 * This block contains all the ephemeral-specific logics
667		 */
668		if (nmr->nr_cmd) {
669			/* nr_cmd must be 0 for a virtual port */
670			return EINVAL;
671		}
672
673		/* bdg_netmap_attach creates a struct netmap_adapter */
674		error = netmap_vp_create(nmr, NULL, &vpna);
675		if (error) {
676			D("error %d", error);
677			free(ifp, M_DEVBUF);
678			return error;
679		}
680		/* shortcut - we can skip get_hw_na(),
681		 * ownership check and nm_bdg_attach()
682		 */
683	} else {
684		struct netmap_adapter *hw;
685
686		error = netmap_get_hw_na(ifp, &hw);
687		if (error || hw == NULL)
688			goto out;
689
690		/* host adapter might not be created */
691		error = hw->nm_bdg_attach(nr_name, hw);
692		if (error)
693			goto out;
694		vpna = hw->na_vp;
695		hostna = hw->na_hostvp;
696		if_rele(ifp);
697		if (nmr->nr_arg1 != NETMAP_BDG_HOST)
698			hostna = NULL;
699	}
700
701	BDG_WLOCK(b);
702	vpna->bdg_port = cand;
703	ND("NIC  %p to bridge port %d", vpna, cand);
704	/* bind the port to the bridge (virtual ports are not active) */
705	b->bdg_ports[cand] = vpna;
706	vpna->na_bdg = b;
707	b->bdg_active_ports++;
708	if (hostna != NULL) {
709		/* also bind the host stack to the bridge */
710		b->bdg_ports[cand2] = hostna;
711		hostna->bdg_port = cand2;
712		hostna->na_bdg = b;
713		b->bdg_active_ports++;
714		ND("host %p to bridge port %d", hostna, cand2);
715	}
716	ND("if %s refs %d", ifname, vpna->up.na_refcount);
717	BDG_WUNLOCK(b);
718	*na = &vpna->up;
719	netmap_adapter_get(*na);
720	return 0;
721
722out:
723	if_rele(ifp);
724
725	return error;
726}
727
728
729/* Process NETMAP_BDG_ATTACH */
730static int
731nm_bdg_ctl_attach(struct nmreq *nmr)
732{
733	struct netmap_adapter *na;
734	int error;
735
736	NMG_LOCK();
737
738	error = netmap_get_bdg_na(nmr, &na, 1 /* create if not exists */);
739	if (error) /* no device */
740		goto unlock_exit;
741
742	if (na == NULL) { /* VALE prefix missing */
743		error = EINVAL;
744		goto unlock_exit;
745	}
746
747	if (NETMAP_OWNED_BY_ANY(na)) {
748		error = EBUSY;
749		goto unref_exit;
750	}
751
752	if (na->nm_bdg_ctl) {
753		/* nop for VALE ports. The bwrap needs to put the hwna
754		 * in netmap mode (see netmap_bwrap_bdg_ctl)
755		 */
756		error = na->nm_bdg_ctl(na, nmr, 1);
757		if (error)
758			goto unref_exit;
759		ND("registered %s to netmap-mode", na->name);
760	}
761	NMG_UNLOCK();
762	return 0;
763
764unref_exit:
765	netmap_adapter_put(na);
766unlock_exit:
767	NMG_UNLOCK();
768	return error;
769}
770
771
772/* process NETMAP_BDG_DETACH */
773static int
774nm_bdg_ctl_detach(struct nmreq *nmr)
775{
776	struct netmap_adapter *na;
777	int error;
778
779	NMG_LOCK();
780	error = netmap_get_bdg_na(nmr, &na, 0 /* don't create */);
781	if (error) { /* no device, or another bridge or user owns the device */
782		goto unlock_exit;
783	}
784
785	if (na == NULL) { /* VALE prefix missing */
786		error = EINVAL;
787		goto unlock_exit;
788	}
789
790	if (na->nm_bdg_ctl) {
791		/* remove the port from bridge. The bwrap
792		 * also needs to put the hwna in normal mode
793		 */
794		error = na->nm_bdg_ctl(na, nmr, 0);
795	}
796
797	netmap_adapter_put(na);
798unlock_exit:
799	NMG_UNLOCK();
800	return error;
801
802}
803
804
805/* Called by either user's context (netmap_ioctl())
806 * or external kernel modules (e.g., Openvswitch).
807 * Operation is indicated in nmr->nr_cmd.
808 * NETMAP_BDG_OPS that sets configure/lookup/dtor functions to the bridge
809 * requires bdg_ops argument; the other commands ignore this argument.
810 *
811 * Called without NMG_LOCK.
812 */
813int
814netmap_bdg_ctl(struct nmreq *nmr, struct netmap_bdg_ops *bdg_ops)
815{
816	struct nm_bridge *b, *bridges;
817	struct netmap_adapter *na;
818	struct netmap_vp_adapter *vpna;
819	char *name = nmr->nr_name;
820	int cmd = nmr->nr_cmd, namelen = strlen(name);
821	int error = 0, i, j;
822	u_int num_bridges;
823
824	netmap_bns_getbridges(&bridges, &num_bridges);
825
826	switch (cmd) {
827	case NETMAP_BDG_NEWIF:
828		error = nm_vi_create(nmr);
829		break;
830
831	case NETMAP_BDG_DELIF:
832		error = nm_vi_destroy(nmr->nr_name);
833		break;
834
835	case NETMAP_BDG_ATTACH:
836		error = nm_bdg_ctl_attach(nmr);
837		break;
838
839	case NETMAP_BDG_DETACH:
840		error = nm_bdg_ctl_detach(nmr);
841		break;
842
843	case NETMAP_BDG_LIST:
844		/* this is used to enumerate bridges and ports */
845		if (namelen) { /* look up indexes of bridge and port */
846			if (strncmp(name, NM_NAME, strlen(NM_NAME))) {
847				error = EINVAL;
848				break;
849			}
850			NMG_LOCK();
851			b = nm_find_bridge(name, 0 /* don't create */);
852			if (!b) {
853				error = ENOENT;
854				NMG_UNLOCK();
855				break;
856			}
857
858			error = ENOENT;
859			for (j = 0; j < b->bdg_active_ports; j++) {
860				i = b->bdg_port_index[j];
861				vpna = b->bdg_ports[i];
862				if (vpna == NULL) {
863					D("---AAAAAAAAARGH-------");
864					continue;
865				}
866				/* the former and the latter identify a
867				 * virtual port and a NIC, respectively
868				 */
869				if (!strcmp(vpna->up.name, name)) {
870					/* bridge index */
871					nmr->nr_arg1 = b - bridges;
872					nmr->nr_arg2 = i; /* port index */
873					error = 0;
874					break;
875				}
876			}
877			NMG_UNLOCK();
878		} else {
879			/* return the first non-empty entry starting from
880			 * bridge nr_arg1 and port nr_arg2.
881			 *
882			 * Users can detect the end of the same bridge by
883			 * seeing the new and old value of nr_arg1, and can
884			 * detect the end of all the bridge by error != 0
885			 */
886			i = nmr->nr_arg1;
887			j = nmr->nr_arg2;
888
889			NMG_LOCK();
890			for (error = ENOENT; i < NM_BRIDGES; i++) {
891				b = bridges + i;
892				if (j >= b->bdg_active_ports) {
893					j = 0; /* following bridges scan from 0 */
894					continue;
895				}
896				nmr->nr_arg1 = i;
897				nmr->nr_arg2 = j;
898				j = b->bdg_port_index[j];
899				vpna = b->bdg_ports[j];
900				strncpy(name, vpna->up.name, (size_t)IFNAMSIZ);
901				error = 0;
902				break;
903			}
904			NMG_UNLOCK();
905		}
906		break;
907
908	case NETMAP_BDG_REGOPS: /* XXX this should not be available from userspace */
909		/* register callbacks to the given bridge.
910		 * nmr->nr_name may be just bridge's name (including ':'
911		 * if it is not just NM_NAME).
912		 */
913		if (!bdg_ops) {
914			error = EINVAL;
915			break;
916		}
917		NMG_LOCK();
918		b = nm_find_bridge(name, 0 /* don't create */);
919		if (!b) {
920			error = EINVAL;
921		} else {
922			b->bdg_ops = *bdg_ops;
923		}
924		NMG_UNLOCK();
925		break;
926
927	case NETMAP_BDG_VNET_HDR:
928		/* Valid lengths for the virtio-net header are 0 (no header),
929		   10 and 12. */
930		if (nmr->nr_arg1 != 0 &&
931			nmr->nr_arg1 != sizeof(struct nm_vnet_hdr) &&
932				nmr->nr_arg1 != 12) {
933			error = EINVAL;
934			break;
935		}
936		NMG_LOCK();
937		error = netmap_get_bdg_na(nmr, &na, 0);
938		if (na && !error) {
939			vpna = (struct netmap_vp_adapter *)na;
940			vpna->virt_hdr_len = nmr->nr_arg1;
941			if (vpna->virt_hdr_len)
942				vpna->mfs = NETMAP_BUF_SIZE(na);
943			D("Using vnet_hdr_len %d for %p", vpna->virt_hdr_len, vpna);
944			netmap_adapter_put(na);
945		}
946		NMG_UNLOCK();
947		break;
948
949	default:
950		D("invalid cmd (nmr->nr_cmd) (0x%x)", cmd);
951		error = EINVAL;
952		break;
953	}
954	return error;
955}
956
957int
958netmap_bdg_config(struct nmreq *nmr)
959{
960	struct nm_bridge *b;
961	int error = EINVAL;
962
963	NMG_LOCK();
964	b = nm_find_bridge(nmr->nr_name, 0);
965	if (!b) {
966		NMG_UNLOCK();
967		return error;
968	}
969	NMG_UNLOCK();
970	/* Don't call config() with NMG_LOCK() held */
971	BDG_RLOCK(b);
972	if (b->bdg_ops.config != NULL)
973		error = b->bdg_ops.config((struct nm_ifreq *)nmr);
974	BDG_RUNLOCK(b);
975	return error;
976}
977
978
979/* nm_krings_create callback for VALE ports.
980 * Calls the standard netmap_krings_create, then adds leases on rx
981 * rings and bdgfwd on tx rings.
982 */
983static int
984netmap_vp_krings_create(struct netmap_adapter *na)
985{
986	u_int tailroom;
987	int error, i;
988	uint32_t *leases;
989	u_int nrx = netmap_real_rings(na, NR_RX);
990
991	/*
992	 * Leases are attached to RX rings on vale ports
993	 */
994	tailroom = sizeof(uint32_t) * na->num_rx_desc * nrx;
995
996	error = netmap_krings_create(na, tailroom);
997	if (error)
998		return error;
999
1000	leases = na->tailroom;
1001
1002	for (i = 0; i < nrx; i++) { /* Receive rings */
1003		na->rx_rings[i].nkr_leases = leases;
1004		leases += na->num_rx_desc;
1005	}
1006
1007	error = nm_alloc_bdgfwd(na);
1008	if (error) {
1009		netmap_krings_delete(na);
1010		return error;
1011	}
1012
1013	return 0;
1014}
1015
1016
1017/* nm_krings_delete callback for VALE ports. */
1018static void
1019netmap_vp_krings_delete(struct netmap_adapter *na)
1020{
1021	nm_free_bdgfwd(na);
1022	netmap_krings_delete(na);
1023}
1024
1025
1026static int
1027nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n,
1028	struct netmap_vp_adapter *na, u_int ring_nr);
1029
1030
1031/*
1032 * main dispatch routine for the bridge.
1033 * Grab packets from a kring, move them into the ft structure
1034 * associated to the tx (input) port. Max one instance per port,
1035 * filtered on input (ioctl, poll or XXX).
1036 * Returns the next position in the ring.
1037 */
1038static int
1039nm_bdg_preflush(struct netmap_kring *kring, u_int end)
1040{
1041	struct netmap_vp_adapter *na =
1042		(struct netmap_vp_adapter*)kring->na;
1043	struct netmap_ring *ring = kring->ring;
1044	struct nm_bdg_fwd *ft;
1045	u_int ring_nr = kring->ring_id;
1046	u_int j = kring->nr_hwcur, lim = kring->nkr_num_slots - 1;
1047	u_int ft_i = 0;	/* start from 0 */
1048	u_int frags = 1; /* how many frags ? */
1049	struct nm_bridge *b = na->na_bdg;
1050
1051	/* To protect against modifications to the bridge we acquire a
1052	 * shared lock, waiting if we can sleep (if the source port is
1053	 * attached to a user process) or with a trylock otherwise (NICs).
1054	 */
1055	ND("wait rlock for %d packets", ((j > end ? lim+1 : 0) + end) - j);
1056	if (na->up.na_flags & NAF_BDG_MAYSLEEP)
1057		BDG_RLOCK(b);
1058	else if (!BDG_RTRYLOCK(b))
1059		return 0;
1060	ND(5, "rlock acquired for %d packets", ((j > end ? lim+1 : 0) + end) - j);
1061	ft = kring->nkr_ft;
1062
1063	for (; likely(j != end); j = nm_next(j, lim)) {
1064		struct netmap_slot *slot = &ring->slot[j];
1065		char *buf;
1066
1067		ft[ft_i].ft_len = slot->len;
1068		ft[ft_i].ft_flags = slot->flags;
1069
1070		ND("flags is 0x%x", slot->flags);
1071		/* we do not use the buf changed flag, but we still need to reset it */
1072		slot->flags &= ~NS_BUF_CHANGED;
1073
1074		/* this slot goes into a list so initialize the link field */
1075		ft[ft_i].ft_next = NM_FT_NULL;
1076		buf = ft[ft_i].ft_buf = (slot->flags & NS_INDIRECT) ?
1077			(void *)(uintptr_t)slot->ptr : NMB(&na->up, slot);
1078		if (unlikely(buf == NULL)) {
1079			RD(5, "NULL %s buffer pointer from %s slot %d len %d",
1080				(slot->flags & NS_INDIRECT) ? "INDIRECT" : "DIRECT",
1081				kring->name, j, ft[ft_i].ft_len);
1082			buf = ft[ft_i].ft_buf = NETMAP_BUF_BASE(&na->up);
1083			ft[ft_i].ft_len = 0;
1084			ft[ft_i].ft_flags = 0;
1085		}
1086		__builtin_prefetch(buf);
1087		++ft_i;
1088		if (slot->flags & NS_MOREFRAG) {
1089			frags++;
1090			continue;
1091		}
1092		if (unlikely(netmap_verbose && frags > 1))
1093			RD(5, "%d frags at %d", frags, ft_i - frags);
1094		ft[ft_i - frags].ft_frags = frags;
1095		frags = 1;
1096		if (unlikely((int)ft_i >= bridge_batch))
1097			ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr);
1098	}
1099	if (frags > 1) {
1100		D("truncate incomplete fragment at %d (%d frags)", ft_i, frags);
1101		// ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG
1102		ft[ft_i - 1].ft_frags &= ~NS_MOREFRAG;
1103		ft[ft_i - frags].ft_frags = frags - 1;
1104	}
1105	if (ft_i)
1106		ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr);
1107	BDG_RUNLOCK(b);
1108	return j;
1109}
1110
1111
1112/* ----- FreeBSD if_bridge hash function ------- */
1113
1114/*
1115 * The following hash function is adapted from "Hash Functions" by Bob Jenkins
1116 * ("Algorithm Alley", Dr. Dobbs Journal, September 1997).
1117 *
1118 * http://www.burtleburtle.net/bob/hash/spooky.html
1119 */
1120#define mix(a, b, c)                                                    \
1121do {                                                                    \
1122        a -= b; a -= c; a ^= (c >> 13);                                 \
1123        b -= c; b -= a; b ^= (a << 8);                                  \
1124        c -= a; c -= b; c ^= (b >> 13);                                 \
1125        a -= b; a -= c; a ^= (c >> 12);                                 \
1126        b -= c; b -= a; b ^= (a << 16);                                 \
1127        c -= a; c -= b; c ^= (b >> 5);                                  \
1128        a -= b; a -= c; a ^= (c >> 3);                                  \
1129        b -= c; b -= a; b ^= (a << 10);                                 \
1130        c -= a; c -= b; c ^= (b >> 15);                                 \
1131} while (/*CONSTCOND*/0)
1132
1133
1134static __inline uint32_t
1135nm_bridge_rthash(const uint8_t *addr)
1136{
1137        uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = 0; // hask key
1138
1139        b += addr[5] << 8;
1140        b += addr[4];
1141        a += addr[3] << 24;
1142        a += addr[2] << 16;
1143        a += addr[1] << 8;
1144        a += addr[0];
1145
1146        mix(a, b, c);
1147#define BRIDGE_RTHASH_MASK	(NM_BDG_HASH-1)
1148        return (c & BRIDGE_RTHASH_MASK);
1149}
1150
1151#undef mix
1152
1153
1154/* nm_register callback for VALE ports */
1155static int
1156netmap_vp_reg(struct netmap_adapter *na, int onoff)
1157{
1158	struct netmap_vp_adapter *vpna =
1159		(struct netmap_vp_adapter*)na;
1160
1161	/* persistent ports may be put in netmap mode
1162	 * before being attached to a bridge
1163	 */
1164	if (vpna->na_bdg)
1165		BDG_WLOCK(vpna->na_bdg);
1166	if (onoff) {
1167		na->na_flags |= NAF_NETMAP_ON;
1168		 /* XXX on FreeBSD, persistent VALE ports should also
1169		 * toggle IFCAP_NETMAP in na->ifp (2014-03-16)
1170		 */
1171	} else {
1172		na->na_flags &= ~NAF_NETMAP_ON;
1173	}
1174	if (vpna->na_bdg)
1175		BDG_WUNLOCK(vpna->na_bdg);
1176	return 0;
1177}
1178
1179
1180/*
1181 * Lookup function for a learning bridge.
1182 * Update the hash table with the source address,
1183 * and then returns the destination port index, and the
1184 * ring in *dst_ring (at the moment, always use ring 0)
1185 */
1186u_int
1187netmap_bdg_learning(struct nm_bdg_fwd *ft, uint8_t *dst_ring,
1188		struct netmap_vp_adapter *na)
1189{
1190	uint8_t *buf = ft->ft_buf;
1191	u_int buf_len = ft->ft_len;
1192	struct nm_hash_ent *ht = na->na_bdg->ht;
1193	uint32_t sh, dh;
1194	u_int dst, mysrc = na->bdg_port;
1195	uint64_t smac, dmac;
1196
1197	/* safety check, unfortunately we have many cases */
1198	if (buf_len >= 14 + na->virt_hdr_len) {
1199		/* virthdr + mac_hdr in the same slot */
1200		buf += na->virt_hdr_len;
1201		buf_len -= na->virt_hdr_len;
1202	} else if (buf_len == na->virt_hdr_len && ft->ft_flags & NS_MOREFRAG) {
1203		/* only header in first fragment */
1204		ft++;
1205		buf = ft->ft_buf;
1206		buf_len = ft->ft_len;
1207	} else {
1208		RD(5, "invalid buf format, length %d", buf_len);
1209		return NM_BDG_NOPORT;
1210	}
1211	dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff;
1212	smac = le64toh(*(uint64_t *)(buf + 4));
1213	smac >>= 16;
1214
1215	/*
1216	 * The hash is somewhat expensive, there might be some
1217	 * worthwhile optimizations here.
1218	 */
1219	if (((buf[6] & 1) == 0) && (na->last_smac != smac)) { /* valid src */
1220		uint8_t *s = buf+6;
1221		sh = nm_bridge_rthash(s); // XXX hash of source
1222		/* update source port forwarding entry */
1223		na->last_smac = ht[sh].mac = smac;	/* XXX expire ? */
1224		ht[sh].ports = mysrc;
1225		if (netmap_verbose)
1226		    D("src %02x:%02x:%02x:%02x:%02x:%02x on port %d",
1227			s[0], s[1], s[2], s[3], s[4], s[5], mysrc);
1228	}
1229	dst = NM_BDG_BROADCAST;
1230	if ((buf[0] & 1) == 0) { /* unicast */
1231		dh = nm_bridge_rthash(buf); // XXX hash of dst
1232		if (ht[dh].mac == dmac) {	/* found dst */
1233			dst = ht[dh].ports;
1234		}
1235		/* XXX otherwise return NM_BDG_UNKNOWN ? */
1236	}
1237	return dst;
1238}
1239
1240
1241/*
1242 * Available space in the ring. Only used in VALE code
1243 * and only with is_rx = 1
1244 */
1245static inline uint32_t
1246nm_kr_space(struct netmap_kring *k, int is_rx)
1247{
1248	int space;
1249
1250	if (is_rx) {
1251		int busy = k->nkr_hwlease - k->nr_hwcur;
1252		if (busy < 0)
1253			busy += k->nkr_num_slots;
1254		space = k->nkr_num_slots - 1 - busy;
1255	} else {
1256		/* XXX never used in this branch */
1257		space = k->nr_hwtail - k->nkr_hwlease;
1258		if (space < 0)
1259			space += k->nkr_num_slots;
1260	}
1261#if 0
1262	// sanity check
1263	if (k->nkr_hwlease >= k->nkr_num_slots ||
1264		k->nr_hwcur >= k->nkr_num_slots ||
1265		k->nr_tail >= k->nkr_num_slots ||
1266		busy < 0 ||
1267		busy >= k->nkr_num_slots) {
1268		D("invalid kring, cur %d tail %d lease %d lease_idx %d lim %d",			k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease,
1269			k->nkr_lease_idx, k->nkr_num_slots);
1270	}
1271#endif
1272	return space;
1273}
1274
1275
1276
1277
1278/* make a lease on the kring for N positions. return the
1279 * lease index
1280 * XXX only used in VALE code and with is_rx = 1
1281 */
1282static inline uint32_t
1283nm_kr_lease(struct netmap_kring *k, u_int n, int is_rx)
1284{
1285	uint32_t lim = k->nkr_num_slots - 1;
1286	uint32_t lease_idx = k->nkr_lease_idx;
1287
1288	k->nkr_leases[lease_idx] = NR_NOSLOT;
1289	k->nkr_lease_idx = nm_next(lease_idx, lim);
1290
1291	if (n > nm_kr_space(k, is_rx)) {
1292		D("invalid request for %d slots", n);
1293		panic("x");
1294	}
1295	/* XXX verify that there are n slots */
1296	k->nkr_hwlease += n;
1297	if (k->nkr_hwlease > lim)
1298		k->nkr_hwlease -= lim + 1;
1299
1300	if (k->nkr_hwlease >= k->nkr_num_slots ||
1301		k->nr_hwcur >= k->nkr_num_slots ||
1302		k->nr_hwtail >= k->nkr_num_slots ||
1303		k->nkr_lease_idx >= k->nkr_num_slots) {
1304		D("invalid kring %s, cur %d tail %d lease %d lease_idx %d lim %d",
1305			k->na->name,
1306			k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease,
1307			k->nkr_lease_idx, k->nkr_num_slots);
1308	}
1309	return lease_idx;
1310}
1311
1312/*
1313 *
1314 * This flush routine supports only unicast and broadcast but a large
1315 * number of ports, and lets us replace the learn and dispatch functions.
1316 */
1317int
1318nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na,
1319		u_int ring_nr)
1320{
1321	struct nm_bdg_q *dst_ents, *brddst;
1322	uint16_t num_dsts = 0, *dsts;
1323	struct nm_bridge *b = na->na_bdg;
1324	u_int i, j, me = na->bdg_port;
1325
1326	/*
1327	 * The work area (pointed by ft) is followed by an array of
1328	 * pointers to queues , dst_ents; there are NM_BDG_MAXRINGS
1329	 * queues per port plus one for the broadcast traffic.
1330	 * Then we have an array of destination indexes.
1331	 */
1332	dst_ents = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX);
1333	dsts = (uint16_t *)(dst_ents + NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1);
1334
1335	/* first pass: find a destination for each packet in the batch */
1336	for (i = 0; likely(i < n); i += ft[i].ft_frags) {
1337		uint8_t dst_ring = ring_nr; /* default, same ring as origin */
1338		uint16_t dst_port, d_i;
1339		struct nm_bdg_q *d;
1340
1341		ND("slot %d frags %d", i, ft[i].ft_frags);
1342		/* Drop the packet if the virtio-net header is not into the first
1343		   fragment nor at the very beginning of the second. */
1344		if (unlikely(na->virt_hdr_len > ft[i].ft_len))
1345			continue;
1346		dst_port = b->bdg_ops.lookup(&ft[i], &dst_ring, na);
1347		if (netmap_verbose > 255)
1348			RD(5, "slot %d port %d -> %d", i, me, dst_port);
1349		if (dst_port == NM_BDG_NOPORT)
1350			continue; /* this packet is identified to be dropped */
1351		else if (unlikely(dst_port > NM_BDG_MAXPORTS))
1352			continue;
1353		else if (dst_port == NM_BDG_BROADCAST)
1354			dst_ring = 0; /* broadcasts always go to ring 0 */
1355		else if (unlikely(dst_port == me ||
1356		    !b->bdg_ports[dst_port]))
1357			continue;
1358
1359		/* get a position in the scratch pad */
1360		d_i = dst_port * NM_BDG_MAXRINGS + dst_ring;
1361		d = dst_ents + d_i;
1362
1363		/* append the first fragment to the list */
1364		if (d->bq_head == NM_FT_NULL) { /* new destination */
1365			d->bq_head = d->bq_tail = i;
1366			/* remember this position to be scanned later */
1367			if (dst_port != NM_BDG_BROADCAST)
1368				dsts[num_dsts++] = d_i;
1369		} else {
1370			ft[d->bq_tail].ft_next = i;
1371			d->bq_tail = i;
1372		}
1373		d->bq_len += ft[i].ft_frags;
1374	}
1375
1376	/*
1377	 * Broadcast traffic goes to ring 0 on all destinations.
1378	 * So we need to add these rings to the list of ports to scan.
1379	 * XXX at the moment we scan all NM_BDG_MAXPORTS ports, which is
1380	 * expensive. We should keep a compact list of active destinations
1381	 * so we could shorten this loop.
1382	 */
1383	brddst = dst_ents + NM_BDG_BROADCAST * NM_BDG_MAXRINGS;
1384	if (brddst->bq_head != NM_FT_NULL) {
1385		for (j = 0; likely(j < b->bdg_active_ports); j++) {
1386			uint16_t d_i;
1387			i = b->bdg_port_index[j];
1388			if (unlikely(i == me))
1389				continue;
1390			d_i = i * NM_BDG_MAXRINGS;
1391			if (dst_ents[d_i].bq_head == NM_FT_NULL)
1392				dsts[num_dsts++] = d_i;
1393		}
1394	}
1395
1396	ND(5, "pass 1 done %d pkts %d dsts", n, num_dsts);
1397	/* second pass: scan destinations */
1398	for (i = 0; i < num_dsts; i++) {
1399		struct netmap_vp_adapter *dst_na;
1400		struct netmap_kring *kring;
1401		struct netmap_ring *ring;
1402		u_int dst_nr, lim, j, d_i, next, brd_next;
1403		u_int needed, howmany;
1404		int retry = netmap_txsync_retry;
1405		struct nm_bdg_q *d;
1406		uint32_t my_start = 0, lease_idx = 0;
1407		int nrings;
1408		int virt_hdr_mismatch = 0;
1409
1410		d_i = dsts[i];
1411		ND("second pass %d port %d", i, d_i);
1412		d = dst_ents + d_i;
1413		// XXX fix the division
1414		dst_na = b->bdg_ports[d_i/NM_BDG_MAXRINGS];
1415		/* protect from the lookup function returning an inactive
1416		 * destination port
1417		 */
1418		if (unlikely(dst_na == NULL))
1419			goto cleanup;
1420		if (dst_na->up.na_flags & NAF_SW_ONLY)
1421			goto cleanup;
1422		/*
1423		 * The interface may be in !netmap mode in two cases:
1424		 * - when na is attached but not activated yet;
1425		 * - when na is being deactivated but is still attached.
1426		 */
1427		if (unlikely(!nm_netmap_on(&dst_na->up))) {
1428			ND("not in netmap mode!");
1429			goto cleanup;
1430		}
1431
1432		/* there is at least one either unicast or broadcast packet */
1433		brd_next = brddst->bq_head;
1434		next = d->bq_head;
1435		/* we need to reserve this many slots. If fewer are
1436		 * available, some packets will be dropped.
1437		 * Packets may have multiple fragments, so we may not use
1438		 * there is a chance that we may not use all of the slots
1439		 * we have claimed, so we will need to handle the leftover
1440		 * ones when we regain the lock.
1441		 */
1442		needed = d->bq_len + brddst->bq_len;
1443
1444		if (unlikely(dst_na->virt_hdr_len != na->virt_hdr_len)) {
1445			RD(3, "virt_hdr_mismatch, src %d dst %d", na->virt_hdr_len, dst_na->virt_hdr_len);
1446			/* There is a virtio-net header/offloadings mismatch between
1447			 * source and destination. The slower mismatch datapath will
1448			 * be used to cope with all the mismatches.
1449			 */
1450			virt_hdr_mismatch = 1;
1451			if (dst_na->mfs < na->mfs) {
1452				/* We may need to do segmentation offloadings, and so
1453				 * we may need a number of destination slots greater
1454				 * than the number of input slots ('needed').
1455				 * We look for the smallest integer 'x' which satisfies:
1456				 *	needed * na->mfs + x * H <= x * na->mfs
1457				 * where 'H' is the length of the longest header that may
1458				 * be replicated in the segmentation process (e.g. for
1459				 * TCPv4 we must account for ethernet header, IP header
1460				 * and TCPv4 header).
1461				 */
1462				needed = (needed * na->mfs) /
1463						(dst_na->mfs - WORST_CASE_GSO_HEADER) + 1;
1464				ND(3, "srcmtu=%u, dstmtu=%u, x=%u", na->mfs, dst_na->mfs, needed);
1465			}
1466		}
1467
1468		ND(5, "pass 2 dst %d is %x %s",
1469			i, d_i, is_vp ? "virtual" : "nic/host");
1470		dst_nr = d_i & (NM_BDG_MAXRINGS-1);
1471		nrings = dst_na->up.num_rx_rings;
1472		if (dst_nr >= nrings)
1473			dst_nr = dst_nr % nrings;
1474		kring = &dst_na->up.rx_rings[dst_nr];
1475		ring = kring->ring;
1476		lim = kring->nkr_num_slots - 1;
1477
1478retry:
1479
1480		if (dst_na->retry && retry) {
1481			/* try to get some free slot from the previous run */
1482			kring->nm_notify(kring, 0);
1483			/* actually useful only for bwraps, since there
1484			 * the notify will trigger a txsync on the hwna. VALE ports
1485			 * have dst_na->retry == 0
1486			 */
1487		}
1488		/* reserve the buffers in the queue and an entry
1489		 * to report completion, and drop lock.
1490		 * XXX this might become a helper function.
1491		 */
1492		mtx_lock(&kring->q_lock);
1493		if (kring->nkr_stopped) {
1494			mtx_unlock(&kring->q_lock);
1495			goto cleanup;
1496		}
1497		my_start = j = kring->nkr_hwlease;
1498		howmany = nm_kr_space(kring, 1);
1499		if (needed < howmany)
1500			howmany = needed;
1501		lease_idx = nm_kr_lease(kring, howmany, 1);
1502		mtx_unlock(&kring->q_lock);
1503
1504		/* only retry if we need more than available slots */
1505		if (retry && needed <= howmany)
1506			retry = 0;
1507
1508		/* copy to the destination queue */
1509		while (howmany > 0) {
1510			struct netmap_slot *slot;
1511			struct nm_bdg_fwd *ft_p, *ft_end;
1512			u_int cnt;
1513
1514			/* find the queue from which we pick next packet.
1515			 * NM_FT_NULL is always higher than valid indexes
1516			 * so we never dereference it if the other list
1517			 * has packets (and if both are empty we never
1518			 * get here).
1519			 */
1520			if (next < brd_next) {
1521				ft_p = ft + next;
1522				next = ft_p->ft_next;
1523			} else { /* insert broadcast */
1524				ft_p = ft + brd_next;
1525				brd_next = ft_p->ft_next;
1526			}
1527			cnt = ft_p->ft_frags; // cnt > 0
1528			if (unlikely(cnt > howmany))
1529			    break; /* no more space */
1530			if (netmap_verbose && cnt > 1)
1531				RD(5, "rx %d frags to %d", cnt, j);
1532			ft_end = ft_p + cnt;
1533			if (unlikely(virt_hdr_mismatch)) {
1534				bdg_mismatch_datapath(na, dst_na, ft_p, ring, &j, lim, &howmany);
1535			} else {
1536				howmany -= cnt;
1537				do {
1538					char *dst, *src = ft_p->ft_buf;
1539					size_t copy_len = ft_p->ft_len, dst_len = copy_len;
1540
1541					slot = &ring->slot[j];
1542					dst = NMB(&dst_na->up, slot);
1543
1544					ND("send [%d] %d(%d) bytes at %s:%d",
1545							i, (int)copy_len, (int)dst_len,
1546							NM_IFPNAME(dst_ifp), j);
1547					/* round to a multiple of 64 */
1548					copy_len = (copy_len + 63) & ~63;
1549
1550					if (unlikely(copy_len > NETMAP_BUF_SIZE(&dst_na->up) ||
1551						     copy_len > NETMAP_BUF_SIZE(&na->up))) {
1552						RD(5, "invalid len %d, down to 64", (int)copy_len);
1553						copy_len = dst_len = 64; // XXX
1554					}
1555					if (ft_p->ft_flags & NS_INDIRECT) {
1556						if (copyin(src, dst, copy_len)) {
1557							// invalid user pointer, pretend len is 0
1558							dst_len = 0;
1559						}
1560					} else {
1561						//memcpy(dst, src, copy_len);
1562						pkt_copy(src, dst, (int)copy_len);
1563					}
1564					slot->len = dst_len;
1565					slot->flags = (cnt << 8)| NS_MOREFRAG;
1566					j = nm_next(j, lim);
1567					needed--;
1568					ft_p++;
1569				} while (ft_p != ft_end);
1570				slot->flags = (cnt << 8); /* clear flag on last entry */
1571			}
1572			/* are we done ? */
1573			if (next == NM_FT_NULL && brd_next == NM_FT_NULL)
1574				break;
1575		}
1576		{
1577		    /* current position */
1578		    uint32_t *p = kring->nkr_leases; /* shorthand */
1579		    uint32_t update_pos;
1580		    int still_locked = 1;
1581
1582		    mtx_lock(&kring->q_lock);
1583		    if (unlikely(howmany > 0)) {
1584			/* not used all bufs. If i am the last one
1585			 * i can recover the slots, otherwise must
1586			 * fill them with 0 to mark empty packets.
1587			 */
1588			ND("leftover %d bufs", howmany);
1589			if (nm_next(lease_idx, lim) == kring->nkr_lease_idx) {
1590			    /* yes i am the last one */
1591			    ND("roll back nkr_hwlease to %d", j);
1592			    kring->nkr_hwlease = j;
1593			} else {
1594			    while (howmany-- > 0) {
1595				ring->slot[j].len = 0;
1596				ring->slot[j].flags = 0;
1597				j = nm_next(j, lim);
1598			    }
1599			}
1600		    }
1601		    p[lease_idx] = j; /* report I am done */
1602
1603		    update_pos = kring->nr_hwtail;
1604
1605		    if (my_start == update_pos) {
1606			/* all slots before my_start have been reported,
1607			 * so scan subsequent leases to see if other ranges
1608			 * have been completed, and to a selwakeup or txsync.
1609		         */
1610			while (lease_idx != kring->nkr_lease_idx &&
1611				p[lease_idx] != NR_NOSLOT) {
1612			    j = p[lease_idx];
1613			    p[lease_idx] = NR_NOSLOT;
1614			    lease_idx = nm_next(lease_idx, lim);
1615			}
1616			/* j is the new 'write' position. j != my_start
1617			 * means there are new buffers to report
1618			 */
1619			if (likely(j != my_start)) {
1620				kring->nr_hwtail = j;
1621				still_locked = 0;
1622				mtx_unlock(&kring->q_lock);
1623				kring->nm_notify(kring, 0);
1624				/* this is netmap_notify for VALE ports and
1625				 * netmap_bwrap_notify for bwrap. The latter will
1626				 * trigger a txsync on the underlying hwna
1627				 */
1628				if (dst_na->retry && retry--) {
1629					/* XXX this is going to call nm_notify again.
1630					 * Only useful for bwrap in virtual machines
1631					 */
1632					goto retry;
1633				}
1634			}
1635		    }
1636		    if (still_locked)
1637			mtx_unlock(&kring->q_lock);
1638		}
1639cleanup:
1640		d->bq_head = d->bq_tail = NM_FT_NULL; /* cleanup */
1641		d->bq_len = 0;
1642	}
1643	brddst->bq_head = brddst->bq_tail = NM_FT_NULL; /* cleanup */
1644	brddst->bq_len = 0;
1645	return 0;
1646}
1647
1648/* nm_txsync callback for VALE ports */
1649static int
1650netmap_vp_txsync(struct netmap_kring *kring, int flags)
1651{
1652	struct netmap_vp_adapter *na =
1653		(struct netmap_vp_adapter *)kring->na;
1654	u_int done;
1655	u_int const lim = kring->nkr_num_slots - 1;
1656	u_int const head = kring->rhead;
1657
1658	if (bridge_batch <= 0) { /* testing only */
1659		done = head; // used all
1660		goto done;
1661	}
1662	if (!na->na_bdg) {
1663		done = head;
1664		goto done;
1665	}
1666	if (bridge_batch > NM_BDG_BATCH)
1667		bridge_batch = NM_BDG_BATCH;
1668
1669	done = nm_bdg_preflush(kring, head);
1670done:
1671	if (done != head)
1672		D("early break at %d/ %d, tail %d", done, head, kring->nr_hwtail);
1673	/*
1674	 * packets between 'done' and 'cur' are left unsent.
1675	 */
1676	kring->nr_hwcur = done;
1677	kring->nr_hwtail = nm_prev(done, lim);
1678	if (netmap_verbose)
1679		D("%s ring %d flags %d", na->up.name, kring->ring_id, flags);
1680	return 0;
1681}
1682
1683
1684/* rxsync code used by VALE ports nm_rxsync callback and also
1685 * internally by the brwap
1686 */
1687static int
1688netmap_vp_rxsync_locked(struct netmap_kring *kring, int flags)
1689{
1690	struct netmap_adapter *na = kring->na;
1691	struct netmap_ring *ring = kring->ring;
1692	u_int nm_i, lim = kring->nkr_num_slots - 1;
1693	u_int head = kring->rhead;
1694	int n;
1695
1696	if (head > lim) {
1697		D("ouch dangerous reset!!!");
1698		n = netmap_ring_reinit(kring);
1699		goto done;
1700	}
1701
1702	/* First part, import newly received packets. */
1703	/* actually nothing to do here, they are already in the kring */
1704
1705	/* Second part, skip past packets that userspace has released. */
1706	nm_i = kring->nr_hwcur;
1707	if (nm_i != head) {
1708		/* consistency check, but nothing really important here */
1709		for (n = 0; likely(nm_i != head); n++) {
1710			struct netmap_slot *slot = &ring->slot[nm_i];
1711			void *addr = NMB(na, slot);
1712
1713			if (addr == NETMAP_BUF_BASE(kring->na)) { /* bad buf */
1714				D("bad buffer index %d, ignore ?",
1715					slot->buf_idx);
1716			}
1717			slot->flags &= ~NS_BUF_CHANGED;
1718			nm_i = nm_next(nm_i, lim);
1719		}
1720		kring->nr_hwcur = head;
1721	}
1722
1723	n = 0;
1724done:
1725	return n;
1726}
1727
1728/*
1729 * nm_rxsync callback for VALE ports
1730 * user process reading from a VALE switch.
1731 * Already protected against concurrent calls from userspace,
1732 * but we must acquire the queue's lock to protect against
1733 * writers on the same queue.
1734 */
1735static int
1736netmap_vp_rxsync(struct netmap_kring *kring, int flags)
1737{
1738	int n;
1739
1740	mtx_lock(&kring->q_lock);
1741	n = netmap_vp_rxsync_locked(kring, flags);
1742	mtx_unlock(&kring->q_lock);
1743	return n;
1744}
1745
1746
1747/* nm_bdg_attach callback for VALE ports
1748 * The na_vp port is this same netmap_adapter. There is no host port.
1749 */
1750static int
1751netmap_vp_bdg_attach(const char *name, struct netmap_adapter *na)
1752{
1753	struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na;
1754
1755	if (vpna->na_bdg)
1756		return EBUSY;
1757	na->na_vp = vpna;
1758	strncpy(na->name, name, sizeof(na->name));
1759	na->na_hostvp = NULL;
1760	return 0;
1761}
1762
1763/* create a netmap_vp_adapter that describes a VALE port.
1764 * Only persistent VALE ports have a non-null ifp.
1765 */
1766static int
1767netmap_vp_create(struct nmreq *nmr, struct ifnet *ifp, struct netmap_vp_adapter **ret)
1768{
1769	struct netmap_vp_adapter *vpna;
1770	struct netmap_adapter *na;
1771	int error;
1772	u_int npipes = 0;
1773
1774	vpna = malloc(sizeof(*vpna), M_DEVBUF, M_NOWAIT | M_ZERO);
1775	if (vpna == NULL)
1776		return ENOMEM;
1777
1778 	na = &vpna->up;
1779
1780	na->ifp = ifp;
1781	strncpy(na->name, nmr->nr_name, sizeof(na->name));
1782
1783	/* bound checking */
1784	na->num_tx_rings = nmr->nr_tx_rings;
1785	nm_bound_var(&na->num_tx_rings, 1, 1, NM_BDG_MAXRINGS, NULL);
1786	nmr->nr_tx_rings = na->num_tx_rings; // write back
1787	na->num_rx_rings = nmr->nr_rx_rings;
1788	nm_bound_var(&na->num_rx_rings, 1, 1, NM_BDG_MAXRINGS, NULL);
1789	nmr->nr_rx_rings = na->num_rx_rings; // write back
1790	nm_bound_var(&nmr->nr_tx_slots, NM_BRIDGE_RINGSIZE,
1791			1, NM_BDG_MAXSLOTS, NULL);
1792	na->num_tx_desc = nmr->nr_tx_slots;
1793	nm_bound_var(&nmr->nr_rx_slots, NM_BRIDGE_RINGSIZE,
1794			1, NM_BDG_MAXSLOTS, NULL);
1795	/* validate number of pipes. We want at least 1,
1796	 * but probably can do with some more.
1797	 * So let's use 2 as default (when 0 is supplied)
1798	 */
1799	npipes = nmr->nr_arg1;
1800	nm_bound_var(&npipes, 2, 1, NM_MAXPIPES, NULL);
1801	nmr->nr_arg1 = npipes;	/* write back */
1802	/* validate extra bufs */
1803	nm_bound_var(&nmr->nr_arg3, 0, 0,
1804			128*NM_BDG_MAXSLOTS, NULL);
1805	na->num_rx_desc = nmr->nr_rx_slots;
1806	vpna->virt_hdr_len = 0;
1807	vpna->mfs = 1514;
1808	vpna->last_smac = ~0llu;
1809	/*if (vpna->mfs > netmap_buf_size)  TODO netmap_buf_size is zero??
1810		vpna->mfs = netmap_buf_size; */
1811        if (netmap_verbose)
1812		D("max frame size %u", vpna->mfs);
1813
1814	na->na_flags |= NAF_BDG_MAYSLEEP;
1815	/* persistent VALE ports look like hw devices
1816	 * with a native netmap adapter
1817	 */
1818	if (ifp)
1819		na->na_flags |= NAF_NATIVE;
1820	na->nm_txsync = netmap_vp_txsync;
1821	na->nm_rxsync = netmap_vp_rxsync;
1822	na->nm_register = netmap_vp_reg;
1823	na->nm_krings_create = netmap_vp_krings_create;
1824	na->nm_krings_delete = netmap_vp_krings_delete;
1825	na->nm_dtor = netmap_vp_dtor;
1826	na->nm_mem = netmap_mem_private_new(na->name,
1827			na->num_tx_rings, na->num_tx_desc,
1828			na->num_rx_rings, na->num_rx_desc,
1829			nmr->nr_arg3, npipes, &error);
1830	if (na->nm_mem == NULL)
1831		goto err;
1832	na->nm_bdg_attach = netmap_vp_bdg_attach;
1833	/* other nmd fields are set in the common routine */
1834	error = netmap_attach_common(na);
1835	if (error)
1836		goto err;
1837	*ret = vpna;
1838	return 0;
1839
1840err:
1841	if (na->nm_mem != NULL)
1842		netmap_mem_delete(na->nm_mem);
1843	free(vpna, M_DEVBUF);
1844	return error;
1845}
1846
1847/* Bridge wrapper code (bwrap).
1848 * This is used to connect a non-VALE-port netmap_adapter (hwna) to a
1849 * VALE switch.
1850 * The main task is to swap the meaning of tx and rx rings to match the
1851 * expectations of the VALE switch code (see nm_bdg_flush).
1852 *
1853 * The bwrap works by interposing a netmap_bwrap_adapter between the
1854 * rest of the system and the hwna. The netmap_bwrap_adapter looks like
1855 * a netmap_vp_adapter to the rest the system, but, internally, it
1856 * translates all callbacks to what the hwna expects.
1857 *
1858 * Note that we have to intercept callbacks coming from two sides:
1859 *
1860 *  - callbacks coming from the netmap module are intercepted by
1861 *    passing around the netmap_bwrap_adapter instead of the hwna
1862 *
1863 *  - callbacks coming from outside of the netmap module only know
1864 *    about the hwna. This, however, only happens in interrupt
1865 *    handlers, where only the hwna->nm_notify callback is called.
1866 *    What the bwrap does is to overwrite the hwna->nm_notify callback
1867 *    with its own netmap_bwrap_intr_notify.
1868 *    XXX This assumes that the hwna->nm_notify callback was the
1869 *    standard netmap_notify(), as it is the case for nic adapters.
1870 *    Any additional action performed by hwna->nm_notify will not be
1871 *    performed by netmap_bwrap_intr_notify.
1872 *
1873 * Additionally, the bwrap can optionally attach the host rings pair
1874 * of the wrapped adapter to a different port of the switch.
1875 */
1876
1877
1878static void
1879netmap_bwrap_dtor(struct netmap_adapter *na)
1880{
1881	struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na;
1882	struct netmap_adapter *hwna = bna->hwna;
1883
1884	ND("na %p", na);
1885	/* drop reference to hwna->ifp.
1886	 * If we don't do this, netmap_detach_common(na)
1887	 * will think it has set NA(na->ifp) to NULL
1888	 */
1889	na->ifp = NULL;
1890	/* for safety, also drop the possible reference
1891	 * in the hostna
1892	 */
1893	bna->host.up.ifp = NULL;
1894
1895	hwna->nm_mem = bna->save_nmd;
1896	hwna->na_private = NULL;
1897	hwna->na_vp = hwna->na_hostvp = NULL;
1898	hwna->na_flags &= ~NAF_BUSY;
1899	netmap_adapter_put(hwna);
1900
1901}
1902
1903
1904/*
1905 * Intr callback for NICs connected to a bridge.
1906 * Simply ignore tx interrupts (maybe we could try to recover space ?)
1907 * and pass received packets from nic to the bridge.
1908 *
1909 * XXX TODO check locking: this is called from the interrupt
1910 * handler so we should make sure that the interface is not
1911 * disconnected while passing down an interrupt.
1912 *
1913 * Note, no user process can access this NIC or the host stack.
1914 * The only part of the ring that is significant are the slots,
1915 * and head/cur/tail are set from the kring as needed
1916 * (part as a receive ring, part as a transmit ring).
1917 *
1918 * callback that overwrites the hwna notify callback.
1919 * Packets come from the outside or from the host stack and are put on an hwna rx ring.
1920 * The bridge wrapper then sends the packets through the bridge.
1921 */
1922static int
1923netmap_bwrap_intr_notify(struct netmap_kring *kring, int flags)
1924{
1925	struct netmap_adapter *na = kring->na;
1926	struct netmap_bwrap_adapter *bna = na->na_private;
1927	struct netmap_kring *bkring;
1928	struct netmap_vp_adapter *vpna = &bna->up;
1929	u_int ring_nr = kring->ring_id;
1930	int error = 0;
1931
1932	if (netmap_verbose)
1933	    D("%s %s 0x%x", na->name, kring->name, flags);
1934
1935	if (!nm_netmap_on(na))
1936		return 0;
1937
1938	bkring = &vpna->up.tx_rings[ring_nr];
1939
1940	/* make sure the ring is not disabled */
1941	if (nm_kr_tryget(kring))
1942		return 0;
1943
1944	if (netmap_verbose)
1945	    D("%s head %d cur %d tail %d",  na->name,
1946		kring->rhead, kring->rcur, kring->rtail);
1947
1948	/* simulate a user wakeup on the rx ring
1949	 * fetch packets that have arrived.
1950	 */
1951	error = kring->nm_sync(kring, 0);
1952	if (error)
1953		goto put_out;
1954	if (kring->nr_hwcur == kring->nr_hwtail && netmap_verbose) {
1955		D("how strange, interrupt with no packets on %s",
1956			na->name);
1957		goto put_out;
1958	}
1959
1960	/* new packets are kring->rcur to kring->nr_hwtail, and the bkring
1961	 * had hwcur == bkring->rhead. So advance bkring->rhead to kring->nr_hwtail
1962	 * to push all packets out.
1963	 */
1964	bkring->rhead = bkring->rcur = kring->nr_hwtail;
1965
1966	netmap_vp_txsync(bkring, flags);
1967
1968	/* mark all buffers as released on this ring */
1969	kring->rhead = kring->rcur = kring->rtail = kring->nr_hwtail;
1970	/* another call to actually release the buffers */
1971	error = kring->nm_sync(kring, 0);
1972
1973put_out:
1974	nm_kr_put(kring);
1975	return error;
1976}
1977
1978
1979/* nm_register callback for bwrap */
1980static int
1981netmap_bwrap_register(struct netmap_adapter *na, int onoff)
1982{
1983	struct netmap_bwrap_adapter *bna =
1984		(struct netmap_bwrap_adapter *)na;
1985	struct netmap_adapter *hwna = bna->hwna;
1986	struct netmap_vp_adapter *hostna = &bna->host;
1987	int error;
1988	enum txrx t;
1989
1990	ND("%s %s", na->name, onoff ? "on" : "off");
1991
1992	if (onoff) {
1993		int i;
1994
1995		/* netmap_do_regif has been called on the bwrap na.
1996		 * We need to pass the information about the
1997		 * memory allocator down to the hwna before
1998		 * putting it in netmap mode
1999		 */
2000		hwna->na_lut = na->na_lut;
2001
2002		if (hostna->na_bdg) {
2003			/* if the host rings have been attached to switch,
2004			 * we need to copy the memory allocator information
2005			 * in the hostna also
2006			 */
2007			hostna->up.na_lut = na->na_lut;
2008		}
2009
2010		/* cross-link the netmap rings
2011		 * The original number of rings comes from hwna,
2012		 * rx rings on one side equals tx rings on the other.
2013		 * We need to do this now, after the initialization
2014		 * of the kring->ring pointers
2015		 */
2016		for_rx_tx(t) {
2017			enum txrx r= nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */
2018			for (i = 0; i < nma_get_nrings(na, r) + 1; i++) {
2019				NMR(hwna, t)[i].nkr_num_slots = NMR(na, r)[i].nkr_num_slots;
2020				NMR(hwna, t)[i].ring = NMR(na, r)[i].ring;
2021			}
2022		}
2023	}
2024
2025	/* forward the request to the hwna */
2026	error = hwna->nm_register(hwna, onoff);
2027	if (error)
2028		return error;
2029
2030	/* impersonate a netmap_vp_adapter */
2031	netmap_vp_reg(na, onoff);
2032	if (hostna->na_bdg)
2033		netmap_vp_reg(&hostna->up, onoff);
2034
2035	if (onoff) {
2036		u_int i;
2037		/* intercept the hwna nm_nofify callback on the hw rings */
2038		for (i = 0; i < hwna->num_rx_rings; i++) {
2039			hwna->rx_rings[i].save_notify = hwna->rx_rings[i].nm_notify;
2040			hwna->rx_rings[i].nm_notify = netmap_bwrap_intr_notify;
2041		}
2042		i = hwna->num_rx_rings; /* for safety */
2043		/* save the host ring notify unconditionally */
2044		hwna->rx_rings[i].save_notify = hwna->rx_rings[i].nm_notify;
2045		if (hostna->na_bdg) {
2046			/* also intercept the host ring notify */
2047			hwna->rx_rings[i].nm_notify = netmap_bwrap_intr_notify;
2048		}
2049	} else {
2050		u_int i;
2051		/* reset all notify callbacks (including host ring) */
2052		for (i = 0; i <= hwna->num_rx_rings; i++) {
2053			hwna->rx_rings[i].nm_notify = hwna->rx_rings[i].save_notify;
2054			hwna->rx_rings[i].save_notify = NULL;
2055		}
2056		hwna->na_lut.lut = NULL;
2057		hwna->na_lut.objtotal = 0;
2058		hwna->na_lut.objsize = 0;
2059	}
2060
2061	return 0;
2062}
2063
2064/* nm_config callback for bwrap */
2065static int
2066netmap_bwrap_config(struct netmap_adapter *na, u_int *txr, u_int *txd,
2067				    u_int *rxr, u_int *rxd)
2068{
2069	struct netmap_bwrap_adapter *bna =
2070		(struct netmap_bwrap_adapter *)na;
2071	struct netmap_adapter *hwna = bna->hwna;
2072
2073	/* forward the request */
2074	netmap_update_config(hwna);
2075	/* swap the results */
2076	*txr = hwna->num_rx_rings;
2077	*txd = hwna->num_rx_desc;
2078	*rxr = hwna->num_tx_rings;
2079	*rxd = hwna->num_rx_desc;
2080
2081	return 0;
2082}
2083
2084
2085/* nm_krings_create callback for bwrap */
2086static int
2087netmap_bwrap_krings_create(struct netmap_adapter *na)
2088{
2089	struct netmap_bwrap_adapter *bna =
2090		(struct netmap_bwrap_adapter *)na;
2091	struct netmap_adapter *hwna = bna->hwna;
2092	struct netmap_adapter *hostna = &bna->host.up;
2093	int error;
2094
2095	ND("%s", na->name);
2096
2097	/* impersonate a netmap_vp_adapter */
2098	error = netmap_vp_krings_create(na);
2099	if (error)
2100		return error;
2101
2102	/* also create the hwna krings */
2103	error = hwna->nm_krings_create(hwna);
2104	if (error) {
2105		netmap_vp_krings_delete(na);
2106		return error;
2107	}
2108	/* the connection between the bwrap krings and the hwna krings
2109	 * will be perfomed later, in the nm_register callback, since
2110	 * now the kring->ring pointers have not been initialized yet
2111	 */
2112
2113	if (na->na_flags & NAF_HOST_RINGS) {
2114		/* the hostna rings are the host rings of the bwrap.
2115		 * The corresponding krings must point back to the
2116		 * hostna
2117		 */
2118		hostna->tx_rings = &na->tx_rings[na->num_tx_rings];
2119		hostna->tx_rings[0].na = hostna;
2120		hostna->rx_rings = &na->rx_rings[na->num_rx_rings];
2121		hostna->rx_rings[0].na = hostna;
2122	}
2123
2124	return 0;
2125}
2126
2127
2128static void
2129netmap_bwrap_krings_delete(struct netmap_adapter *na)
2130{
2131	struct netmap_bwrap_adapter *bna =
2132		(struct netmap_bwrap_adapter *)na;
2133	struct netmap_adapter *hwna = bna->hwna;
2134
2135	ND("%s", na->name);
2136
2137	hwna->nm_krings_delete(hwna);
2138	netmap_vp_krings_delete(na);
2139}
2140
2141
2142/* notify method for the bridge-->hwna direction */
2143static int
2144netmap_bwrap_notify(struct netmap_kring *kring, int flags)
2145{
2146	struct netmap_adapter *na = kring->na;
2147	struct netmap_bwrap_adapter *bna = na->na_private;
2148	struct netmap_adapter *hwna = bna->hwna;
2149	u_int ring_n = kring->ring_id;
2150	u_int lim = kring->nkr_num_slots - 1;
2151	struct netmap_kring *hw_kring;
2152	int error = 0;
2153
2154	ND("%s: na %s hwna %s",
2155			(kring ? kring->name : "NULL!"),
2156			(na ? na->name : "NULL!"),
2157			(hwna ? hwna->name : "NULL!"));
2158	hw_kring = &hwna->tx_rings[ring_n];
2159
2160	if (nm_kr_tryget(hw_kring))
2161		return 0;
2162
2163	if (!nm_netmap_on(hwna))
2164		return 0;
2165	/* first step: simulate a user wakeup on the rx ring */
2166	netmap_vp_rxsync(kring, flags);
2167	ND("%s[%d] PRE rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)",
2168		na->name, ring_n,
2169		kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease,
2170		ring->head, ring->cur, ring->tail,
2171		hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_ring->rtail);
2172	/* second step: the new packets are sent on the tx ring
2173	 * (which is actually the same ring)
2174	 */
2175	hw_kring->rhead = hw_kring->rcur = kring->nr_hwtail;
2176	error = hw_kring->nm_sync(hw_kring, flags);
2177	if (error)
2178		goto out;
2179
2180	/* third step: now we are back the rx ring */
2181	/* claim ownership on all hw owned bufs */
2182	kring->rhead = kring->rcur = nm_next(hw_kring->nr_hwtail, lim); /* skip past reserved slot */
2183
2184	/* fourth step: the user goes to sleep again, causing another rxsync */
2185	netmap_vp_rxsync(kring, flags);
2186	ND("%s[%d] PST rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)",
2187		na->name, ring_n,
2188		kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease,
2189		ring->head, ring->cur, ring->tail,
2190		hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_kring->rtail);
2191out:
2192	nm_kr_put(hw_kring);
2193	return error;
2194}
2195
2196
2197/* nm_bdg_ctl callback for the bwrap.
2198 * Called on bridge-attach and detach, as an effect of vale-ctl -[ahd].
2199 * On attach, it needs to provide a fake netmap_priv_d structure and
2200 * perform a netmap_do_regif() on the bwrap. This will put both the
2201 * bwrap and the hwna in netmap mode, with the netmap rings shared
2202 * and cross linked. Moroever, it will start intercepting interrupts
2203 * directed to hwna.
2204 */
2205static int
2206netmap_bwrap_bdg_ctl(struct netmap_adapter *na, struct nmreq *nmr, int attach)
2207{
2208	struct netmap_priv_d *npriv;
2209	struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na;
2210	int error = 0;
2211
2212	if (attach) {
2213		if (NETMAP_OWNED_BY_ANY(na)) {
2214			return EBUSY;
2215		}
2216		if (bna->na_kpriv) {
2217			/* nothing to do */
2218			return 0;
2219		}
2220		npriv = malloc(sizeof(*npriv), M_DEVBUF, M_NOWAIT|M_ZERO);
2221		if (npriv == NULL)
2222			return ENOMEM;
2223		error = netmap_do_regif(npriv, na, nmr->nr_ringid, nmr->nr_flags);
2224		if (error) {
2225			bzero(npriv, sizeof(*npriv));
2226			free(npriv, M_DEVBUF);
2227			return error;
2228		}
2229		bna->na_kpriv = npriv;
2230		na->na_flags |= NAF_BUSY;
2231	} else {
2232		int last_instance;
2233
2234		if (na->active_fds == 0) /* not registered */
2235			return EINVAL;
2236		last_instance = netmap_dtor_locked(bna->na_kpriv);
2237		if (!last_instance) {
2238			D("--- error, trying to detach an entry with active mmaps");
2239			error = EINVAL;
2240		} else {
2241			struct nm_bridge *b = bna->up.na_bdg,
2242				*bh = bna->host.na_bdg;
2243			npriv = bna->na_kpriv;
2244			bna->na_kpriv = NULL;
2245			D("deleting priv");
2246
2247			bzero(npriv, sizeof(*npriv));
2248			free(npriv, M_DEVBUF);
2249			if (b) {
2250				/* XXX the bwrap dtor should take care
2251				 * of this (2014-06-16)
2252				 */
2253				netmap_bdg_detach_common(b, bna->up.bdg_port,
2254				    (bh ? bna->host.bdg_port : -1));
2255			}
2256			na->na_flags &= ~NAF_BUSY;
2257		}
2258	}
2259	return error;
2260
2261}
2262
2263/* attach a bridge wrapper to the 'real' device */
2264int
2265netmap_bwrap_attach(const char *nr_name, struct netmap_adapter *hwna)
2266{
2267	struct netmap_bwrap_adapter *bna;
2268	struct netmap_adapter *na = NULL;
2269	struct netmap_adapter *hostna = NULL;
2270	int error = 0;
2271	enum txrx t;
2272
2273	/* make sure the NIC is not already in use */
2274	if (NETMAP_OWNED_BY_ANY(hwna)) {
2275		D("NIC %s busy, cannot attach to bridge", hwna->name);
2276		return EBUSY;
2277	}
2278
2279	bna = malloc(sizeof(*bna), M_DEVBUF, M_NOWAIT | M_ZERO);
2280	if (bna == NULL) {
2281		return ENOMEM;
2282	}
2283
2284	na = &bna->up.up;
2285	na->na_private = bna;
2286	strncpy(na->name, nr_name, sizeof(na->name));
2287	/* fill the ring data for the bwrap adapter with rx/tx meanings
2288	 * swapped. The real cross-linking will be done during register,
2289	 * when all the krings will have been created.
2290	 */
2291	for_rx_tx(t) {
2292		enum txrx r = nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */
2293		nma_set_nrings(na, t, nma_get_nrings(hwna, r));
2294		nma_set_ndesc(na, t, nma_get_ndesc(hwna, r));
2295	}
2296	na->nm_dtor = netmap_bwrap_dtor;
2297	na->nm_register = netmap_bwrap_register;
2298	// na->nm_txsync = netmap_bwrap_txsync;
2299	// na->nm_rxsync = netmap_bwrap_rxsync;
2300	na->nm_config = netmap_bwrap_config;
2301	na->nm_krings_create = netmap_bwrap_krings_create;
2302	na->nm_krings_delete = netmap_bwrap_krings_delete;
2303	na->nm_notify = netmap_bwrap_notify;
2304	na->nm_bdg_ctl = netmap_bwrap_bdg_ctl;
2305	na->pdev = hwna->pdev;
2306	na->nm_mem = netmap_mem_private_new(na->name,
2307			na->num_tx_rings, na->num_tx_desc,
2308			na->num_rx_rings, na->num_rx_desc,
2309			0, 0, &error);
2310	na->na_flags |= NAF_MEM_OWNER;
2311	if (na->nm_mem == NULL)
2312		goto err_put;
2313	bna->up.retry = 1; /* XXX maybe this should depend on the hwna */
2314
2315	bna->hwna = hwna;
2316	netmap_adapter_get(hwna);
2317	hwna->na_private = bna; /* weak reference */
2318	hwna->na_vp = &bna->up;
2319
2320	if (hwna->na_flags & NAF_HOST_RINGS) {
2321		if (hwna->na_flags & NAF_SW_ONLY)
2322			na->na_flags |= NAF_SW_ONLY;
2323		na->na_flags |= NAF_HOST_RINGS;
2324		hostna = &bna->host.up;
2325		snprintf(hostna->name, sizeof(hostna->name), "%s^", nr_name);
2326		hostna->ifp = hwna->ifp;
2327		for_rx_tx(t) {
2328			enum txrx r = nm_txrx_swap(t);
2329			nma_set_nrings(hostna, t, 1);
2330			nma_set_ndesc(hostna, t, nma_get_ndesc(hwna, r));
2331		}
2332		// hostna->nm_txsync = netmap_bwrap_host_txsync;
2333		// hostna->nm_rxsync = netmap_bwrap_host_rxsync;
2334		hostna->nm_notify = netmap_bwrap_notify;
2335		hostna->nm_mem = na->nm_mem;
2336		hostna->na_private = bna;
2337		hostna->na_vp = &bna->up;
2338		na->na_hostvp = hwna->na_hostvp =
2339			hostna->na_hostvp = &bna->host;
2340		hostna->na_flags = NAF_BUSY; /* prevent NIOCREGIF */
2341	}
2342
2343	ND("%s<->%s txr %d txd %d rxr %d rxd %d",
2344		na->name, ifp->if_xname,
2345		na->num_tx_rings, na->num_tx_desc,
2346		na->num_rx_rings, na->num_rx_desc);
2347
2348	error = netmap_attach_common(na);
2349	if (error) {
2350		goto err_free;
2351	}
2352	/* make bwrap ifp point to the real ifp
2353	 * NOTE: netmap_attach_common() interprets a non-NULL na->ifp
2354	 * as a request to make the ifp point to the na. Since we
2355	 * do not want to change the na already pointed to by hwna->ifp,
2356	 * the following assignment has to be delayed until now
2357	 */
2358	na->ifp = hwna->ifp;
2359	hwna->na_flags |= NAF_BUSY;
2360	/* make hwna point to the allocator we are actually using,
2361	 * so that monitors will be able to find it
2362	 */
2363	bna->save_nmd = hwna->nm_mem;
2364	hwna->nm_mem = na->nm_mem;
2365	return 0;
2366
2367err_free:
2368	netmap_mem_delete(na->nm_mem);
2369err_put:
2370	hwna->na_vp = hwna->na_hostvp = NULL;
2371	netmap_adapter_put(hwna);
2372	free(bna, M_DEVBUF);
2373	return error;
2374
2375}
2376
2377struct nm_bridge *
2378netmap_init_bridges2(u_int n)
2379{
2380	int i;
2381	struct nm_bridge *b;
2382
2383	b = malloc(sizeof(struct nm_bridge) * n, M_DEVBUF,
2384		M_NOWAIT | M_ZERO);
2385	if (b == NULL)
2386		return NULL;
2387	for (i = 0; i < n; i++)
2388		BDG_RWINIT(&b[i]);
2389	return b;
2390}
2391
2392void
2393netmap_uninit_bridges2(struct nm_bridge *b, u_int n)
2394{
2395	int i;
2396
2397	if (b == NULL)
2398		return;
2399
2400	for (i = 0; i < n; i++)
2401		BDG_RWDESTROY(&b[i]);
2402	free(b, M_DEVBUF);
2403}
2404
2405int
2406netmap_init_bridges(void)
2407{
2408#ifdef CONFIG_NET_NS
2409	return netmap_bns_register();
2410#else
2411	nm_bridges = netmap_init_bridges2(NM_BRIDGES);
2412	if (nm_bridges == NULL)
2413		return ENOMEM;
2414	return 0;
2415#endif
2416}
2417
2418void
2419netmap_uninit_bridges(void)
2420{
2421#ifdef CONFIG_NET_NS
2422	netmap_bns_unregister();
2423#else
2424	netmap_uninit_bridges2(nm_bridges, NM_BRIDGES);
2425#endif
2426}
2427#endif /* WITH_VALE */
2428