1/*
2 * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 *   1. Redistributions of source code must retain the above copyright
8 *      notice, this list of conditions and the following disclaimer.
9 *   2. Redistributions in binary form must reproduce the above copyright
10 *      notice, this list of conditions and the following disclaimer in the
11 *      documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 * SUCH DAMAGE.
24 */
25
26
27/*
28 * This module implements the VALE switch for netmap
29
30--- VALE SWITCH ---
31
32NMG_LOCK() serializes all modifications to switches and ports.
33A switch cannot be deleted until all ports are gone.
34
35For each switch, an SX lock (RWlock on linux) protects
36deletion of ports. When configuring or deleting a new port, the
37lock is acquired in exclusive mode (after holding NMG_LOCK).
38When forwarding, the lock is acquired in shared mode (without NMG_LOCK).
39The lock is held throughout the entire forwarding cycle,
40during which the thread may incur in a page fault.
41Hence it is important that sleepable shared locks are used.
42
43On the rx ring, the per-port lock is grabbed initially to reserve
44a number of slot in the ring, then the lock is released,
45packets are copied from source to destination, and then
46the lock is acquired again and the receive ring is updated.
47(A similar thing is done on the tx ring for NIC and host stack
48ports attached to the switch)
49
50 */
51
52/*
53 * OS-specific code that is used only within this file.
54 * Other OS-specific code that must be accessed by drivers
55 * is present in netmap_kern.h
56 */
57
58#if defined(__FreeBSD__)
59#include <sys/cdefs.h> /* prerequisite */
60__FBSDID("$FreeBSD$");
61
62#include <sys/types.h>
63#include <sys/errno.h>
64#include <sys/param.h>	/* defines used in kernel.h */
65#include <sys/kernel.h>	/* types used in module initialization */
66#include <sys/conf.h>	/* cdevsw struct, UID, GID */
67#include <sys/sockio.h>
68#include <sys/socketvar.h>	/* struct socket */
69#include <sys/malloc.h>
70#include <sys/poll.h>
71#include <sys/rwlock.h>
72#include <sys/socket.h> /* sockaddrs */
73#include <sys/selinfo.h>
74#include <sys/sysctl.h>
75#include <net/if.h>
76#include <net/if_var.h>
77#include <net/bpf.h>		/* BIOCIMMEDIATE */
78#include <machine/bus.h>	/* bus_dmamap_* */
79#include <sys/endian.h>
80#include <sys/refcount.h>
81
82
83#define BDG_RWLOCK_T		struct rwlock // struct rwlock
84
85#define	BDG_RWINIT(b)		\
86	rw_init_flags(&(b)->bdg_lock, "bdg lock", RW_NOWITNESS)
87#define BDG_WLOCK(b)		rw_wlock(&(b)->bdg_lock)
88#define BDG_WUNLOCK(b)		rw_wunlock(&(b)->bdg_lock)
89#define BDG_RLOCK(b)		rw_rlock(&(b)->bdg_lock)
90#define BDG_RTRYLOCK(b)		rw_try_rlock(&(b)->bdg_lock)
91#define BDG_RUNLOCK(b)		rw_runlock(&(b)->bdg_lock)
92#define BDG_RWDESTROY(b)	rw_destroy(&(b)->bdg_lock)
93
94
95#elif defined(linux)
96
97#include "bsd_glue.h"
98
99#elif defined(__APPLE__)
100
101#warning OSX support is only partial
102#include "osx_glue.h"
103
104#else
105
106#error	Unsupported platform
107
108#endif /* unsupported */
109
110/*
111 * common headers
112 */
113
114#include <net/netmap.h>
115#include <dev/netmap/netmap_kern.h>
116#include <dev/netmap/netmap_mem2.h>
117
118#ifdef WITH_VALE
119
120/*
121 * system parameters (most of them in netmap_kern.h)
122 * NM_NAME	prefix for switch port names, default "vale"
123 * NM_BDG_MAXPORTS	number of ports
124 * NM_BRIDGES	max number of switches in the system.
125 *	XXX should become a sysctl or tunable
126 *
127 * Switch ports are named valeX:Y where X is the switch name and Y
128 * is the port. If Y matches a physical interface name, the port is
129 * connected to a physical device.
130 *
131 * Unlike physical interfaces, switch ports use their own memory region
132 * for rings and buffers.
133 * The virtual interfaces use per-queue lock instead of core lock.
134 * In the tx loop, we aggregate traffic in batches to make all operations
135 * faster. The batch size is bridge_batch.
136 */
137#define NM_BDG_MAXRINGS		16	/* XXX unclear how many. */
138#define NM_BDG_MAXSLOTS		4096	/* XXX same as above */
139#define NM_BRIDGE_RINGSIZE	1024	/* in the device */
140#define NM_BDG_HASH		1024	/* forwarding table entries */
141#define NM_BDG_BATCH		1024	/* entries in the forwarding buffer */
142#define NM_MULTISEG		64	/* max size of a chain of bufs */
143/* actual size of the tables */
144#define NM_BDG_BATCH_MAX	(NM_BDG_BATCH + NM_MULTISEG)
145/* NM_FT_NULL terminates a list of slots in the ft */
146#define NM_FT_NULL		NM_BDG_BATCH_MAX
147#define	NM_BRIDGES		8	/* number of bridges */
148
149
150/*
151 * bridge_batch is set via sysctl to the max batch size to be
152 * used in the bridge. The actual value may be larger as the
153 * last packet in the block may overflow the size.
154 */
155int bridge_batch = NM_BDG_BATCH; /* bridge batch size */
156SYSCTL_DECL(_dev_netmap);
157SYSCTL_INT(_dev_netmap, OID_AUTO, bridge_batch, CTLFLAG_RW, &bridge_batch, 0 , "");
158
159
160static int netmap_vp_create(struct nmreq *, struct ifnet *, struct netmap_vp_adapter **);
161static int netmap_vp_reg(struct netmap_adapter *na, int onoff);
162static int netmap_bwrap_register(struct netmap_adapter *, int onoff);
163
164/*
165 * For each output interface, nm_bdg_q is used to construct a list.
166 * bq_len is the number of output buffers (we can have coalescing
167 * during the copy).
168 */
169struct nm_bdg_q {
170	uint16_t bq_head;
171	uint16_t bq_tail;
172	uint32_t bq_len;	/* number of buffers */
173};
174
175/* XXX revise this */
176struct nm_hash_ent {
177	uint64_t	mac;	/* the top 2 bytes are the epoch */
178	uint64_t	ports;
179};
180
181/*
182 * nm_bridge is a descriptor for a VALE switch.
183 * Interfaces for a bridge are all in bdg_ports[].
184 * The array has fixed size, an empty entry does not terminate
185 * the search, but lookups only occur on attach/detach so we
186 * don't mind if they are slow.
187 *
188 * The bridge is non blocking on the transmit ports: excess
189 * packets are dropped if there is no room on the output port.
190 *
191 * bdg_lock protects accesses to the bdg_ports array.
192 * This is a rw lock (or equivalent).
193 */
194struct nm_bridge {
195	/* XXX what is the proper alignment/layout ? */
196	BDG_RWLOCK_T	bdg_lock;	/* protects bdg_ports */
197	int		bdg_namelen;
198	uint32_t	bdg_active_ports; /* 0 means free */
199	char		bdg_basename[IFNAMSIZ];
200
201	/* Indexes of active ports (up to active_ports)
202	 * and all other remaining ports.
203	 */
204	uint8_t		bdg_port_index[NM_BDG_MAXPORTS];
205
206	struct netmap_vp_adapter *bdg_ports[NM_BDG_MAXPORTS];
207
208
209	/*
210	 * The function to decide the destination port.
211	 * It returns either of an index of the destination port,
212	 * NM_BDG_BROADCAST to broadcast this packet, or NM_BDG_NOPORT not to
213	 * forward this packet.  ring_nr is the source ring index, and the
214	 * function may overwrite this value to forward this packet to a
215	 * different ring index.
216	 * This function must be set by netmap_bdgctl().
217	 */
218	struct netmap_bdg_ops bdg_ops;
219
220	/* the forwarding table, MAC+ports.
221	 * XXX should be changed to an argument to be passed to
222	 * the lookup function, and allocated on attach
223	 */
224	struct nm_hash_ent ht[NM_BDG_HASH];
225};
226
227const char*
228netmap_bdg_name(struct netmap_vp_adapter *vp)
229{
230	struct nm_bridge *b = vp->na_bdg;
231	if (b == NULL)
232		return NULL;
233	return b->bdg_basename;
234}
235
236
237/*
238 * XXX in principle nm_bridges could be created dynamically
239 * Right now we have a static array and deletions are protected
240 * by an exclusive lock.
241 */
242struct nm_bridge nm_bridges[NM_BRIDGES];
243
244
245/*
246 * this is a slightly optimized copy routine which rounds
247 * to multiple of 64 bytes and is often faster than dealing
248 * with other odd sizes. We assume there is enough room
249 * in the source and destination buffers.
250 *
251 * XXX only for multiples of 64 bytes, non overlapped.
252 */
253static inline void
254pkt_copy(void *_src, void *_dst, int l)
255{
256        uint64_t *src = _src;
257        uint64_t *dst = _dst;
258        if (unlikely(l >= 1024)) {
259                memcpy(dst, src, l);
260                return;
261        }
262        for (; likely(l > 0); l-=64) {
263                *dst++ = *src++;
264                *dst++ = *src++;
265                *dst++ = *src++;
266                *dst++ = *src++;
267                *dst++ = *src++;
268                *dst++ = *src++;
269                *dst++ = *src++;
270                *dst++ = *src++;
271        }
272}
273
274
275/*
276 * locate a bridge among the existing ones.
277 * MUST BE CALLED WITH NMG_LOCK()
278 *
279 * a ':' in the name terminates the bridge name. Otherwise, just NM_NAME.
280 * We assume that this is called with a name of at least NM_NAME chars.
281 */
282static struct nm_bridge *
283nm_find_bridge(const char *name, int create)
284{
285	int i, l, namelen;
286	struct nm_bridge *b = NULL;
287
288	NMG_LOCK_ASSERT();
289
290	namelen = strlen(NM_NAME);	/* base length */
291	l = name ? strlen(name) : 0;		/* actual length */
292	if (l < namelen) {
293		D("invalid bridge name %s", name ? name : NULL);
294		return NULL;
295	}
296	for (i = namelen + 1; i < l; i++) {
297		if (name[i] == ':') {
298			namelen = i;
299			break;
300		}
301	}
302	if (namelen >= IFNAMSIZ)
303		namelen = IFNAMSIZ;
304	ND("--- prefix is '%.*s' ---", namelen, name);
305
306	/* lookup the name, remember empty slot if there is one */
307	for (i = 0; i < NM_BRIDGES; i++) {
308		struct nm_bridge *x = nm_bridges + i;
309
310		if (x->bdg_active_ports == 0) {
311			if (create && b == NULL)
312				b = x;	/* record empty slot */
313		} else if (x->bdg_namelen != namelen) {
314			continue;
315		} else if (strncmp(name, x->bdg_basename, namelen) == 0) {
316			ND("found '%.*s' at %d", namelen, name, i);
317			b = x;
318			break;
319		}
320	}
321	if (i == NM_BRIDGES && b) { /* name not found, can create entry */
322		/* initialize the bridge */
323		strncpy(b->bdg_basename, name, namelen);
324		ND("create new bridge %s with ports %d", b->bdg_basename,
325			b->bdg_active_ports);
326		b->bdg_namelen = namelen;
327		b->bdg_active_ports = 0;
328		for (i = 0; i < NM_BDG_MAXPORTS; i++)
329			b->bdg_port_index[i] = i;
330		/* set the default function */
331		b->bdg_ops.lookup = netmap_bdg_learning;
332		/* reset the MAC address table */
333		bzero(b->ht, sizeof(struct nm_hash_ent) * NM_BDG_HASH);
334	}
335	return b;
336}
337
338
339/*
340 * Free the forwarding tables for rings attached to switch ports.
341 */
342static void
343nm_free_bdgfwd(struct netmap_adapter *na)
344{
345	int nrings, i;
346	struct netmap_kring *kring;
347
348	NMG_LOCK_ASSERT();
349	nrings = na->num_tx_rings;
350	kring = na->tx_rings;
351	for (i = 0; i < nrings; i++) {
352		if (kring[i].nkr_ft) {
353			free(kring[i].nkr_ft, M_DEVBUF);
354			kring[i].nkr_ft = NULL; /* protect from freeing twice */
355		}
356	}
357}
358
359
360/*
361 * Allocate the forwarding tables for the rings attached to the bridge ports.
362 */
363static int
364nm_alloc_bdgfwd(struct netmap_adapter *na)
365{
366	int nrings, l, i, num_dstq;
367	struct netmap_kring *kring;
368
369	NMG_LOCK_ASSERT();
370	/* all port:rings + broadcast */
371	num_dstq = NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1;
372	l = sizeof(struct nm_bdg_fwd) * NM_BDG_BATCH_MAX;
373	l += sizeof(struct nm_bdg_q) * num_dstq;
374	l += sizeof(uint16_t) * NM_BDG_BATCH_MAX;
375
376	nrings = netmap_real_tx_rings(na);
377	kring = na->tx_rings;
378	for (i = 0; i < nrings; i++) {
379		struct nm_bdg_fwd *ft;
380		struct nm_bdg_q *dstq;
381		int j;
382
383		ft = malloc(l, M_DEVBUF, M_NOWAIT | M_ZERO);
384		if (!ft) {
385			nm_free_bdgfwd(na);
386			return ENOMEM;
387		}
388		dstq = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX);
389		for (j = 0; j < num_dstq; j++) {
390			dstq[j].bq_head = dstq[j].bq_tail = NM_FT_NULL;
391			dstq[j].bq_len = 0;
392		}
393		kring[i].nkr_ft = ft;
394	}
395	return 0;
396}
397
398
399/* remove from bridge b the ports in slots hw and sw
400 * (sw can be -1 if not needed)
401 */
402static void
403netmap_bdg_detach_common(struct nm_bridge *b, int hw, int sw)
404{
405	int s_hw = hw, s_sw = sw;
406	int i, lim =b->bdg_active_ports;
407	uint8_t tmp[NM_BDG_MAXPORTS];
408
409	/*
410	New algorithm:
411	make a copy of bdg_port_index;
412	lookup NA(ifp)->bdg_port and SWNA(ifp)->bdg_port
413	in the array of bdg_port_index, replacing them with
414	entries from the bottom of the array;
415	decrement bdg_active_ports;
416	acquire BDG_WLOCK() and copy back the array.
417	 */
418
419	if (netmap_verbose)
420		D("detach %d and %d (lim %d)", hw, sw, lim);
421	/* make a copy of the list of active ports, update it,
422	 * and then copy back within BDG_WLOCK().
423	 */
424	memcpy(tmp, b->bdg_port_index, sizeof(tmp));
425	for (i = 0; (hw >= 0 || sw >= 0) && i < lim; ) {
426		if (hw >= 0 && tmp[i] == hw) {
427			ND("detach hw %d at %d", hw, i);
428			lim--; /* point to last active port */
429			tmp[i] = tmp[lim]; /* swap with i */
430			tmp[lim] = hw;	/* now this is inactive */
431			hw = -1;
432		} else if (sw >= 0 && tmp[i] == sw) {
433			ND("detach sw %d at %d", sw, i);
434			lim--;
435			tmp[i] = tmp[lim];
436			tmp[lim] = sw;
437			sw = -1;
438		} else {
439			i++;
440		}
441	}
442	if (hw >= 0 || sw >= 0) {
443		D("XXX delete failed hw %d sw %d, should panic...", hw, sw);
444	}
445
446	BDG_WLOCK(b);
447	if (b->bdg_ops.dtor)
448		b->bdg_ops.dtor(b->bdg_ports[s_hw]);
449	b->bdg_ports[s_hw] = NULL;
450	if (s_sw >= 0) {
451		b->bdg_ports[s_sw] = NULL;
452	}
453	memcpy(b->bdg_port_index, tmp, sizeof(tmp));
454	b->bdg_active_ports = lim;
455	BDG_WUNLOCK(b);
456
457	ND("now %d active ports", lim);
458	if (lim == 0) {
459		ND("marking bridge %s as free", b->bdg_basename);
460		bzero(&b->bdg_ops, sizeof(b->bdg_ops));
461	}
462}
463
464/* nm_bdg_ctl callback for VALE ports */
465static int
466netmap_vp_bdg_ctl(struct netmap_adapter *na, struct nmreq *nmr, int attach)
467{
468	struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na;
469	struct nm_bridge *b = vpna->na_bdg;
470
471	if (attach)
472		return 0; /* nothing to do */
473	if (b) {
474		netmap_set_all_rings(na, 0 /* disable */);
475		netmap_bdg_detach_common(b, vpna->bdg_port, -1);
476		vpna->na_bdg = NULL;
477		netmap_set_all_rings(na, 1 /* enable */);
478	}
479	/* I have took reference just for attach */
480	netmap_adapter_put(na);
481	return 0;
482}
483
484/* nm_dtor callback for ephemeral VALE ports */
485static void
486netmap_vp_dtor(struct netmap_adapter *na)
487{
488	struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na;
489	struct nm_bridge *b = vpna->na_bdg;
490
491	ND("%s has %d references", na->name, na->na_refcount);
492
493	if (b) {
494		netmap_bdg_detach_common(b, vpna->bdg_port, -1);
495	}
496}
497
498/* nm_dtor callback for persistent VALE ports */
499static void
500netmap_persist_vp_dtor(struct netmap_adapter *na)
501{
502	struct ifnet *ifp = na->ifp;
503
504	netmap_vp_dtor(na);
505	na->ifp = NULL;
506	nm_vi_detach(ifp);
507}
508
509/* remove a persistent VALE port from the system */
510static int
511nm_vi_destroy(const char *name)
512{
513	struct ifnet *ifp;
514	int error;
515
516	ifp = ifunit_ref(name);
517	if (!ifp)
518		return ENXIO;
519	NMG_LOCK();
520	/* make sure this is actually a VALE port */
521	if (!NETMAP_CAPABLE(ifp) || NA(ifp)->nm_register != netmap_vp_reg) {
522		error = EINVAL;
523		goto err;
524	}
525
526	if (NA(ifp)->na_refcount > 1) {
527		error = EBUSY;
528		goto err;
529	}
530	NMG_UNLOCK();
531
532	D("destroying a persistent vale interface %s", ifp->if_xname);
533	/* Linux requires all the references are released
534	 * before unregister
535	 */
536	if_rele(ifp);
537	netmap_detach(ifp);
538	return 0;
539
540err:
541	NMG_UNLOCK();
542	if_rele(ifp);
543	return error;
544}
545
546/*
547 * Create a virtual interface registered to the system.
548 * The interface will be attached to a bridge later.
549 */
550static int
551nm_vi_create(struct nmreq *nmr)
552{
553	struct ifnet *ifp;
554	struct netmap_vp_adapter *vpna;
555	int error;
556
557	/* don't include VALE prefix */
558	if (!strncmp(nmr->nr_name, NM_NAME, strlen(NM_NAME)))
559		return EINVAL;
560	ifp = ifunit_ref(nmr->nr_name);
561	if (ifp) { /* already exist, cannot create new one */
562		if_rele(ifp);
563		return EEXIST;
564	}
565	error = nm_vi_persist(nmr->nr_name, &ifp);
566	if (error)
567		return error;
568
569	NMG_LOCK();
570	/* netmap_vp_create creates a struct netmap_vp_adapter */
571	error = netmap_vp_create(nmr, ifp, &vpna);
572	if (error) {
573		D("error %d", error);
574		nm_vi_detach(ifp);
575		return error;
576	}
577	/* persist-specific routines */
578	vpna->up.nm_bdg_ctl = netmap_vp_bdg_ctl;
579	vpna->up.nm_dtor = netmap_persist_vp_dtor;
580	netmap_adapter_get(&vpna->up);
581	NMG_UNLOCK();
582	D("created %s", ifp->if_xname);
583	return 0;
584}
585
586/* Try to get a reference to a netmap adapter attached to a VALE switch.
587 * If the adapter is found (or is created), this function returns 0, a
588 * non NULL pointer is returned into *na, and the caller holds a
589 * reference to the adapter.
590 * If an adapter is not found, then no reference is grabbed and the
591 * function returns an error code, or 0 if there is just a VALE prefix
592 * mismatch. Therefore the caller holds a reference when
593 * (*na != NULL && return == 0).
594 */
595int
596netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na, int create)
597{
598	char *nr_name = nmr->nr_name;
599	const char *ifname;
600	struct ifnet *ifp;
601	int error = 0;
602	struct netmap_vp_adapter *vpna, *hostna = NULL;
603	struct nm_bridge *b;
604	int i, j, cand = -1, cand2 = -1;
605	int needed;
606
607	*na = NULL;     /* default return value */
608
609	/* first try to see if this is a bridge port. */
610	NMG_LOCK_ASSERT();
611	if (strncmp(nr_name, NM_NAME, sizeof(NM_NAME) - 1)) {
612		return 0;  /* no error, but no VALE prefix */
613	}
614
615	b = nm_find_bridge(nr_name, create);
616	if (b == NULL) {
617		D("no bridges available for '%s'", nr_name);
618		return (create ? ENOMEM : ENXIO);
619	}
620	if (strlen(nr_name) < b->bdg_namelen) /* impossible */
621		panic("x");
622
623	/* Now we are sure that name starts with the bridge's name,
624	 * lookup the port in the bridge. We need to scan the entire
625	 * list. It is not important to hold a WLOCK on the bridge
626	 * during the search because NMG_LOCK already guarantees
627	 * that there are no other possible writers.
628	 */
629
630	/* lookup in the local list of ports */
631	for (j = 0; j < b->bdg_active_ports; j++) {
632		i = b->bdg_port_index[j];
633		vpna = b->bdg_ports[i];
634		// KASSERT(na != NULL);
635		D("checking %s", vpna->up.name);
636		if (!strcmp(vpna->up.name, nr_name)) {
637			netmap_adapter_get(&vpna->up);
638			ND("found existing if %s refs %d", nr_name)
639			*na = &vpna->up;
640			return 0;
641		}
642	}
643	/* not found, should we create it? */
644	if (!create)
645		return ENXIO;
646	/* yes we should, see if we have space to attach entries */
647	needed = 2; /* in some cases we only need 1 */
648	if (b->bdg_active_ports + needed >= NM_BDG_MAXPORTS) {
649		D("bridge full %d, cannot create new port", b->bdg_active_ports);
650		return ENOMEM;
651	}
652	/* record the next two ports available, but do not allocate yet */
653	cand = b->bdg_port_index[b->bdg_active_ports];
654	cand2 = b->bdg_port_index[b->bdg_active_ports + 1];
655	ND("+++ bridge %s port %s used %d avail %d %d",
656		b->bdg_basename, ifname, b->bdg_active_ports, cand, cand2);
657
658	/*
659	 * try see if there is a matching NIC with this name
660	 * (after the bridge's name)
661	 */
662	ifname = nr_name + b->bdg_namelen + 1;
663	ifp = ifunit_ref(ifname);
664	if (!ifp) {
665		/* Create an ephemeral virtual port
666		 * This block contains all the ephemeral-specific logics
667		 */
668		if (nmr->nr_cmd) {
669			/* nr_cmd must be 0 for a virtual port */
670			return EINVAL;
671		}
672
673		/* bdg_netmap_attach creates a struct netmap_adapter */
674		error = netmap_vp_create(nmr, NULL, &vpna);
675		if (error) {
676			D("error %d", error);
677			free(ifp, M_DEVBUF);
678			return error;
679		}
680		/* shortcut - we can skip get_hw_na(),
681		 * ownership check and nm_bdg_attach()
682		 */
683	} else {
684		struct netmap_adapter *hw;
685
686		error = netmap_get_hw_na(ifp, &hw);
687		if (error || hw == NULL)
688			goto out;
689
690		/* host adapter might not be created */
691		error = hw->nm_bdg_attach(nr_name, hw);
692		if (error)
693			goto out;
694		vpna = hw->na_vp;
695		hostna = hw->na_hostvp;
696		if_rele(ifp);
697		if (nmr->nr_arg1 != NETMAP_BDG_HOST)
698			hostna = NULL;
699	}
700
701	BDG_WLOCK(b);
702	vpna->bdg_port = cand;
703	ND("NIC  %p to bridge port %d", vpna, cand);
704	/* bind the port to the bridge (virtual ports are not active) */
705	b->bdg_ports[cand] = vpna;
706	vpna->na_bdg = b;
707	b->bdg_active_ports++;
708	if (hostna != NULL) {
709		/* also bind the host stack to the bridge */
710		b->bdg_ports[cand2] = hostna;
711		hostna->bdg_port = cand2;
712		hostna->na_bdg = b;
713		b->bdg_active_ports++;
714		ND("host %p to bridge port %d", hostna, cand2);
715	}
716	ND("if %s refs %d", ifname, vpna->up.na_refcount);
717	BDG_WUNLOCK(b);
718	*na = &vpna->up;
719	netmap_adapter_get(*na);
720	return 0;
721
722out:
723	if_rele(ifp);
724
725	return error;
726}
727
728
729/* Process NETMAP_BDG_ATTACH */
730static int
731nm_bdg_ctl_attach(struct nmreq *nmr)
732{
733	struct netmap_adapter *na;
734	int error;
735
736	NMG_LOCK();
737
738	error = netmap_get_bdg_na(nmr, &na, 1 /* create if not exists */);
739	if (error) /* no device */
740		goto unlock_exit;
741
742	if (na == NULL) { /* VALE prefix missing */
743		error = EINVAL;
744		goto unlock_exit;
745	}
746
747	if (NETMAP_OWNED_BY_ANY(na)) {
748		error = EBUSY;
749		goto unref_exit;
750	}
751
752	if (na->nm_bdg_ctl) {
753		/* nop for VALE ports. The bwrap needs to put the hwna
754		 * in netmap mode (see netmap_bwrap_bdg_ctl)
755		 */
756		error = na->nm_bdg_ctl(na, nmr, 1);
757		if (error)
758			goto unref_exit;
759		ND("registered %s to netmap-mode", na->name);
760	}
761	NMG_UNLOCK();
762	return 0;
763
764unref_exit:
765	netmap_adapter_put(na);
766unlock_exit:
767	NMG_UNLOCK();
768	return error;
769}
770
771
772/* process NETMAP_BDG_DETACH */
773static int
774nm_bdg_ctl_detach(struct nmreq *nmr)
775{
776	struct netmap_adapter *na;
777	int error;
778
779	NMG_LOCK();
780	error = netmap_get_bdg_na(nmr, &na, 0 /* don't create */);
781	if (error) { /* no device, or another bridge or user owns the device */
782		goto unlock_exit;
783	}
784
785	if (na == NULL) { /* VALE prefix missing */
786		error = EINVAL;
787		goto unlock_exit;
788	}
789
790	if (na->nm_bdg_ctl) {
791		/* remove the port from bridge. The bwrap
792		 * also needs to put the hwna in normal mode
793		 */
794		error = na->nm_bdg_ctl(na, nmr, 0);
795	}
796
797	netmap_adapter_put(na);
798unlock_exit:
799	NMG_UNLOCK();
800	return error;
801
802}
803
804
805/* Called by either user's context (netmap_ioctl())
806 * or external kernel modules (e.g., Openvswitch).
807 * Operation is indicated in nmr->nr_cmd.
808 * NETMAP_BDG_OPS that sets configure/lookup/dtor functions to the bridge
809 * requires bdg_ops argument; the other commands ignore this argument.
810 *
811 * Called without NMG_LOCK.
812 */
813int
814netmap_bdg_ctl(struct nmreq *nmr, struct netmap_bdg_ops *bdg_ops)
815{
816	struct nm_bridge *b;
817	struct netmap_adapter *na;
818	struct netmap_vp_adapter *vpna;
819	char *name = nmr->nr_name;
820	int cmd = nmr->nr_cmd, namelen = strlen(name);
821	int error = 0, i, j;
822
823	switch (cmd) {
824	case NETMAP_BDG_NEWIF:
825		error = nm_vi_create(nmr);
826		break;
827
828	case NETMAP_BDG_DELIF:
829		error = nm_vi_destroy(nmr->nr_name);
830		break;
831
832	case NETMAP_BDG_ATTACH:
833		error = nm_bdg_ctl_attach(nmr);
834		break;
835
836	case NETMAP_BDG_DETACH:
837		error = nm_bdg_ctl_detach(nmr);
838		break;
839
840	case NETMAP_BDG_LIST:
841		/* this is used to enumerate bridges and ports */
842		if (namelen) { /* look up indexes of bridge and port */
843			if (strncmp(name, NM_NAME, strlen(NM_NAME))) {
844				error = EINVAL;
845				break;
846			}
847			NMG_LOCK();
848			b = nm_find_bridge(name, 0 /* don't create */);
849			if (!b) {
850				error = ENOENT;
851				NMG_UNLOCK();
852				break;
853			}
854
855			name = name + b->bdg_namelen + 1;
856			error = ENOENT;
857			for (j = 0; j < b->bdg_active_ports; j++) {
858				i = b->bdg_port_index[j];
859				vpna = b->bdg_ports[i];
860				if (vpna == NULL) {
861					D("---AAAAAAAAARGH-------");
862					continue;
863				}
864				/* the former and the latter identify a
865				 * virtual port and a NIC, respectively
866				 */
867				if (!strcmp(vpna->up.name, name)) {
868					/* bridge index */
869					nmr->nr_arg1 = b - nm_bridges;
870					nmr->nr_arg2 = i; /* port index */
871					error = 0;
872					break;
873				}
874			}
875			NMG_UNLOCK();
876		} else {
877			/* return the first non-empty entry starting from
878			 * bridge nr_arg1 and port nr_arg2.
879			 *
880			 * Users can detect the end of the same bridge by
881			 * seeing the new and old value of nr_arg1, and can
882			 * detect the end of all the bridge by error != 0
883			 */
884			i = nmr->nr_arg1;
885			j = nmr->nr_arg2;
886
887			NMG_LOCK();
888			for (error = ENOENT; i < NM_BRIDGES; i++) {
889				b = nm_bridges + i;
890				if (j >= b->bdg_active_ports) {
891					j = 0; /* following bridges scan from 0 */
892					continue;
893				}
894				nmr->nr_arg1 = i;
895				nmr->nr_arg2 = j;
896				j = b->bdg_port_index[j];
897				vpna = b->bdg_ports[j];
898				strncpy(name, vpna->up.name, (size_t)IFNAMSIZ);
899				error = 0;
900				break;
901			}
902			NMG_UNLOCK();
903		}
904		break;
905
906	case NETMAP_BDG_REGOPS: /* XXX this should not be available from userspace */
907		/* register callbacks to the given bridge.
908		 * nmr->nr_name may be just bridge's name (including ':'
909		 * if it is not just NM_NAME).
910		 */
911		if (!bdg_ops) {
912			error = EINVAL;
913			break;
914		}
915		NMG_LOCK();
916		b = nm_find_bridge(name, 0 /* don't create */);
917		if (!b) {
918			error = EINVAL;
919		} else {
920			b->bdg_ops = *bdg_ops;
921		}
922		NMG_UNLOCK();
923		break;
924
925	case NETMAP_BDG_VNET_HDR:
926		/* Valid lengths for the virtio-net header are 0 (no header),
927		   10 and 12. */
928		if (nmr->nr_arg1 != 0 &&
929			nmr->nr_arg1 != sizeof(struct nm_vnet_hdr) &&
930				nmr->nr_arg1 != 12) {
931			error = EINVAL;
932			break;
933		}
934		NMG_LOCK();
935		error = netmap_get_bdg_na(nmr, &na, 0);
936		if (na && !error) {
937			vpna = (struct netmap_vp_adapter *)na;
938			vpna->virt_hdr_len = nmr->nr_arg1;
939			if (vpna->virt_hdr_len)
940				vpna->mfs = NETMAP_BUF_SIZE(na);
941			D("Using vnet_hdr_len %d for %p", vpna->virt_hdr_len, vpna);
942			netmap_adapter_put(na);
943		}
944		NMG_UNLOCK();
945		break;
946
947	default:
948		D("invalid cmd (nmr->nr_cmd) (0x%x)", cmd);
949		error = EINVAL;
950		break;
951	}
952	return error;
953}
954
955int
956netmap_bdg_config(struct nmreq *nmr)
957{
958	struct nm_bridge *b;
959	int error = EINVAL;
960
961	NMG_LOCK();
962	b = nm_find_bridge(nmr->nr_name, 0);
963	if (!b) {
964		NMG_UNLOCK();
965		return error;
966	}
967	NMG_UNLOCK();
968	/* Don't call config() with NMG_LOCK() held */
969	BDG_RLOCK(b);
970	if (b->bdg_ops.config != NULL)
971		error = b->bdg_ops.config((struct nm_ifreq *)nmr);
972	BDG_RUNLOCK(b);
973	return error;
974}
975
976
977/* nm_krings_create callback for VALE ports.
978 * Calls the standard netmap_krings_create, then adds leases on rx
979 * rings and bdgfwd on tx rings.
980 */
981static int
982netmap_vp_krings_create(struct netmap_adapter *na)
983{
984	u_int tailroom;
985	int error, i;
986	uint32_t *leases;
987	u_int nrx = netmap_real_rx_rings(na);
988
989	/*
990	 * Leases are attached to RX rings on vale ports
991	 */
992	tailroom = sizeof(uint32_t) * na->num_rx_desc * nrx;
993
994	error = netmap_krings_create(na, tailroom);
995	if (error)
996		return error;
997
998	leases = na->tailroom;
999
1000	for (i = 0; i < nrx; i++) { /* Receive rings */
1001		na->rx_rings[i].nkr_leases = leases;
1002		leases += na->num_rx_desc;
1003	}
1004
1005	error = nm_alloc_bdgfwd(na);
1006	if (error) {
1007		netmap_krings_delete(na);
1008		return error;
1009	}
1010
1011	return 0;
1012}
1013
1014
1015/* nm_krings_delete callback for VALE ports. */
1016static void
1017netmap_vp_krings_delete(struct netmap_adapter *na)
1018{
1019	nm_free_bdgfwd(na);
1020	netmap_krings_delete(na);
1021}
1022
1023
1024static int
1025nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n,
1026	struct netmap_vp_adapter *na, u_int ring_nr);
1027
1028
1029/*
1030 * main dispatch routine for the bridge.
1031 * Grab packets from a kring, move them into the ft structure
1032 * associated to the tx (input) port. Max one instance per port,
1033 * filtered on input (ioctl, poll or XXX).
1034 * Returns the next position in the ring.
1035 */
1036static int
1037nm_bdg_preflush(struct netmap_kring *kring, u_int end)
1038{
1039	struct netmap_vp_adapter *na =
1040		(struct netmap_vp_adapter*)kring->na;
1041	struct netmap_ring *ring = kring->ring;
1042	struct nm_bdg_fwd *ft;
1043	u_int ring_nr = kring->ring_id;
1044	u_int j = kring->nr_hwcur, lim = kring->nkr_num_slots - 1;
1045	u_int ft_i = 0;	/* start from 0 */
1046	u_int frags = 1; /* how many frags ? */
1047	struct nm_bridge *b = na->na_bdg;
1048
1049	/* To protect against modifications to the bridge we acquire a
1050	 * shared lock, waiting if we can sleep (if the source port is
1051	 * attached to a user process) or with a trylock otherwise (NICs).
1052	 */
1053	ND("wait rlock for %d packets", ((j > end ? lim+1 : 0) + end) - j);
1054	if (na->up.na_flags & NAF_BDG_MAYSLEEP)
1055		BDG_RLOCK(b);
1056	else if (!BDG_RTRYLOCK(b))
1057		return 0;
1058	ND(5, "rlock acquired for %d packets", ((j > end ? lim+1 : 0) + end) - j);
1059	ft = kring->nkr_ft;
1060
1061	for (; likely(j != end); j = nm_next(j, lim)) {
1062		struct netmap_slot *slot = &ring->slot[j];
1063		char *buf;
1064
1065		ft[ft_i].ft_len = slot->len;
1066		ft[ft_i].ft_flags = slot->flags;
1067
1068		ND("flags is 0x%x", slot->flags);
1069		/* this slot goes into a list so initialize the link field */
1070		ft[ft_i].ft_next = NM_FT_NULL;
1071		buf = ft[ft_i].ft_buf = (slot->flags & NS_INDIRECT) ?
1072			(void *)(uintptr_t)slot->ptr : NMB(&na->up, slot);
1073		if (unlikely(buf == NULL)) {
1074			RD(5, "NULL %s buffer pointer from %s slot %d len %d",
1075				(slot->flags & NS_INDIRECT) ? "INDIRECT" : "DIRECT",
1076				kring->name, j, ft[ft_i].ft_len);
1077			buf = ft[ft_i].ft_buf = NETMAP_BUF_BASE(&na->up);
1078			ft[ft_i].ft_len = 0;
1079			ft[ft_i].ft_flags = 0;
1080		}
1081		__builtin_prefetch(buf);
1082		++ft_i;
1083		if (slot->flags & NS_MOREFRAG) {
1084			frags++;
1085			continue;
1086		}
1087		if (unlikely(netmap_verbose && frags > 1))
1088			RD(5, "%d frags at %d", frags, ft_i - frags);
1089		ft[ft_i - frags].ft_frags = frags;
1090		frags = 1;
1091		if (unlikely((int)ft_i >= bridge_batch))
1092			ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr);
1093	}
1094	if (frags > 1) {
1095		D("truncate incomplete fragment at %d (%d frags)", ft_i, frags);
1096		// ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG
1097		ft[ft_i - 1].ft_frags &= ~NS_MOREFRAG;
1098		ft[ft_i - frags].ft_frags = frags - 1;
1099	}
1100	if (ft_i)
1101		ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr);
1102	BDG_RUNLOCK(b);
1103	return j;
1104}
1105
1106
1107/* ----- FreeBSD if_bridge hash function ------- */
1108
1109/*
1110 * The following hash function is adapted from "Hash Functions" by Bob Jenkins
1111 * ("Algorithm Alley", Dr. Dobbs Journal, September 1997).
1112 *
1113 * http://www.burtleburtle.net/bob/hash/spooky.html
1114 */
1115#define mix(a, b, c)                                                    \
1116do {                                                                    \
1117        a -= b; a -= c; a ^= (c >> 13);                                 \
1118        b -= c; b -= a; b ^= (a << 8);                                  \
1119        c -= a; c -= b; c ^= (b >> 13);                                 \
1120        a -= b; a -= c; a ^= (c >> 12);                                 \
1121        b -= c; b -= a; b ^= (a << 16);                                 \
1122        c -= a; c -= b; c ^= (b >> 5);                                  \
1123        a -= b; a -= c; a ^= (c >> 3);                                  \
1124        b -= c; b -= a; b ^= (a << 10);                                 \
1125        c -= a; c -= b; c ^= (b >> 15);                                 \
1126} while (/*CONSTCOND*/0)
1127
1128
1129static __inline uint32_t
1130nm_bridge_rthash(const uint8_t *addr)
1131{
1132        uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = 0; // hask key
1133
1134        b += addr[5] << 8;
1135        b += addr[4];
1136        a += addr[3] << 24;
1137        a += addr[2] << 16;
1138        a += addr[1] << 8;
1139        a += addr[0];
1140
1141        mix(a, b, c);
1142#define BRIDGE_RTHASH_MASK	(NM_BDG_HASH-1)
1143        return (c & BRIDGE_RTHASH_MASK);
1144}
1145
1146#undef mix
1147
1148
1149/* nm_register callback for VALE ports */
1150static int
1151netmap_vp_reg(struct netmap_adapter *na, int onoff)
1152{
1153	struct netmap_vp_adapter *vpna =
1154		(struct netmap_vp_adapter*)na;
1155
1156	/* persistent ports may be put in netmap mode
1157	 * before being attached to a bridge
1158	 */
1159	if (vpna->na_bdg)
1160		BDG_WLOCK(vpna->na_bdg);
1161	if (onoff) {
1162		na->na_flags |= NAF_NETMAP_ON;
1163		 /* XXX on FreeBSD, persistent VALE ports should also
1164		 * toggle IFCAP_NETMAP in na->ifp (2014-03-16)
1165		 */
1166	} else {
1167		na->na_flags &= ~NAF_NETMAP_ON;
1168	}
1169	if (vpna->na_bdg)
1170		BDG_WUNLOCK(vpna->na_bdg);
1171	return 0;
1172}
1173
1174
1175/*
1176 * Lookup function for a learning bridge.
1177 * Update the hash table with the source address,
1178 * and then returns the destination port index, and the
1179 * ring in *dst_ring (at the moment, always use ring 0)
1180 */
1181u_int
1182netmap_bdg_learning(struct nm_bdg_fwd *ft, uint8_t *dst_ring,
1183		const struct netmap_vp_adapter *na)
1184{
1185	uint8_t *buf = ft->ft_buf;
1186	u_int buf_len = ft->ft_len;
1187	struct nm_hash_ent *ht = na->na_bdg->ht;
1188	uint32_t sh, dh;
1189	u_int dst, mysrc = na->bdg_port;
1190	uint64_t smac, dmac;
1191
1192	/* safety check, unfortunately we have many cases */
1193	if (buf_len >= 14 + na->virt_hdr_len) {
1194		/* virthdr + mac_hdr in the same slot */
1195		buf += na->virt_hdr_len;
1196		buf_len -= na->virt_hdr_len;
1197	} else if (buf_len == na->virt_hdr_len && ft->ft_flags & NS_MOREFRAG) {
1198		/* only header in first fragment */
1199		ft++;
1200		buf = ft->ft_buf;
1201		buf_len = ft->ft_len;
1202	} else {
1203		RD(5, "invalid buf format, length %d", buf_len);
1204		return NM_BDG_NOPORT;
1205	}
1206	dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff;
1207	smac = le64toh(*(uint64_t *)(buf + 4));
1208	smac >>= 16;
1209
1210	/*
1211	 * The hash is somewhat expensive, there might be some
1212	 * worthwhile optimizations here.
1213	 */
1214	if ((buf[6] & 1) == 0) { /* valid src */
1215		uint8_t *s = buf+6;
1216		sh = nm_bridge_rthash(s); // XXX hash of source
1217		/* update source port forwarding entry */
1218		ht[sh].mac = smac;	/* XXX expire ? */
1219		ht[sh].ports = mysrc;
1220		if (netmap_verbose)
1221		    D("src %02x:%02x:%02x:%02x:%02x:%02x on port %d",
1222			s[0], s[1], s[2], s[3], s[4], s[5], mysrc);
1223	}
1224	dst = NM_BDG_BROADCAST;
1225	if ((buf[0] & 1) == 0) { /* unicast */
1226		dh = nm_bridge_rthash(buf); // XXX hash of dst
1227		if (ht[dh].mac == dmac) {	/* found dst */
1228			dst = ht[dh].ports;
1229		}
1230		/* XXX otherwise return NM_BDG_UNKNOWN ? */
1231	}
1232	*dst_ring = 0;
1233	return dst;
1234}
1235
1236
1237/*
1238 * Available space in the ring. Only used in VALE code
1239 * and only with is_rx = 1
1240 */
1241static inline uint32_t
1242nm_kr_space(struct netmap_kring *k, int is_rx)
1243{
1244	int space;
1245
1246	if (is_rx) {
1247		int busy = k->nkr_hwlease - k->nr_hwcur;
1248		if (busy < 0)
1249			busy += k->nkr_num_slots;
1250		space = k->nkr_num_slots - 1 - busy;
1251	} else {
1252		/* XXX never used in this branch */
1253		space = k->nr_hwtail - k->nkr_hwlease;
1254		if (space < 0)
1255			space += k->nkr_num_slots;
1256	}
1257#if 0
1258	// sanity check
1259	if (k->nkr_hwlease >= k->nkr_num_slots ||
1260		k->nr_hwcur >= k->nkr_num_slots ||
1261		k->nr_tail >= k->nkr_num_slots ||
1262		busy < 0 ||
1263		busy >= k->nkr_num_slots) {
1264		D("invalid kring, cur %d tail %d lease %d lease_idx %d lim %d",			k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease,
1265			k->nkr_lease_idx, k->nkr_num_slots);
1266	}
1267#endif
1268	return space;
1269}
1270
1271
1272
1273
1274/* make a lease on the kring for N positions. return the
1275 * lease index
1276 * XXX only used in VALE code and with is_rx = 1
1277 */
1278static inline uint32_t
1279nm_kr_lease(struct netmap_kring *k, u_int n, int is_rx)
1280{
1281	uint32_t lim = k->nkr_num_slots - 1;
1282	uint32_t lease_idx = k->nkr_lease_idx;
1283
1284	k->nkr_leases[lease_idx] = NR_NOSLOT;
1285	k->nkr_lease_idx = nm_next(lease_idx, lim);
1286
1287	if (n > nm_kr_space(k, is_rx)) {
1288		D("invalid request for %d slots", n);
1289		panic("x");
1290	}
1291	/* XXX verify that there are n slots */
1292	k->nkr_hwlease += n;
1293	if (k->nkr_hwlease > lim)
1294		k->nkr_hwlease -= lim + 1;
1295
1296	if (k->nkr_hwlease >= k->nkr_num_slots ||
1297		k->nr_hwcur >= k->nkr_num_slots ||
1298		k->nr_hwtail >= k->nkr_num_slots ||
1299		k->nkr_lease_idx >= k->nkr_num_slots) {
1300		D("invalid kring %s, cur %d tail %d lease %d lease_idx %d lim %d",
1301			k->na->name,
1302			k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease,
1303			k->nkr_lease_idx, k->nkr_num_slots);
1304	}
1305	return lease_idx;
1306}
1307
1308/*
1309 *
1310 * This flush routine supports only unicast and broadcast but a large
1311 * number of ports, and lets us replace the learn and dispatch functions.
1312 */
1313int
1314nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na,
1315		u_int ring_nr)
1316{
1317	struct nm_bdg_q *dst_ents, *brddst;
1318	uint16_t num_dsts = 0, *dsts;
1319	struct nm_bridge *b = na->na_bdg;
1320	u_int i, j, me = na->bdg_port;
1321
1322	/*
1323	 * The work area (pointed by ft) is followed by an array of
1324	 * pointers to queues , dst_ents; there are NM_BDG_MAXRINGS
1325	 * queues per port plus one for the broadcast traffic.
1326	 * Then we have an array of destination indexes.
1327	 */
1328	dst_ents = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX);
1329	dsts = (uint16_t *)(dst_ents + NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1);
1330
1331	/* first pass: find a destination for each packet in the batch */
1332	for (i = 0; likely(i < n); i += ft[i].ft_frags) {
1333		uint8_t dst_ring = ring_nr; /* default, same ring as origin */
1334		uint16_t dst_port, d_i;
1335		struct nm_bdg_q *d;
1336
1337		ND("slot %d frags %d", i, ft[i].ft_frags);
1338		/* Drop the packet if the virtio-net header is not into the first
1339		   fragment nor at the very beginning of the second. */
1340		if (unlikely(na->virt_hdr_len > ft[i].ft_len))
1341			continue;
1342		dst_port = b->bdg_ops.lookup(&ft[i], &dst_ring, na);
1343		if (netmap_verbose > 255)
1344			RD(5, "slot %d port %d -> %d", i, me, dst_port);
1345		if (dst_port == NM_BDG_NOPORT)
1346			continue; /* this packet is identified to be dropped */
1347		else if (unlikely(dst_port > NM_BDG_MAXPORTS))
1348			continue;
1349		else if (dst_port == NM_BDG_BROADCAST)
1350			dst_ring = 0; /* broadcasts always go to ring 0 */
1351		else if (unlikely(dst_port == me ||
1352		    !b->bdg_ports[dst_port]))
1353			continue;
1354
1355		/* get a position in the scratch pad */
1356		d_i = dst_port * NM_BDG_MAXRINGS + dst_ring;
1357		d = dst_ents + d_i;
1358
1359		/* append the first fragment to the list */
1360		if (d->bq_head == NM_FT_NULL) { /* new destination */
1361			d->bq_head = d->bq_tail = i;
1362			/* remember this position to be scanned later */
1363			if (dst_port != NM_BDG_BROADCAST)
1364				dsts[num_dsts++] = d_i;
1365		} else {
1366			ft[d->bq_tail].ft_next = i;
1367			d->bq_tail = i;
1368		}
1369		d->bq_len += ft[i].ft_frags;
1370	}
1371
1372	/*
1373	 * Broadcast traffic goes to ring 0 on all destinations.
1374	 * So we need to add these rings to the list of ports to scan.
1375	 * XXX at the moment we scan all NM_BDG_MAXPORTS ports, which is
1376	 * expensive. We should keep a compact list of active destinations
1377	 * so we could shorten this loop.
1378	 */
1379	brddst = dst_ents + NM_BDG_BROADCAST * NM_BDG_MAXRINGS;
1380	if (brddst->bq_head != NM_FT_NULL) {
1381		for (j = 0; likely(j < b->bdg_active_ports); j++) {
1382			uint16_t d_i;
1383			i = b->bdg_port_index[j];
1384			if (unlikely(i == me))
1385				continue;
1386			d_i = i * NM_BDG_MAXRINGS;
1387			if (dst_ents[d_i].bq_head == NM_FT_NULL)
1388				dsts[num_dsts++] = d_i;
1389		}
1390	}
1391
1392	ND(5, "pass 1 done %d pkts %d dsts", n, num_dsts);
1393	/* second pass: scan destinations */
1394	for (i = 0; i < num_dsts; i++) {
1395		struct netmap_vp_adapter *dst_na;
1396		struct netmap_kring *kring;
1397		struct netmap_ring *ring;
1398		u_int dst_nr, lim, j, d_i, next, brd_next;
1399		u_int needed, howmany;
1400		int retry = netmap_txsync_retry;
1401		struct nm_bdg_q *d;
1402		uint32_t my_start = 0, lease_idx = 0;
1403		int nrings;
1404		int virt_hdr_mismatch = 0;
1405
1406		d_i = dsts[i];
1407		ND("second pass %d port %d", i, d_i);
1408		d = dst_ents + d_i;
1409		// XXX fix the division
1410		dst_na = b->bdg_ports[d_i/NM_BDG_MAXRINGS];
1411		/* protect from the lookup function returning an inactive
1412		 * destination port
1413		 */
1414		if (unlikely(dst_na == NULL))
1415			goto cleanup;
1416		if (dst_na->up.na_flags & NAF_SW_ONLY)
1417			goto cleanup;
1418		/*
1419		 * The interface may be in !netmap mode in two cases:
1420		 * - when na is attached but not activated yet;
1421		 * - when na is being deactivated but is still attached.
1422		 */
1423		if (unlikely(!nm_netmap_on(&dst_na->up))) {
1424			ND("not in netmap mode!");
1425			goto cleanup;
1426		}
1427
1428		/* there is at least one either unicast or broadcast packet */
1429		brd_next = brddst->bq_head;
1430		next = d->bq_head;
1431		/* we need to reserve this many slots. If fewer are
1432		 * available, some packets will be dropped.
1433		 * Packets may have multiple fragments, so we may not use
1434		 * there is a chance that we may not use all of the slots
1435		 * we have claimed, so we will need to handle the leftover
1436		 * ones when we regain the lock.
1437		 */
1438		needed = d->bq_len + brddst->bq_len;
1439
1440		if (unlikely(dst_na->virt_hdr_len != na->virt_hdr_len)) {
1441			RD(3, "virt_hdr_mismatch, src %d dst %d", na->virt_hdr_len, dst_na->virt_hdr_len);
1442			/* There is a virtio-net header/offloadings mismatch between
1443			 * source and destination. The slower mismatch datapath will
1444			 * be used to cope with all the mismatches.
1445			 */
1446			virt_hdr_mismatch = 1;
1447			if (dst_na->mfs < na->mfs) {
1448				/* We may need to do segmentation offloadings, and so
1449				 * we may need a number of destination slots greater
1450				 * than the number of input slots ('needed').
1451				 * We look for the smallest integer 'x' which satisfies:
1452				 *	needed * na->mfs + x * H <= x * na->mfs
1453				 * where 'H' is the length of the longest header that may
1454				 * be replicated in the segmentation process (e.g. for
1455				 * TCPv4 we must account for ethernet header, IP header
1456				 * and TCPv4 header).
1457				 */
1458				needed = (needed * na->mfs) /
1459						(dst_na->mfs - WORST_CASE_GSO_HEADER) + 1;
1460				ND(3, "srcmtu=%u, dstmtu=%u, x=%u", na->mfs, dst_na->mfs, needed);
1461			}
1462		}
1463
1464		ND(5, "pass 2 dst %d is %x %s",
1465			i, d_i, is_vp ? "virtual" : "nic/host");
1466		dst_nr = d_i & (NM_BDG_MAXRINGS-1);
1467		nrings = dst_na->up.num_rx_rings;
1468		if (dst_nr >= nrings)
1469			dst_nr = dst_nr % nrings;
1470		kring = &dst_na->up.rx_rings[dst_nr];
1471		ring = kring->ring;
1472		lim = kring->nkr_num_slots - 1;
1473
1474retry:
1475
1476		if (dst_na->retry && retry) {
1477			/* try to get some free slot from the previous run */
1478			dst_na->up.nm_notify(&dst_na->up, dst_nr, NR_RX, 0);
1479			/* actually useful only for bwraps, since there
1480			 * the notify will trigger a txsync on the hwna. VALE ports
1481			 * have dst_na->retry == 0
1482			 */
1483		}
1484		/* reserve the buffers in the queue and an entry
1485		 * to report completion, and drop lock.
1486		 * XXX this might become a helper function.
1487		 */
1488		mtx_lock(&kring->q_lock);
1489		if (kring->nkr_stopped) {
1490			mtx_unlock(&kring->q_lock);
1491			goto cleanup;
1492		}
1493		my_start = j = kring->nkr_hwlease;
1494		howmany = nm_kr_space(kring, 1);
1495		if (needed < howmany)
1496			howmany = needed;
1497		lease_idx = nm_kr_lease(kring, howmany, 1);
1498		mtx_unlock(&kring->q_lock);
1499
1500		/* only retry if we need more than available slots */
1501		if (retry && needed <= howmany)
1502			retry = 0;
1503
1504		/* copy to the destination queue */
1505		while (howmany > 0) {
1506			struct netmap_slot *slot;
1507			struct nm_bdg_fwd *ft_p, *ft_end;
1508			u_int cnt;
1509
1510			/* find the queue from which we pick next packet.
1511			 * NM_FT_NULL is always higher than valid indexes
1512			 * so we never dereference it if the other list
1513			 * has packets (and if both are empty we never
1514			 * get here).
1515			 */
1516			if (next < brd_next) {
1517				ft_p = ft + next;
1518				next = ft_p->ft_next;
1519			} else { /* insert broadcast */
1520				ft_p = ft + brd_next;
1521				brd_next = ft_p->ft_next;
1522			}
1523			cnt = ft_p->ft_frags; // cnt > 0
1524			if (unlikely(cnt > howmany))
1525			    break; /* no more space */
1526			if (netmap_verbose && cnt > 1)
1527				RD(5, "rx %d frags to %d", cnt, j);
1528			ft_end = ft_p + cnt;
1529			if (unlikely(virt_hdr_mismatch)) {
1530				bdg_mismatch_datapath(na, dst_na, ft_p, ring, &j, lim, &howmany);
1531			} else {
1532				howmany -= cnt;
1533				do {
1534					char *dst, *src = ft_p->ft_buf;
1535					size_t copy_len = ft_p->ft_len, dst_len = copy_len;
1536
1537					slot = &ring->slot[j];
1538					dst = NMB(&dst_na->up, slot);
1539
1540					ND("send [%d] %d(%d) bytes at %s:%d",
1541							i, (int)copy_len, (int)dst_len,
1542							NM_IFPNAME(dst_ifp), j);
1543					/* round to a multiple of 64 */
1544					copy_len = (copy_len + 63) & ~63;
1545
1546					if (unlikely(copy_len > NETMAP_BUF_SIZE(&dst_na->up) ||
1547						     copy_len > NETMAP_BUF_SIZE(&na->up))) {
1548						RD(5, "invalid len %d, down to 64", (int)copy_len);
1549						copy_len = dst_len = 64; // XXX
1550					}
1551					if (ft_p->ft_flags & NS_INDIRECT) {
1552						if (copyin(src, dst, copy_len)) {
1553							// invalid user pointer, pretend len is 0
1554							dst_len = 0;
1555						}
1556					} else {
1557						//memcpy(dst, src, copy_len);
1558						pkt_copy(src, dst, (int)copy_len);
1559					}
1560					slot->len = dst_len;
1561					slot->flags = (cnt << 8)| NS_MOREFRAG;
1562					j = nm_next(j, lim);
1563					needed--;
1564					ft_p++;
1565				} while (ft_p != ft_end);
1566				slot->flags = (cnt << 8); /* clear flag on last entry */
1567			}
1568			/* are we done ? */
1569			if (next == NM_FT_NULL && brd_next == NM_FT_NULL)
1570				break;
1571		}
1572		{
1573		    /* current position */
1574		    uint32_t *p = kring->nkr_leases; /* shorthand */
1575		    uint32_t update_pos;
1576		    int still_locked = 1;
1577
1578		    mtx_lock(&kring->q_lock);
1579		    if (unlikely(howmany > 0)) {
1580			/* not used all bufs. If i am the last one
1581			 * i can recover the slots, otherwise must
1582			 * fill them with 0 to mark empty packets.
1583			 */
1584			ND("leftover %d bufs", howmany);
1585			if (nm_next(lease_idx, lim) == kring->nkr_lease_idx) {
1586			    /* yes i am the last one */
1587			    ND("roll back nkr_hwlease to %d", j);
1588			    kring->nkr_hwlease = j;
1589			} else {
1590			    while (howmany-- > 0) {
1591				ring->slot[j].len = 0;
1592				ring->slot[j].flags = 0;
1593				j = nm_next(j, lim);
1594			    }
1595			}
1596		    }
1597		    p[lease_idx] = j; /* report I am done */
1598
1599		    update_pos = kring->nr_hwtail;
1600
1601		    if (my_start == update_pos) {
1602			/* all slots before my_start have been reported,
1603			 * so scan subsequent leases to see if other ranges
1604			 * have been completed, and to a selwakeup or txsync.
1605		         */
1606			while (lease_idx != kring->nkr_lease_idx &&
1607				p[lease_idx] != NR_NOSLOT) {
1608			    j = p[lease_idx];
1609			    p[lease_idx] = NR_NOSLOT;
1610			    lease_idx = nm_next(lease_idx, lim);
1611			}
1612			/* j is the new 'write' position. j != my_start
1613			 * means there are new buffers to report
1614			 */
1615			if (likely(j != my_start)) {
1616				kring->nr_hwtail = j;
1617				still_locked = 0;
1618				mtx_unlock(&kring->q_lock);
1619				dst_na->up.nm_notify(&dst_na->up, dst_nr, NR_RX, 0);
1620				/* this is netmap_notify for VALE ports and
1621				 * netmap_bwrap_notify for bwrap. The latter will
1622				 * trigger a txsync on the underlying hwna
1623				 */
1624				if (dst_na->retry && retry--) {
1625					/* XXX this is going to call nm_notify again.
1626					 * Only useful for bwrap in virtual machines
1627					 */
1628					goto retry;
1629				}
1630			}
1631		    }
1632		    if (still_locked)
1633			mtx_unlock(&kring->q_lock);
1634		}
1635cleanup:
1636		d->bq_head = d->bq_tail = NM_FT_NULL; /* cleanup */
1637		d->bq_len = 0;
1638	}
1639	brddst->bq_head = brddst->bq_tail = NM_FT_NULL; /* cleanup */
1640	brddst->bq_len = 0;
1641	return 0;
1642}
1643
1644/* nm_txsync callback for VALE ports */
1645static int
1646netmap_vp_txsync(struct netmap_kring *kring, int flags)
1647{
1648	struct netmap_vp_adapter *na =
1649		(struct netmap_vp_adapter *)kring->na;
1650	u_int done;
1651	u_int const lim = kring->nkr_num_slots - 1;
1652	u_int const cur = kring->rcur;
1653
1654	if (bridge_batch <= 0) { /* testing only */
1655		done = cur; // used all
1656		goto done;
1657	}
1658	if (!na->na_bdg) {
1659		done = cur;
1660		goto done;
1661	}
1662	if (bridge_batch > NM_BDG_BATCH)
1663		bridge_batch = NM_BDG_BATCH;
1664
1665	done = nm_bdg_preflush(kring, cur);
1666done:
1667	if (done != cur)
1668		D("early break at %d/ %d, tail %d", done, cur, kring->nr_hwtail);
1669	/*
1670	 * packets between 'done' and 'cur' are left unsent.
1671	 */
1672	kring->nr_hwcur = done;
1673	kring->nr_hwtail = nm_prev(done, lim);
1674	nm_txsync_finalize(kring);
1675	if (netmap_verbose)
1676		D("%s ring %d flags %d", na->up.name, kring->ring_id, flags);
1677	return 0;
1678}
1679
1680
1681/* rxsync code used by VALE ports nm_rxsync callback and also
1682 * internally by the brwap
1683 */
1684static int
1685netmap_vp_rxsync_locked(struct netmap_kring *kring, int flags)
1686{
1687	struct netmap_adapter *na = kring->na;
1688	struct netmap_ring *ring = kring->ring;
1689	u_int nm_i, lim = kring->nkr_num_slots - 1;
1690	u_int head = nm_rxsync_prologue(kring);
1691	int n;
1692
1693	if (head > lim) {
1694		D("ouch dangerous reset!!!");
1695		n = netmap_ring_reinit(kring);
1696		goto done;
1697	}
1698
1699	/* First part, import newly received packets. */
1700	/* actually nothing to do here, they are already in the kring */
1701
1702	/* Second part, skip past packets that userspace has released. */
1703	nm_i = kring->nr_hwcur;
1704	if (nm_i != head) {
1705		/* consistency check, but nothing really important here */
1706		for (n = 0; likely(nm_i != head); n++) {
1707			struct netmap_slot *slot = &ring->slot[nm_i];
1708			void *addr = NMB(na, slot);
1709
1710			if (addr == NETMAP_BUF_BASE(kring->na)) { /* bad buf */
1711				D("bad buffer index %d, ignore ?",
1712					slot->buf_idx);
1713			}
1714			slot->flags &= ~NS_BUF_CHANGED;
1715			nm_i = nm_next(nm_i, lim);
1716		}
1717		kring->nr_hwcur = head;
1718	}
1719
1720	/* tell userspace that there are new packets */
1721	nm_rxsync_finalize(kring);
1722	n = 0;
1723done:
1724	return n;
1725}
1726
1727/*
1728 * nm_rxsync callback for VALE ports
1729 * user process reading from a VALE switch.
1730 * Already protected against concurrent calls from userspace,
1731 * but we must acquire the queue's lock to protect against
1732 * writers on the same queue.
1733 */
1734static int
1735netmap_vp_rxsync(struct netmap_kring *kring, int flags)
1736{
1737	int n;
1738
1739	mtx_lock(&kring->q_lock);
1740	n = netmap_vp_rxsync_locked(kring, flags);
1741	mtx_unlock(&kring->q_lock);
1742	return n;
1743}
1744
1745
1746/* nm_bdg_attach callback for VALE ports
1747 * The na_vp port is this same netmap_adapter. There is no host port.
1748 */
1749static int
1750netmap_vp_bdg_attach(const char *name, struct netmap_adapter *na)
1751{
1752	struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na;
1753
1754	if (vpna->na_bdg)
1755		return EBUSY;
1756	na->na_vp = vpna;
1757	strncpy(na->name, name, sizeof(na->name));
1758	na->na_hostvp = NULL;
1759	return 0;
1760}
1761
1762/* create a netmap_vp_adapter that describes a VALE port.
1763 * Only persistent VALE ports have a non-null ifp.
1764 */
1765static int
1766netmap_vp_create(struct nmreq *nmr, struct ifnet *ifp, struct netmap_vp_adapter **ret)
1767{
1768	struct netmap_vp_adapter *vpna;
1769	struct netmap_adapter *na;
1770	int error;
1771	u_int npipes = 0;
1772
1773	vpna = malloc(sizeof(*vpna), M_DEVBUF, M_NOWAIT | M_ZERO);
1774	if (vpna == NULL)
1775		return ENOMEM;
1776
1777 	na = &vpna->up;
1778
1779	na->ifp = ifp;
1780	strncpy(na->name, nmr->nr_name, sizeof(na->name));
1781
1782	/* bound checking */
1783	na->num_tx_rings = nmr->nr_tx_rings;
1784	nm_bound_var(&na->num_tx_rings, 1, 1, NM_BDG_MAXRINGS, NULL);
1785	nmr->nr_tx_rings = na->num_tx_rings; // write back
1786	na->num_rx_rings = nmr->nr_rx_rings;
1787	nm_bound_var(&na->num_rx_rings, 1, 1, NM_BDG_MAXRINGS, NULL);
1788	nmr->nr_rx_rings = na->num_rx_rings; // write back
1789	nm_bound_var(&nmr->nr_tx_slots, NM_BRIDGE_RINGSIZE,
1790			1, NM_BDG_MAXSLOTS, NULL);
1791	na->num_tx_desc = nmr->nr_tx_slots;
1792	nm_bound_var(&nmr->nr_rx_slots, NM_BRIDGE_RINGSIZE,
1793			1, NM_BDG_MAXSLOTS, NULL);
1794	/* validate number of pipes. We want at least 1,
1795	 * but probably can do with some more.
1796	 * So let's use 2 as default (when 0 is supplied)
1797	 */
1798	npipes = nmr->nr_arg1;
1799	nm_bound_var(&npipes, 2, 1, NM_MAXPIPES, NULL);
1800	nmr->nr_arg1 = npipes;	/* write back */
1801	/* validate extra bufs */
1802	nm_bound_var(&nmr->nr_arg3, 0, 0,
1803			128*NM_BDG_MAXSLOTS, NULL);
1804	na->num_rx_desc = nmr->nr_rx_slots;
1805	vpna->virt_hdr_len = 0;
1806	vpna->mfs = 1514;
1807	/*if (vpna->mfs > netmap_buf_size)  TODO netmap_buf_size is zero??
1808		vpna->mfs = netmap_buf_size; */
1809        if (netmap_verbose)
1810		D("max frame size %u", vpna->mfs);
1811
1812	na->na_flags |= NAF_BDG_MAYSLEEP | NAF_MEM_OWNER;
1813	na->nm_txsync = netmap_vp_txsync;
1814	na->nm_rxsync = netmap_vp_rxsync;
1815	na->nm_register = netmap_vp_reg;
1816	na->nm_krings_create = netmap_vp_krings_create;
1817	na->nm_krings_delete = netmap_vp_krings_delete;
1818	na->nm_dtor = netmap_vp_dtor;
1819	na->nm_mem = netmap_mem_private_new(na->name,
1820			na->num_tx_rings, na->num_tx_desc,
1821			na->num_rx_rings, na->num_rx_desc,
1822			nmr->nr_arg3, npipes, &error);
1823	if (na->nm_mem == NULL)
1824		goto err;
1825	na->nm_bdg_attach = netmap_vp_bdg_attach;
1826	/* other nmd fields are set in the common routine */
1827	error = netmap_attach_common(na);
1828	if (error)
1829		goto err;
1830	*ret = vpna;
1831	return 0;
1832
1833err:
1834	if (na->nm_mem != NULL)
1835		netmap_mem_private_delete(na->nm_mem);
1836	free(vpna, M_DEVBUF);
1837	return error;
1838}
1839
1840/* Bridge wrapper code (bwrap).
1841 * This is used to connect a non-VALE-port netmap_adapter (hwna) to a
1842 * VALE switch.
1843 * The main task is to swap the meaning of tx and rx rings to match the
1844 * expectations of the VALE switch code (see nm_bdg_flush).
1845 *
1846 * The bwrap works by interposing a netmap_bwrap_adapter between the
1847 * rest of the system and the hwna. The netmap_bwrap_adapter looks like
1848 * a netmap_vp_adapter to the rest the system, but, internally, it
1849 * translates all callbacks to what the hwna expects.
1850 *
1851 * Note that we have to intercept callbacks coming from two sides:
1852 *
1853 *  - callbacks coming from the netmap module are intercepted by
1854 *    passing around the netmap_bwrap_adapter instead of the hwna
1855 *
1856 *  - callbacks coming from outside of the netmap module only know
1857 *    about the hwna. This, however, only happens in interrupt
1858 *    handlers, where only the hwna->nm_notify callback is called.
1859 *    What the bwrap does is to overwrite the hwna->nm_notify callback
1860 *    with its own netmap_bwrap_intr_notify.
1861 *    XXX This assumes that the hwna->nm_notify callback was the
1862 *    standard netmap_notify(), as it is the case for nic adapters.
1863 *    Any additional action performed by hwna->nm_notify will not be
1864 *    performed by netmap_bwrap_intr_notify.
1865 *
1866 * Additionally, the bwrap can optionally attach the host rings pair
1867 * of the wrapped adapter to a different port of the switch.
1868 */
1869
1870
1871static void
1872netmap_bwrap_dtor(struct netmap_adapter *na)
1873{
1874	struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na;
1875	struct netmap_adapter *hwna = bna->hwna;
1876
1877	ND("na %p", na);
1878	/* drop reference to hwna->ifp.
1879	 * If we don't do this, netmap_detach_common(na)
1880	 * will think it has set NA(na->ifp) to NULL
1881	 */
1882	na->ifp = NULL;
1883	/* for safety, also drop the possible reference
1884	 * in the hostna
1885	 */
1886	bna->host.up.ifp = NULL;
1887
1888	hwna->nm_mem = bna->save_nmd;
1889	hwna->na_private = NULL;
1890	hwna->na_vp = hwna->na_hostvp = NULL;
1891	hwna->na_flags &= ~NAF_BUSY;
1892	netmap_adapter_put(hwna);
1893
1894}
1895
1896
1897/*
1898 * Intr callback for NICs connected to a bridge.
1899 * Simply ignore tx interrupts (maybe we could try to recover space ?)
1900 * and pass received packets from nic to the bridge.
1901 *
1902 * XXX TODO check locking: this is called from the interrupt
1903 * handler so we should make sure that the interface is not
1904 * disconnected while passing down an interrupt.
1905 *
1906 * Note, no user process can access this NIC or the host stack.
1907 * The only part of the ring that is significant are the slots,
1908 * and head/cur/tail are set from the kring as needed
1909 * (part as a receive ring, part as a transmit ring).
1910 *
1911 * callback that overwrites the hwna notify callback.
1912 * Packets come from the outside or from the host stack and are put on an hwna rx ring.
1913 * The bridge wrapper then sends the packets through the bridge.
1914 */
1915static int
1916netmap_bwrap_intr_notify(struct netmap_adapter *na, u_int ring_nr, enum txrx tx, int flags)
1917{
1918	struct netmap_bwrap_adapter *bna = na->na_private;
1919	struct netmap_vp_adapter *hostna = &bna->host;
1920	struct netmap_kring *kring, *bkring;
1921	struct netmap_ring *ring;
1922	int is_host_ring = ring_nr == na->num_rx_rings;
1923	struct netmap_vp_adapter *vpna = &bna->up;
1924	int error = 0;
1925
1926	if (netmap_verbose)
1927	    D("%s %s%d 0x%x", na->name,
1928		(tx == NR_TX ? "TX" : "RX"), ring_nr, flags);
1929
1930	if (flags & NAF_DISABLE_NOTIFY) {
1931		/* the enabled/disabled state of the ring has changed,
1932		 * propagate the info to the wrapper (with tx/rx swapped)
1933		 */
1934		if (tx == NR_TX) {
1935			netmap_set_rxring(&vpna->up, ring_nr,
1936					na->tx_rings[ring_nr].nkr_stopped);
1937		} else {
1938			netmap_set_txring(&vpna->up, ring_nr,
1939					na->rx_rings[ring_nr].nkr_stopped);
1940		}
1941		return 0;
1942	}
1943
1944	if (!nm_netmap_on(na))
1945		return 0;
1946
1947	/* we only care about receive interrupts */
1948	if (tx == NR_TX)
1949		return 0;
1950
1951	kring = &na->rx_rings[ring_nr];
1952	ring = kring->ring;
1953
1954	/* make sure the ring is not disabled */
1955	if (nm_kr_tryget(kring))
1956		return 0;
1957
1958	if (is_host_ring && hostna->na_bdg == NULL) {
1959		error = bna->save_notify(na, ring_nr, tx, flags);
1960		goto put_out;
1961	}
1962
1963	/* Here we expect ring->head = ring->cur = ring->tail
1964	 * because everything has been released from the previous round.
1965	 * However the ring is shared and we might have info from
1966	 * the wrong side (the tx ring). Hence we overwrite with
1967	 * the info from the rx kring.
1968	 */
1969	if (netmap_verbose)
1970	    D("%s head %d cur %d tail %d (kring %d %d %d)",  na->name,
1971		ring->head, ring->cur, ring->tail,
1972		kring->rhead, kring->rcur, kring->rtail);
1973
1974	ring->head = kring->rhead;
1975	ring->cur = kring->rcur;
1976	ring->tail = kring->rtail;
1977
1978	if (is_host_ring) {
1979		vpna = hostna;
1980		ring_nr = 0;
1981	}
1982	/* simulate a user wakeup on the rx ring */
1983	/* fetch packets that have arrived.
1984	 * XXX maybe do this in a loop ?
1985	 */
1986	error = kring->nm_sync(kring, 0);
1987	if (error)
1988		goto put_out;
1989	if (kring->nr_hwcur == kring->nr_hwtail && netmap_verbose) {
1990		D("how strange, interrupt with no packets on %s",
1991			na->name);
1992		goto put_out;
1993	}
1994
1995	/* new packets are ring->cur to ring->tail, and the bkring
1996	 * had hwcur == ring->cur. So advance ring->cur to ring->tail
1997	 * to push all packets out.
1998	 */
1999	ring->head = ring->cur = ring->tail;
2000
2001	/* also set tail to what the bwrap expects */
2002	bkring = &vpna->up.tx_rings[ring_nr];
2003	ring->tail = bkring->nr_hwtail; // rtail too ?
2004
2005	/* pass packets to the switch */
2006	nm_txsync_prologue(bkring); // XXX error checking ?
2007	netmap_vp_txsync(bkring, flags);
2008
2009	/* mark all buffers as released on this ring */
2010	ring->head = ring->cur = kring->nr_hwtail;
2011	ring->tail = kring->rtail;
2012	/* another call to actually release the buffers */
2013	if (!is_host_ring) {
2014		error = kring->nm_sync(kring, 0);
2015	} else {
2016		/* mark all packets as released, as in the
2017		 * second part of netmap_rxsync_from_host()
2018		 */
2019		kring->nr_hwcur = kring->nr_hwtail;
2020		nm_rxsync_finalize(kring);
2021	}
2022
2023put_out:
2024	nm_kr_put(kring);
2025	return error;
2026}
2027
2028
2029/* nm_register callback for bwrap */
2030static int
2031netmap_bwrap_register(struct netmap_adapter *na, int onoff)
2032{
2033	struct netmap_bwrap_adapter *bna =
2034		(struct netmap_bwrap_adapter *)na;
2035	struct netmap_adapter *hwna = bna->hwna;
2036	struct netmap_vp_adapter *hostna = &bna->host;
2037	int error;
2038
2039	ND("%s %s", na->name, onoff ? "on" : "off");
2040
2041	if (onoff) {
2042		int i;
2043
2044		/* netmap_do_regif has been called on the bwrap na.
2045		 * We need to pass the information about the
2046		 * memory allocator down to the hwna before
2047		 * putting it in netmap mode
2048		 */
2049		hwna->na_lut = na->na_lut;
2050		hwna->na_lut_objtotal = na->na_lut_objtotal;
2051		hwna->na_lut_objsize = na->na_lut_objsize;
2052
2053		if (hostna->na_bdg) {
2054			/* if the host rings have been attached to switch,
2055			 * we need to copy the memory allocator information
2056			 * in the hostna also
2057			 */
2058			hostna->up.na_lut = na->na_lut;
2059			hostna->up.na_lut_objtotal = na->na_lut_objtotal;
2060			hostna->up.na_lut_objsize = na->na_lut_objsize;
2061		}
2062
2063		/* cross-link the netmap rings
2064		 * The original number of rings comes from hwna,
2065		 * rx rings on one side equals tx rings on the other.
2066		 * We need to do this now, after the initialization
2067		 * of the kring->ring pointers
2068		 */
2069		for (i = 0; i < na->num_rx_rings + 1; i++) {
2070			hwna->tx_rings[i].nkr_num_slots = na->rx_rings[i].nkr_num_slots;
2071			hwna->tx_rings[i].ring = na->rx_rings[i].ring;
2072		}
2073		for (i = 0; i < na->num_tx_rings + 1; i++) {
2074			hwna->rx_rings[i].nkr_num_slots = na->tx_rings[i].nkr_num_slots;
2075			hwna->rx_rings[i].ring = na->tx_rings[i].ring;
2076		}
2077	}
2078
2079	/* forward the request to the hwna */
2080	error = hwna->nm_register(hwna, onoff);
2081	if (error)
2082		return error;
2083
2084	/* impersonate a netmap_vp_adapter */
2085	netmap_vp_reg(na, onoff);
2086	if (hostna->na_bdg)
2087		netmap_vp_reg(&hostna->up, onoff);
2088
2089	if (onoff) {
2090		/* intercept the hwna nm_nofify callback */
2091		bna->save_notify = hwna->nm_notify;
2092		hwna->nm_notify = netmap_bwrap_intr_notify;
2093	} else {
2094		hwna->nm_notify = bna->save_notify;
2095		hwna->na_lut = NULL;
2096		hwna->na_lut_objtotal = 0;
2097		hwna->na_lut_objsize = 0;
2098	}
2099
2100	return 0;
2101}
2102
2103/* nm_config callback for bwrap */
2104static int
2105netmap_bwrap_config(struct netmap_adapter *na, u_int *txr, u_int *txd,
2106				    u_int *rxr, u_int *rxd)
2107{
2108	struct netmap_bwrap_adapter *bna =
2109		(struct netmap_bwrap_adapter *)na;
2110	struct netmap_adapter *hwna = bna->hwna;
2111
2112	/* forward the request */
2113	netmap_update_config(hwna);
2114	/* swap the results */
2115	*txr = hwna->num_rx_rings;
2116	*txd = hwna->num_rx_desc;
2117	*rxr = hwna->num_tx_rings;
2118	*rxd = hwna->num_rx_desc;
2119
2120	return 0;
2121}
2122
2123
2124/* nm_krings_create callback for bwrap */
2125static int
2126netmap_bwrap_krings_create(struct netmap_adapter *na)
2127{
2128	struct netmap_bwrap_adapter *bna =
2129		(struct netmap_bwrap_adapter *)na;
2130	struct netmap_adapter *hwna = bna->hwna;
2131	struct netmap_adapter *hostna = &bna->host.up;
2132	int error;
2133
2134	ND("%s", na->name);
2135
2136	/* impersonate a netmap_vp_adapter */
2137	error = netmap_vp_krings_create(na);
2138	if (error)
2139		return error;
2140
2141	/* also create the hwna krings */
2142	error = hwna->nm_krings_create(hwna);
2143	if (error) {
2144		netmap_vp_krings_delete(na);
2145		return error;
2146	}
2147	/* the connection between the bwrap krings and the hwna krings
2148	 * will be perfomed later, in the nm_register callback, since
2149	 * now the kring->ring pointers have not been initialized yet
2150	 */
2151
2152	if (na->na_flags & NAF_HOST_RINGS) {
2153		/* the hostna rings are the host rings of the bwrap.
2154		 * The corresponding krings must point back to the
2155		 * hostna
2156		 */
2157		hostna->tx_rings = na->tx_rings + na->num_tx_rings;
2158		hostna->tx_rings[0].na = hostna;
2159		hostna->rx_rings = na->rx_rings + na->num_rx_rings;
2160		hostna->rx_rings[0].na = hostna;
2161	}
2162
2163	return 0;
2164}
2165
2166
2167static void
2168netmap_bwrap_krings_delete(struct netmap_adapter *na)
2169{
2170	struct netmap_bwrap_adapter *bna =
2171		(struct netmap_bwrap_adapter *)na;
2172	struct netmap_adapter *hwna = bna->hwna;
2173
2174	ND("%s", na->name);
2175
2176	hwna->nm_krings_delete(hwna);
2177	netmap_vp_krings_delete(na);
2178}
2179
2180
2181/* notify method for the bridge-->hwna direction */
2182static int
2183netmap_bwrap_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int flags)
2184{
2185	struct netmap_bwrap_adapter *bna =
2186		(struct netmap_bwrap_adapter *)na;
2187	struct netmap_adapter *hwna = bna->hwna;
2188	struct netmap_kring *kring, *hw_kring;
2189	struct netmap_ring *ring;
2190	u_int lim;
2191	int error = 0;
2192
2193	if (tx == NR_TX)
2194	        return EINVAL;
2195
2196	kring = &na->rx_rings[ring_n];
2197	hw_kring = &hwna->tx_rings[ring_n];
2198	ring = kring->ring;
2199	lim = kring->nkr_num_slots - 1;
2200
2201	if (!nm_netmap_on(hwna))
2202		return 0;
2203	mtx_lock(&kring->q_lock);
2204	/* first step: simulate a user wakeup on the rx ring */
2205	netmap_vp_rxsync_locked(kring, flags);
2206	ND("%s[%d] PRE rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)",
2207		na->name, ring_n,
2208		kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease,
2209		ring->head, ring->cur, ring->tail,
2210		hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_ring->rtail);
2211	/* second step: the simulated user consumes all new packets */
2212	ring->head = ring->cur = ring->tail;
2213
2214	/* third step: the new packets are sent on the tx ring
2215	 * (which is actually the same ring)
2216	 */
2217	/* set tail to what the hw expects */
2218	ring->tail = hw_kring->rtail;
2219	nm_txsync_prologue(&hwna->tx_rings[ring_n]); // XXX error checking ?
2220	error = hw_kring->nm_sync(hw_kring, flags);
2221
2222	/* fourth step: now we are back the rx ring */
2223	/* claim ownership on all hw owned bufs */
2224	ring->head = nm_next(ring->tail, lim); /* skip past reserved slot */
2225	ring->tail = kring->rtail; /* restore saved value of tail, for safety */
2226
2227	/* fifth step: the user goes to sleep again, causing another rxsync */
2228	netmap_vp_rxsync_locked(kring, flags);
2229	ND("%s[%d] PST rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)",
2230		na->name, ring_n,
2231		kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease,
2232		ring->head, ring->cur, ring->tail,
2233		hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_kring->rtail);
2234	mtx_unlock(&kring->q_lock);
2235	return error;
2236}
2237
2238
2239/* notify method for the bridge-->host-rings path */
2240static int
2241netmap_bwrap_host_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int flags)
2242{
2243	struct netmap_bwrap_adapter *bna = na->na_private;
2244	struct netmap_adapter *port_na = &bna->up.up;
2245	if (tx == NR_TX || ring_n != 0)
2246		return EINVAL;
2247	return netmap_bwrap_notify(port_na, port_na->num_rx_rings, NR_RX, flags);
2248}
2249
2250
2251/* nm_bdg_ctl callback for the bwrap.
2252 * Called on bridge-attach and detach, as an effect of vale-ctl -[ahd].
2253 * On attach, it needs to provide a fake netmap_priv_d structure and
2254 * perform a netmap_do_regif() on the bwrap. This will put both the
2255 * bwrap and the hwna in netmap mode, with the netmap rings shared
2256 * and cross linked. Moroever, it will start intercepting interrupts
2257 * directed to hwna.
2258 */
2259static int
2260netmap_bwrap_bdg_ctl(struct netmap_adapter *na, struct nmreq *nmr, int attach)
2261{
2262	struct netmap_priv_d *npriv;
2263	struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na;
2264	struct netmap_if *nifp;
2265	int error = 0;
2266
2267	if (attach) {
2268		if (NETMAP_OWNED_BY_ANY(na)) {
2269			return EBUSY;
2270		}
2271		if (bna->na_kpriv) {
2272			/* nothing to do */
2273			return 0;
2274		}
2275		npriv = malloc(sizeof(*npriv), M_DEVBUF, M_NOWAIT|M_ZERO);
2276		if (npriv == NULL)
2277			return ENOMEM;
2278		nifp = netmap_do_regif(npriv, na, nmr->nr_ringid, nmr->nr_flags, &error);
2279		if (!nifp) {
2280			bzero(npriv, sizeof(*npriv));
2281			free(npriv, M_DEVBUF);
2282			return error;
2283		}
2284		bna->na_kpriv = npriv;
2285		na->na_flags |= NAF_BUSY;
2286	} else {
2287		int last_instance;
2288
2289		if (na->active_fds == 0) /* not registered */
2290			return EINVAL;
2291		last_instance = netmap_dtor_locked(bna->na_kpriv);
2292		if (!last_instance) {
2293			D("--- error, trying to detach an entry with active mmaps");
2294			error = EINVAL;
2295		} else {
2296			struct nm_bridge *b = bna->up.na_bdg,
2297				*bh = bna->host.na_bdg;
2298			npriv = bna->na_kpriv;
2299			bna->na_kpriv = NULL;
2300			D("deleting priv");
2301
2302			bzero(npriv, sizeof(*npriv));
2303			free(npriv, M_DEVBUF);
2304			if (b) {
2305				/* XXX the bwrap dtor should take care
2306				 * of this (2014-06-16)
2307				 */
2308				netmap_bdg_detach_common(b, bna->up.bdg_port,
2309				    (bh ? bna->host.bdg_port : -1));
2310			}
2311			na->na_flags &= ~NAF_BUSY;
2312		}
2313	}
2314	return error;
2315
2316}
2317
2318/* attach a bridge wrapper to the 'real' device */
2319int
2320netmap_bwrap_attach(const char *nr_name, struct netmap_adapter *hwna)
2321{
2322	struct netmap_bwrap_adapter *bna;
2323	struct netmap_adapter *na = NULL;
2324	struct netmap_adapter *hostna = NULL;
2325	int error = 0;
2326
2327	/* make sure the NIC is not already in use */
2328	if (NETMAP_OWNED_BY_ANY(hwna)) {
2329		D("NIC %s busy, cannot attach to bridge", hwna->name);
2330		return EBUSY;
2331	}
2332
2333	bna = malloc(sizeof(*bna), M_DEVBUF, M_NOWAIT | M_ZERO);
2334	if (bna == NULL) {
2335		return ENOMEM;
2336	}
2337
2338	na = &bna->up.up;
2339	strncpy(na->name, nr_name, sizeof(na->name));
2340	/* fill the ring data for the bwrap adapter with rx/tx meanings
2341	 * swapped. The real cross-linking will be done during register,
2342	 * when all the krings will have been created.
2343	 */
2344	na->num_rx_rings = hwna->num_tx_rings;
2345	na->num_tx_rings = hwna->num_rx_rings;
2346	na->num_tx_desc = hwna->num_rx_desc;
2347	na->num_rx_desc = hwna->num_tx_desc;
2348	na->nm_dtor = netmap_bwrap_dtor;
2349	na->nm_register = netmap_bwrap_register;
2350	// na->nm_txsync = netmap_bwrap_txsync;
2351	// na->nm_rxsync = netmap_bwrap_rxsync;
2352	na->nm_config = netmap_bwrap_config;
2353	na->nm_krings_create = netmap_bwrap_krings_create;
2354	na->nm_krings_delete = netmap_bwrap_krings_delete;
2355	na->nm_notify = netmap_bwrap_notify;
2356	na->nm_bdg_ctl = netmap_bwrap_bdg_ctl;
2357	na->pdev = hwna->pdev;
2358	na->nm_mem = netmap_mem_private_new(na->name,
2359			na->num_tx_rings, na->num_tx_desc,
2360			na->num_rx_rings, na->num_rx_desc,
2361			0, 0, &error);
2362	na->na_flags |= NAF_MEM_OWNER;
2363	if (na->nm_mem == NULL)
2364		goto err_put;
2365	bna->up.retry = 1; /* XXX maybe this should depend on the hwna */
2366
2367	bna->hwna = hwna;
2368	netmap_adapter_get(hwna);
2369	hwna->na_private = bna; /* weak reference */
2370	hwna->na_vp = &bna->up;
2371
2372	if (hwna->na_flags & NAF_HOST_RINGS) {
2373		if (hwna->na_flags & NAF_SW_ONLY)
2374			na->na_flags |= NAF_SW_ONLY;
2375		na->na_flags |= NAF_HOST_RINGS;
2376		hostna = &bna->host.up;
2377		snprintf(hostna->name, sizeof(hostna->name), "%s^", nr_name);
2378		hostna->ifp = hwna->ifp;
2379		hostna->num_tx_rings = 1;
2380		hostna->num_tx_desc = hwna->num_rx_desc;
2381		hostna->num_rx_rings = 1;
2382		hostna->num_rx_desc = hwna->num_tx_desc;
2383		// hostna->nm_txsync = netmap_bwrap_host_txsync;
2384		// hostna->nm_rxsync = netmap_bwrap_host_rxsync;
2385		hostna->nm_notify = netmap_bwrap_host_notify;
2386		hostna->nm_mem = na->nm_mem;
2387		hostna->na_private = bna;
2388		hostna->na_vp = &bna->up;
2389		na->na_hostvp = hwna->na_hostvp =
2390			hostna->na_hostvp = &bna->host;
2391		hostna->na_flags = NAF_BUSY; /* prevent NIOCREGIF */
2392	}
2393
2394	ND("%s<->%s txr %d txd %d rxr %d rxd %d",
2395		na->name, ifp->if_xname,
2396		na->num_tx_rings, na->num_tx_desc,
2397		na->num_rx_rings, na->num_rx_desc);
2398
2399	error = netmap_attach_common(na);
2400	if (error) {
2401		goto err_free;
2402	}
2403	/* make bwrap ifp point to the real ifp
2404	 * NOTE: netmap_attach_common() interprets a non-NULL na->ifp
2405	 * as a request to make the ifp point to the na. Since we
2406	 * do not want to change the na already pointed to by hwna->ifp,
2407	 * the following assignment has to be delayed until now
2408	 */
2409	na->ifp = hwna->ifp;
2410	hwna->na_flags |= NAF_BUSY;
2411	/* make hwna point to the allocator we are actually using,
2412	 * so that monitors will be able to find it
2413	 */
2414	bna->save_nmd = hwna->nm_mem;
2415	hwna->nm_mem = na->nm_mem;
2416	return 0;
2417
2418err_free:
2419	netmap_mem_private_delete(na->nm_mem);
2420err_put:
2421	hwna->na_vp = hwna->na_hostvp = NULL;
2422	netmap_adapter_put(hwna);
2423	free(bna, M_DEVBUF);
2424	return error;
2425
2426}
2427
2428
2429void
2430netmap_init_bridges(void)
2431{
2432	int i;
2433	bzero(nm_bridges, sizeof(struct nm_bridge) * NM_BRIDGES); /* safety */
2434	for (i = 0; i < NM_BRIDGES; i++)
2435		BDG_RWINIT(&nm_bridges[i]);
2436}
2437#endif /* WITH_VALE */
2438