1/*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (C) 2013-2016 Universita` di Pisa
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *   1. Redistributions of source code must retain the above copyright
11 *      notice, this list of conditions and the following disclaimer.
12 *   2. Redistributions in binary form must reproduce the above copyright
13 *      notice, this list of conditions and the following disclaimer in the
14 *      documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29
30#if defined(__FreeBSD__)
31#include <sys/cdefs.h> /* prerequisite */
32#include <sys/types.h>
33#include <sys/errno.h>
34#include <sys/param.h>	/* defines used in kernel.h */
35#include <sys/kernel.h>	/* types used in module initialization */
36#include <sys/conf.h>	/* cdevsw struct, UID, GID */
37#include <sys/sockio.h>
38#include <sys/socketvar.h>	/* struct socket */
39#include <sys/malloc.h>
40#include <sys/poll.h>
41#include <sys/rwlock.h>
42#include <sys/socket.h> /* sockaddrs */
43#include <sys/selinfo.h>
44#include <sys/sysctl.h>
45#include <net/if.h>
46#include <net/if_var.h>
47#include <net/bpf.h>		/* BIOCIMMEDIATE */
48#include <machine/bus.h>	/* bus_dmamap_* */
49#include <sys/endian.h>
50#include <sys/refcount.h>
51#include <sys/smp.h>
52
53
54#elif defined(linux)
55
56#include "bsd_glue.h"
57
58#elif defined(__APPLE__)
59
60#warning OSX support is only partial
61#include "osx_glue.h"
62
63#elif defined(_WIN32)
64#include "win_glue.h"
65
66#else
67
68#error	Unsupported platform
69
70#endif /* unsupported */
71
72/*
73 * common headers
74 */
75
76#include <net/netmap.h>
77#include <dev/netmap/netmap_kern.h>
78#include <dev/netmap/netmap_mem2.h>
79#include <dev/netmap/netmap_bdg.h>
80
81#ifdef WITH_VALE
82
83/*
84 * system parameters (most of them in netmap_kern.h)
85 * NM_BDG_NAME		prefix for switch port names, default "vale"
86 * NM_BDG_MAXPORTS	number of ports
87 * NM_BRIDGES		max number of switches in the system.
88 *
89 * Switch ports are named valeX:Y where X is the switch name and Y
90 * is the port. If Y matches a physical interface name, the port is
91 * connected to a physical device.
92 *
93 * Unlike physical interfaces, switch ports use their own memory region
94 * for rings and buffers.
95 * The virtual interfaces use per-queue lock instead of core lock.
96 * In the tx loop, we aggregate traffic in batches to make all operations
97 * faster. The batch size is bridge_batch.
98 */
99#define NM_BDG_MAXRINGS		16	/* XXX unclear how many (must be a pow of 2). */
100#define NM_BDG_MAXSLOTS		4096	/* XXX same as above */
101#define NM_BRIDGE_RINGSIZE	1024	/* in the device */
102#define NM_BDG_BATCH		1024	/* entries in the forwarding buffer */
103/* actual size of the tables */
104#define NM_BDG_BATCH_MAX	(NM_BDG_BATCH + NETMAP_MAX_FRAGS)
105/* NM_FT_NULL terminates a list of slots in the ft */
106#define NM_FT_NULL		NM_BDG_BATCH_MAX
107
108
109/*
110 * bridge_batch is set via sysctl to the max batch size to be
111 * used in the bridge. The actual value may be larger as the
112 * last packet in the block may overflow the size.
113 */
114static int bridge_batch = NM_BDG_BATCH; /* bridge batch size */
115
116/* Max number of vale bridges (loader tunable). */
117unsigned int vale_max_bridges = NM_BRIDGES;
118
119SYSBEGIN(vars_vale);
120SYSCTL_DECL(_dev_netmap);
121SYSCTL_INT(_dev_netmap, OID_AUTO, bridge_batch, CTLFLAG_RW, &bridge_batch, 0,
122		"Max batch size to be used in the bridge");
123SYSCTL_UINT(_dev_netmap, OID_AUTO, max_bridges, CTLFLAG_RDTUN, &vale_max_bridges, 0,
124		"Max number of vale bridges");
125SYSEND;
126
127static int netmap_vale_vp_create(struct nmreq_header *hdr, if_t,
128		struct netmap_mem_d *nmd, struct netmap_vp_adapter **);
129static int netmap_vale_vp_bdg_attach(const char *, struct netmap_adapter *,
130		struct nm_bridge *);
131static int netmap_vale_bwrap_attach(const char *, struct netmap_adapter *);
132
133/*
134 * For each output interface, nm_vale_q is used to construct a list.
135 * bq_len is the number of output buffers (we can have coalescing
136 * during the copy).
137 */
138struct nm_vale_q {
139	uint16_t bq_head;
140	uint16_t bq_tail;
141	uint32_t bq_len;	/* number of buffers */
142};
143
144/* Holds the default callbacks */
145struct netmap_bdg_ops vale_bdg_ops = {
146	.lookup = netmap_vale_learning,
147	.config = NULL,
148	.dtor = NULL,
149	.vp_create = netmap_vale_vp_create,
150	.bwrap_attach = netmap_vale_bwrap_attach,
151	.name = NM_BDG_NAME,
152};
153
154/*
155 * this is a slightly optimized copy routine which rounds
156 * to multiple of 64 bytes and is often faster than dealing
157 * with other odd sizes. We assume there is enough room
158 * in the source and destination buffers.
159 *
160 * XXX only for multiples of NM_BUF_ALIGN bytes, non overlapped.
161 */
162
163static inline void
164pkt_copy(void *_src, void *_dst, int l)
165{
166	uint64_t *src = _src;
167	uint64_t *dst = _dst;
168	if (unlikely(l >= 1024)) {
169		memcpy(dst, src, l);
170		return;
171	}
172	for (; likely(l > 0); l -= NM_BUF_ALIGN) {
173		/* XXX NM_BUF_ALIGN/sizeof(uint64_t) statements */
174		*dst++ = *src++;
175		*dst++ = *src++;
176		*dst++ = *src++;
177		*dst++ = *src++;
178		*dst++ = *src++;
179		*dst++ = *src++;
180		*dst++ = *src++;
181		*dst++ = *src++;
182	}
183}
184
185
186/*
187 * Free the forwarding tables for rings attached to switch ports.
188 */
189static void
190nm_free_bdgfwd(struct netmap_adapter *na)
191{
192	int nrings, i;
193	struct netmap_kring **kring;
194
195	NMG_LOCK_ASSERT();
196	nrings = na->num_tx_rings;
197	kring = na->tx_rings;
198	for (i = 0; i < nrings; i++) {
199		if (kring[i]->nkr_ft) {
200			nm_os_free(kring[i]->nkr_ft);
201			kring[i]->nkr_ft = NULL; /* protect from freeing twice */
202		}
203	}
204}
205
206
207/*
208 * Allocate the forwarding tables for the rings attached to the bridge ports.
209 */
210static int
211nm_alloc_bdgfwd(struct netmap_adapter *na)
212{
213	int nrings, l, i, num_dstq;
214	struct netmap_kring **kring;
215
216	NMG_LOCK_ASSERT();
217	/* all port:rings + broadcast */
218	num_dstq = NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1;
219	l = sizeof(struct nm_bdg_fwd) * NM_BDG_BATCH_MAX;
220	l += sizeof(struct nm_vale_q) * num_dstq;
221	l += sizeof(uint16_t) * NM_BDG_BATCH_MAX;
222
223	nrings = netmap_real_rings(na, NR_TX);
224	kring = na->tx_rings;
225	for (i = 0; i < nrings; i++) {
226		struct nm_bdg_fwd *ft;
227		struct nm_vale_q *dstq;
228		int j;
229
230		ft = nm_os_malloc(l);
231		if (!ft) {
232			nm_free_bdgfwd(na);
233			return ENOMEM;
234		}
235		dstq = (struct nm_vale_q *)(ft + NM_BDG_BATCH_MAX);
236		for (j = 0; j < num_dstq; j++) {
237			dstq[j].bq_head = dstq[j].bq_tail = NM_FT_NULL;
238			dstq[j].bq_len = 0;
239		}
240		kring[i]->nkr_ft = ft;
241	}
242	return 0;
243}
244
245/* Allows external modules to create bridges in exclusive mode,
246 * returns an authentication token that the external module will need
247 * to provide during nm_bdg_ctl_{attach, detach}(), netmap_bdg_regops(),
248 * and nm_bdg_update_private_data() operations.
249 * Successfully executed if ret != NULL and *return_status == 0.
250 */
251void *
252netmap_vale_create(const char *bdg_name, int *return_status)
253{
254	struct nm_bridge *b = NULL;
255	void *ret = NULL;
256
257	NMG_LOCK();
258	b = nm_find_bridge(bdg_name, 0 /* don't create */, NULL);
259	if (b) {
260		*return_status = EEXIST;
261		goto unlock_bdg_create;
262	}
263
264	b = nm_find_bridge(bdg_name, 1 /* create */, &vale_bdg_ops);
265	if (!b) {
266		*return_status = ENOMEM;
267		goto unlock_bdg_create;
268	}
269
270	b->bdg_flags |= NM_BDG_ACTIVE | NM_BDG_EXCLUSIVE;
271	ret = nm_bdg_get_auth_token(b);
272	*return_status = 0;
273
274unlock_bdg_create:
275	NMG_UNLOCK();
276	return ret;
277}
278
279/* Allows external modules to destroy a bridge created through
280 * netmap_bdg_create(), the bridge must be empty.
281 */
282int
283netmap_vale_destroy(const char *bdg_name, void *auth_token)
284{
285	struct nm_bridge *b = NULL;
286	int ret = 0;
287
288	NMG_LOCK();
289	b = nm_find_bridge(bdg_name, 0 /* don't create */, NULL);
290	if (!b) {
291		ret = ENXIO;
292		goto unlock_bdg_free;
293	}
294
295	if (!nm_bdg_valid_auth_token(b, auth_token)) {
296		ret = EACCES;
297		goto unlock_bdg_free;
298	}
299	if (!(b->bdg_flags & NM_BDG_EXCLUSIVE)) {
300		ret = EINVAL;
301		goto unlock_bdg_free;
302	}
303
304	b->bdg_flags &= ~(NM_BDG_EXCLUSIVE | NM_BDG_ACTIVE);
305	ret = netmap_bdg_free(b);
306	if (ret) {
307		b->bdg_flags |= NM_BDG_EXCLUSIVE | NM_BDG_ACTIVE;
308	}
309
310unlock_bdg_free:
311	NMG_UNLOCK();
312	return ret;
313}
314
315/* Process NETMAP_REQ_VALE_LIST. */
316int
317netmap_vale_list(struct nmreq_header *hdr)
318{
319	struct nmreq_vale_list *req =
320		(struct nmreq_vale_list *)(uintptr_t)hdr->nr_body;
321	int namelen = strlen(hdr->nr_name);
322	struct nm_bridge *b, *bridges;
323	struct netmap_vp_adapter *vpna;
324	int error = 0, i, j;
325	u_int num_bridges;
326
327	netmap_bns_getbridges(&bridges, &num_bridges);
328
329	/* this is used to enumerate bridges and ports */
330	if (namelen) { /* look up indexes of bridge and port */
331		if (strncmp(hdr->nr_name, NM_BDG_NAME,
332					strlen(NM_BDG_NAME))) {
333			return EINVAL;
334		}
335		NMG_LOCK();
336		b = nm_find_bridge(hdr->nr_name, 0 /* don't create */, NULL);
337		if (!b) {
338			NMG_UNLOCK();
339			return ENOENT;
340		}
341
342		req->nr_bridge_idx = b - bridges; /* bridge index */
343		req->nr_port_idx = NM_BDG_NOPORT;
344		for (j = 0; j < b->bdg_active_ports; j++) {
345			i = b->bdg_port_index[j];
346			vpna = b->bdg_ports[i];
347			if (vpna == NULL) {
348				nm_prerr("This should not happen");
349				continue;
350			}
351			/* the former and the latter identify a
352			 * virtual port and a NIC, respectively
353			 */
354			if (!strcmp(vpna->up.name, hdr->nr_name)) {
355				req->nr_port_idx = i; /* port index */
356				break;
357			}
358		}
359		NMG_UNLOCK();
360	} else {
361		/* return the first non-empty entry starting from
362		 * bridge nr_arg1 and port nr_arg2.
363		 *
364		 * Users can detect the end of the same bridge by
365		 * seeing the new and old value of nr_arg1, and can
366		 * detect the end of all the bridge by error != 0
367		 */
368		i = req->nr_bridge_idx;
369		j = req->nr_port_idx;
370
371		NMG_LOCK();
372		for (error = ENOENT; i < vale_max_bridges; i++) {
373			b = bridges + i;
374			for ( ; j < NM_BDG_MAXPORTS; j++) {
375				if (b->bdg_ports[j] == NULL)
376					continue;
377				vpna = b->bdg_ports[j];
378				/* write back the VALE switch name */
379				strlcpy(hdr->nr_name, vpna->up.name,
380					sizeof(hdr->nr_name));
381				error = 0;
382				goto out;
383			}
384			j = 0; /* following bridges scan from 0 */
385		}
386	out:
387		req->nr_bridge_idx = i;
388		req->nr_port_idx = j;
389		NMG_UNLOCK();
390	}
391
392	return error;
393}
394
395
396/* nm_dtor callback for ephemeral VALE ports */
397static void
398netmap_vale_vp_dtor(struct netmap_adapter *na)
399{
400	struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na;
401	struct nm_bridge *b = vpna->na_bdg;
402
403	nm_prdis("%s has %d references", na->name, na->na_refcount);
404
405	if (b) {
406		netmap_bdg_detach_common(b, vpna->bdg_port, -1);
407	}
408
409	if (na->ifp != NULL && !nm_iszombie(na)) {
410		NM_DETACH_NA(na->ifp);
411		if (vpna->autodelete) {
412			nm_prdis("releasing %s", if_name(na->ifp));
413			NMG_UNLOCK();
414			nm_os_vi_detach(na->ifp);
415			NMG_LOCK();
416		}
417	}
418}
419
420
421
422/* nm_krings_create callback for VALE ports.
423 * Calls the standard netmap_krings_create, then adds leases on rx
424 * rings and bdgfwd on tx rings.
425 */
426static int
427netmap_vale_vp_krings_create(struct netmap_adapter *na)
428{
429	u_int tailroom;
430	int error, i;
431	uint32_t *leases;
432	u_int nrx = netmap_real_rings(na, NR_RX);
433
434	/*
435	 * Leases are attached to RX rings on vale ports
436	 */
437	tailroom = sizeof(uint32_t) * na->num_rx_desc * nrx;
438
439	error = netmap_krings_create(na, tailroom);
440	if (error)
441		return error;
442
443	leases = na->tailroom;
444
445	for (i = 0; i < nrx; i++) { /* Receive rings */
446		na->rx_rings[i]->nkr_leases = leases;
447		leases += na->num_rx_desc;
448	}
449
450	error = nm_alloc_bdgfwd(na);
451	if (error) {
452		netmap_krings_delete(na);
453		return error;
454	}
455
456	return 0;
457}
458
459
460/* nm_krings_delete callback for VALE ports. */
461static void
462netmap_vale_vp_krings_delete(struct netmap_adapter *na)
463{
464	nm_free_bdgfwd(na);
465	netmap_krings_delete(na);
466}
467
468
469static int
470nm_vale_flush(struct nm_bdg_fwd *ft, u_int n,
471	struct netmap_vp_adapter *na, u_int ring_nr);
472
473
474/*
475 * main dispatch routine for the bridge.
476 * Grab packets from a kring, move them into the ft structure
477 * associated to the tx (input) port. Max one instance per port,
478 * filtered on input (ioctl, poll or XXX).
479 * Returns the next position in the ring.
480 */
481static int
482nm_vale_preflush(struct netmap_kring *kring, u_int end)
483{
484	struct netmap_vp_adapter *na =
485		(struct netmap_vp_adapter*)kring->na;
486	struct netmap_ring *ring = kring->ring;
487	struct nm_bdg_fwd *ft;
488	u_int ring_nr = kring->ring_id;
489	u_int j = kring->nr_hwcur, lim = kring->nkr_num_slots - 1;
490	u_int ft_i = 0;	/* start from 0 */
491	u_int frags = 1; /* how many frags ? */
492	struct nm_bridge *b = na->na_bdg;
493
494	/* To protect against modifications to the bridge we acquire a
495	 * shared lock, waiting if we can sleep (if the source port is
496	 * attached to a user process) or with a trylock otherwise (NICs).
497	 */
498	nm_prdis("wait rlock for %d packets", ((j > end ? lim+1 : 0) + end) - j);
499	if (na->up.na_flags & NAF_BDG_MAYSLEEP)
500		BDG_RLOCK(b);
501	else if (!BDG_RTRYLOCK(b))
502		return j;
503	nm_prdis(5, "rlock acquired for %d packets", ((j > end ? lim+1 : 0) + end) - j);
504	ft = kring->nkr_ft;
505
506	for (; likely(j != end); j = nm_next(j, lim)) {
507		struct netmap_slot *slot = &ring->slot[j];
508		char *buf;
509
510		ft[ft_i].ft_len = slot->len;
511		ft[ft_i].ft_flags = slot->flags;
512		ft[ft_i].ft_offset = 0;
513
514		nm_prdis("flags is 0x%x", slot->flags);
515		/* we do not use the buf changed flag, but we still need to reset it */
516		slot->flags &= ~NS_BUF_CHANGED;
517
518		/* this slot goes into a list so initialize the link field */
519		ft[ft_i].ft_next = NM_FT_NULL;
520		buf = ft[ft_i].ft_buf = (slot->flags & NS_INDIRECT) ?
521			(void *)(uintptr_t)slot->ptr : NMB_O(kring, slot);
522		if (unlikely(buf == NULL ||
523		     slot->len > NETMAP_BUF_SIZE(&na->up) - nm_get_offset(kring, slot))) {
524			nm_prlim(5, "NULL %s buffer pointer from %s slot %d len %d",
525				(slot->flags & NS_INDIRECT) ? "INDIRECT" : "DIRECT",
526				kring->name, j, ft[ft_i].ft_len);
527			buf = ft[ft_i].ft_buf = NETMAP_BUF_BASE(&na->up);
528			ft[ft_i].ft_len = 0;
529			ft[ft_i].ft_flags = 0;
530		}
531		__builtin_prefetch(buf);
532		++ft_i;
533		if (slot->flags & NS_MOREFRAG) {
534			frags++;
535			continue;
536		}
537		if (unlikely(netmap_verbose && frags > 1))
538			nm_prlim(5, "%d frags at %d", frags, ft_i - frags);
539		ft[ft_i - frags].ft_frags = frags;
540		frags = 1;
541		if (unlikely((int)ft_i >= bridge_batch))
542			ft_i = nm_vale_flush(ft, ft_i, na, ring_nr);
543	}
544	if (frags > 1) {
545		/* Here ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG, and we
546		 * have to fix frags count. */
547		frags--;
548		ft[ft_i - 1].ft_flags &= ~NS_MOREFRAG;
549		ft[ft_i - frags].ft_frags = frags;
550		nm_prlim(5, "Truncate incomplete fragment at %d (%d frags)", ft_i, frags);
551	}
552	if (ft_i)
553		ft_i = nm_vale_flush(ft, ft_i, na, ring_nr);
554	BDG_RUNLOCK(b);
555	return j;
556}
557
558
559/* ----- FreeBSD if_bridge hash function ------- */
560
561/*
562 * The following hash function is adapted from "Hash Functions" by Bob Jenkins
563 * ("Algorithm Alley", Dr. Dobbs Journal, September 1997).
564 *
565 * http://www.burtleburtle.net/bob/hash/spooky.html
566 */
567#define mix(a, b, c)                                                    \
568do {                                                                    \
569	a -= b; a -= c; a ^= (c >> 13);                                 \
570	b -= c; b -= a; b ^= (a << 8);                                  \
571	c -= a; c -= b; c ^= (b >> 13);                                 \
572	a -= b; a -= c; a ^= (c >> 12);                                 \
573	b -= c; b -= a; b ^= (a << 16);                                 \
574	c -= a; c -= b; c ^= (b >> 5);                                  \
575	a -= b; a -= c; a ^= (c >> 3);                                  \
576	b -= c; b -= a; b ^= (a << 10);                                 \
577	c -= a; c -= b; c ^= (b >> 15);                                 \
578} while (/*CONSTCOND*/0)
579
580
581static __inline uint32_t
582nm_vale_rthash(const uint8_t *addr)
583{
584	uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = 0; // hash key
585
586	b += addr[5] << 8;
587	b += addr[4];
588	a += addr[3] << 24;
589	a += addr[2] << 16;
590	a += addr[1] << 8;
591	a += addr[0];
592
593	mix(a, b, c);
594#define BRIDGE_RTHASH_MASK	(NM_BDG_HASH-1)
595	return (c & BRIDGE_RTHASH_MASK);
596}
597
598#undef mix
599
600
601/*
602 * Lookup function for a learning bridge.
603 * Update the hash table with the source address,
604 * and then returns the destination port index, and the
605 * ring in *dst_ring (at the moment, always use ring 0)
606 */
607uint32_t
608netmap_vale_learning(struct nm_bdg_fwd *ft, uint8_t *dst_ring,
609		struct netmap_vp_adapter *na, void *private_data)
610{
611	uint8_t *buf = ((uint8_t *)ft->ft_buf) + ft->ft_offset;
612	u_int buf_len = ft->ft_len - ft->ft_offset;
613	struct nm_hash_ent *ht = private_data;
614	uint32_t sh, dh;
615	u_int dst, mysrc = na->bdg_port;
616	uint64_t smac, dmac;
617	uint8_t indbuf[12];
618
619	if (buf_len < 14) {
620		return NM_BDG_NOPORT;
621	}
622
623	if (ft->ft_flags & NS_INDIRECT) {
624		if (copyin(buf, indbuf, sizeof(indbuf))) {
625			return NM_BDG_NOPORT;
626		}
627		buf = indbuf;
628	}
629
630	dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff;
631	smac = le64toh(*(uint64_t *)(buf + 4));
632	smac >>= 16;
633
634	/*
635	 * The hash is somewhat expensive, there might be some
636	 * worthwhile optimizations here.
637	 */
638	if (((buf[6] & 1) == 0) && (na->last_smac != smac)) { /* valid src */
639		uint8_t *s = buf+6;
640		sh = nm_vale_rthash(s); /* hash of source */
641		/* update source port forwarding entry */
642		na->last_smac = ht[sh].mac = smac;	/* XXX expire ? */
643		ht[sh].ports = mysrc;
644		if (netmap_debug & NM_DEBUG_VALE)
645		    nm_prinf("src %02x:%02x:%02x:%02x:%02x:%02x on port %d",
646			s[0], s[1], s[2], s[3], s[4], s[5], mysrc);
647	}
648	dst = NM_BDG_BROADCAST;
649	if ((buf[0] & 1) == 0) { /* unicast */
650		dh = nm_vale_rthash(buf); /* hash of dst */
651		if (ht[dh].mac == dmac) {	/* found dst */
652			dst = ht[dh].ports;
653		}
654	}
655	return dst;
656}
657
658
659/*
660 * Available space in the ring. Only used in VALE code
661 * and only with is_rx = 1
662 */
663static inline uint32_t
664nm_kr_space(struct netmap_kring *k, int is_rx)
665{
666	int space;
667
668	if (is_rx) {
669		int busy = k->nkr_hwlease - k->nr_hwcur;
670		if (busy < 0)
671			busy += k->nkr_num_slots;
672		space = k->nkr_num_slots - 1 - busy;
673	} else {
674		/* XXX never used in this branch */
675		space = k->nr_hwtail - k->nkr_hwlease;
676		if (space < 0)
677			space += k->nkr_num_slots;
678	}
679#if 0
680	// sanity check
681	if (k->nkr_hwlease >= k->nkr_num_slots ||
682		k->nr_hwcur >= k->nkr_num_slots ||
683		k->nr_tail >= k->nkr_num_slots ||
684		busy < 0 ||
685		busy >= k->nkr_num_slots) {
686		nm_prerr("invalid kring, cur %d tail %d lease %d lease_idx %d lim %d",
687		    k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease,
688		    k->nkr_lease_idx, k->nkr_num_slots);
689	}
690#endif
691	return space;
692}
693
694
695
696
697/* make a lease on the kring for N positions. return the
698 * lease index
699 * XXX only used in VALE code and with is_rx = 1
700 */
701static inline uint32_t
702nm_kr_lease(struct netmap_kring *k, u_int n, int is_rx)
703{
704	uint32_t lim = k->nkr_num_slots - 1;
705	uint32_t lease_idx = k->nkr_lease_idx;
706
707	k->nkr_leases[lease_idx] = NR_NOSLOT;
708	k->nkr_lease_idx = nm_next(lease_idx, lim);
709
710#ifdef CONFIG_NETMAP_DEBUG
711	if (n > nm_kr_space(k, is_rx)) {
712		nm_prerr("invalid request for %d slots", n);
713		panic("x");
714	}
715#endif /* CONFIG NETMAP_DEBUG */
716	/* XXX verify that there are n slots */
717	k->nkr_hwlease += n;
718	if (k->nkr_hwlease > lim)
719		k->nkr_hwlease -= lim + 1;
720
721#ifdef CONFIG_NETMAP_DEBUG
722	if (k->nkr_hwlease >= k->nkr_num_slots ||
723		k->nr_hwcur >= k->nkr_num_slots ||
724		k->nr_hwtail >= k->nkr_num_slots ||
725		k->nkr_lease_idx >= k->nkr_num_slots) {
726		nm_prerr("invalid kring %s, cur %d tail %d lease %d lease_idx %d lim %d",
727			k->na->name,
728			k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease,
729			k->nkr_lease_idx, k->nkr_num_slots);
730	}
731#endif /* CONFIG_NETMAP_DEBUG */
732	return lease_idx;
733}
734
735/*
736 *
737 * This flush routine supports only unicast and broadcast but a large
738 * number of ports, and lets us replace the learn and dispatch functions.
739 */
740int
741nm_vale_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na,
742		u_int ring_nr)
743{
744	struct nm_vale_q *dst_ents, *brddst;
745	uint16_t num_dsts = 0, *dsts;
746	struct nm_bridge *b = na->na_bdg;
747	u_int i, me = na->bdg_port;
748
749	/*
750	 * The work area (pointed by ft) is followed by an array of
751	 * pointers to queues , dst_ents; there are NM_BDG_MAXRINGS
752	 * queues per port plus one for the broadcast traffic.
753	 * Then we have an array of destination indexes.
754	 */
755	dst_ents = (struct nm_vale_q *)(ft + NM_BDG_BATCH_MAX);
756	dsts = (uint16_t *)(dst_ents + NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1);
757
758	/* first pass: find a destination for each packet in the batch */
759	for (i = 0; likely(i < n); i += ft[i].ft_frags) {
760		uint8_t dst_ring = ring_nr; /* default, same ring as origin */
761		uint16_t dst_port, d_i;
762		struct nm_vale_q *d;
763		struct nm_bdg_fwd *start_ft = NULL;
764
765		nm_prdis("slot %d frags %d", i, ft[i].ft_frags);
766
767		if (na->up.virt_hdr_len < ft[i].ft_len) {
768			ft[i].ft_offset = na->up.virt_hdr_len;
769			start_ft = &ft[i];
770		} else if (na->up.virt_hdr_len == ft[i].ft_len && ft[i].ft_flags & NS_MOREFRAG) {
771			ft[i].ft_offset = ft[i].ft_len;
772			start_ft = &ft[i+1];
773		} else {
774			/* Drop the packet if the virtio-net header is not into the first
775			 * fragment nor at the very beginning of the second.
776			 */
777			continue;
778		}
779		dst_port = b->bdg_ops.lookup(start_ft, &dst_ring, na, b->private_data);
780		if (netmap_verbose > 255)
781			nm_prlim(5, "slot %d port %d -> %d", i, me, dst_port);
782		if (dst_port >= NM_BDG_NOPORT)
783			continue; /* this packet is identified to be dropped */
784		else if (dst_port == NM_BDG_BROADCAST)
785			dst_ring = 0; /* broadcasts always go to ring 0 */
786		else if (unlikely(dst_port == me ||
787		    !b->bdg_ports[dst_port]))
788			continue;
789
790		/* get a position in the scratch pad */
791		d_i = dst_port * NM_BDG_MAXRINGS + dst_ring;
792		d = dst_ents + d_i;
793
794		/* append the first fragment to the list */
795		if (d->bq_head == NM_FT_NULL) { /* new destination */
796			d->bq_head = d->bq_tail = i;
797			/* remember this position to be scanned later */
798			if (dst_port != NM_BDG_BROADCAST)
799				dsts[num_dsts++] = d_i;
800		} else {
801			ft[d->bq_tail].ft_next = i;
802			d->bq_tail = i;
803		}
804		d->bq_len += ft[i].ft_frags;
805	}
806
807	/*
808	 * Broadcast traffic goes to ring 0 on all destinations.
809	 * So we need to add these rings to the list of ports to scan.
810	 */
811	brddst = dst_ents + NM_BDG_BROADCAST * NM_BDG_MAXRINGS;
812	if (brddst->bq_head != NM_FT_NULL) {
813		u_int j;
814		for (j = 0; likely(j < b->bdg_active_ports); j++) {
815			uint16_t d_i;
816			i = b->bdg_port_index[j];
817			if (unlikely(i == me))
818				continue;
819			d_i = i * NM_BDG_MAXRINGS;
820			if (dst_ents[d_i].bq_head == NM_FT_NULL)
821				dsts[num_dsts++] = d_i;
822		}
823	}
824
825	nm_prdis(5, "pass 1 done %d pkts %d dsts", n, num_dsts);
826	/* second pass: scan destinations */
827	for (i = 0; i < num_dsts; i++) {
828		struct netmap_vp_adapter *dst_na;
829		struct netmap_kring *kring;
830		struct netmap_ring *ring;
831		u_int dst_nr, lim, j, d_i, next, brd_next;
832		u_int needed, howmany;
833		int retry = netmap_txsync_retry;
834		struct nm_vale_q *d;
835		uint32_t my_start = 0, lease_idx = 0;
836		int nrings;
837		int virt_hdr_mismatch = 0;
838
839		d_i = dsts[i];
840		nm_prdis("second pass %d port %d", i, d_i);
841		d = dst_ents + d_i;
842		// XXX fix the division
843		dst_na = b->bdg_ports[d_i/NM_BDG_MAXRINGS];
844		/* protect from the lookup function returning an inactive
845		 * destination port
846		 */
847		if (unlikely(dst_na == NULL))
848			goto cleanup;
849		if (dst_na->up.na_flags & NAF_SW_ONLY)
850			goto cleanup;
851		/*
852		 * The interface may be in !netmap mode in two cases:
853		 * - when na is attached but not activated yet;
854		 * - when na is being deactivated but is still attached.
855		 */
856		if (unlikely(!nm_netmap_on(&dst_na->up))) {
857			nm_prdis("not in netmap mode!");
858			goto cleanup;
859		}
860
861		/* there is at least one either unicast or broadcast packet */
862		brd_next = brddst->bq_head;
863		next = d->bq_head;
864		/* we need to reserve this many slots. If fewer are
865		 * available, some packets will be dropped.
866		 * Packets may have multiple fragments, so
867		 * there is a chance that we may not use all of the slots
868		 * we have claimed, so we will need to handle the leftover
869		 * ones when we regain the lock.
870		 */
871		needed = d->bq_len + brddst->bq_len;
872
873		if (unlikely(dst_na->up.virt_hdr_len != na->up.virt_hdr_len)) {
874			if (netmap_verbose) {
875				nm_prlim(3, "virt_hdr_mismatch, src %d dst %d", na->up.virt_hdr_len,
876						dst_na->up.virt_hdr_len);
877			}
878			/* There is a virtio-net header/offloadings mismatch between
879			 * source and destination. The slower mismatch datapath will
880			 * be used to cope with all the mismatches.
881			 */
882			virt_hdr_mismatch = 1;
883			if (dst_na->mfs < na->mfs) {
884				/* We may need to do segmentation offloadings, and so
885				 * we may need a number of destination slots greater
886				 * than the number of input slots ('needed').
887				 * We look for the smallest integer 'x' which satisfies:
888				 *	needed * na->mfs + x * H <= x * na->mfs
889				 * where 'H' is the length of the longest header that may
890				 * be replicated in the segmentation process (e.g. for
891				 * TCPv4 we must account for ethernet header, IP header
892				 * and TCPv4 header).
893				 */
894				KASSERT(dst_na->mfs > 0, ("vpna->mfs is 0"));
895				needed = (needed * na->mfs) /
896						(dst_na->mfs - WORST_CASE_GSO_HEADER) + 1;
897				nm_prdis(3, "srcmtu=%u, dstmtu=%u, x=%u", na->mfs, dst_na->mfs, needed);
898			}
899		}
900
901		nm_prdis(5, "pass 2 dst %d is %x %s",
902			i, d_i, nm_is_bwrap(&dst_na->up) ? "nic/host" : "virtual");
903		dst_nr = d_i & (NM_BDG_MAXRINGS-1);
904		nrings = dst_na->up.num_rx_rings;
905		if (dst_nr >= nrings)
906			dst_nr = dst_nr % nrings;
907		kring = dst_na->up.rx_rings[dst_nr];
908		ring = kring->ring;
909		/* the destination ring may have not been opened for RX */
910		if (unlikely(ring == NULL || kring->nr_mode != NKR_NETMAP_ON))
911			goto cleanup;
912		lim = kring->nkr_num_slots - 1;
913
914retry:
915
916		if (dst_na->retry && retry) {
917			/* try to get some free slot from the previous run */
918			kring->nm_notify(kring, NAF_FORCE_RECLAIM);
919			/* actually useful only for bwraps, since there
920			 * the notify will trigger a txsync on the hwna. VALE ports
921			 * have dst_na->retry == 0
922			 */
923		}
924		/* reserve the buffers in the queue and an entry
925		 * to report completion, and drop lock.
926		 * XXX this might become a helper function.
927		 */
928		mtx_lock(&kring->q_lock);
929		if (kring->nkr_stopped) {
930			mtx_unlock(&kring->q_lock);
931			goto cleanup;
932		}
933		my_start = j = kring->nkr_hwlease;
934		howmany = nm_kr_space(kring, 1);
935		if (needed < howmany)
936			howmany = needed;
937		lease_idx = nm_kr_lease(kring, howmany, 1);
938		mtx_unlock(&kring->q_lock);
939
940		/* only retry if we need more than available slots */
941		if (retry && needed <= howmany)
942			retry = 0;
943
944		/* copy to the destination queue */
945		while (howmany > 0) {
946			struct netmap_slot *slot;
947			struct nm_bdg_fwd *ft_p, *ft_end;
948			u_int cnt;
949
950			/* find the queue from which we pick next packet.
951			 * NM_FT_NULL is always higher than valid indexes
952			 * so we never dereference it if the other list
953			 * has packets (and if both are empty we never
954			 * get here).
955			 */
956			if (next < brd_next) {
957				ft_p = ft + next;
958				next = ft_p->ft_next;
959			} else { /* insert broadcast */
960				ft_p = ft + brd_next;
961				brd_next = ft_p->ft_next;
962			}
963			cnt = ft_p->ft_frags; // cnt > 0
964			if (unlikely(cnt > howmany))
965			    break; /* no more space */
966			if (netmap_verbose && cnt > 1)
967				nm_prlim(5, "rx %d frags to %d", cnt, j);
968			ft_end = ft_p + cnt;
969			if (unlikely(virt_hdr_mismatch)) {
970				bdg_mismatch_datapath(na, dst_na, ft_p, ring, &j, lim, &howmany);
971			} else {
972				howmany -= cnt;
973				do {
974					char *dst, *src = ft_p->ft_buf;
975					size_t copy_len = ft_p->ft_len, dst_len = copy_len;
976					uintptr_t src_cb;
977					uint64_t dstoff, dstoff_cb;
978					int src_co, dst_co;
979					const uintptr_t mask = NM_BUF_ALIGN - 1;
980
981					slot = &ring->slot[j];
982					dst = NMB(&dst_na->up, slot);
983					dstoff = nm_get_offset(kring, slot);
984					dstoff_cb = dstoff & ~mask;
985					src_cb = ((uintptr_t)src) & ~mask;
986					src_co = ((uintptr_t)src) & mask;
987					dst_co = ((uintptr_t)(dst + dstoff)) & mask;
988					if (dst_co < src_co) {
989						dstoff_cb += NM_BUF_ALIGN;
990					}
991					dstoff = dstoff_cb + src_co;
992					copy_len += src_co;
993
994					nm_prdis("send [%d] %d(%d) bytes at %s:%d",
995							i, (int)copy_len, (int)dst_len,
996							NM_IFPNAME(dst_ifp), j);
997
998					if (unlikely(dstoff > NETMAP_BUF_SIZE(&dst_na->up) ||
999				                     dst_len > NETMAP_BUF_SIZE(&dst_na->up) - dstoff)) {
1000						nm_prlim(5, "dropping packet/fragment of len %zu, dest offset %llu",
1001								dst_len, (unsigned long long)dstoff);
1002						copy_len = dst_len = 0;
1003						dstoff = nm_get_offset(kring, slot);
1004					}
1005
1006					if (ft_p->ft_flags & NS_INDIRECT) {
1007						if (copyin(src, dst, copy_len)) {
1008							// invalid user pointer, pretend len is 0
1009							dst_len = 0;
1010						}
1011					} else {
1012						//memcpy(dst, src, copy_len);
1013						pkt_copy((char *)src_cb, dst + dstoff_cb, (int)copy_len);
1014					}
1015					slot->len = dst_len;
1016					slot->flags = (cnt << 8)| NS_MOREFRAG;
1017					nm_write_offset(kring, slot, dstoff);
1018					j = nm_next(j, lim);
1019					needed--;
1020					ft_p++;
1021				} while (ft_p != ft_end);
1022				slot->flags = (cnt << 8); /* clear flag on last entry */
1023			}
1024			/* are we done ? */
1025			if (next == NM_FT_NULL && brd_next == NM_FT_NULL)
1026				break;
1027		}
1028		{
1029		    /* current position */
1030		    uint32_t *p = kring->nkr_leases; /* shorthand */
1031		    uint32_t update_pos;
1032		    int still_locked = 1;
1033
1034		    mtx_lock(&kring->q_lock);
1035		    if (unlikely(howmany > 0)) {
1036			/* not used all bufs. If i am the last one
1037			 * i can recover the slots, otherwise must
1038			 * fill them with 0 to mark empty packets.
1039			 */
1040			nm_prdis("leftover %d bufs", howmany);
1041			if (nm_next(lease_idx, lim) == kring->nkr_lease_idx) {
1042			    /* yes i am the last one */
1043			    nm_prdis("roll back nkr_hwlease to %d", j);
1044			    kring->nkr_hwlease = j;
1045			} else {
1046			    while (howmany-- > 0) {
1047				ring->slot[j].len = 0;
1048				ring->slot[j].flags = 0;
1049				j = nm_next(j, lim);
1050			    }
1051			}
1052		    }
1053		    p[lease_idx] = j; /* report I am done */
1054
1055		    update_pos = kring->nr_hwtail;
1056
1057		    if (my_start == update_pos) {
1058			/* all slots before my_start have been reported,
1059			 * so scan subsequent leases to see if other ranges
1060			 * have been completed, and to a selwakeup or txsync.
1061		         */
1062			while (lease_idx != kring->nkr_lease_idx &&
1063				p[lease_idx] != NR_NOSLOT) {
1064			    j = p[lease_idx];
1065			    p[lease_idx] = NR_NOSLOT;
1066			    lease_idx = nm_next(lease_idx, lim);
1067			}
1068			/* j is the new 'write' position. j != my_start
1069			 * means there are new buffers to report
1070			 */
1071			if (likely(j != my_start)) {
1072				kring->nr_hwtail = j;
1073				still_locked = 0;
1074				mtx_unlock(&kring->q_lock);
1075				kring->nm_notify(kring, 0);
1076				/* this is netmap_notify for VALE ports and
1077				 * netmap_bwrap_notify for bwrap. The latter will
1078				 * trigger a txsync on the underlying hwna
1079				 */
1080				if (dst_na->retry && retry--) {
1081					/* XXX this is going to call nm_notify again.
1082					 * Only useful for bwrap in virtual machines
1083					 */
1084					goto retry;
1085				}
1086			}
1087		    }
1088		    if (still_locked)
1089			mtx_unlock(&kring->q_lock);
1090		}
1091cleanup:
1092		d->bq_head = d->bq_tail = NM_FT_NULL; /* cleanup */
1093		d->bq_len = 0;
1094	}
1095	brddst->bq_head = brddst->bq_tail = NM_FT_NULL; /* cleanup */
1096	brddst->bq_len = 0;
1097	return 0;
1098}
1099
1100/* nm_txsync callback for VALE ports */
1101static int
1102netmap_vale_vp_txsync(struct netmap_kring *kring, int flags)
1103{
1104	struct netmap_vp_adapter *na =
1105		(struct netmap_vp_adapter *)kring->na;
1106	u_int done;
1107	u_int const lim = kring->nkr_num_slots - 1;
1108	u_int const head = kring->rhead;
1109
1110	if (bridge_batch <= 0) { /* testing only */
1111		done = head; // used all
1112		goto done;
1113	}
1114	if (!na->na_bdg) {
1115		done = head;
1116		goto done;
1117	}
1118	if (bridge_batch > NM_BDG_BATCH)
1119		bridge_batch = NM_BDG_BATCH;
1120
1121	done = nm_vale_preflush(kring, head);
1122done:
1123	if (done != head)
1124		nm_prerr("early break at %d/ %d, tail %d", done, head, kring->nr_hwtail);
1125	/*
1126	 * packets between 'done' and 'cur' are left unsent.
1127	 */
1128	kring->nr_hwcur = done;
1129	kring->nr_hwtail = nm_prev(done, lim);
1130	if (netmap_debug & NM_DEBUG_TXSYNC)
1131		nm_prinf("%s ring %d flags %d", na->up.name, kring->ring_id, flags);
1132	return 0;
1133}
1134
1135
1136/* create a netmap_vp_adapter that describes a VALE port.
1137 * Only persistent VALE ports have a non-null ifp.
1138 */
1139static int
1140netmap_vale_vp_create(struct nmreq_header *hdr, if_t ifp,
1141		struct netmap_mem_d *nmd, struct netmap_vp_adapter **ret)
1142{
1143	struct nmreq_register *req = (struct nmreq_register *)(uintptr_t)hdr->nr_body;
1144	struct netmap_vp_adapter *vpna;
1145	struct netmap_adapter *na;
1146	int error = 0;
1147	u_int npipes = 0;
1148	u_int extrabufs = 0;
1149
1150	if (hdr->nr_reqtype != NETMAP_REQ_REGISTER) {
1151		return EINVAL;
1152	}
1153
1154	vpna = nm_os_malloc(sizeof(*vpna));
1155	if (vpna == NULL)
1156		return ENOMEM;
1157
1158 	na = &vpna->up;
1159
1160	na->ifp = ifp;
1161	strlcpy(na->name, hdr->nr_name, sizeof(na->name));
1162
1163	/* bound checking */
1164	na->num_tx_rings = req->nr_tx_rings;
1165	nm_bound_var(&na->num_tx_rings, 1, 1, NM_BDG_MAXRINGS, NULL);
1166	req->nr_tx_rings = na->num_tx_rings; /* write back */
1167	na->num_rx_rings = req->nr_rx_rings;
1168	nm_bound_var(&na->num_rx_rings, 1, 1, NM_BDG_MAXRINGS, NULL);
1169	req->nr_rx_rings = na->num_rx_rings; /* write back */
1170	nm_bound_var(&req->nr_tx_slots, NM_BRIDGE_RINGSIZE,
1171			1, NM_BDG_MAXSLOTS, NULL);
1172	na->num_tx_desc = req->nr_tx_slots;
1173	nm_bound_var(&req->nr_rx_slots, NM_BRIDGE_RINGSIZE,
1174			1, NM_BDG_MAXSLOTS, NULL);
1175	/* validate number of pipes. We want at least 1,
1176	 * but probably can do with some more.
1177	 * So let's use 2 as default (when 0 is supplied)
1178	 */
1179	nm_bound_var(&npipes, 2, 1, NM_MAXPIPES, NULL);
1180	/* validate extra bufs */
1181	extrabufs = req->nr_extra_bufs;
1182	nm_bound_var(&extrabufs, 0, 0,
1183			128*NM_BDG_MAXSLOTS, NULL);
1184	req->nr_extra_bufs = extrabufs; /* write back */
1185	na->num_rx_desc = req->nr_rx_slots;
1186	/* Set the mfs to a default value, as it is needed on the VALE
1187	 * mismatch datapath. XXX We should set it according to the MTU
1188	 * known to the kernel. */
1189	vpna->mfs = NM_BDG_MFS_DEFAULT;
1190	vpna->last_smac = ~0llu;
1191	/*if (vpna->mfs > netmap_buf_size)  TODO netmap_buf_size is zero??
1192		vpna->mfs = netmap_buf_size; */
1193	if (netmap_verbose)
1194		nm_prinf("max frame size %u", vpna->mfs);
1195
1196	na->na_flags |= (NAF_BDG_MAYSLEEP | NAF_OFFSETS);
1197	/* persistent VALE ports look like hw devices
1198	 * with a native netmap adapter
1199	 */
1200	if (ifp)
1201		na->na_flags |= NAF_NATIVE;
1202	na->nm_txsync = netmap_vale_vp_txsync;
1203	na->nm_rxsync = netmap_vp_rxsync; /* use the one provided by bdg */
1204	na->nm_register = netmap_vp_reg;  /* use the one provided by bdg */
1205	na->nm_krings_create = netmap_vale_vp_krings_create;
1206	na->nm_krings_delete = netmap_vale_vp_krings_delete;
1207	na->nm_dtor = netmap_vale_vp_dtor;
1208	nm_prdis("nr_mem_id %d", req->nr_mem_id);
1209	na->nm_mem = nmd ?
1210		netmap_mem_get(nmd):
1211		netmap_mem_private_new(
1212			na->num_tx_rings, na->num_tx_desc,
1213			na->num_rx_rings, na->num_rx_desc,
1214			req->nr_extra_bufs, npipes, &error);
1215	if (na->nm_mem == NULL)
1216		goto err;
1217	na->nm_bdg_attach = netmap_vale_vp_bdg_attach;
1218	/* other nmd fields are set in the common routine */
1219	error = netmap_attach_common(na);
1220	if (error)
1221		goto err;
1222	*ret = vpna;
1223	return 0;
1224
1225err:
1226	if (na->nm_mem != NULL)
1227		netmap_mem_put(na->nm_mem);
1228	nm_os_free(vpna);
1229	return error;
1230}
1231
1232/* nm_bdg_attach callback for VALE ports
1233 * The na_vp port is this same netmap_adapter. There is no host port.
1234 */
1235static int
1236netmap_vale_vp_bdg_attach(const char *name, struct netmap_adapter *na,
1237		struct nm_bridge *b)
1238{
1239	struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na;
1240
1241	if ((b->bdg_flags & NM_BDG_NEED_BWRAP) || vpna->na_bdg) {
1242		return NM_NEED_BWRAP;
1243	}
1244	na->na_vp = vpna;
1245	strlcpy(na->name, name, sizeof(na->name));
1246	na->na_hostvp = NULL;
1247	return 0;
1248}
1249
1250static int
1251netmap_vale_bwrap_krings_create(struct netmap_adapter *na)
1252{
1253	int error;
1254
1255	/* impersonate a netmap_vp_adapter */
1256	error = netmap_vale_vp_krings_create(na);
1257	if (error)
1258		return error;
1259	error = netmap_bwrap_krings_create_common(na);
1260	if (error) {
1261		netmap_vale_vp_krings_delete(na);
1262	}
1263	return error;
1264}
1265
1266static void
1267netmap_vale_bwrap_krings_delete(struct netmap_adapter *na)
1268{
1269	netmap_bwrap_krings_delete_common(na);
1270	netmap_vale_vp_krings_delete(na);
1271}
1272
1273static int
1274netmap_vale_bwrap_attach(const char *nr_name, struct netmap_adapter *hwna)
1275{
1276	struct netmap_bwrap_adapter *bna;
1277	struct netmap_adapter *na = NULL;
1278	struct netmap_adapter *hostna = NULL;
1279	int error;
1280
1281	bna = nm_os_malloc(sizeof(*bna));
1282	if (bna == NULL) {
1283		return ENOMEM;
1284	}
1285	na = &bna->up.up;
1286	strlcpy(na->name, nr_name, sizeof(na->name));
1287	na->nm_register = netmap_bwrap_reg;
1288	na->nm_txsync = netmap_vale_vp_txsync;
1289	// na->nm_rxsync = netmap_bwrap_rxsync;
1290	na->nm_krings_create = netmap_vale_bwrap_krings_create;
1291	na->nm_krings_delete = netmap_vale_bwrap_krings_delete;
1292	na->nm_notify = netmap_bwrap_notify;
1293	bna->nm_intr_notify = netmap_bwrap_intr_notify;
1294	bna->up.retry = 1; /* XXX maybe this should depend on the hwna */
1295	/* Set the mfs, needed on the VALE mismatch datapath. */
1296	bna->up.mfs = NM_BDG_MFS_DEFAULT;
1297
1298	if (hwna->na_flags & NAF_HOST_RINGS) {
1299		hostna = &bna->host.up;
1300		hostna->nm_notify = netmap_bwrap_notify;
1301		bna->host.mfs = NM_BDG_MFS_DEFAULT;
1302	}
1303
1304	error = netmap_bwrap_attach_common(na, hwna);
1305	if (error) {
1306		nm_os_free(bna);
1307	}
1308	return error;
1309}
1310
1311int
1312netmap_get_vale_na(struct nmreq_header *hdr, struct netmap_adapter **na,
1313		struct netmap_mem_d *nmd, int create)
1314{
1315	return netmap_get_bdg_na(hdr, na, nmd, create, &vale_bdg_ops);
1316}
1317
1318
1319/* creates a persistent VALE port */
1320int
1321nm_vi_create(struct nmreq_header *hdr)
1322{
1323	struct nmreq_vale_newif *req =
1324		(struct nmreq_vale_newif *)(uintptr_t)hdr->nr_body;
1325	int error = 0;
1326	/* Build a nmreq_register out of the nmreq_vale_newif,
1327	 * so that we can call netmap_get_bdg_na(). */
1328	struct nmreq_register regreq;
1329	bzero(&regreq, sizeof(regreq));
1330	regreq.nr_tx_slots = req->nr_tx_slots;
1331	regreq.nr_rx_slots = req->nr_rx_slots;
1332	regreq.nr_tx_rings = req->nr_tx_rings;
1333	regreq.nr_rx_rings = req->nr_rx_rings;
1334	regreq.nr_mem_id = req->nr_mem_id;
1335	hdr->nr_reqtype = NETMAP_REQ_REGISTER;
1336	hdr->nr_body = (uintptr_t)&regreq;
1337	error = netmap_vi_create(hdr, 0 /* no autodelete */);
1338	hdr->nr_reqtype = NETMAP_REQ_VALE_NEWIF;
1339	hdr->nr_body = (uintptr_t)req;
1340	/* Write back to the original struct. */
1341	req->nr_tx_slots = regreq.nr_tx_slots;
1342	req->nr_rx_slots = regreq.nr_rx_slots;
1343	req->nr_tx_rings = regreq.nr_tx_rings;
1344	req->nr_rx_rings = regreq.nr_rx_rings;
1345	req->nr_mem_id = regreq.nr_mem_id;
1346	return error;
1347}
1348
1349/* remove a persistent VALE port from the system */
1350int
1351nm_vi_destroy(const char *name)
1352{
1353	if_t ifp;
1354	struct netmap_vp_adapter *vpna;
1355	int error;
1356
1357	ifp = ifunit_ref(name);
1358	if (!ifp)
1359		return ENXIO;
1360	NMG_LOCK();
1361	/* make sure this is actually a VALE port */
1362	if (!NM_NA_VALID(ifp) || NA(ifp)->nm_register != netmap_vp_reg) {
1363		error = EINVAL;
1364		goto err;
1365	}
1366
1367	vpna = (struct netmap_vp_adapter *)NA(ifp);
1368
1369	/* we can only destroy ports that were created via NETMAP_BDG_NEWIF */
1370	if (vpna->autodelete) {
1371		error = EINVAL;
1372		goto err;
1373	}
1374
1375	/* also make sure that nobody is using the interface */
1376	if (NETMAP_OWNED_BY_ANY(&vpna->up) ||
1377	    vpna->up.na_refcount > 1 /* any ref besides the one in nm_vi_create()? */) {
1378		error = EBUSY;
1379		goto err;
1380	}
1381
1382	NMG_UNLOCK();
1383
1384	if (netmap_verbose)
1385		nm_prinf("destroying a persistent vale interface %s", if_name(ifp));
1386	/* Linux requires all the references are released
1387	 * before unregister
1388	 */
1389	netmap_detach(ifp);
1390	if_rele(ifp);
1391	nm_os_vi_detach(ifp);
1392	return 0;
1393
1394err:
1395	NMG_UNLOCK();
1396	if_rele(ifp);
1397	return error;
1398}
1399
1400static int
1401nm_update_info(struct nmreq_register *req, struct netmap_adapter *na)
1402{
1403	req->nr_rx_rings = na->num_rx_rings;
1404	req->nr_tx_rings = na->num_tx_rings;
1405	req->nr_rx_slots = na->num_rx_desc;
1406	req->nr_tx_slots = na->num_tx_desc;
1407	return netmap_mem_get_info(na->nm_mem, &req->nr_memsize, NULL,
1408					&req->nr_mem_id);
1409}
1410
1411
1412/*
1413 * Create a virtual interface registered to the system.
1414 * The interface will be attached to a bridge later.
1415 */
1416int
1417netmap_vi_create(struct nmreq_header *hdr, int autodelete)
1418{
1419	struct nmreq_register *req = (struct nmreq_register *)(uintptr_t)hdr->nr_body;
1420	if_t ifp;
1421	struct netmap_vp_adapter *vpna;
1422	struct netmap_mem_d *nmd = NULL;
1423	int error;
1424
1425	if (hdr->nr_reqtype != NETMAP_REQ_REGISTER) {
1426		return EINVAL;
1427	}
1428
1429	/* don't include VALE prefix */
1430	if (!strncmp(hdr->nr_name, NM_BDG_NAME, strlen(NM_BDG_NAME)))
1431		return EINVAL;
1432	if (strlen(hdr->nr_name) >= IFNAMSIZ) {
1433		return EINVAL;
1434	}
1435	ifp = ifunit_ref(hdr->nr_name);
1436	if (ifp) { /* already exist, cannot create new one */
1437		error = EEXIST;
1438		NMG_LOCK();
1439		if (NM_NA_VALID(ifp)) {
1440			int update_err = nm_update_info(req, NA(ifp));
1441			if (update_err)
1442				error = update_err;
1443		}
1444		NMG_UNLOCK();
1445		if_rele(ifp);
1446		return error;
1447	}
1448	error = nm_os_vi_persist(hdr->nr_name, &ifp);
1449	if (error)
1450		return error;
1451
1452	NMG_LOCK();
1453	if (req->nr_mem_id) {
1454		nmd = netmap_mem_find(req->nr_mem_id);
1455		if (nmd == NULL) {
1456			error = EINVAL;
1457			goto err_1;
1458		}
1459	}
1460	/* netmap_vp_create creates a struct netmap_vp_adapter */
1461	error = netmap_vale_vp_create(hdr, ifp, nmd, &vpna);
1462	if (error) {
1463		if (netmap_debug & NM_DEBUG_VALE)
1464			nm_prerr("error %d", error);
1465		goto err_1;
1466	}
1467	/* persist-specific routines */
1468	vpna->up.nm_bdg_ctl = netmap_vp_bdg_ctl;
1469	if (!autodelete) {
1470		netmap_adapter_get(&vpna->up);
1471	} else {
1472		vpna->autodelete = 1;
1473	}
1474	NM_ATTACH_NA(ifp, &vpna->up);
1475	/* return the updated info */
1476	error = nm_update_info(req, &vpna->up);
1477	if (error) {
1478		goto err_2;
1479	}
1480	nm_prdis("returning nr_mem_id %d", req->nr_mem_id);
1481	if (nmd)
1482		netmap_mem_put(nmd);
1483	NMG_UNLOCK();
1484	nm_prdis("created %s", if_name(ifp));
1485	return 0;
1486
1487err_2:
1488	netmap_detach(ifp);
1489err_1:
1490	if (nmd)
1491		netmap_mem_put(nmd);
1492	NMG_UNLOCK();
1493	nm_os_vi_detach(ifp);
1494
1495	return error;
1496}
1497
1498#endif /* WITH_VALE */
1499