1/*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright (C) 2013-2016 Universita` di Pisa
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *   1. Redistributions of source code must retain the above copyright
11 *      notice, this list of conditions and the following disclaimer.
12 *   2. Redistributions in binary form must reproduce the above copyright
13 *      notice, this list of conditions and the following disclaimer in the
14 *      documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29
30#if defined(__FreeBSD__)
31#include <sys/cdefs.h> /* prerequisite */
32__FBSDID("$FreeBSD$");
33
34#include <sys/types.h>
35#include <sys/errno.h>
36#include <sys/param.h>	/* defines used in kernel.h */
37#include <sys/kernel.h>	/* types used in module initialization */
38#include <sys/conf.h>	/* cdevsw struct, UID, GID */
39#include <sys/sockio.h>
40#include <sys/socketvar.h>	/* struct socket */
41#include <sys/malloc.h>
42#include <sys/poll.h>
43#include <sys/rwlock.h>
44#include <sys/socket.h> /* sockaddrs */
45#include <sys/selinfo.h>
46#include <sys/sysctl.h>
47#include <net/if.h>
48#include <net/if_var.h>
49#include <net/bpf.h>		/* BIOCIMMEDIATE */
50#include <machine/bus.h>	/* bus_dmamap_* */
51#include <sys/endian.h>
52#include <sys/refcount.h>
53#include <sys/smp.h>
54
55
56#elif defined(linux)
57
58#include "bsd_glue.h"
59
60#elif defined(__APPLE__)
61
62#warning OSX support is only partial
63#include "osx_glue.h"
64
65#elif defined(_WIN32)
66#include "win_glue.h"
67
68#else
69
70#error	Unsupported platform
71
72#endif /* unsupported */
73
74/*
75 * common headers
76 */
77
78#include <net/netmap.h>
79#include <dev/netmap/netmap_kern.h>
80#include <dev/netmap/netmap_mem2.h>
81#include <dev/netmap/netmap_bdg.h>
82
83#ifdef WITH_VALE
84
85/*
86 * system parameters (most of them in netmap_kern.h)
87 * NM_BDG_NAME	prefix for switch port names, default "vale"
88 * NM_BDG_MAXPORTS	number of ports
89 * NM_BRIDGES	max number of switches in the system.
90 *	XXX should become a sysctl or tunable
91 *
92 * Switch ports are named valeX:Y where X is the switch name and Y
93 * is the port. If Y matches a physical interface name, the port is
94 * connected to a physical device.
95 *
96 * Unlike physical interfaces, switch ports use their own memory region
97 * for rings and buffers.
98 * The virtual interfaces use per-queue lock instead of core lock.
99 * In the tx loop, we aggregate traffic in batches to make all operations
100 * faster. The batch size is bridge_batch.
101 */
102#define NM_BDG_MAXRINGS		16	/* XXX unclear how many. */
103#define NM_BDG_MAXSLOTS		4096	/* XXX same as above */
104#define NM_BRIDGE_RINGSIZE	1024	/* in the device */
105#define NM_BDG_BATCH		1024	/* entries in the forwarding buffer */
106/* actual size of the tables */
107#define NM_BDG_BATCH_MAX	(NM_BDG_BATCH + NETMAP_MAX_FRAGS)
108/* NM_FT_NULL terminates a list of slots in the ft */
109#define NM_FT_NULL		NM_BDG_BATCH_MAX
110
111
112/*
113 * bridge_batch is set via sysctl to the max batch size to be
114 * used in the bridge. The actual value may be larger as the
115 * last packet in the block may overflow the size.
116 */
117static int bridge_batch = NM_BDG_BATCH; /* bridge batch size */
118SYSBEGIN(vars_vale);
119SYSCTL_DECL(_dev_netmap);
120SYSCTL_INT(_dev_netmap, OID_AUTO, bridge_batch, CTLFLAG_RW, &bridge_batch, 0,
121		"Max batch size to be used in the bridge");
122SYSEND;
123
124static int netmap_vale_vp_create(struct nmreq_header *hdr, struct ifnet *,
125		struct netmap_mem_d *nmd, struct netmap_vp_adapter **);
126static int netmap_vale_vp_bdg_attach(const char *, struct netmap_adapter *,
127		struct nm_bridge *);
128static int netmap_vale_bwrap_attach(const char *, struct netmap_adapter *);
129
130/*
131 * For each output interface, nm_vale_q is used to construct a list.
132 * bq_len is the number of output buffers (we can have coalescing
133 * during the copy).
134 */
135struct nm_vale_q {
136	uint16_t bq_head;
137	uint16_t bq_tail;
138	uint32_t bq_len;	/* number of buffers */
139};
140
141/* Holds the default callbacks */
142struct netmap_bdg_ops vale_bdg_ops = {
143	.lookup = netmap_vale_learning,
144	.config = NULL,
145	.dtor = NULL,
146	.vp_create = netmap_vale_vp_create,
147	.bwrap_attach = netmap_vale_bwrap_attach,
148	.name = NM_BDG_NAME,
149};
150
151/*
152 * this is a slightly optimized copy routine which rounds
153 * to multiple of 64 bytes and is often faster than dealing
154 * with other odd sizes. We assume there is enough room
155 * in the source and destination buffers.
156 *
157 * XXX only for multiples of 64 bytes, non overlapped.
158 */
159static inline void
160pkt_copy(void *_src, void *_dst, int l)
161{
162	uint64_t *src = _src;
163	uint64_t *dst = _dst;
164	if (unlikely(l >= 1024)) {
165		memcpy(dst, src, l);
166		return;
167	}
168	for (; likely(l > 0); l-=64) {
169		*dst++ = *src++;
170		*dst++ = *src++;
171		*dst++ = *src++;
172		*dst++ = *src++;
173		*dst++ = *src++;
174		*dst++ = *src++;
175		*dst++ = *src++;
176		*dst++ = *src++;
177	}
178}
179
180
181/*
182 * Free the forwarding tables for rings attached to switch ports.
183 */
184static void
185nm_free_bdgfwd(struct netmap_adapter *na)
186{
187	int nrings, i;
188	struct netmap_kring **kring;
189
190	NMG_LOCK_ASSERT();
191	nrings = na->num_tx_rings;
192	kring = na->tx_rings;
193	for (i = 0; i < nrings; i++) {
194		if (kring[i]->nkr_ft) {
195			nm_os_free(kring[i]->nkr_ft);
196			kring[i]->nkr_ft = NULL; /* protect from freeing twice */
197		}
198	}
199}
200
201
202/*
203 * Allocate the forwarding tables for the rings attached to the bridge ports.
204 */
205static int
206nm_alloc_bdgfwd(struct netmap_adapter *na)
207{
208	int nrings, l, i, num_dstq;
209	struct netmap_kring **kring;
210
211	NMG_LOCK_ASSERT();
212	/* all port:rings + broadcast */
213	num_dstq = NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1;
214	l = sizeof(struct nm_bdg_fwd) * NM_BDG_BATCH_MAX;
215	l += sizeof(struct nm_vale_q) * num_dstq;
216	l += sizeof(uint16_t) * NM_BDG_BATCH_MAX;
217
218	nrings = netmap_real_rings(na, NR_TX);
219	kring = na->tx_rings;
220	for (i = 0; i < nrings; i++) {
221		struct nm_bdg_fwd *ft;
222		struct nm_vale_q *dstq;
223		int j;
224
225		ft = nm_os_malloc(l);
226		if (!ft) {
227			nm_free_bdgfwd(na);
228			return ENOMEM;
229		}
230		dstq = (struct nm_vale_q *)(ft + NM_BDG_BATCH_MAX);
231		for (j = 0; j < num_dstq; j++) {
232			dstq[j].bq_head = dstq[j].bq_tail = NM_FT_NULL;
233			dstq[j].bq_len = 0;
234		}
235		kring[i]->nkr_ft = ft;
236	}
237	return 0;
238}
239
240/* Allows external modules to create bridges in exclusive mode,
241 * returns an authentication token that the external module will need
242 * to provide during nm_bdg_ctl_{attach, detach}(), netmap_bdg_regops(),
243 * and nm_bdg_update_private_data() operations.
244 * Successfully executed if ret != NULL and *return_status == 0.
245 */
246void *
247netmap_vale_create(const char *bdg_name, int *return_status)
248{
249	struct nm_bridge *b = NULL;
250	void *ret = NULL;
251
252	NMG_LOCK();
253	b = nm_find_bridge(bdg_name, 0 /* don't create */, NULL);
254	if (b) {
255		*return_status = EEXIST;
256		goto unlock_bdg_create;
257	}
258
259	b = nm_find_bridge(bdg_name, 1 /* create */, &vale_bdg_ops);
260	if (!b) {
261		*return_status = ENOMEM;
262		goto unlock_bdg_create;
263	}
264
265	b->bdg_flags |= NM_BDG_ACTIVE | NM_BDG_EXCLUSIVE;
266	ret = nm_bdg_get_auth_token(b);
267	*return_status = 0;
268
269unlock_bdg_create:
270	NMG_UNLOCK();
271	return ret;
272}
273
274/* Allows external modules to destroy a bridge created through
275 * netmap_bdg_create(), the bridge must be empty.
276 */
277int
278netmap_vale_destroy(const char *bdg_name, void *auth_token)
279{
280	struct nm_bridge *b = NULL;
281	int ret = 0;
282
283	NMG_LOCK();
284	b = nm_find_bridge(bdg_name, 0 /* don't create */, NULL);
285	if (!b) {
286		ret = ENXIO;
287		goto unlock_bdg_free;
288	}
289
290	if (!nm_bdg_valid_auth_token(b, auth_token)) {
291		ret = EACCES;
292		goto unlock_bdg_free;
293	}
294	if (!(b->bdg_flags & NM_BDG_EXCLUSIVE)) {
295		ret = EINVAL;
296		goto unlock_bdg_free;
297	}
298
299	b->bdg_flags &= ~(NM_BDG_EXCLUSIVE | NM_BDG_ACTIVE);
300	ret = netmap_bdg_free(b);
301	if (ret) {
302		b->bdg_flags |= NM_BDG_EXCLUSIVE | NM_BDG_ACTIVE;
303	}
304
305unlock_bdg_free:
306	NMG_UNLOCK();
307	return ret;
308}
309
310/* Process NETMAP_REQ_VALE_LIST. */
311int
312netmap_vale_list(struct nmreq_header *hdr)
313{
314	struct nmreq_vale_list *req =
315		(struct nmreq_vale_list *)(uintptr_t)hdr->nr_body;
316	int namelen = strlen(hdr->nr_name);
317	struct nm_bridge *b, *bridges;
318	struct netmap_vp_adapter *vpna;
319	int error = 0, i, j;
320	u_int num_bridges;
321
322	netmap_bns_getbridges(&bridges, &num_bridges);
323
324	/* this is used to enumerate bridges and ports */
325	if (namelen) { /* look up indexes of bridge and port */
326		if (strncmp(hdr->nr_name, NM_BDG_NAME,
327					strlen(NM_BDG_NAME))) {
328			return EINVAL;
329		}
330		NMG_LOCK();
331		b = nm_find_bridge(hdr->nr_name, 0 /* don't create */, NULL);
332		if (!b) {
333			NMG_UNLOCK();
334			return ENOENT;
335		}
336
337		req->nr_bridge_idx = b - bridges; /* bridge index */
338		req->nr_port_idx = NM_BDG_NOPORT;
339		for (j = 0; j < b->bdg_active_ports; j++) {
340			i = b->bdg_port_index[j];
341			vpna = b->bdg_ports[i];
342			if (vpna == NULL) {
343				nm_prerr("This should not happen");
344				continue;
345			}
346			/* the former and the latter identify a
347			 * virtual port and a NIC, respectively
348			 */
349			if (!strcmp(vpna->up.name, hdr->nr_name)) {
350				req->nr_port_idx = i; /* port index */
351				break;
352			}
353		}
354		NMG_UNLOCK();
355	} else {
356		/* return the first non-empty entry starting from
357		 * bridge nr_arg1 and port nr_arg2.
358		 *
359		 * Users can detect the end of the same bridge by
360		 * seeing the new and old value of nr_arg1, and can
361		 * detect the end of all the bridge by error != 0
362		 */
363		i = req->nr_bridge_idx;
364		j = req->nr_port_idx;
365
366		NMG_LOCK();
367		for (error = ENOENT; i < NM_BRIDGES; i++) {
368			b = bridges + i;
369			for ( ; j < NM_BDG_MAXPORTS; j++) {
370				if (b->bdg_ports[j] == NULL)
371					continue;
372				vpna = b->bdg_ports[j];
373				/* write back the VALE switch name */
374				strlcpy(hdr->nr_name, vpna->up.name,
375					sizeof(hdr->nr_name));
376				error = 0;
377				goto out;
378			}
379			j = 0; /* following bridges scan from 0 */
380		}
381	out:
382		req->nr_bridge_idx = i;
383		req->nr_port_idx = j;
384		NMG_UNLOCK();
385	}
386
387	return error;
388}
389
390/* Process NETMAP_REQ_VALE_ATTACH.
391 */
392int
393netmap_vale_attach(struct nmreq_header *hdr, void *auth_token)
394{
395	struct nmreq_vale_attach *req =
396		(struct nmreq_vale_attach *)(uintptr_t)hdr->nr_body;
397	struct netmap_vp_adapter * vpna;
398	struct netmap_adapter *na = NULL;
399	struct netmap_mem_d *nmd = NULL;
400	struct nm_bridge *b = NULL;
401	int error;
402
403	NMG_LOCK();
404	/* permission check for modified bridges */
405	b = nm_find_bridge(hdr->nr_name, 0 /* don't create */, NULL);
406	if (b && !nm_bdg_valid_auth_token(b, auth_token)) {
407		error = EACCES;
408		goto unlock_exit;
409	}
410
411	if (req->reg.nr_mem_id) {
412		nmd = netmap_mem_find(req->reg.nr_mem_id);
413		if (nmd == NULL) {
414			error = EINVAL;
415			goto unlock_exit;
416		}
417	}
418
419	/* check for existing one */
420	error = netmap_get_vale_na(hdr, &na, nmd, 0);
421	if (na) {
422		error = EBUSY;
423		goto unref_exit;
424	}
425	error = netmap_get_vale_na(hdr, &na,
426				nmd, 1 /* create if not exists */);
427	if (error) { /* no device */
428		goto unlock_exit;
429	}
430
431	if (na == NULL) { /* VALE prefix missing */
432		error = EINVAL;
433		goto unlock_exit;
434	}
435
436	if (NETMAP_OWNED_BY_ANY(na)) {
437		error = EBUSY;
438		goto unref_exit;
439	}
440
441	if (na->nm_bdg_ctl) {
442		/* nop for VALE ports. The bwrap needs to put the hwna
443		 * in netmap mode (see netmap_bwrap_bdg_ctl)
444		 */
445		error = na->nm_bdg_ctl(hdr, na);
446		if (error)
447			goto unref_exit;
448		nm_prdis("registered %s to netmap-mode", na->name);
449	}
450	vpna = (struct netmap_vp_adapter *)na;
451	req->port_index = vpna->bdg_port;
452
453	if (nmd)
454		netmap_mem_put(nmd);
455
456	NMG_UNLOCK();
457	return 0;
458
459unref_exit:
460	netmap_adapter_put(na);
461unlock_exit:
462	if (nmd)
463		netmap_mem_put(nmd);
464
465	NMG_UNLOCK();
466	return error;
467}
468
469/* Process NETMAP_REQ_VALE_DETACH.
470 */
471int
472netmap_vale_detach(struct nmreq_header *hdr, void *auth_token)
473{
474	struct nmreq_vale_detach *nmreq_det = (void *)(uintptr_t)hdr->nr_body;
475	struct netmap_vp_adapter *vpna;
476	struct netmap_adapter *na;
477	struct nm_bridge *b = NULL;
478	int error;
479
480	NMG_LOCK();
481	/* permission check for modified bridges */
482	b = nm_find_bridge(hdr->nr_name, 0 /* don't create */, NULL);
483	if (b && !nm_bdg_valid_auth_token(b, auth_token)) {
484		error = EACCES;
485		goto unlock_exit;
486	}
487
488	error = netmap_get_vale_na(hdr, &na, NULL, 0 /* don't create */);
489	if (error) { /* no device, or another bridge or user owns the device */
490		goto unlock_exit;
491	}
492
493	if (na == NULL) { /* VALE prefix missing */
494		error = EINVAL;
495		goto unlock_exit;
496	} else if (nm_is_bwrap(na) &&
497		   ((struct netmap_bwrap_adapter *)na)->na_polling_state) {
498		/* Don't detach a NIC with polling */
499		error = EBUSY;
500		goto unref_exit;
501	}
502
503	vpna = (struct netmap_vp_adapter *)na;
504	if (na->na_vp != vpna) {
505		/* trying to detach first attach of VALE persistent port attached
506		 * to 2 bridges
507		 */
508		error = EBUSY;
509		goto unref_exit;
510	}
511	nmreq_det->port_index = vpna->bdg_port;
512
513	if (na->nm_bdg_ctl) {
514		/* remove the port from bridge. The bwrap
515		 * also needs to put the hwna in normal mode
516		 */
517		error = na->nm_bdg_ctl(hdr, na);
518	}
519
520unref_exit:
521	netmap_adapter_put(na);
522unlock_exit:
523	NMG_UNLOCK();
524	return error;
525
526}
527
528
529/* nm_dtor callback for ephemeral VALE ports */
530static void
531netmap_vale_vp_dtor(struct netmap_adapter *na)
532{
533	struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na;
534	struct nm_bridge *b = vpna->na_bdg;
535
536	nm_prdis("%s has %d references", na->name, na->na_refcount);
537
538	if (b) {
539		netmap_bdg_detach_common(b, vpna->bdg_port, -1);
540	}
541
542	if (na->ifp != NULL && !nm_iszombie(na)) {
543		NM_DETACH_NA(na->ifp);
544		if (vpna->autodelete) {
545			nm_prdis("releasing %s", na->ifp->if_xname);
546			NMG_UNLOCK();
547			nm_os_vi_detach(na->ifp);
548			NMG_LOCK();
549		}
550	}
551}
552
553
554
555/* nm_krings_create callback for VALE ports.
556 * Calls the standard netmap_krings_create, then adds leases on rx
557 * rings and bdgfwd on tx rings.
558 */
559static int
560netmap_vale_vp_krings_create(struct netmap_adapter *na)
561{
562	u_int tailroom;
563	int error, i;
564	uint32_t *leases;
565	u_int nrx = netmap_real_rings(na, NR_RX);
566
567	/*
568	 * Leases are attached to RX rings on vale ports
569	 */
570	tailroom = sizeof(uint32_t) * na->num_rx_desc * nrx;
571
572	error = netmap_krings_create(na, tailroom);
573	if (error)
574		return error;
575
576	leases = na->tailroom;
577
578	for (i = 0; i < nrx; i++) { /* Receive rings */
579		na->rx_rings[i]->nkr_leases = leases;
580		leases += na->num_rx_desc;
581	}
582
583	error = nm_alloc_bdgfwd(na);
584	if (error) {
585		netmap_krings_delete(na);
586		return error;
587	}
588
589	return 0;
590}
591
592
593/* nm_krings_delete callback for VALE ports. */
594static void
595netmap_vale_vp_krings_delete(struct netmap_adapter *na)
596{
597	nm_free_bdgfwd(na);
598	netmap_krings_delete(na);
599}
600
601
602static int
603nm_vale_flush(struct nm_bdg_fwd *ft, u_int n,
604	struct netmap_vp_adapter *na, u_int ring_nr);
605
606
607/*
608 * main dispatch routine for the bridge.
609 * Grab packets from a kring, move them into the ft structure
610 * associated to the tx (input) port. Max one instance per port,
611 * filtered on input (ioctl, poll or XXX).
612 * Returns the next position in the ring.
613 */
614static int
615nm_vale_preflush(struct netmap_kring *kring, u_int end)
616{
617	struct netmap_vp_adapter *na =
618		(struct netmap_vp_adapter*)kring->na;
619	struct netmap_ring *ring = kring->ring;
620	struct nm_bdg_fwd *ft;
621	u_int ring_nr = kring->ring_id;
622	u_int j = kring->nr_hwcur, lim = kring->nkr_num_slots - 1;
623	u_int ft_i = 0;	/* start from 0 */
624	u_int frags = 1; /* how many frags ? */
625	struct nm_bridge *b = na->na_bdg;
626
627	/* To protect against modifications to the bridge we acquire a
628	 * shared lock, waiting if we can sleep (if the source port is
629	 * attached to a user process) or with a trylock otherwise (NICs).
630	 */
631	nm_prdis("wait rlock for %d packets", ((j > end ? lim+1 : 0) + end) - j);
632	if (na->up.na_flags & NAF_BDG_MAYSLEEP)
633		BDG_RLOCK(b);
634	else if (!BDG_RTRYLOCK(b))
635		return j;
636	nm_prdis(5, "rlock acquired for %d packets", ((j > end ? lim+1 : 0) + end) - j);
637	ft = kring->nkr_ft;
638
639	for (; likely(j != end); j = nm_next(j, lim)) {
640		struct netmap_slot *slot = &ring->slot[j];
641		char *buf;
642
643		ft[ft_i].ft_len = slot->len;
644		ft[ft_i].ft_flags = slot->flags;
645		ft[ft_i].ft_offset = 0;
646
647		nm_prdis("flags is 0x%x", slot->flags);
648		/* we do not use the buf changed flag, but we still need to reset it */
649		slot->flags &= ~NS_BUF_CHANGED;
650
651		/* this slot goes into a list so initialize the link field */
652		ft[ft_i].ft_next = NM_FT_NULL;
653		buf = ft[ft_i].ft_buf = (slot->flags & NS_INDIRECT) ?
654			(void *)(uintptr_t)slot->ptr : NMB(&na->up, slot);
655		if (unlikely(buf == NULL)) {
656			nm_prlim(5, "NULL %s buffer pointer from %s slot %d len %d",
657				(slot->flags & NS_INDIRECT) ? "INDIRECT" : "DIRECT",
658				kring->name, j, ft[ft_i].ft_len);
659			buf = ft[ft_i].ft_buf = NETMAP_BUF_BASE(&na->up);
660			ft[ft_i].ft_len = 0;
661			ft[ft_i].ft_flags = 0;
662		}
663		__builtin_prefetch(buf);
664		++ft_i;
665		if (slot->flags & NS_MOREFRAG) {
666			frags++;
667			continue;
668		}
669		if (unlikely(netmap_verbose && frags > 1))
670			nm_prlim(5, "%d frags at %d", frags, ft_i - frags);
671		ft[ft_i - frags].ft_frags = frags;
672		frags = 1;
673		if (unlikely((int)ft_i >= bridge_batch))
674			ft_i = nm_vale_flush(ft, ft_i, na, ring_nr);
675	}
676	if (frags > 1) {
677		/* Here ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG, and we
678		 * have to fix frags count. */
679		frags--;
680		ft[ft_i - 1].ft_flags &= ~NS_MOREFRAG;
681		ft[ft_i - frags].ft_frags = frags;
682		nm_prlim(5, "Truncate incomplete fragment at %d (%d frags)", ft_i, frags);
683	}
684	if (ft_i)
685		ft_i = nm_vale_flush(ft, ft_i, na, ring_nr);
686	BDG_RUNLOCK(b);
687	return j;
688}
689
690
691/* ----- FreeBSD if_bridge hash function ------- */
692
693/*
694 * The following hash function is adapted from "Hash Functions" by Bob Jenkins
695 * ("Algorithm Alley", Dr. Dobbs Journal, September 1997).
696 *
697 * http://www.burtleburtle.net/bob/hash/spooky.html
698 */
699#define mix(a, b, c)                                                    \
700do {                                                                    \
701	a -= b; a -= c; a ^= (c >> 13);                                 \
702	b -= c; b -= a; b ^= (a << 8);                                  \
703	c -= a; c -= b; c ^= (b >> 13);                                 \
704	a -= b; a -= c; a ^= (c >> 12);                                 \
705	b -= c; b -= a; b ^= (a << 16);                                 \
706	c -= a; c -= b; c ^= (b >> 5);                                  \
707	a -= b; a -= c; a ^= (c >> 3);                                  \
708	b -= c; b -= a; b ^= (a << 10);                                 \
709	c -= a; c -= b; c ^= (b >> 15);                                 \
710} while (/*CONSTCOND*/0)
711
712
713static __inline uint32_t
714nm_vale_rthash(const uint8_t *addr)
715{
716	uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = 0; // hask key
717
718	b += addr[5] << 8;
719	b += addr[4];
720	a += addr[3] << 24;
721	a += addr[2] << 16;
722	a += addr[1] << 8;
723	a += addr[0];
724
725	mix(a, b, c);
726#define BRIDGE_RTHASH_MASK	(NM_BDG_HASH-1)
727	return (c & BRIDGE_RTHASH_MASK);
728}
729
730#undef mix
731
732
733/*
734 * Lookup function for a learning bridge.
735 * Update the hash table with the source address,
736 * and then returns the destination port index, and the
737 * ring in *dst_ring (at the moment, always use ring 0)
738 */
739uint32_t
740netmap_vale_learning(struct nm_bdg_fwd *ft, uint8_t *dst_ring,
741		struct netmap_vp_adapter *na, void *private_data)
742{
743	uint8_t *buf = ((uint8_t *)ft->ft_buf) + ft->ft_offset;
744	u_int buf_len = ft->ft_len - ft->ft_offset;
745	struct nm_hash_ent *ht = private_data;
746	uint32_t sh, dh;
747	u_int dst, mysrc = na->bdg_port;
748	uint64_t smac, dmac;
749	uint8_t indbuf[12];
750
751	if (buf_len < 14) {
752		return NM_BDG_NOPORT;
753	}
754
755	if (ft->ft_flags & NS_INDIRECT) {
756		if (copyin(buf, indbuf, sizeof(indbuf))) {
757			return NM_BDG_NOPORT;
758		}
759		buf = indbuf;
760	}
761
762	dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff;
763	smac = le64toh(*(uint64_t *)(buf + 4));
764	smac >>= 16;
765
766	/*
767	 * The hash is somewhat expensive, there might be some
768	 * worthwhile optimizations here.
769	 */
770	if (((buf[6] & 1) == 0) && (na->last_smac != smac)) { /* valid src */
771		uint8_t *s = buf+6;
772		sh = nm_vale_rthash(s); /* hash of source */
773		/* update source port forwarding entry */
774		na->last_smac = ht[sh].mac = smac;	/* XXX expire ? */
775		ht[sh].ports = mysrc;
776		if (netmap_debug & NM_DEBUG_VALE)
777		    nm_prinf("src %02x:%02x:%02x:%02x:%02x:%02x on port %d",
778			s[0], s[1], s[2], s[3], s[4], s[5], mysrc);
779	}
780	dst = NM_BDG_BROADCAST;
781	if ((buf[0] & 1) == 0) { /* unicast */
782		dh = nm_vale_rthash(buf); /* hash of dst */
783		if (ht[dh].mac == dmac) {	/* found dst */
784			dst = ht[dh].ports;
785		}
786	}
787	return dst;
788}
789
790
791/*
792 * Available space in the ring. Only used in VALE code
793 * and only with is_rx = 1
794 */
795static inline uint32_t
796nm_kr_space(struct netmap_kring *k, int is_rx)
797{
798	int space;
799
800	if (is_rx) {
801		int busy = k->nkr_hwlease - k->nr_hwcur;
802		if (busy < 0)
803			busy += k->nkr_num_slots;
804		space = k->nkr_num_slots - 1 - busy;
805	} else {
806		/* XXX never used in this branch */
807		space = k->nr_hwtail - k->nkr_hwlease;
808		if (space < 0)
809			space += k->nkr_num_slots;
810	}
811#if 0
812	// sanity check
813	if (k->nkr_hwlease >= k->nkr_num_slots ||
814		k->nr_hwcur >= k->nkr_num_slots ||
815		k->nr_tail >= k->nkr_num_slots ||
816		busy < 0 ||
817		busy >= k->nkr_num_slots) {
818		nm_prerr("invalid kring, cur %d tail %d lease %d lease_idx %d lim %d",
819		    k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease,
820		    k->nkr_lease_idx, k->nkr_num_slots);
821	}
822#endif
823	return space;
824}
825
826
827
828
829/* make a lease on the kring for N positions. return the
830 * lease index
831 * XXX only used in VALE code and with is_rx = 1
832 */
833static inline uint32_t
834nm_kr_lease(struct netmap_kring *k, u_int n, int is_rx)
835{
836	uint32_t lim = k->nkr_num_slots - 1;
837	uint32_t lease_idx = k->nkr_lease_idx;
838
839	k->nkr_leases[lease_idx] = NR_NOSLOT;
840	k->nkr_lease_idx = nm_next(lease_idx, lim);
841
842#ifdef CONFIG_NETMAP_DEBUG
843	if (n > nm_kr_space(k, is_rx)) {
844		nm_prerr("invalid request for %d slots", n);
845		panic("x");
846	}
847#endif /* CONFIG NETMAP_DEBUG */
848	/* XXX verify that there are n slots */
849	k->nkr_hwlease += n;
850	if (k->nkr_hwlease > lim)
851		k->nkr_hwlease -= lim + 1;
852
853#ifdef CONFIG_NETMAP_DEBUG
854	if (k->nkr_hwlease >= k->nkr_num_slots ||
855		k->nr_hwcur >= k->nkr_num_slots ||
856		k->nr_hwtail >= k->nkr_num_slots ||
857		k->nkr_lease_idx >= k->nkr_num_slots) {
858		nm_prerr("invalid kring %s, cur %d tail %d lease %d lease_idx %d lim %d",
859			k->na->name,
860			k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease,
861			k->nkr_lease_idx, k->nkr_num_slots);
862	}
863#endif /* CONFIG_NETMAP_DEBUG */
864	return lease_idx;
865}
866
867/*
868 *
869 * This flush routine supports only unicast and broadcast but a large
870 * number of ports, and lets us replace the learn and dispatch functions.
871 */
872int
873nm_vale_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na,
874		u_int ring_nr)
875{
876	struct nm_vale_q *dst_ents, *brddst;
877	uint16_t num_dsts = 0, *dsts;
878	struct nm_bridge *b = na->na_bdg;
879	u_int i, me = na->bdg_port;
880
881	/*
882	 * The work area (pointed by ft) is followed by an array of
883	 * pointers to queues , dst_ents; there are NM_BDG_MAXRINGS
884	 * queues per port plus one for the broadcast traffic.
885	 * Then we have an array of destination indexes.
886	 */
887	dst_ents = (struct nm_vale_q *)(ft + NM_BDG_BATCH_MAX);
888	dsts = (uint16_t *)(dst_ents + NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1);
889
890	/* first pass: find a destination for each packet in the batch */
891	for (i = 0; likely(i < n); i += ft[i].ft_frags) {
892		uint8_t dst_ring = ring_nr; /* default, same ring as origin */
893		uint16_t dst_port, d_i;
894		struct nm_vale_q *d;
895		struct nm_bdg_fwd *start_ft = NULL;
896
897		nm_prdis("slot %d frags %d", i, ft[i].ft_frags);
898
899		if (na->up.virt_hdr_len < ft[i].ft_len) {
900			ft[i].ft_offset = na->up.virt_hdr_len;
901			start_ft = &ft[i];
902		} else if (na->up.virt_hdr_len == ft[i].ft_len && ft[i].ft_flags & NS_MOREFRAG) {
903			ft[i].ft_offset = ft[i].ft_len;
904			start_ft = &ft[i+1];
905		} else {
906			/* Drop the packet if the virtio-net header is not into the first
907			 * fragment nor at the very beginning of the second.
908			 */
909			continue;
910		}
911		dst_port = b->bdg_ops.lookup(start_ft, &dst_ring, na, b->private_data);
912		if (netmap_verbose > 255)
913			nm_prlim(5, "slot %d port %d -> %d", i, me, dst_port);
914		if (dst_port >= NM_BDG_NOPORT)
915			continue; /* this packet is identified to be dropped */
916		else if (dst_port == NM_BDG_BROADCAST)
917			dst_ring = 0; /* broadcasts always go to ring 0 */
918		else if (unlikely(dst_port == me ||
919		    !b->bdg_ports[dst_port]))
920			continue;
921
922		/* get a position in the scratch pad */
923		d_i = dst_port * NM_BDG_MAXRINGS + dst_ring;
924		d = dst_ents + d_i;
925
926		/* append the first fragment to the list */
927		if (d->bq_head == NM_FT_NULL) { /* new destination */
928			d->bq_head = d->bq_tail = i;
929			/* remember this position to be scanned later */
930			if (dst_port != NM_BDG_BROADCAST)
931				dsts[num_dsts++] = d_i;
932		} else {
933			ft[d->bq_tail].ft_next = i;
934			d->bq_tail = i;
935		}
936		d->bq_len += ft[i].ft_frags;
937	}
938
939	/*
940	 * Broadcast traffic goes to ring 0 on all destinations.
941	 * So we need to add these rings to the list of ports to scan.
942	 * XXX at the moment we scan all NM_BDG_MAXPORTS ports, which is
943	 * expensive. We should keep a compact list of active destinations
944	 * so we could shorten this loop.
945	 */
946	brddst = dst_ents + NM_BDG_BROADCAST * NM_BDG_MAXRINGS;
947	if (brddst->bq_head != NM_FT_NULL) {
948		u_int j;
949		for (j = 0; likely(j < b->bdg_active_ports); j++) {
950			uint16_t d_i;
951			i = b->bdg_port_index[j];
952			if (unlikely(i == me))
953				continue;
954			d_i = i * NM_BDG_MAXRINGS;
955			if (dst_ents[d_i].bq_head == NM_FT_NULL)
956				dsts[num_dsts++] = d_i;
957		}
958	}
959
960	nm_prdis(5, "pass 1 done %d pkts %d dsts", n, num_dsts);
961	/* second pass: scan destinations */
962	for (i = 0; i < num_dsts; i++) {
963		struct netmap_vp_adapter *dst_na;
964		struct netmap_kring *kring;
965		struct netmap_ring *ring;
966		u_int dst_nr, lim, j, d_i, next, brd_next;
967		u_int needed, howmany;
968		int retry = netmap_txsync_retry;
969		struct nm_vale_q *d;
970		uint32_t my_start = 0, lease_idx = 0;
971		int nrings;
972		int virt_hdr_mismatch = 0;
973
974		d_i = dsts[i];
975		nm_prdis("second pass %d port %d", i, d_i);
976		d = dst_ents + d_i;
977		// XXX fix the division
978		dst_na = b->bdg_ports[d_i/NM_BDG_MAXRINGS];
979		/* protect from the lookup function returning an inactive
980		 * destination port
981		 */
982		if (unlikely(dst_na == NULL))
983			goto cleanup;
984		if (dst_na->up.na_flags & NAF_SW_ONLY)
985			goto cleanup;
986		/*
987		 * The interface may be in !netmap mode in two cases:
988		 * - when na is attached but not activated yet;
989		 * - when na is being deactivated but is still attached.
990		 */
991		if (unlikely(!nm_netmap_on(&dst_na->up))) {
992			nm_prdis("not in netmap mode!");
993			goto cleanup;
994		}
995
996		/* there is at least one either unicast or broadcast packet */
997		brd_next = brddst->bq_head;
998		next = d->bq_head;
999		/* we need to reserve this many slots. If fewer are
1000		 * available, some packets will be dropped.
1001		 * Packets may have multiple fragments, so we may not use
1002		 * there is a chance that we may not use all of the slots
1003		 * we have claimed, so we will need to handle the leftover
1004		 * ones when we regain the lock.
1005		 */
1006		needed = d->bq_len + brddst->bq_len;
1007
1008		if (unlikely(dst_na->up.virt_hdr_len != na->up.virt_hdr_len)) {
1009			if (netmap_verbose) {
1010				nm_prlim(3, "virt_hdr_mismatch, src %d dst %d", na->up.virt_hdr_len,
1011						dst_na->up.virt_hdr_len);
1012			}
1013			/* There is a virtio-net header/offloadings mismatch between
1014			 * source and destination. The slower mismatch datapath will
1015			 * be used to cope with all the mismatches.
1016			 */
1017			virt_hdr_mismatch = 1;
1018			if (dst_na->mfs < na->mfs) {
1019				/* We may need to do segmentation offloadings, and so
1020				 * we may need a number of destination slots greater
1021				 * than the number of input slots ('needed').
1022				 * We look for the smallest integer 'x' which satisfies:
1023				 *	needed * na->mfs + x * H <= x * na->mfs
1024				 * where 'H' is the length of the longest header that may
1025				 * be replicated in the segmentation process (e.g. for
1026				 * TCPv4 we must account for ethernet header, IP header
1027				 * and TCPv4 header).
1028				 */
1029				KASSERT(dst_na->mfs > 0, ("vpna->mfs is 0"));
1030				needed = (needed * na->mfs) /
1031						(dst_na->mfs - WORST_CASE_GSO_HEADER) + 1;
1032				nm_prdis(3, "srcmtu=%u, dstmtu=%u, x=%u", na->mfs, dst_na->mfs, needed);
1033			}
1034		}
1035
1036		nm_prdis(5, "pass 2 dst %d is %x %s",
1037			i, d_i, nm_is_bwrap(&dst_na->up) ? "nic/host" : "virtual");
1038		dst_nr = d_i & (NM_BDG_MAXRINGS-1);
1039		nrings = dst_na->up.num_rx_rings;
1040		if (dst_nr >= nrings)
1041			dst_nr = dst_nr % nrings;
1042		kring = dst_na->up.rx_rings[dst_nr];
1043		ring = kring->ring;
1044		/* the destination ring may have not been opened for RX */
1045		if (unlikely(ring == NULL || kring->nr_mode != NKR_NETMAP_ON))
1046			goto cleanup;
1047		lim = kring->nkr_num_slots - 1;
1048
1049retry:
1050
1051		if (dst_na->retry && retry) {
1052			/* try to get some free slot from the previous run */
1053			kring->nm_notify(kring, NAF_FORCE_RECLAIM);
1054			/* actually useful only for bwraps, since there
1055			 * the notify will trigger a txsync on the hwna. VALE ports
1056			 * have dst_na->retry == 0
1057			 */
1058		}
1059		/* reserve the buffers in the queue and an entry
1060		 * to report completion, and drop lock.
1061		 * XXX this might become a helper function.
1062		 */
1063		mtx_lock(&kring->q_lock);
1064		if (kring->nkr_stopped) {
1065			mtx_unlock(&kring->q_lock);
1066			goto cleanup;
1067		}
1068		my_start = j = kring->nkr_hwlease;
1069		howmany = nm_kr_space(kring, 1);
1070		if (needed < howmany)
1071			howmany = needed;
1072		lease_idx = nm_kr_lease(kring, howmany, 1);
1073		mtx_unlock(&kring->q_lock);
1074
1075		/* only retry if we need more than available slots */
1076		if (retry && needed <= howmany)
1077			retry = 0;
1078
1079		/* copy to the destination queue */
1080		while (howmany > 0) {
1081			struct netmap_slot *slot;
1082			struct nm_bdg_fwd *ft_p, *ft_end;
1083			u_int cnt;
1084
1085			/* find the queue from which we pick next packet.
1086			 * NM_FT_NULL is always higher than valid indexes
1087			 * so we never dereference it if the other list
1088			 * has packets (and if both are empty we never
1089			 * get here).
1090			 */
1091			if (next < brd_next) {
1092				ft_p = ft + next;
1093				next = ft_p->ft_next;
1094			} else { /* insert broadcast */
1095				ft_p = ft + brd_next;
1096				brd_next = ft_p->ft_next;
1097			}
1098			cnt = ft_p->ft_frags; // cnt > 0
1099			if (unlikely(cnt > howmany))
1100			    break; /* no more space */
1101			if (netmap_verbose && cnt > 1)
1102				nm_prlim(5, "rx %d frags to %d", cnt, j);
1103			ft_end = ft_p + cnt;
1104			if (unlikely(virt_hdr_mismatch)) {
1105				bdg_mismatch_datapath(na, dst_na, ft_p, ring, &j, lim, &howmany);
1106			} else {
1107				howmany -= cnt;
1108				do {
1109					char *dst, *src = ft_p->ft_buf;
1110					size_t copy_len = ft_p->ft_len, dst_len = copy_len;
1111
1112					slot = &ring->slot[j];
1113					dst = NMB(&dst_na->up, slot);
1114
1115					nm_prdis("send [%d] %d(%d) bytes at %s:%d",
1116							i, (int)copy_len, (int)dst_len,
1117							dst_na->up.name, j);
1118					/* round to a multiple of 64 */
1119					copy_len = (copy_len + 63) & ~63;
1120
1121					if (unlikely(copy_len > NETMAP_BUF_SIZE(&dst_na->up) ||
1122						     copy_len > NETMAP_BUF_SIZE(&na->up))) {
1123						nm_prlim(5, "invalid len %d, down to 64", (int)copy_len);
1124						copy_len = dst_len = 64; // XXX
1125					}
1126					if (ft_p->ft_flags & NS_INDIRECT) {
1127						if (copyin(src, dst, copy_len)) {
1128							// invalid user pointer, pretend len is 0
1129							dst_len = 0;
1130						}
1131					} else {
1132						//memcpy(dst, src, copy_len);
1133						pkt_copy(src, dst, (int)copy_len);
1134					}
1135					slot->len = dst_len;
1136					slot->flags = (cnt << 8)| NS_MOREFRAG;
1137					j = nm_next(j, lim);
1138					needed--;
1139					ft_p++;
1140				} while (ft_p != ft_end);
1141				slot->flags = (cnt << 8); /* clear flag on last entry */
1142			}
1143			/* are we done ? */
1144			if (next == NM_FT_NULL && brd_next == NM_FT_NULL)
1145				break;
1146		}
1147		{
1148		    /* current position */
1149		    uint32_t *p = kring->nkr_leases; /* shorthand */
1150		    uint32_t update_pos;
1151		    int still_locked = 1;
1152
1153		    mtx_lock(&kring->q_lock);
1154		    if (unlikely(howmany > 0)) {
1155			/* not used all bufs. If i am the last one
1156			 * i can recover the slots, otherwise must
1157			 * fill them with 0 to mark empty packets.
1158			 */
1159			nm_prdis("leftover %d bufs", howmany);
1160			if (nm_next(lease_idx, lim) == kring->nkr_lease_idx) {
1161			    /* yes i am the last one */
1162			    nm_prdis("roll back nkr_hwlease to %d", j);
1163			    kring->nkr_hwlease = j;
1164			} else {
1165			    while (howmany-- > 0) {
1166				ring->slot[j].len = 0;
1167				ring->slot[j].flags = 0;
1168				j = nm_next(j, lim);
1169			    }
1170			}
1171		    }
1172		    p[lease_idx] = j; /* report I am done */
1173
1174		    update_pos = kring->nr_hwtail;
1175
1176		    if (my_start == update_pos) {
1177			/* all slots before my_start have been reported,
1178			 * so scan subsequent leases to see if other ranges
1179			 * have been completed, and to a selwakeup or txsync.
1180		         */
1181			while (lease_idx != kring->nkr_lease_idx &&
1182				p[lease_idx] != NR_NOSLOT) {
1183			    j = p[lease_idx];
1184			    p[lease_idx] = NR_NOSLOT;
1185			    lease_idx = nm_next(lease_idx, lim);
1186			}
1187			/* j is the new 'write' position. j != my_start
1188			 * means there are new buffers to report
1189			 */
1190			if (likely(j != my_start)) {
1191				kring->nr_hwtail = j;
1192				still_locked = 0;
1193				mtx_unlock(&kring->q_lock);
1194				kring->nm_notify(kring, 0);
1195				/* this is netmap_notify for VALE ports and
1196				 * netmap_bwrap_notify for bwrap. The latter will
1197				 * trigger a txsync on the underlying hwna
1198				 */
1199				if (dst_na->retry && retry--) {
1200					/* XXX this is going to call nm_notify again.
1201					 * Only useful for bwrap in virtual machines
1202					 */
1203					goto retry;
1204				}
1205			}
1206		    }
1207		    if (still_locked)
1208			mtx_unlock(&kring->q_lock);
1209		}
1210cleanup:
1211		d->bq_head = d->bq_tail = NM_FT_NULL; /* cleanup */
1212		d->bq_len = 0;
1213	}
1214	brddst->bq_head = brddst->bq_tail = NM_FT_NULL; /* cleanup */
1215	brddst->bq_len = 0;
1216	return 0;
1217}
1218
1219/* nm_txsync callback for VALE ports */
1220static int
1221netmap_vale_vp_txsync(struct netmap_kring *kring, int flags)
1222{
1223	struct netmap_vp_adapter *na =
1224		(struct netmap_vp_adapter *)kring->na;
1225	u_int done;
1226	u_int const lim = kring->nkr_num_slots - 1;
1227	u_int const head = kring->rhead;
1228
1229	if (bridge_batch <= 0) { /* testing only */
1230		done = head; // used all
1231		goto done;
1232	}
1233	if (!na->na_bdg) {
1234		done = head;
1235		goto done;
1236	}
1237	if (bridge_batch > NM_BDG_BATCH)
1238		bridge_batch = NM_BDG_BATCH;
1239
1240	done = nm_vale_preflush(kring, head);
1241done:
1242	if (done != head)
1243		nm_prerr("early break at %d/ %d, tail %d", done, head, kring->nr_hwtail);
1244	/*
1245	 * packets between 'done' and 'cur' are left unsent.
1246	 */
1247	kring->nr_hwcur = done;
1248	kring->nr_hwtail = nm_prev(done, lim);
1249	if (netmap_debug & NM_DEBUG_TXSYNC)
1250		nm_prinf("%s ring %d flags %d", na->up.name, kring->ring_id, flags);
1251	return 0;
1252}
1253
1254
1255/* create a netmap_vp_adapter that describes a VALE port.
1256 * Only persistent VALE ports have a non-null ifp.
1257 */
1258static int
1259netmap_vale_vp_create(struct nmreq_header *hdr, struct ifnet *ifp,
1260		struct netmap_mem_d *nmd, struct netmap_vp_adapter **ret)
1261{
1262	struct nmreq_register *req = (struct nmreq_register *)(uintptr_t)hdr->nr_body;
1263	struct netmap_vp_adapter *vpna;
1264	struct netmap_adapter *na;
1265	int error = 0;
1266	u_int npipes = 0;
1267	u_int extrabufs = 0;
1268
1269	if (hdr->nr_reqtype != NETMAP_REQ_REGISTER) {
1270		return EINVAL;
1271	}
1272
1273	vpna = nm_os_malloc(sizeof(*vpna));
1274	if (vpna == NULL)
1275		return ENOMEM;
1276
1277 	na = &vpna->up;
1278
1279	na->ifp = ifp;
1280	strlcpy(na->name, hdr->nr_name, sizeof(na->name));
1281
1282	/* bound checking */
1283	na->num_tx_rings = req->nr_tx_rings;
1284	nm_bound_var(&na->num_tx_rings, 1, 1, NM_BDG_MAXRINGS, NULL);
1285	req->nr_tx_rings = na->num_tx_rings; /* write back */
1286	na->num_rx_rings = req->nr_rx_rings;
1287	nm_bound_var(&na->num_rx_rings, 1, 1, NM_BDG_MAXRINGS, NULL);
1288	req->nr_rx_rings = na->num_rx_rings; /* write back */
1289	nm_bound_var(&req->nr_tx_slots, NM_BRIDGE_RINGSIZE,
1290			1, NM_BDG_MAXSLOTS, NULL);
1291	na->num_tx_desc = req->nr_tx_slots;
1292	nm_bound_var(&req->nr_rx_slots, NM_BRIDGE_RINGSIZE,
1293			1, NM_BDG_MAXSLOTS, NULL);
1294	/* validate number of pipes. We want at least 1,
1295	 * but probably can do with some more.
1296	 * So let's use 2 as default (when 0 is supplied)
1297	 */
1298	nm_bound_var(&npipes, 2, 1, NM_MAXPIPES, NULL);
1299	/* validate extra bufs */
1300	extrabufs = req->nr_extra_bufs;
1301	nm_bound_var(&extrabufs, 0, 0,
1302			128*NM_BDG_MAXSLOTS, NULL);
1303	req->nr_extra_bufs = extrabufs; /* write back */
1304	na->num_rx_desc = req->nr_rx_slots;
1305	/* Set the mfs to a default value, as it is needed on the VALE
1306	 * mismatch datapath. XXX We should set it according to the MTU
1307	 * known to the kernel. */
1308	vpna->mfs = NM_BDG_MFS_DEFAULT;
1309	vpna->last_smac = ~0llu;
1310	/*if (vpna->mfs > netmap_buf_size)  TODO netmap_buf_size is zero??
1311		vpna->mfs = netmap_buf_size; */
1312	if (netmap_verbose)
1313		nm_prinf("max frame size %u", vpna->mfs);
1314
1315	na->na_flags |= NAF_BDG_MAYSLEEP;
1316	/* persistent VALE ports look like hw devices
1317	 * with a native netmap adapter
1318	 */
1319	if (ifp)
1320		na->na_flags |= NAF_NATIVE;
1321	na->nm_txsync = netmap_vale_vp_txsync;
1322	na->nm_rxsync = netmap_vp_rxsync; /* use the one provided by bdg */
1323	na->nm_register = netmap_vp_reg;  /* use the one provided by bdg */
1324	na->nm_krings_create = netmap_vale_vp_krings_create;
1325	na->nm_krings_delete = netmap_vale_vp_krings_delete;
1326	na->nm_dtor = netmap_vale_vp_dtor;
1327	nm_prdis("nr_mem_id %d", req->nr_mem_id);
1328	na->nm_mem = nmd ?
1329		netmap_mem_get(nmd):
1330		netmap_mem_private_new(
1331			na->num_tx_rings, na->num_tx_desc,
1332			na->num_rx_rings, na->num_rx_desc,
1333			req->nr_extra_bufs, npipes, &error);
1334	if (na->nm_mem == NULL)
1335		goto err;
1336	na->nm_bdg_attach = netmap_vale_vp_bdg_attach;
1337	/* other nmd fields are set in the common routine */
1338	error = netmap_attach_common(na);
1339	if (error)
1340		goto err;
1341	*ret = vpna;
1342	return 0;
1343
1344err:
1345	if (na->nm_mem != NULL)
1346		netmap_mem_put(na->nm_mem);
1347	nm_os_free(vpna);
1348	return error;
1349}
1350
1351/* nm_bdg_attach callback for VALE ports
1352 * The na_vp port is this same netmap_adapter. There is no host port.
1353 */
1354static int
1355netmap_vale_vp_bdg_attach(const char *name, struct netmap_adapter *na,
1356		struct nm_bridge *b)
1357{
1358	struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na;
1359
1360	if ((b->bdg_flags & NM_BDG_NEED_BWRAP) || vpna->na_bdg) {
1361		return NM_NEED_BWRAP;
1362	}
1363	na->na_vp = vpna;
1364	strlcpy(na->name, name, sizeof(na->name));
1365	na->na_hostvp = NULL;
1366	return 0;
1367}
1368
1369static int
1370netmap_vale_bwrap_krings_create(struct netmap_adapter *na)
1371{
1372	int error;
1373
1374	/* impersonate a netmap_vp_adapter */
1375	error = netmap_vale_vp_krings_create(na);
1376	if (error)
1377		return error;
1378	error = netmap_bwrap_krings_create_common(na);
1379	if (error) {
1380		netmap_vale_vp_krings_delete(na);
1381	}
1382	return error;
1383}
1384
1385static void
1386netmap_vale_bwrap_krings_delete(struct netmap_adapter *na)
1387{
1388	netmap_bwrap_krings_delete_common(na);
1389	netmap_vale_vp_krings_delete(na);
1390}
1391
1392static int
1393netmap_vale_bwrap_attach(const char *nr_name, struct netmap_adapter *hwna)
1394{
1395	struct netmap_bwrap_adapter *bna;
1396	struct netmap_adapter *na = NULL;
1397	struct netmap_adapter *hostna = NULL;
1398	int error;
1399
1400	bna = nm_os_malloc(sizeof(*bna));
1401	if (bna == NULL) {
1402		return ENOMEM;
1403	}
1404	na = &bna->up.up;
1405	strlcpy(na->name, nr_name, sizeof(na->name));
1406	na->nm_register = netmap_bwrap_reg;
1407	na->nm_txsync = netmap_vale_vp_txsync;
1408	// na->nm_rxsync = netmap_bwrap_rxsync;
1409	na->nm_krings_create = netmap_vale_bwrap_krings_create;
1410	na->nm_krings_delete = netmap_vale_bwrap_krings_delete;
1411	na->nm_notify = netmap_bwrap_notify;
1412	bna->up.retry = 1; /* XXX maybe this should depend on the hwna */
1413	/* Set the mfs, needed on the VALE mismatch datapath. */
1414	bna->up.mfs = NM_BDG_MFS_DEFAULT;
1415
1416	if (hwna->na_flags & NAF_HOST_RINGS) {
1417		hostna = &bna->host.up;
1418		hostna->nm_notify = netmap_bwrap_notify;
1419		bna->host.mfs = NM_BDG_MFS_DEFAULT;
1420	}
1421
1422	error = netmap_bwrap_attach_common(na, hwna);
1423	if (error) {
1424		nm_os_free(bna);
1425	}
1426	return error;
1427}
1428
1429int
1430netmap_get_vale_na(struct nmreq_header *hdr, struct netmap_adapter **na,
1431		struct netmap_mem_d *nmd, int create)
1432{
1433	return netmap_get_bdg_na(hdr, na, nmd, create, &vale_bdg_ops);
1434}
1435
1436
1437/* creates a persistent VALE port */
1438int
1439nm_vi_create(struct nmreq_header *hdr)
1440{
1441	struct nmreq_vale_newif *req =
1442		(struct nmreq_vale_newif *)(uintptr_t)hdr->nr_body;
1443	int error = 0;
1444	/* Build a nmreq_register out of the nmreq_vale_newif,
1445	 * so that we can call netmap_get_bdg_na(). */
1446	struct nmreq_register regreq;
1447	bzero(&regreq, sizeof(regreq));
1448	regreq.nr_tx_slots = req->nr_tx_slots;
1449	regreq.nr_rx_slots = req->nr_rx_slots;
1450	regreq.nr_tx_rings = req->nr_tx_rings;
1451	regreq.nr_rx_rings = req->nr_rx_rings;
1452	regreq.nr_mem_id = req->nr_mem_id;
1453	hdr->nr_reqtype = NETMAP_REQ_REGISTER;
1454	hdr->nr_body = (uintptr_t)&regreq;
1455	error = netmap_vi_create(hdr, 0 /* no autodelete */);
1456	hdr->nr_reqtype = NETMAP_REQ_VALE_NEWIF;
1457	hdr->nr_body = (uintptr_t)req;
1458	/* Write back to the original struct. */
1459	req->nr_tx_slots = regreq.nr_tx_slots;
1460	req->nr_rx_slots = regreq.nr_rx_slots;
1461	req->nr_tx_rings = regreq.nr_tx_rings;
1462	req->nr_rx_rings = regreq.nr_rx_rings;
1463	req->nr_mem_id = regreq.nr_mem_id;
1464	return error;
1465}
1466
1467/* remove a persistent VALE port from the system */
1468int
1469nm_vi_destroy(const char *name)
1470{
1471	struct ifnet *ifp;
1472	struct netmap_vp_adapter *vpna;
1473	int error;
1474
1475	ifp = ifunit_ref(name);
1476	if (!ifp)
1477		return ENXIO;
1478	NMG_LOCK();
1479	/* make sure this is actually a VALE port */
1480	if (!NM_NA_VALID(ifp) || NA(ifp)->nm_register != netmap_vp_reg) {
1481		error = EINVAL;
1482		goto err;
1483	}
1484
1485	vpna = (struct netmap_vp_adapter *)NA(ifp);
1486
1487	/* we can only destroy ports that were created via NETMAP_BDG_NEWIF */
1488	if (vpna->autodelete) {
1489		error = EINVAL;
1490		goto err;
1491	}
1492
1493	/* also make sure that nobody is using the inferface */
1494	if (NETMAP_OWNED_BY_ANY(&vpna->up) ||
1495	    vpna->up.na_refcount > 1 /* any ref besides the one in nm_vi_create()? */) {
1496		error = EBUSY;
1497		goto err;
1498	}
1499
1500	NMG_UNLOCK();
1501
1502	if (netmap_verbose)
1503		nm_prinf("destroying a persistent vale interface %s", ifp->if_xname);
1504	/* Linux requires all the references are released
1505	 * before unregister
1506	 */
1507	netmap_detach(ifp);
1508	if_rele(ifp);
1509	nm_os_vi_detach(ifp);
1510	return 0;
1511
1512err:
1513	NMG_UNLOCK();
1514	if_rele(ifp);
1515	return error;
1516}
1517
1518static int
1519nm_update_info(struct nmreq_register *req, struct netmap_adapter *na)
1520{
1521	req->nr_rx_rings = na->num_rx_rings;
1522	req->nr_tx_rings = na->num_tx_rings;
1523	req->nr_rx_slots = na->num_rx_desc;
1524	req->nr_tx_slots = na->num_tx_desc;
1525	return netmap_mem_get_info(na->nm_mem, &req->nr_memsize, NULL,
1526					&req->nr_mem_id);
1527}
1528
1529
1530/*
1531 * Create a virtual interface registered to the system.
1532 * The interface will be attached to a bridge later.
1533 */
1534int
1535netmap_vi_create(struct nmreq_header *hdr, int autodelete)
1536{
1537	struct nmreq_register *req = (struct nmreq_register *)(uintptr_t)hdr->nr_body;
1538	struct ifnet *ifp;
1539	struct netmap_vp_adapter *vpna;
1540	struct netmap_mem_d *nmd = NULL;
1541	int error;
1542
1543	if (hdr->nr_reqtype != NETMAP_REQ_REGISTER) {
1544		return EINVAL;
1545	}
1546
1547	/* don't include VALE prefix */
1548	if (!strncmp(hdr->nr_name, NM_BDG_NAME, strlen(NM_BDG_NAME)))
1549		return EINVAL;
1550	if (strlen(hdr->nr_name) >= IFNAMSIZ) {
1551		return EINVAL;
1552	}
1553	ifp = ifunit_ref(hdr->nr_name);
1554	if (ifp) { /* already exist, cannot create new one */
1555		error = EEXIST;
1556		NMG_LOCK();
1557		if (NM_NA_VALID(ifp)) {
1558			int update_err = nm_update_info(req, NA(ifp));
1559			if (update_err)
1560				error = update_err;
1561		}
1562		NMG_UNLOCK();
1563		if_rele(ifp);
1564		return error;
1565	}
1566	error = nm_os_vi_persist(hdr->nr_name, &ifp);
1567	if (error)
1568		return error;
1569
1570	NMG_LOCK();
1571	if (req->nr_mem_id) {
1572		nmd = netmap_mem_find(req->nr_mem_id);
1573		if (nmd == NULL) {
1574			error = EINVAL;
1575			goto err_1;
1576		}
1577	}
1578	/* netmap_vp_create creates a struct netmap_vp_adapter */
1579	error = netmap_vale_vp_create(hdr, ifp, nmd, &vpna);
1580	if (error) {
1581		if (netmap_debug & NM_DEBUG_VALE)
1582			nm_prerr("error %d", error);
1583		goto err_1;
1584	}
1585	/* persist-specific routines */
1586	vpna->up.nm_bdg_ctl = netmap_vp_bdg_ctl;
1587	if (!autodelete) {
1588		netmap_adapter_get(&vpna->up);
1589	} else {
1590		vpna->autodelete = 1;
1591	}
1592	NM_ATTACH_NA(ifp, &vpna->up);
1593	/* return the updated info */
1594	error = nm_update_info(req, &vpna->up);
1595	if (error) {
1596		goto err_2;
1597	}
1598	nm_prdis("returning nr_mem_id %d", req->nr_mem_id);
1599	if (nmd)
1600		netmap_mem_put(nmd);
1601	NMG_UNLOCK();
1602	nm_prdis("created %s", ifp->if_xname);
1603	return 0;
1604
1605err_2:
1606	netmap_detach(ifp);
1607err_1:
1608	if (nmd)
1609		netmap_mem_put(nmd);
1610	NMG_UNLOCK();
1611	nm_os_vi_detach(ifp);
1612
1613	return error;
1614}
1615
1616#endif /* WITH_VALE */
1617