1/*
2 * Copyright (C) 2013-2016 Universita` di Pisa
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *   1. Redistributions of source code must retain the above copyright
9 *      notice, this list of conditions and the following disclaimer.
10 *   2. Redistributions in binary form must reproduce the above copyright
11 *      notice, this list of conditions and the following disclaimer in the
12 *      documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27
28#if defined(__FreeBSD__)
29#include <sys/cdefs.h> /* prerequisite */
30__FBSDID("$FreeBSD: stable/11/sys/dev/netmap/netmap_vale.c 344047 2019-02-12 09:26:05Z vmaffione $");
31
32#include <sys/types.h>
33#include <sys/errno.h>
34#include <sys/param.h>	/* defines used in kernel.h */
35#include <sys/kernel.h>	/* types used in module initialization */
36#include <sys/conf.h>	/* cdevsw struct, UID, GID */
37#include <sys/sockio.h>
38#include <sys/socketvar.h>	/* struct socket */
39#include <sys/malloc.h>
40#include <sys/poll.h>
41#include <sys/rwlock.h>
42#include <sys/socket.h> /* sockaddrs */
43#include <sys/selinfo.h>
44#include <sys/sysctl.h>
45#include <net/if.h>
46#include <net/if_var.h>
47#include <net/bpf.h>		/* BIOCIMMEDIATE */
48#include <machine/bus.h>	/* bus_dmamap_* */
49#include <sys/endian.h>
50#include <sys/refcount.h>
51#include <sys/smp.h>
52
53
54#elif defined(linux)
55
56#include "bsd_glue.h"
57
58#elif defined(__APPLE__)
59
60#warning OSX support is only partial
61#include "osx_glue.h"
62
63#elif defined(_WIN32)
64#include "win_glue.h"
65
66#else
67
68#error	Unsupported platform
69
70#endif /* unsupported */
71
72/*
73 * common headers
74 */
75
76#include <net/netmap.h>
77#include <dev/netmap/netmap_kern.h>
78#include <dev/netmap/netmap_mem2.h>
79#include <dev/netmap/netmap_bdg.h>
80
81#ifdef WITH_VALE
82
83/*
84 * system parameters (most of them in netmap_kern.h)
85 * NM_BDG_NAME	prefix for switch port names, default "vale"
86 * NM_BDG_MAXPORTS	number of ports
87 * NM_BRIDGES	max number of switches in the system.
88 *	XXX should become a sysctl or tunable
89 *
90 * Switch ports are named valeX:Y where X is the switch name and Y
91 * is the port. If Y matches a physical interface name, the port is
92 * connected to a physical device.
93 *
94 * Unlike physical interfaces, switch ports use their own memory region
95 * for rings and buffers.
96 * The virtual interfaces use per-queue lock instead of core lock.
97 * In the tx loop, we aggregate traffic in batches to make all operations
98 * faster. The batch size is bridge_batch.
99 */
100#define NM_BDG_MAXRINGS		16	/* XXX unclear how many. */
101#define NM_BDG_MAXSLOTS		4096	/* XXX same as above */
102#define NM_BRIDGE_RINGSIZE	1024	/* in the device */
103#define NM_BDG_BATCH		1024	/* entries in the forwarding buffer */
104/* actual size of the tables */
105#define NM_BDG_BATCH_MAX	(NM_BDG_BATCH + NETMAP_MAX_FRAGS)
106/* NM_FT_NULL terminates a list of slots in the ft */
107#define NM_FT_NULL		NM_BDG_BATCH_MAX
108
109
110/*
111 * bridge_batch is set via sysctl to the max batch size to be
112 * used in the bridge. The actual value may be larger as the
113 * last packet in the block may overflow the size.
114 */
115static int bridge_batch = NM_BDG_BATCH; /* bridge batch size */
116SYSBEGIN(vars_vale);
117SYSCTL_DECL(_dev_netmap);
118SYSCTL_INT(_dev_netmap, OID_AUTO, bridge_batch, CTLFLAG_RW, &bridge_batch, 0,
119		"Max batch size to be used in the bridge");
120SYSEND;
121
122static int netmap_vale_vp_create(struct nmreq_header *hdr, struct ifnet *,
123		struct netmap_mem_d *nmd, struct netmap_vp_adapter **);
124static int netmap_vale_vp_bdg_attach(const char *, struct netmap_adapter *,
125		struct nm_bridge *);
126static int netmap_vale_bwrap_attach(const char *, struct netmap_adapter *);
127
128/*
129 * For each output interface, nm_vale_q is used to construct a list.
130 * bq_len is the number of output buffers (we can have coalescing
131 * during the copy).
132 */
133struct nm_vale_q {
134	uint16_t bq_head;
135	uint16_t bq_tail;
136	uint32_t bq_len;	/* number of buffers */
137};
138
139/* Holds the default callbacks */
140struct netmap_bdg_ops vale_bdg_ops = {
141	.lookup = netmap_vale_learning,
142	.config = NULL,
143	.dtor = NULL,
144	.vp_create = netmap_vale_vp_create,
145	.bwrap_attach = netmap_vale_bwrap_attach,
146	.name = NM_BDG_NAME,
147};
148
149/*
150 * this is a slightly optimized copy routine which rounds
151 * to multiple of 64 bytes and is often faster than dealing
152 * with other odd sizes. We assume there is enough room
153 * in the source and destination buffers.
154 *
155 * XXX only for multiples of 64 bytes, non overlapped.
156 */
157static inline void
158pkt_copy(void *_src, void *_dst, int l)
159{
160	uint64_t *src = _src;
161	uint64_t *dst = _dst;
162	if (unlikely(l >= 1024)) {
163		memcpy(dst, src, l);
164		return;
165	}
166	for (; likely(l > 0); l-=64) {
167		*dst++ = *src++;
168		*dst++ = *src++;
169		*dst++ = *src++;
170		*dst++ = *src++;
171		*dst++ = *src++;
172		*dst++ = *src++;
173		*dst++ = *src++;
174		*dst++ = *src++;
175	}
176}
177
178
179/*
180 * Free the forwarding tables for rings attached to switch ports.
181 */
182static void
183nm_free_bdgfwd(struct netmap_adapter *na)
184{
185	int nrings, i;
186	struct netmap_kring **kring;
187
188	NMG_LOCK_ASSERT();
189	nrings = na->num_tx_rings;
190	kring = na->tx_rings;
191	for (i = 0; i < nrings; i++) {
192		if (kring[i]->nkr_ft) {
193			nm_os_free(kring[i]->nkr_ft);
194			kring[i]->nkr_ft = NULL; /* protect from freeing twice */
195		}
196	}
197}
198
199
200/*
201 * Allocate the forwarding tables for the rings attached to the bridge ports.
202 */
203static int
204nm_alloc_bdgfwd(struct netmap_adapter *na)
205{
206	int nrings, l, i, num_dstq;
207	struct netmap_kring **kring;
208
209	NMG_LOCK_ASSERT();
210	/* all port:rings + broadcast */
211	num_dstq = NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1;
212	l = sizeof(struct nm_bdg_fwd) * NM_BDG_BATCH_MAX;
213	l += sizeof(struct nm_vale_q) * num_dstq;
214	l += sizeof(uint16_t) * NM_BDG_BATCH_MAX;
215
216	nrings = netmap_real_rings(na, NR_TX);
217	kring = na->tx_rings;
218	for (i = 0; i < nrings; i++) {
219		struct nm_bdg_fwd *ft;
220		struct nm_vale_q *dstq;
221		int j;
222
223		ft = nm_os_malloc(l);
224		if (!ft) {
225			nm_free_bdgfwd(na);
226			return ENOMEM;
227		}
228		dstq = (struct nm_vale_q *)(ft + NM_BDG_BATCH_MAX);
229		for (j = 0; j < num_dstq; j++) {
230			dstq[j].bq_head = dstq[j].bq_tail = NM_FT_NULL;
231			dstq[j].bq_len = 0;
232		}
233		kring[i]->nkr_ft = ft;
234	}
235	return 0;
236}
237
238/* Allows external modules to create bridges in exclusive mode,
239 * returns an authentication token that the external module will need
240 * to provide during nm_bdg_ctl_{attach, detach}(), netmap_bdg_regops(),
241 * and nm_bdg_update_private_data() operations.
242 * Successfully executed if ret != NULL and *return_status == 0.
243 */
244void *
245netmap_vale_create(const char *bdg_name, int *return_status)
246{
247	struct nm_bridge *b = NULL;
248	void *ret = NULL;
249
250	NMG_LOCK();
251	b = nm_find_bridge(bdg_name, 0 /* don't create */, NULL);
252	if (b) {
253		*return_status = EEXIST;
254		goto unlock_bdg_create;
255	}
256
257	b = nm_find_bridge(bdg_name, 1 /* create */, &vale_bdg_ops);
258	if (!b) {
259		*return_status = ENOMEM;
260		goto unlock_bdg_create;
261	}
262
263	b->bdg_flags |= NM_BDG_ACTIVE | NM_BDG_EXCLUSIVE;
264	ret = nm_bdg_get_auth_token(b);
265	*return_status = 0;
266
267unlock_bdg_create:
268	NMG_UNLOCK();
269	return ret;
270}
271
272/* Allows external modules to destroy a bridge created through
273 * netmap_bdg_create(), the bridge must be empty.
274 */
275int
276netmap_vale_destroy(const char *bdg_name, void *auth_token)
277{
278	struct nm_bridge *b = NULL;
279	int ret = 0;
280
281	NMG_LOCK();
282	b = nm_find_bridge(bdg_name, 0 /* don't create */, NULL);
283	if (!b) {
284		ret = ENXIO;
285		goto unlock_bdg_free;
286	}
287
288	if (!nm_bdg_valid_auth_token(b, auth_token)) {
289		ret = EACCES;
290		goto unlock_bdg_free;
291	}
292	if (!(b->bdg_flags & NM_BDG_EXCLUSIVE)) {
293		ret = EINVAL;
294		goto unlock_bdg_free;
295	}
296
297	b->bdg_flags &= ~(NM_BDG_EXCLUSIVE | NM_BDG_ACTIVE);
298	ret = netmap_bdg_free(b);
299	if (ret) {
300		b->bdg_flags |= NM_BDG_EXCLUSIVE | NM_BDG_ACTIVE;
301	}
302
303unlock_bdg_free:
304	NMG_UNLOCK();
305	return ret;
306}
307
308/* Process NETMAP_REQ_VALE_LIST. */
309int
310netmap_vale_list(struct nmreq_header *hdr)
311{
312	struct nmreq_vale_list *req =
313		(struct nmreq_vale_list *)(uintptr_t)hdr->nr_body;
314	int namelen = strlen(hdr->nr_name);
315	struct nm_bridge *b, *bridges;
316	struct netmap_vp_adapter *vpna;
317	int error = 0, i, j;
318	u_int num_bridges;
319
320	netmap_bns_getbridges(&bridges, &num_bridges);
321
322	/* this is used to enumerate bridges and ports */
323	if (namelen) { /* look up indexes of bridge and port */
324		if (strncmp(hdr->nr_name, NM_BDG_NAME,
325					strlen(NM_BDG_NAME))) {
326			return EINVAL;
327		}
328		NMG_LOCK();
329		b = nm_find_bridge(hdr->nr_name, 0 /* don't create */, NULL);
330		if (!b) {
331			NMG_UNLOCK();
332			return ENOENT;
333		}
334
335		req->nr_bridge_idx = b - bridges; /* bridge index */
336		req->nr_port_idx = NM_BDG_NOPORT;
337		for (j = 0; j < b->bdg_active_ports; j++) {
338			i = b->bdg_port_index[j];
339			vpna = b->bdg_ports[i];
340			if (vpna == NULL) {
341				nm_prerr("This should not happen");
342				continue;
343			}
344			/* the former and the latter identify a
345			 * virtual port and a NIC, respectively
346			 */
347			if (!strcmp(vpna->up.name, hdr->nr_name)) {
348				req->nr_port_idx = i; /* port index */
349				break;
350			}
351		}
352		NMG_UNLOCK();
353	} else {
354		/* return the first non-empty entry starting from
355		 * bridge nr_arg1 and port nr_arg2.
356		 *
357		 * Users can detect the end of the same bridge by
358		 * seeing the new and old value of nr_arg1, and can
359		 * detect the end of all the bridge by error != 0
360		 */
361		i = req->nr_bridge_idx;
362		j = req->nr_port_idx;
363
364		NMG_LOCK();
365		for (error = ENOENT; i < NM_BRIDGES; i++) {
366			b = bridges + i;
367			for ( ; j < NM_BDG_MAXPORTS; j++) {
368				if (b->bdg_ports[j] == NULL)
369					continue;
370				vpna = b->bdg_ports[j];
371				/* write back the VALE switch name */
372				strlcpy(hdr->nr_name, vpna->up.name,
373					sizeof(hdr->nr_name));
374				error = 0;
375				goto out;
376			}
377			j = 0; /* following bridges scan from 0 */
378		}
379	out:
380		req->nr_bridge_idx = i;
381		req->nr_port_idx = j;
382		NMG_UNLOCK();
383	}
384
385	return error;
386}
387
388/* Process NETMAP_REQ_VALE_ATTACH.
389 */
390int
391netmap_vale_attach(struct nmreq_header *hdr, void *auth_token)
392{
393	struct nmreq_vale_attach *req =
394		(struct nmreq_vale_attach *)(uintptr_t)hdr->nr_body;
395	struct netmap_vp_adapter * vpna;
396	struct netmap_adapter *na = NULL;
397	struct netmap_mem_d *nmd = NULL;
398	struct nm_bridge *b = NULL;
399	int error;
400
401	NMG_LOCK();
402	/* permission check for modified bridges */
403	b = nm_find_bridge(hdr->nr_name, 0 /* don't create */, NULL);
404	if (b && !nm_bdg_valid_auth_token(b, auth_token)) {
405		error = EACCES;
406		goto unlock_exit;
407	}
408
409	if (req->reg.nr_mem_id) {
410		nmd = netmap_mem_find(req->reg.nr_mem_id);
411		if (nmd == NULL) {
412			error = EINVAL;
413			goto unlock_exit;
414		}
415	}
416
417	/* check for existing one */
418	error = netmap_get_vale_na(hdr, &na, nmd, 0);
419	if (na) {
420		error = EBUSY;
421		goto unref_exit;
422	}
423	error = netmap_get_vale_na(hdr, &na,
424				nmd, 1 /* create if not exists */);
425	if (error) { /* no device */
426		goto unlock_exit;
427	}
428
429	if (na == NULL) { /* VALE prefix missing */
430		error = EINVAL;
431		goto unlock_exit;
432	}
433
434	if (NETMAP_OWNED_BY_ANY(na)) {
435		error = EBUSY;
436		goto unref_exit;
437	}
438
439	if (na->nm_bdg_ctl) {
440		/* nop for VALE ports. The bwrap needs to put the hwna
441		 * in netmap mode (see netmap_bwrap_bdg_ctl)
442		 */
443		error = na->nm_bdg_ctl(hdr, na);
444		if (error)
445			goto unref_exit;
446		nm_prdis("registered %s to netmap-mode", na->name);
447	}
448	vpna = (struct netmap_vp_adapter *)na;
449	req->port_index = vpna->bdg_port;
450
451	if (nmd)
452		netmap_mem_put(nmd);
453
454	NMG_UNLOCK();
455	return 0;
456
457unref_exit:
458	netmap_adapter_put(na);
459unlock_exit:
460	if (nmd)
461		netmap_mem_put(nmd);
462
463	NMG_UNLOCK();
464	return error;
465}
466
467/* Process NETMAP_REQ_VALE_DETACH.
468 */
469int
470netmap_vale_detach(struct nmreq_header *hdr, void *auth_token)
471{
472	struct nmreq_vale_detach *nmreq_det = (void *)(uintptr_t)hdr->nr_body;
473	struct netmap_vp_adapter *vpna;
474	struct netmap_adapter *na;
475	struct nm_bridge *b = NULL;
476	int error;
477
478	NMG_LOCK();
479	/* permission check for modified bridges */
480	b = nm_find_bridge(hdr->nr_name, 0 /* don't create */, NULL);
481	if (b && !nm_bdg_valid_auth_token(b, auth_token)) {
482		error = EACCES;
483		goto unlock_exit;
484	}
485
486	error = netmap_get_vale_na(hdr, &na, NULL, 0 /* don't create */);
487	if (error) { /* no device, or another bridge or user owns the device */
488		goto unlock_exit;
489	}
490
491	if (na == NULL) { /* VALE prefix missing */
492		error = EINVAL;
493		goto unlock_exit;
494	} else if (nm_is_bwrap(na) &&
495		   ((struct netmap_bwrap_adapter *)na)->na_polling_state) {
496		/* Don't detach a NIC with polling */
497		error = EBUSY;
498		goto unref_exit;
499	}
500
501	vpna = (struct netmap_vp_adapter *)na;
502	if (na->na_vp != vpna) {
503		/* trying to detach first attach of VALE persistent port attached
504		 * to 2 bridges
505		 */
506		error = EBUSY;
507		goto unref_exit;
508	}
509	nmreq_det->port_index = vpna->bdg_port;
510
511	if (na->nm_bdg_ctl) {
512		/* remove the port from bridge. The bwrap
513		 * also needs to put the hwna in normal mode
514		 */
515		error = na->nm_bdg_ctl(hdr, na);
516	}
517
518unref_exit:
519	netmap_adapter_put(na);
520unlock_exit:
521	NMG_UNLOCK();
522	return error;
523
524}
525
526
527/* nm_dtor callback for ephemeral VALE ports */
528static void
529netmap_vale_vp_dtor(struct netmap_adapter *na)
530{
531	struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na;
532	struct nm_bridge *b = vpna->na_bdg;
533
534	nm_prdis("%s has %d references", na->name, na->na_refcount);
535
536	if (b) {
537		netmap_bdg_detach_common(b, vpna->bdg_port, -1);
538	}
539
540	if (na->ifp != NULL && !nm_iszombie(na)) {
541		NM_DETACH_NA(na->ifp);
542		if (vpna->autodelete) {
543			nm_prdis("releasing %s", na->ifp->if_xname);
544			NMG_UNLOCK();
545			nm_os_vi_detach(na->ifp);
546			NMG_LOCK();
547		}
548	}
549}
550
551
552
553/* nm_krings_create callback for VALE ports.
554 * Calls the standard netmap_krings_create, then adds leases on rx
555 * rings and bdgfwd on tx rings.
556 */
557static int
558netmap_vale_vp_krings_create(struct netmap_adapter *na)
559{
560	u_int tailroom;
561	int error, i;
562	uint32_t *leases;
563	u_int nrx = netmap_real_rings(na, NR_RX);
564
565	/*
566	 * Leases are attached to RX rings on vale ports
567	 */
568	tailroom = sizeof(uint32_t) * na->num_rx_desc * nrx;
569
570	error = netmap_krings_create(na, tailroom);
571	if (error)
572		return error;
573
574	leases = na->tailroom;
575
576	for (i = 0; i < nrx; i++) { /* Receive rings */
577		na->rx_rings[i]->nkr_leases = leases;
578		leases += na->num_rx_desc;
579	}
580
581	error = nm_alloc_bdgfwd(na);
582	if (error) {
583		netmap_krings_delete(na);
584		return error;
585	}
586
587	return 0;
588}
589
590
591/* nm_krings_delete callback for VALE ports. */
592static void
593netmap_vale_vp_krings_delete(struct netmap_adapter *na)
594{
595	nm_free_bdgfwd(na);
596	netmap_krings_delete(na);
597}
598
599
600static int
601nm_vale_flush(struct nm_bdg_fwd *ft, u_int n,
602	struct netmap_vp_adapter *na, u_int ring_nr);
603
604
605/*
606 * main dispatch routine for the bridge.
607 * Grab packets from a kring, move them into the ft structure
608 * associated to the tx (input) port. Max one instance per port,
609 * filtered on input (ioctl, poll or XXX).
610 * Returns the next position in the ring.
611 */
612static int
613nm_vale_preflush(struct netmap_kring *kring, u_int end)
614{
615	struct netmap_vp_adapter *na =
616		(struct netmap_vp_adapter*)kring->na;
617	struct netmap_ring *ring = kring->ring;
618	struct nm_bdg_fwd *ft;
619	u_int ring_nr = kring->ring_id;
620	u_int j = kring->nr_hwcur, lim = kring->nkr_num_slots - 1;
621	u_int ft_i = 0;	/* start from 0 */
622	u_int frags = 1; /* how many frags ? */
623	struct nm_bridge *b = na->na_bdg;
624
625	/* To protect against modifications to the bridge we acquire a
626	 * shared lock, waiting if we can sleep (if the source port is
627	 * attached to a user process) or with a trylock otherwise (NICs).
628	 */
629	nm_prdis("wait rlock for %d packets", ((j > end ? lim+1 : 0) + end) - j);
630	if (na->up.na_flags & NAF_BDG_MAYSLEEP)
631		BDG_RLOCK(b);
632	else if (!BDG_RTRYLOCK(b))
633		return j;
634	nm_prdis(5, "rlock acquired for %d packets", ((j > end ? lim+1 : 0) + end) - j);
635	ft = kring->nkr_ft;
636
637	for (; likely(j != end); j = nm_next(j, lim)) {
638		struct netmap_slot *slot = &ring->slot[j];
639		char *buf;
640
641		ft[ft_i].ft_len = slot->len;
642		ft[ft_i].ft_flags = slot->flags;
643		ft[ft_i].ft_offset = 0;
644
645		nm_prdis("flags is 0x%x", slot->flags);
646		/* we do not use the buf changed flag, but we still need to reset it */
647		slot->flags &= ~NS_BUF_CHANGED;
648
649		/* this slot goes into a list so initialize the link field */
650		ft[ft_i].ft_next = NM_FT_NULL;
651		buf = ft[ft_i].ft_buf = (slot->flags & NS_INDIRECT) ?
652			(void *)(uintptr_t)slot->ptr : NMB(&na->up, slot);
653		if (unlikely(buf == NULL)) {
654			nm_prlim(5, "NULL %s buffer pointer from %s slot %d len %d",
655				(slot->flags & NS_INDIRECT) ? "INDIRECT" : "DIRECT",
656				kring->name, j, ft[ft_i].ft_len);
657			buf = ft[ft_i].ft_buf = NETMAP_BUF_BASE(&na->up);
658			ft[ft_i].ft_len = 0;
659			ft[ft_i].ft_flags = 0;
660		}
661		__builtin_prefetch(buf);
662		++ft_i;
663		if (slot->flags & NS_MOREFRAG) {
664			frags++;
665			continue;
666		}
667		if (unlikely(netmap_verbose && frags > 1))
668			nm_prlim(5, "%d frags at %d", frags, ft_i - frags);
669		ft[ft_i - frags].ft_frags = frags;
670		frags = 1;
671		if (unlikely((int)ft_i >= bridge_batch))
672			ft_i = nm_vale_flush(ft, ft_i, na, ring_nr);
673	}
674	if (frags > 1) {
675		/* Here ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG, and we
676		 * have to fix frags count. */
677		frags--;
678		ft[ft_i - 1].ft_flags &= ~NS_MOREFRAG;
679		ft[ft_i - frags].ft_frags = frags;
680		nm_prlim(5, "Truncate incomplete fragment at %d (%d frags)", ft_i, frags);
681	}
682	if (ft_i)
683		ft_i = nm_vale_flush(ft, ft_i, na, ring_nr);
684	BDG_RUNLOCK(b);
685	return j;
686}
687
688
689/* ----- FreeBSD if_bridge hash function ------- */
690
691/*
692 * The following hash function is adapted from "Hash Functions" by Bob Jenkins
693 * ("Algorithm Alley", Dr. Dobbs Journal, September 1997).
694 *
695 * http://www.burtleburtle.net/bob/hash/spooky.html
696 */
697#define mix(a, b, c)                                                    \
698do {                                                                    \
699	a -= b; a -= c; a ^= (c >> 13);                                 \
700	b -= c; b -= a; b ^= (a << 8);                                  \
701	c -= a; c -= b; c ^= (b >> 13);                                 \
702	a -= b; a -= c; a ^= (c >> 12);                                 \
703	b -= c; b -= a; b ^= (a << 16);                                 \
704	c -= a; c -= b; c ^= (b >> 5);                                  \
705	a -= b; a -= c; a ^= (c >> 3);                                  \
706	b -= c; b -= a; b ^= (a << 10);                                 \
707	c -= a; c -= b; c ^= (b >> 15);                                 \
708} while (/*CONSTCOND*/0)
709
710
711static __inline uint32_t
712nm_vale_rthash(const uint8_t *addr)
713{
714	uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = 0; // hask key
715
716	b += addr[5] << 8;
717	b += addr[4];
718	a += addr[3] << 24;
719	a += addr[2] << 16;
720	a += addr[1] << 8;
721	a += addr[0];
722
723	mix(a, b, c);
724#define BRIDGE_RTHASH_MASK	(NM_BDG_HASH-1)
725	return (c & BRIDGE_RTHASH_MASK);
726}
727
728#undef mix
729
730
731/*
732 * Lookup function for a learning bridge.
733 * Update the hash table with the source address,
734 * and then returns the destination port index, and the
735 * ring in *dst_ring (at the moment, always use ring 0)
736 */
737uint32_t
738netmap_vale_learning(struct nm_bdg_fwd *ft, uint8_t *dst_ring,
739		struct netmap_vp_adapter *na, void *private_data)
740{
741	uint8_t *buf = ((uint8_t *)ft->ft_buf) + ft->ft_offset;
742	u_int buf_len = ft->ft_len - ft->ft_offset;
743	struct nm_hash_ent *ht = private_data;
744	uint32_t sh, dh;
745	u_int dst, mysrc = na->bdg_port;
746	uint64_t smac, dmac;
747	uint8_t indbuf[12];
748
749	if (buf_len < 14) {
750		return NM_BDG_NOPORT;
751	}
752
753	if (ft->ft_flags & NS_INDIRECT) {
754		if (copyin(buf, indbuf, sizeof(indbuf))) {
755			return NM_BDG_NOPORT;
756		}
757		buf = indbuf;
758	}
759
760	dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff;
761	smac = le64toh(*(uint64_t *)(buf + 4));
762	smac >>= 16;
763
764	/*
765	 * The hash is somewhat expensive, there might be some
766	 * worthwhile optimizations here.
767	 */
768	if (((buf[6] & 1) == 0) && (na->last_smac != smac)) { /* valid src */
769		uint8_t *s = buf+6;
770		sh = nm_vale_rthash(s); /* hash of source */
771		/* update source port forwarding entry */
772		na->last_smac = ht[sh].mac = smac;	/* XXX expire ? */
773		ht[sh].ports = mysrc;
774		if (netmap_debug & NM_DEBUG_VALE)
775		    nm_prinf("src %02x:%02x:%02x:%02x:%02x:%02x on port %d",
776			s[0], s[1], s[2], s[3], s[4], s[5], mysrc);
777	}
778	dst = NM_BDG_BROADCAST;
779	if ((buf[0] & 1) == 0) { /* unicast */
780		dh = nm_vale_rthash(buf); /* hash of dst */
781		if (ht[dh].mac == dmac) {	/* found dst */
782			dst = ht[dh].ports;
783		}
784	}
785	return dst;
786}
787
788
789/*
790 * Available space in the ring. Only used in VALE code
791 * and only with is_rx = 1
792 */
793static inline uint32_t
794nm_kr_space(struct netmap_kring *k, int is_rx)
795{
796	int space;
797
798	if (is_rx) {
799		int busy = k->nkr_hwlease - k->nr_hwcur;
800		if (busy < 0)
801			busy += k->nkr_num_slots;
802		space = k->nkr_num_slots - 1 - busy;
803	} else {
804		/* XXX never used in this branch */
805		space = k->nr_hwtail - k->nkr_hwlease;
806		if (space < 0)
807			space += k->nkr_num_slots;
808	}
809#if 0
810	// sanity check
811	if (k->nkr_hwlease >= k->nkr_num_slots ||
812		k->nr_hwcur >= k->nkr_num_slots ||
813		k->nr_tail >= k->nkr_num_slots ||
814		busy < 0 ||
815		busy >= k->nkr_num_slots) {
816		nm_prerr("invalid kring, cur %d tail %d lease %d lease_idx %d lim %d",
817		    k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease,
818		    k->nkr_lease_idx, k->nkr_num_slots);
819	}
820#endif
821	return space;
822}
823
824
825
826
827/* make a lease on the kring for N positions. return the
828 * lease index
829 * XXX only used in VALE code and with is_rx = 1
830 */
831static inline uint32_t
832nm_kr_lease(struct netmap_kring *k, u_int n, int is_rx)
833{
834	uint32_t lim = k->nkr_num_slots - 1;
835	uint32_t lease_idx = k->nkr_lease_idx;
836
837	k->nkr_leases[lease_idx] = NR_NOSLOT;
838	k->nkr_lease_idx = nm_next(lease_idx, lim);
839
840#ifdef CONFIG_NETMAP_DEBUG
841	if (n > nm_kr_space(k, is_rx)) {
842		nm_prerr("invalid request for %d slots", n);
843		panic("x");
844	}
845#endif /* CONFIG NETMAP_DEBUG */
846	/* XXX verify that there are n slots */
847	k->nkr_hwlease += n;
848	if (k->nkr_hwlease > lim)
849		k->nkr_hwlease -= lim + 1;
850
851#ifdef CONFIG_NETMAP_DEBUG
852	if (k->nkr_hwlease >= k->nkr_num_slots ||
853		k->nr_hwcur >= k->nkr_num_slots ||
854		k->nr_hwtail >= k->nkr_num_slots ||
855		k->nkr_lease_idx >= k->nkr_num_slots) {
856		nm_prerr("invalid kring %s, cur %d tail %d lease %d lease_idx %d lim %d",
857			k->na->name,
858			k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease,
859			k->nkr_lease_idx, k->nkr_num_slots);
860	}
861#endif /* CONFIG_NETMAP_DEBUG */
862	return lease_idx;
863}
864
865/*
866 *
867 * This flush routine supports only unicast and broadcast but a large
868 * number of ports, and lets us replace the learn and dispatch functions.
869 */
870int
871nm_vale_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na,
872		u_int ring_nr)
873{
874	struct nm_vale_q *dst_ents, *brddst;
875	uint16_t num_dsts = 0, *dsts;
876	struct nm_bridge *b = na->na_bdg;
877	u_int i, me = na->bdg_port;
878
879	/*
880	 * The work area (pointed by ft) is followed by an array of
881	 * pointers to queues , dst_ents; there are NM_BDG_MAXRINGS
882	 * queues per port plus one for the broadcast traffic.
883	 * Then we have an array of destination indexes.
884	 */
885	dst_ents = (struct nm_vale_q *)(ft + NM_BDG_BATCH_MAX);
886	dsts = (uint16_t *)(dst_ents + NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1);
887
888	/* first pass: find a destination for each packet in the batch */
889	for (i = 0; likely(i < n); i += ft[i].ft_frags) {
890		uint8_t dst_ring = ring_nr; /* default, same ring as origin */
891		uint16_t dst_port, d_i;
892		struct nm_vale_q *d;
893		struct nm_bdg_fwd *start_ft = NULL;
894
895		nm_prdis("slot %d frags %d", i, ft[i].ft_frags);
896
897		if (na->up.virt_hdr_len < ft[i].ft_len) {
898			ft[i].ft_offset = na->up.virt_hdr_len;
899			start_ft = &ft[i];
900		} else if (na->up.virt_hdr_len == ft[i].ft_len && ft[i].ft_flags & NS_MOREFRAG) {
901			ft[i].ft_offset = ft[i].ft_len;
902			start_ft = &ft[i+1];
903		} else {
904			/* Drop the packet if the virtio-net header is not into the first
905			 * fragment nor at the very beginning of the second.
906			 */
907			continue;
908		}
909		dst_port = b->bdg_ops.lookup(start_ft, &dst_ring, na, b->private_data);
910		if (netmap_verbose > 255)
911			nm_prlim(5, "slot %d port %d -> %d", i, me, dst_port);
912		if (dst_port >= NM_BDG_NOPORT)
913			continue; /* this packet is identified to be dropped */
914		else if (dst_port == NM_BDG_BROADCAST)
915			dst_ring = 0; /* broadcasts always go to ring 0 */
916		else if (unlikely(dst_port == me ||
917		    !b->bdg_ports[dst_port]))
918			continue;
919
920		/* get a position in the scratch pad */
921		d_i = dst_port * NM_BDG_MAXRINGS + dst_ring;
922		d = dst_ents + d_i;
923
924		/* append the first fragment to the list */
925		if (d->bq_head == NM_FT_NULL) { /* new destination */
926			d->bq_head = d->bq_tail = i;
927			/* remember this position to be scanned later */
928			if (dst_port != NM_BDG_BROADCAST)
929				dsts[num_dsts++] = d_i;
930		} else {
931			ft[d->bq_tail].ft_next = i;
932			d->bq_tail = i;
933		}
934		d->bq_len += ft[i].ft_frags;
935	}
936
937	/*
938	 * Broadcast traffic goes to ring 0 on all destinations.
939	 * So we need to add these rings to the list of ports to scan.
940	 * XXX at the moment we scan all NM_BDG_MAXPORTS ports, which is
941	 * expensive. We should keep a compact list of active destinations
942	 * so we could shorten this loop.
943	 */
944	brddst = dst_ents + NM_BDG_BROADCAST * NM_BDG_MAXRINGS;
945	if (brddst->bq_head != NM_FT_NULL) {
946		u_int j;
947		for (j = 0; likely(j < b->bdg_active_ports); j++) {
948			uint16_t d_i;
949			i = b->bdg_port_index[j];
950			if (unlikely(i == me))
951				continue;
952			d_i = i * NM_BDG_MAXRINGS;
953			if (dst_ents[d_i].bq_head == NM_FT_NULL)
954				dsts[num_dsts++] = d_i;
955		}
956	}
957
958	nm_prdis(5, "pass 1 done %d pkts %d dsts", n, num_dsts);
959	/* second pass: scan destinations */
960	for (i = 0; i < num_dsts; i++) {
961		struct netmap_vp_adapter *dst_na;
962		struct netmap_kring *kring;
963		struct netmap_ring *ring;
964		u_int dst_nr, lim, j, d_i, next, brd_next;
965		u_int needed, howmany;
966		int retry = netmap_txsync_retry;
967		struct nm_vale_q *d;
968		uint32_t my_start = 0, lease_idx = 0;
969		int nrings;
970		int virt_hdr_mismatch = 0;
971
972		d_i = dsts[i];
973		nm_prdis("second pass %d port %d", i, d_i);
974		d = dst_ents + d_i;
975		// XXX fix the division
976		dst_na = b->bdg_ports[d_i/NM_BDG_MAXRINGS];
977		/* protect from the lookup function returning an inactive
978		 * destination port
979		 */
980		if (unlikely(dst_na == NULL))
981			goto cleanup;
982		if (dst_na->up.na_flags & NAF_SW_ONLY)
983			goto cleanup;
984		/*
985		 * The interface may be in !netmap mode in two cases:
986		 * - when na is attached but not activated yet;
987		 * - when na is being deactivated but is still attached.
988		 */
989		if (unlikely(!nm_netmap_on(&dst_na->up))) {
990			nm_prdis("not in netmap mode!");
991			goto cleanup;
992		}
993
994		/* there is at least one either unicast or broadcast packet */
995		brd_next = brddst->bq_head;
996		next = d->bq_head;
997		/* we need to reserve this many slots. If fewer are
998		 * available, some packets will be dropped.
999		 * Packets may have multiple fragments, so we may not use
1000		 * there is a chance that we may not use all of the slots
1001		 * we have claimed, so we will need to handle the leftover
1002		 * ones when we regain the lock.
1003		 */
1004		needed = d->bq_len + brddst->bq_len;
1005
1006		if (unlikely(dst_na->up.virt_hdr_len != na->up.virt_hdr_len)) {
1007			if (netmap_verbose) {
1008				nm_prlim(3, "virt_hdr_mismatch, src %d dst %d", na->up.virt_hdr_len,
1009						dst_na->up.virt_hdr_len);
1010			}
1011			/* There is a virtio-net header/offloadings mismatch between
1012			 * source and destination. The slower mismatch datapath will
1013			 * be used to cope with all the mismatches.
1014			 */
1015			virt_hdr_mismatch = 1;
1016			if (dst_na->mfs < na->mfs) {
1017				/* We may need to do segmentation offloadings, and so
1018				 * we may need a number of destination slots greater
1019				 * than the number of input slots ('needed').
1020				 * We look for the smallest integer 'x' which satisfies:
1021				 *	needed * na->mfs + x * H <= x * na->mfs
1022				 * where 'H' is the length of the longest header that may
1023				 * be replicated in the segmentation process (e.g. for
1024				 * TCPv4 we must account for ethernet header, IP header
1025				 * and TCPv4 header).
1026				 */
1027				KASSERT(dst_na->mfs > 0, ("vpna->mfs is 0"));
1028				needed = (needed * na->mfs) /
1029						(dst_na->mfs - WORST_CASE_GSO_HEADER) + 1;
1030				nm_prdis(3, "srcmtu=%u, dstmtu=%u, x=%u", na->mfs, dst_na->mfs, needed);
1031			}
1032		}
1033
1034		nm_prdis(5, "pass 2 dst %d is %x %s",
1035			i, d_i, is_vp ? "virtual" : "nic/host");
1036		dst_nr = d_i & (NM_BDG_MAXRINGS-1);
1037		nrings = dst_na->up.num_rx_rings;
1038		if (dst_nr >= nrings)
1039			dst_nr = dst_nr % nrings;
1040		kring = dst_na->up.rx_rings[dst_nr];
1041		ring = kring->ring;
1042		/* the destination ring may have not been opened for RX */
1043		if (unlikely(ring == NULL || kring->nr_mode != NKR_NETMAP_ON))
1044			goto cleanup;
1045		lim = kring->nkr_num_slots - 1;
1046
1047retry:
1048
1049		if (dst_na->retry && retry) {
1050			/* try to get some free slot from the previous run */
1051			kring->nm_notify(kring, NAF_FORCE_RECLAIM);
1052			/* actually useful only for bwraps, since there
1053			 * the notify will trigger a txsync on the hwna. VALE ports
1054			 * have dst_na->retry == 0
1055			 */
1056		}
1057		/* reserve the buffers in the queue and an entry
1058		 * to report completion, and drop lock.
1059		 * XXX this might become a helper function.
1060		 */
1061		mtx_lock(&kring->q_lock);
1062		if (kring->nkr_stopped) {
1063			mtx_unlock(&kring->q_lock);
1064			goto cleanup;
1065		}
1066		my_start = j = kring->nkr_hwlease;
1067		howmany = nm_kr_space(kring, 1);
1068		if (needed < howmany)
1069			howmany = needed;
1070		lease_idx = nm_kr_lease(kring, howmany, 1);
1071		mtx_unlock(&kring->q_lock);
1072
1073		/* only retry if we need more than available slots */
1074		if (retry && needed <= howmany)
1075			retry = 0;
1076
1077		/* copy to the destination queue */
1078		while (howmany > 0) {
1079			struct netmap_slot *slot;
1080			struct nm_bdg_fwd *ft_p, *ft_end;
1081			u_int cnt;
1082
1083			/* find the queue from which we pick next packet.
1084			 * NM_FT_NULL is always higher than valid indexes
1085			 * so we never dereference it if the other list
1086			 * has packets (and if both are empty we never
1087			 * get here).
1088			 */
1089			if (next < brd_next) {
1090				ft_p = ft + next;
1091				next = ft_p->ft_next;
1092			} else { /* insert broadcast */
1093				ft_p = ft + brd_next;
1094				brd_next = ft_p->ft_next;
1095			}
1096			cnt = ft_p->ft_frags; // cnt > 0
1097			if (unlikely(cnt > howmany))
1098			    break; /* no more space */
1099			if (netmap_verbose && cnt > 1)
1100				nm_prlim(5, "rx %d frags to %d", cnt, j);
1101			ft_end = ft_p + cnt;
1102			if (unlikely(virt_hdr_mismatch)) {
1103				bdg_mismatch_datapath(na, dst_na, ft_p, ring, &j, lim, &howmany);
1104			} else {
1105				howmany -= cnt;
1106				do {
1107					char *dst, *src = ft_p->ft_buf;
1108					size_t copy_len = ft_p->ft_len, dst_len = copy_len;
1109
1110					slot = &ring->slot[j];
1111					dst = NMB(&dst_na->up, slot);
1112
1113					nm_prdis("send [%d] %d(%d) bytes at %s:%d",
1114							i, (int)copy_len, (int)dst_len,
1115							NM_IFPNAME(dst_ifp), j);
1116					/* round to a multiple of 64 */
1117					copy_len = (copy_len + 63) & ~63;
1118
1119					if (unlikely(copy_len > NETMAP_BUF_SIZE(&dst_na->up) ||
1120						     copy_len > NETMAP_BUF_SIZE(&na->up))) {
1121						nm_prlim(5, "invalid len %d, down to 64", (int)copy_len);
1122						copy_len = dst_len = 64; // XXX
1123					}
1124					if (ft_p->ft_flags & NS_INDIRECT) {
1125						if (copyin(src, dst, copy_len)) {
1126							// invalid user pointer, pretend len is 0
1127							dst_len = 0;
1128						}
1129					} else {
1130						//memcpy(dst, src, copy_len);
1131						pkt_copy(src, dst, (int)copy_len);
1132					}
1133					slot->len = dst_len;
1134					slot->flags = (cnt << 8)| NS_MOREFRAG;
1135					j = nm_next(j, lim);
1136					needed--;
1137					ft_p++;
1138				} while (ft_p != ft_end);
1139				slot->flags = (cnt << 8); /* clear flag on last entry */
1140			}
1141			/* are we done ? */
1142			if (next == NM_FT_NULL && brd_next == NM_FT_NULL)
1143				break;
1144		}
1145		{
1146		    /* current position */
1147		    uint32_t *p = kring->nkr_leases; /* shorthand */
1148		    uint32_t update_pos;
1149		    int still_locked = 1;
1150
1151		    mtx_lock(&kring->q_lock);
1152		    if (unlikely(howmany > 0)) {
1153			/* not used all bufs. If i am the last one
1154			 * i can recover the slots, otherwise must
1155			 * fill them with 0 to mark empty packets.
1156			 */
1157			nm_prdis("leftover %d bufs", howmany);
1158			if (nm_next(lease_idx, lim) == kring->nkr_lease_idx) {
1159			    /* yes i am the last one */
1160			    nm_prdis("roll back nkr_hwlease to %d", j);
1161			    kring->nkr_hwlease = j;
1162			} else {
1163			    while (howmany-- > 0) {
1164				ring->slot[j].len = 0;
1165				ring->slot[j].flags = 0;
1166				j = nm_next(j, lim);
1167			    }
1168			}
1169		    }
1170		    p[lease_idx] = j; /* report I am done */
1171
1172		    update_pos = kring->nr_hwtail;
1173
1174		    if (my_start == update_pos) {
1175			/* all slots before my_start have been reported,
1176			 * so scan subsequent leases to see if other ranges
1177			 * have been completed, and to a selwakeup or txsync.
1178		         */
1179			while (lease_idx != kring->nkr_lease_idx &&
1180				p[lease_idx] != NR_NOSLOT) {
1181			    j = p[lease_idx];
1182			    p[lease_idx] = NR_NOSLOT;
1183			    lease_idx = nm_next(lease_idx, lim);
1184			}
1185			/* j is the new 'write' position. j != my_start
1186			 * means there are new buffers to report
1187			 */
1188			if (likely(j != my_start)) {
1189				kring->nr_hwtail = j;
1190				still_locked = 0;
1191				mtx_unlock(&kring->q_lock);
1192				kring->nm_notify(kring, 0);
1193				/* this is netmap_notify for VALE ports and
1194				 * netmap_bwrap_notify for bwrap. The latter will
1195				 * trigger a txsync on the underlying hwna
1196				 */
1197				if (dst_na->retry && retry--) {
1198					/* XXX this is going to call nm_notify again.
1199					 * Only useful for bwrap in virtual machines
1200					 */
1201					goto retry;
1202				}
1203			}
1204		    }
1205		    if (still_locked)
1206			mtx_unlock(&kring->q_lock);
1207		}
1208cleanup:
1209		d->bq_head = d->bq_tail = NM_FT_NULL; /* cleanup */
1210		d->bq_len = 0;
1211	}
1212	brddst->bq_head = brddst->bq_tail = NM_FT_NULL; /* cleanup */
1213	brddst->bq_len = 0;
1214	return 0;
1215}
1216
1217/* nm_txsync callback for VALE ports */
1218static int
1219netmap_vale_vp_txsync(struct netmap_kring *kring, int flags)
1220{
1221	struct netmap_vp_adapter *na =
1222		(struct netmap_vp_adapter *)kring->na;
1223	u_int done;
1224	u_int const lim = kring->nkr_num_slots - 1;
1225	u_int const head = kring->rhead;
1226
1227	if (bridge_batch <= 0) { /* testing only */
1228		done = head; // used all
1229		goto done;
1230	}
1231	if (!na->na_bdg) {
1232		done = head;
1233		goto done;
1234	}
1235	if (bridge_batch > NM_BDG_BATCH)
1236		bridge_batch = NM_BDG_BATCH;
1237
1238	done = nm_vale_preflush(kring, head);
1239done:
1240	if (done != head)
1241		nm_prerr("early break at %d/ %d, tail %d", done, head, kring->nr_hwtail);
1242	/*
1243	 * packets between 'done' and 'cur' are left unsent.
1244	 */
1245	kring->nr_hwcur = done;
1246	kring->nr_hwtail = nm_prev(done, lim);
1247	if (netmap_debug & NM_DEBUG_TXSYNC)
1248		nm_prinf("%s ring %d flags %d", na->up.name, kring->ring_id, flags);
1249	return 0;
1250}
1251
1252
1253/* create a netmap_vp_adapter that describes a VALE port.
1254 * Only persistent VALE ports have a non-null ifp.
1255 */
1256static int
1257netmap_vale_vp_create(struct nmreq_header *hdr, struct ifnet *ifp,
1258		struct netmap_mem_d *nmd, struct netmap_vp_adapter **ret)
1259{
1260	struct nmreq_register *req = (struct nmreq_register *)(uintptr_t)hdr->nr_body;
1261	struct netmap_vp_adapter *vpna;
1262	struct netmap_adapter *na;
1263	int error = 0;
1264	u_int npipes = 0;
1265	u_int extrabufs = 0;
1266
1267	if (hdr->nr_reqtype != NETMAP_REQ_REGISTER) {
1268		return EINVAL;
1269	}
1270
1271	vpna = nm_os_malloc(sizeof(*vpna));
1272	if (vpna == NULL)
1273		return ENOMEM;
1274
1275 	na = &vpna->up;
1276
1277	na->ifp = ifp;
1278	strlcpy(na->name, hdr->nr_name, sizeof(na->name));
1279
1280	/* bound checking */
1281	na->num_tx_rings = req->nr_tx_rings;
1282	nm_bound_var(&na->num_tx_rings, 1, 1, NM_BDG_MAXRINGS, NULL);
1283	req->nr_tx_rings = na->num_tx_rings; /* write back */
1284	na->num_rx_rings = req->nr_rx_rings;
1285	nm_bound_var(&na->num_rx_rings, 1, 1, NM_BDG_MAXRINGS, NULL);
1286	req->nr_rx_rings = na->num_rx_rings; /* write back */
1287	nm_bound_var(&req->nr_tx_slots, NM_BRIDGE_RINGSIZE,
1288			1, NM_BDG_MAXSLOTS, NULL);
1289	na->num_tx_desc = req->nr_tx_slots;
1290	nm_bound_var(&req->nr_rx_slots, NM_BRIDGE_RINGSIZE,
1291			1, NM_BDG_MAXSLOTS, NULL);
1292	/* validate number of pipes. We want at least 1,
1293	 * but probably can do with some more.
1294	 * So let's use 2 as default (when 0 is supplied)
1295	 */
1296	nm_bound_var(&npipes, 2, 1, NM_MAXPIPES, NULL);
1297	/* validate extra bufs */
1298	extrabufs = req->nr_extra_bufs;
1299	nm_bound_var(&extrabufs, 0, 0,
1300			128*NM_BDG_MAXSLOTS, NULL);
1301	req->nr_extra_bufs = extrabufs; /* write back */
1302	na->num_rx_desc = req->nr_rx_slots;
1303	/* Set the mfs to a default value, as it is needed on the VALE
1304	 * mismatch datapath. XXX We should set it according to the MTU
1305	 * known to the kernel. */
1306	vpna->mfs = NM_BDG_MFS_DEFAULT;
1307	vpna->last_smac = ~0llu;
1308	/*if (vpna->mfs > netmap_buf_size)  TODO netmap_buf_size is zero??
1309		vpna->mfs = netmap_buf_size; */
1310	if (netmap_verbose)
1311		nm_prinf("max frame size %u", vpna->mfs);
1312
1313	na->na_flags |= NAF_BDG_MAYSLEEP;
1314	/* persistent VALE ports look like hw devices
1315	 * with a native netmap adapter
1316	 */
1317	if (ifp)
1318		na->na_flags |= NAF_NATIVE;
1319	na->nm_txsync = netmap_vale_vp_txsync;
1320	na->nm_rxsync = netmap_vp_rxsync; /* use the one provided by bdg */
1321	na->nm_register = netmap_vp_reg;  /* use the one provided by bdg */
1322	na->nm_krings_create = netmap_vale_vp_krings_create;
1323	na->nm_krings_delete = netmap_vale_vp_krings_delete;
1324	na->nm_dtor = netmap_vale_vp_dtor;
1325	nm_prdis("nr_mem_id %d", req->nr_mem_id);
1326	na->nm_mem = nmd ?
1327		netmap_mem_get(nmd):
1328		netmap_mem_private_new(
1329			na->num_tx_rings, na->num_tx_desc,
1330			na->num_rx_rings, na->num_rx_desc,
1331			req->nr_extra_bufs, npipes, &error);
1332	if (na->nm_mem == NULL)
1333		goto err;
1334	na->nm_bdg_attach = netmap_vale_vp_bdg_attach;
1335	/* other nmd fields are set in the common routine */
1336	error = netmap_attach_common(na);
1337	if (error)
1338		goto err;
1339	*ret = vpna;
1340	return 0;
1341
1342err:
1343	if (na->nm_mem != NULL)
1344		netmap_mem_put(na->nm_mem);
1345	nm_os_free(vpna);
1346	return error;
1347}
1348
1349/* nm_bdg_attach callback for VALE ports
1350 * The na_vp port is this same netmap_adapter. There is no host port.
1351 */
1352static int
1353netmap_vale_vp_bdg_attach(const char *name, struct netmap_adapter *na,
1354		struct nm_bridge *b)
1355{
1356	struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na;
1357
1358	if ((b->bdg_flags & NM_BDG_NEED_BWRAP) || vpna->na_bdg) {
1359		return NM_NEED_BWRAP;
1360	}
1361	na->na_vp = vpna;
1362	strlcpy(na->name, name, sizeof(na->name));
1363	na->na_hostvp = NULL;
1364	return 0;
1365}
1366
1367static int
1368netmap_vale_bwrap_krings_create(struct netmap_adapter *na)
1369{
1370	int error;
1371
1372	/* impersonate a netmap_vp_adapter */
1373	error = netmap_vale_vp_krings_create(na);
1374	if (error)
1375		return error;
1376	error = netmap_bwrap_krings_create_common(na);
1377	if (error) {
1378		netmap_vale_vp_krings_delete(na);
1379	}
1380	return error;
1381}
1382
1383static void
1384netmap_vale_bwrap_krings_delete(struct netmap_adapter *na)
1385{
1386	netmap_bwrap_krings_delete_common(na);
1387	netmap_vale_vp_krings_delete(na);
1388}
1389
1390static int
1391netmap_vale_bwrap_attach(const char *nr_name, struct netmap_adapter *hwna)
1392{
1393	struct netmap_bwrap_adapter *bna;
1394	struct netmap_adapter *na = NULL;
1395	struct netmap_adapter *hostna = NULL;
1396	int error;
1397
1398	bna = nm_os_malloc(sizeof(*bna));
1399	if (bna == NULL) {
1400		return ENOMEM;
1401	}
1402	na = &bna->up.up;
1403	strlcpy(na->name, nr_name, sizeof(na->name));
1404	na->nm_register = netmap_bwrap_reg;
1405	na->nm_txsync = netmap_vale_vp_txsync;
1406	// na->nm_rxsync = netmap_bwrap_rxsync;
1407	na->nm_krings_create = netmap_vale_bwrap_krings_create;
1408	na->nm_krings_delete = netmap_vale_bwrap_krings_delete;
1409	na->nm_notify = netmap_bwrap_notify;
1410	bna->up.retry = 1; /* XXX maybe this should depend on the hwna */
1411	/* Set the mfs, needed on the VALE mismatch datapath. */
1412	bna->up.mfs = NM_BDG_MFS_DEFAULT;
1413
1414	if (hwna->na_flags & NAF_HOST_RINGS) {
1415		hostna = &bna->host.up;
1416		hostna->nm_notify = netmap_bwrap_notify;
1417		bna->host.mfs = NM_BDG_MFS_DEFAULT;
1418	}
1419
1420	error = netmap_bwrap_attach_common(na, hwna);
1421	if (error) {
1422		nm_os_free(bna);
1423	}
1424	return error;
1425}
1426
1427int
1428netmap_get_vale_na(struct nmreq_header *hdr, struct netmap_adapter **na,
1429		struct netmap_mem_d *nmd, int create)
1430{
1431	return netmap_get_bdg_na(hdr, na, nmd, create, &vale_bdg_ops);
1432}
1433
1434
1435/* creates a persistent VALE port */
1436int
1437nm_vi_create(struct nmreq_header *hdr)
1438{
1439	struct nmreq_vale_newif *req =
1440		(struct nmreq_vale_newif *)(uintptr_t)hdr->nr_body;
1441	int error = 0;
1442	/* Build a nmreq_register out of the nmreq_vale_newif,
1443	 * so that we can call netmap_get_bdg_na(). */
1444	struct nmreq_register regreq;
1445	bzero(&regreq, sizeof(regreq));
1446	regreq.nr_tx_slots = req->nr_tx_slots;
1447	regreq.nr_rx_slots = req->nr_rx_slots;
1448	regreq.nr_tx_rings = req->nr_tx_rings;
1449	regreq.nr_rx_rings = req->nr_rx_rings;
1450	regreq.nr_mem_id = req->nr_mem_id;
1451	hdr->nr_reqtype = NETMAP_REQ_REGISTER;
1452	hdr->nr_body = (uintptr_t)&regreq;
1453	error = netmap_vi_create(hdr, 0 /* no autodelete */);
1454	hdr->nr_reqtype = NETMAP_REQ_VALE_NEWIF;
1455	hdr->nr_body = (uintptr_t)req;
1456	/* Write back to the original struct. */
1457	req->nr_tx_slots = regreq.nr_tx_slots;
1458	req->nr_rx_slots = regreq.nr_rx_slots;
1459	req->nr_tx_rings = regreq.nr_tx_rings;
1460	req->nr_rx_rings = regreq.nr_rx_rings;
1461	req->nr_mem_id = regreq.nr_mem_id;
1462	return error;
1463}
1464
1465/* remove a persistent VALE port from the system */
1466int
1467nm_vi_destroy(const char *name)
1468{
1469	struct ifnet *ifp;
1470	struct netmap_vp_adapter *vpna;
1471	int error;
1472
1473	ifp = ifunit_ref(name);
1474	if (!ifp)
1475		return ENXIO;
1476	NMG_LOCK();
1477	/* make sure this is actually a VALE port */
1478	if (!NM_NA_VALID(ifp) || NA(ifp)->nm_register != netmap_vp_reg) {
1479		error = EINVAL;
1480		goto err;
1481	}
1482
1483	vpna = (struct netmap_vp_adapter *)NA(ifp);
1484
1485	/* we can only destroy ports that were created via NETMAP_BDG_NEWIF */
1486	if (vpna->autodelete) {
1487		error = EINVAL;
1488		goto err;
1489	}
1490
1491	/* also make sure that nobody is using the inferface */
1492	if (NETMAP_OWNED_BY_ANY(&vpna->up) ||
1493	    vpna->up.na_refcount > 1 /* any ref besides the one in nm_vi_create()? */) {
1494		error = EBUSY;
1495		goto err;
1496	}
1497
1498	NMG_UNLOCK();
1499
1500	if (netmap_verbose)
1501		nm_prinf("destroying a persistent vale interface %s", ifp->if_xname);
1502	/* Linux requires all the references are released
1503	 * before unregister
1504	 */
1505	netmap_detach(ifp);
1506	if_rele(ifp);
1507	nm_os_vi_detach(ifp);
1508	return 0;
1509
1510err:
1511	NMG_UNLOCK();
1512	if_rele(ifp);
1513	return error;
1514}
1515
1516static int
1517nm_update_info(struct nmreq_register *req, struct netmap_adapter *na)
1518{
1519	req->nr_rx_rings = na->num_rx_rings;
1520	req->nr_tx_rings = na->num_tx_rings;
1521	req->nr_rx_slots = na->num_rx_desc;
1522	req->nr_tx_slots = na->num_tx_desc;
1523	return netmap_mem_get_info(na->nm_mem, &req->nr_memsize, NULL,
1524					&req->nr_mem_id);
1525}
1526
1527
1528/*
1529 * Create a virtual interface registered to the system.
1530 * The interface will be attached to a bridge later.
1531 */
1532int
1533netmap_vi_create(struct nmreq_header *hdr, int autodelete)
1534{
1535	struct nmreq_register *req = (struct nmreq_register *)(uintptr_t)hdr->nr_body;
1536	struct ifnet *ifp;
1537	struct netmap_vp_adapter *vpna;
1538	struct netmap_mem_d *nmd = NULL;
1539	int error;
1540
1541	if (hdr->nr_reqtype != NETMAP_REQ_REGISTER) {
1542		return EINVAL;
1543	}
1544
1545	/* don't include VALE prefix */
1546	if (!strncmp(hdr->nr_name, NM_BDG_NAME, strlen(NM_BDG_NAME)))
1547		return EINVAL;
1548	if (strlen(hdr->nr_name) >= IFNAMSIZ) {
1549		return EINVAL;
1550	}
1551	ifp = ifunit_ref(hdr->nr_name);
1552	if (ifp) { /* already exist, cannot create new one */
1553		error = EEXIST;
1554		NMG_LOCK();
1555		if (NM_NA_VALID(ifp)) {
1556			int update_err = nm_update_info(req, NA(ifp));
1557			if (update_err)
1558				error = update_err;
1559		}
1560		NMG_UNLOCK();
1561		if_rele(ifp);
1562		return error;
1563	}
1564	error = nm_os_vi_persist(hdr->nr_name, &ifp);
1565	if (error)
1566		return error;
1567
1568	NMG_LOCK();
1569	if (req->nr_mem_id) {
1570		nmd = netmap_mem_find(req->nr_mem_id);
1571		if (nmd == NULL) {
1572			error = EINVAL;
1573			goto err_1;
1574		}
1575	}
1576	/* netmap_vp_create creates a struct netmap_vp_adapter */
1577	error = netmap_vale_vp_create(hdr, ifp, nmd, &vpna);
1578	if (error) {
1579		if (netmap_debug & NM_DEBUG_VALE)
1580			nm_prerr("error %d", error);
1581		goto err_1;
1582	}
1583	/* persist-specific routines */
1584	vpna->up.nm_bdg_ctl = netmap_vp_bdg_ctl;
1585	if (!autodelete) {
1586		netmap_adapter_get(&vpna->up);
1587	} else {
1588		vpna->autodelete = 1;
1589	}
1590	NM_ATTACH_NA(ifp, &vpna->up);
1591	/* return the updated info */
1592	error = nm_update_info(req, &vpna->up);
1593	if (error) {
1594		goto err_2;
1595	}
1596	nm_prdis("returning nr_mem_id %d", req->nr_mem_id);
1597	if (nmd)
1598		netmap_mem_put(nmd);
1599	NMG_UNLOCK();
1600	nm_prdis("created %s", ifp->if_xname);
1601	return 0;
1602
1603err_2:
1604	netmap_detach(ifp);
1605err_1:
1606	if (nmd)
1607		netmap_mem_put(nmd);
1608	NMG_UNLOCK();
1609	nm_os_vi_detach(ifp);
1610
1611	return error;
1612}
1613
1614#endif /* WITH_VALE */
1615