1/*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright (c) 2019 Vincenzo Maffione <vmaffione@FreeBSD.org>
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
19 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
20 * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
21 * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
22 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
23 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
24 * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
25 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *
27 * $FreeBSD$
28 */
29
30/*
31 * This file implements multiple network backends (tap, netmap, ...),
32 * to be used by network frontends such as virtio-net and e1000.
33 * The API to access the backend (e.g. send/receive packets, negotiate
34 * features) is exported by net_backends.h.
35 */
36
37#include <sys/cdefs.h>
38__FBSDID("$FreeBSD$");
39
40#include <sys/types.h>		/* u_short etc */
41#ifndef WITHOUT_CAPSICUM
42#include <sys/capsicum.h>
43#endif
44#include <sys/ioctl.h>
45#include <sys/mman.h>
46#include <sys/uio.h>
47
48#include <net/if.h>
49#include <net/netmap.h>
50#include <net/netmap_virt.h>
51#define NETMAP_WITH_LIBS
52#include <net/netmap_user.h>
53
54#ifndef WITHOUT_CAPSICUM
55#include <capsicum_helpers.h>
56#endif
57#include <err.h>
58#include <errno.h>
59#include <fcntl.h>
60#include <stdio.h>
61#include <stdlib.h>
62#include <stdint.h>
63#include <string.h>
64#include <unistd.h>
65#include <sysexits.h>
66#include <assert.h>
67#include <pthread.h>
68#include <pthread_np.h>
69#include <poll.h>
70#include <assert.h>
71
72#ifdef NETGRAPH
73#include <sys/param.h>
74#include <sys/sysctl.h>
75#include <netgraph.h>
76#endif
77
78#include "debug.h"
79#include "iov.h"
80#include "mevent.h"
81#include "net_backends.h"
82
83#include <sys/linker_set.h>
84
85/*
86 * Each network backend registers a set of function pointers that are
87 * used to implement the net backends API.
88 * This might need to be exposed if we implement backends in separate files.
89 */
90struct net_backend {
91	const char *prefix;	/* prefix matching this backend */
92
93	/*
94	 * Routines used to initialize and cleanup the resources needed
95	 * by a backend. The cleanup function is used internally,
96	 * and should not be called by the frontend.
97	 */
98	int (*init)(struct net_backend *be, const char *devname,
99	    const char *opts, net_be_rxeof_t cb, void *param);
100	void (*cleanup)(struct net_backend *be);
101
102	/*
103	 * Called to serve a guest transmit request. The scatter-gather
104	 * vector provided by the caller has 'iovcnt' elements and contains
105	 * the packet to send.
106	 */
107	ssize_t (*send)(struct net_backend *be, const struct iovec *iov,
108	    int iovcnt);
109
110	/*
111	 * Get the length of the next packet that can be received from
112	 * the backend. If no packets are currently available, this
113	 * function returns 0.
114	 */
115	ssize_t (*peek_recvlen)(struct net_backend *be);
116
117	/*
118	 * Called to receive a packet from the backend. When the function
119	 * returns a positive value 'len', the scatter-gather vector
120	 * provided by the caller contains a packet with such length.
121	 * The function returns 0 if the backend doesn't have a new packet to
122	 * receive.
123	 */
124	ssize_t (*recv)(struct net_backend *be, const struct iovec *iov,
125	    int iovcnt);
126
127	/*
128	 * Ask the backend to enable or disable receive operation in the
129	 * backend. On return from a disable operation, it is guaranteed
130	 * that the receive callback won't be called until receive is
131	 * enabled again. Note however that it is up to the caller to make
132	 * sure that netbe_recv() is not currently being executed by another
133	 * thread.
134	 */
135	void (*recv_enable)(struct net_backend *be);
136	void (*recv_disable)(struct net_backend *be);
137
138	/*
139	 * Ask the backend for the virtio-net features it is able to
140	 * support. Possible features are TSO, UFO and checksum offloading
141	 * in both rx and tx direction and for both IPv4 and IPv6.
142	 */
143	uint64_t (*get_cap)(struct net_backend *be);
144
145	/*
146	 * Tell the backend to enable/disable the specified virtio-net
147	 * features (capabilities).
148	 */
149	int (*set_cap)(struct net_backend *be, uint64_t features,
150	    unsigned int vnet_hdr_len);
151
152	struct pci_vtnet_softc *sc;
153	int fd;
154
155	/*
156	 * Length of the virtio-net header used by the backend and the
157	 * frontend, respectively. A zero value means that the header
158	 * is not used.
159	 */
160	unsigned int be_vnet_hdr_len;
161	unsigned int fe_vnet_hdr_len;
162
163	/* Size of backend-specific private data. */
164	size_t priv_size;
165
166	/* Room for backend-specific data. */
167	char opaque[0];
168};
169
170SET_DECLARE(net_backend_set, struct net_backend);
171
172#define VNET_HDR_LEN	sizeof(struct virtio_net_rxhdr)
173
174#define WPRINTF(params) PRINTLN params
175
176/*
177 * The tap backend
178 */
179
180struct tap_priv {
181	struct mevent *mevp;
182	/*
183	 * A bounce buffer that allows us to implement the peek_recvlen
184	 * callback. In the future we may get the same information from
185	 * the kevent data.
186	 */
187	char bbuf[1 << 16];
188	ssize_t bbuflen;
189};
190
191static void
192tap_cleanup(struct net_backend *be)
193{
194	struct tap_priv *priv = (struct tap_priv *)be->opaque;
195
196	if (priv->mevp) {
197		mevent_delete(priv->mevp);
198	}
199	if (be->fd != -1) {
200		close(be->fd);
201		be->fd = -1;
202	}
203}
204
205static int
206tap_init(struct net_backend *be, const char *devname,
207	 const char *opts, net_be_rxeof_t cb, void *param)
208{
209	struct tap_priv *priv = (struct tap_priv *)be->opaque;
210	char tbuf[80];
211	int opt = 1;
212#ifndef WITHOUT_CAPSICUM
213	cap_rights_t rights;
214#endif
215
216	if (cb == NULL) {
217		WPRINTF(("TAP backend requires non-NULL callback"));
218		return (-1);
219	}
220
221	strcpy(tbuf, "/dev/");
222	strlcat(tbuf, devname, sizeof(tbuf));
223
224	be->fd = open(tbuf, O_RDWR);
225	if (be->fd == -1) {
226		WPRINTF(("open of tap device %s failed", tbuf));
227		goto error;
228	}
229
230	/*
231	 * Set non-blocking and register for read
232	 * notifications with the event loop
233	 */
234	if (ioctl(be->fd, FIONBIO, &opt) < 0) {
235		WPRINTF(("tap device O_NONBLOCK failed"));
236		goto error;
237	}
238
239#ifndef WITHOUT_CAPSICUM
240	cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE);
241	if (caph_rights_limit(be->fd, &rights) == -1)
242		errx(EX_OSERR, "Unable to apply rights for sandbox");
243#endif
244
245	memset(priv->bbuf, 0, sizeof(priv->bbuf));
246	priv->bbuflen = 0;
247
248	priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param);
249	if (priv->mevp == NULL) {
250		WPRINTF(("Could not register event"));
251		goto error;
252	}
253
254	return (0);
255
256error:
257	tap_cleanup(be);
258	return (-1);
259}
260
261/*
262 * Called to send a buffer chain out to the tap device
263 */
264static ssize_t
265tap_send(struct net_backend *be, const struct iovec *iov, int iovcnt)
266{
267	return (writev(be->fd, iov, iovcnt));
268}
269
270static ssize_t
271tap_peek_recvlen(struct net_backend *be)
272{
273	struct tap_priv *priv = (struct tap_priv *)be->opaque;
274	ssize_t ret;
275
276	if (priv->bbuflen > 0) {
277		/*
278		 * We already have a packet in the bounce buffer.
279		 * Just return its length.
280		 */
281		return priv->bbuflen;
282	}
283
284	/*
285	 * Read the next packet (if any) into the bounce buffer, so
286	 * that we get to know its length and we can return that
287	 * to the caller.
288	 */
289	ret = read(be->fd, priv->bbuf, sizeof(priv->bbuf));
290	if (ret < 0 && errno == EWOULDBLOCK) {
291		return (0);
292	}
293
294	if (ret > 0)
295		priv->bbuflen = ret;
296
297	return (ret);
298}
299
300static ssize_t
301tap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt)
302{
303	struct tap_priv *priv = (struct tap_priv *)be->opaque;
304	ssize_t ret;
305
306	if (priv->bbuflen > 0) {
307		/*
308		 * A packet is available in the bounce buffer, so
309		 * we read it from there.
310		 */
311		ret = buf_to_iov(priv->bbuf, priv->bbuflen,
312		    iov, iovcnt, 0);
313
314		/* Mark the bounce buffer as empty. */
315		priv->bbuflen = 0;
316
317		return (ret);
318	}
319
320	ret = readv(be->fd, iov, iovcnt);
321	if (ret < 0 && errno == EWOULDBLOCK) {
322		return (0);
323	}
324
325	return (ret);
326}
327
328static void
329tap_recv_enable(struct net_backend *be)
330{
331	struct tap_priv *priv = (struct tap_priv *)be->opaque;
332
333	mevent_enable(priv->mevp);
334}
335
336static void
337tap_recv_disable(struct net_backend *be)
338{
339	struct tap_priv *priv = (struct tap_priv *)be->opaque;
340
341	mevent_disable(priv->mevp);
342}
343
344static uint64_t
345tap_get_cap(struct net_backend *be)
346{
347
348	return (0); /* no capabilities for now */
349}
350
351static int
352tap_set_cap(struct net_backend *be, uint64_t features,
353		unsigned vnet_hdr_len)
354{
355
356	return ((features || vnet_hdr_len) ? -1 : 0);
357}
358
359static struct net_backend tap_backend = {
360	.prefix = "tap",
361	.priv_size = sizeof(struct tap_priv),
362	.init = tap_init,
363	.cleanup = tap_cleanup,
364	.send = tap_send,
365	.peek_recvlen = tap_peek_recvlen,
366	.recv = tap_recv,
367	.recv_enable = tap_recv_enable,
368	.recv_disable = tap_recv_disable,
369	.get_cap = tap_get_cap,
370	.set_cap = tap_set_cap,
371};
372
373/* A clone of the tap backend, with a different prefix. */
374static struct net_backend vmnet_backend = {
375	.prefix = "vmnet",
376	.priv_size = sizeof(struct tap_priv),
377	.init = tap_init,
378	.cleanup = tap_cleanup,
379	.send = tap_send,
380	.peek_recvlen = tap_peek_recvlen,
381	.recv = tap_recv,
382	.recv_enable = tap_recv_enable,
383	.recv_disable = tap_recv_disable,
384	.get_cap = tap_get_cap,
385	.set_cap = tap_set_cap,
386};
387
388DATA_SET(net_backend_set, tap_backend);
389DATA_SET(net_backend_set, vmnet_backend);
390
391#ifdef NETGRAPH
392
393/*
394 * Netgraph backend
395 */
396
397#define NG_SBUF_MAX_SIZE (4 * 1024 * 1024)
398
399static int
400ng_init(struct net_backend *be, const char *devname,
401	 const char *opts, net_be_rxeof_t cb, void *param)
402{
403	struct tap_priv *p = (struct tap_priv *)be->opaque;
404	struct ngm_connect ngc;
405	char *ngopts, *tofree;
406	char nodename[NG_NODESIZ];
407	int sbsz;
408	int ctrl_sock;
409	int flags;
410	int path_provided;
411	int peerhook_provided;
412	int socket_provided;
413	unsigned long maxsbsz;
414	size_t msbsz;
415#ifndef WITHOUT_CAPSICUM
416	cap_rights_t rights;
417#endif
418
419	if (cb == NULL) {
420		WPRINTF(("Netgraph backend requires non-NULL callback"));
421		return (-1);
422	}
423
424	be->fd = -1;
425
426	memset(&ngc, 0, sizeof(ngc));
427
428	strncpy(ngc.ourhook, "vmlink", NG_HOOKSIZ - 1);
429
430	tofree = ngopts = strdup(opts);
431
432	if (ngopts == NULL) {
433		WPRINTF(("strdup error"));
434		return (-1);
435	}
436
437	socket_provided = 0;
438	path_provided = 0;
439	peerhook_provided = 0;
440
441	while (ngopts != NULL) {
442		char *value = ngopts;
443		char *key;
444
445		key = strsep(&value, "=");
446		if (value == NULL)
447			break;
448		ngopts = value;
449		(void) strsep(&ngopts, ",");
450
451		if (strcmp(key, "socket") == 0) {
452			strncpy(nodename, value, NG_NODESIZ - 1);
453			socket_provided = 1;
454		} else if (strcmp(key, "path") == 0) {
455			strncpy(ngc.path, value, NG_PATHSIZ - 1);
456			path_provided = 1;
457		} else if (strcmp(key, "hook") == 0) {
458			strncpy(ngc.ourhook, value, NG_HOOKSIZ - 1);
459		} else if (strcmp(key, "peerhook") == 0) {
460			strncpy(ngc.peerhook, value, NG_HOOKSIZ - 1);
461			peerhook_provided = 1;
462		}
463	}
464
465	free(tofree);
466
467	if (!path_provided) {
468		WPRINTF(("path must be provided"));
469		return (-1);
470	}
471
472	if (!peerhook_provided) {
473		WPRINTF(("peer hook must be provided"));
474		return (-1);
475	}
476
477	if (NgMkSockNode(socket_provided ? nodename : NULL,
478		&ctrl_sock, &be->fd) < 0) {
479		WPRINTF(("can't get Netgraph sockets"));
480		return (-1);
481	}
482
483	if (NgSendMsg(ctrl_sock, ".",
484		NGM_GENERIC_COOKIE,
485		NGM_CONNECT, &ngc, sizeof(ngc)) < 0) {
486		WPRINTF(("can't connect to node"));
487		close(ctrl_sock);
488		goto error;
489	}
490
491	close(ctrl_sock);
492
493	flags = fcntl(be->fd, F_GETFL);
494
495	if (flags < 0) {
496		WPRINTF(("can't get socket flags"));
497		goto error;
498	}
499
500	if (fcntl(be->fd, F_SETFL, flags | O_NONBLOCK) < 0) {
501		WPRINTF(("can't set O_NONBLOCK flag"));
502		goto error;
503	}
504
505	/*
506	 * The default ng_socket(4) buffer's size is too low.
507	 * Calculate the minimum value between NG_SBUF_MAX_SIZE
508	 * and kern.ipc.maxsockbuf.
509	 */
510	msbsz = sizeof(maxsbsz);
511	if (sysctlbyname("kern.ipc.maxsockbuf", &maxsbsz, &msbsz,
512		NULL, 0) < 0) {
513		WPRINTF(("can't get 'kern.ipc.maxsockbuf' value"));
514		goto error;
515	}
516
517	/*
518	 * We can't set the socket buffer size to kern.ipc.maxsockbuf value,
519	 * as it takes into account the mbuf(9) overhead.
520	 */
521	maxsbsz = maxsbsz * MCLBYTES / (MSIZE + MCLBYTES);
522
523	sbsz = MIN(NG_SBUF_MAX_SIZE, maxsbsz);
524
525	if (setsockopt(be->fd, SOL_SOCKET, SO_SNDBUF, &sbsz,
526		sizeof(sbsz)) < 0) {
527		WPRINTF(("can't set TX buffer size"));
528		goto error;
529	}
530
531	if (setsockopt(be->fd, SOL_SOCKET, SO_RCVBUF, &sbsz,
532		sizeof(sbsz)) < 0) {
533		WPRINTF(("can't set RX buffer size"));
534		goto error;
535	}
536
537#ifndef WITHOUT_CAPSICUM
538	cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE);
539	if (caph_rights_limit(be->fd, &rights) == -1)
540		errx(EX_OSERR, "Unable to apply rights for sandbox");
541#endif
542
543	memset(p->bbuf, 0, sizeof(p->bbuf));
544	p->bbuflen = 0;
545
546	p->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param);
547	if (p->mevp == NULL) {
548		WPRINTF(("Could not register event"));
549		goto error;
550	}
551
552	return (0);
553
554error:
555	tap_cleanup(be);
556	return (-1);
557}
558
559static struct net_backend ng_backend = {
560	.prefix = "netgraph",
561	.priv_size = sizeof(struct tap_priv),
562	.init = ng_init,
563	.cleanup = tap_cleanup,
564	.send = tap_send,
565	.peek_recvlen = tap_peek_recvlen,
566	.recv = tap_recv,
567	.recv_enable = tap_recv_enable,
568	.recv_disable = tap_recv_disable,
569	.get_cap = tap_get_cap,
570	.set_cap = tap_set_cap,
571};
572
573DATA_SET(net_backend_set, ng_backend);
574
575#endif /* NETGRAPH */
576
577/*
578 * The netmap backend
579 */
580
581/* The virtio-net features supported by netmap. */
582#define NETMAP_FEATURES (VIRTIO_NET_F_CSUM | VIRTIO_NET_F_HOST_TSO4 | \
583		VIRTIO_NET_F_HOST_TSO6 | VIRTIO_NET_F_HOST_UFO | \
584		VIRTIO_NET_F_GUEST_CSUM | VIRTIO_NET_F_GUEST_TSO4 | \
585		VIRTIO_NET_F_GUEST_TSO6 | VIRTIO_NET_F_GUEST_UFO)
586
587struct netmap_priv {
588	char ifname[IFNAMSIZ];
589	struct nm_desc *nmd;
590	uint16_t memid;
591	struct netmap_ring *rx;
592	struct netmap_ring *tx;
593	struct mevent *mevp;
594	net_be_rxeof_t cb;
595	void *cb_param;
596};
597
598static void
599nmreq_init(struct nmreq *req, char *ifname)
600{
601
602	memset(req, 0, sizeof(*req));
603	strlcpy(req->nr_name, ifname, sizeof(req->nr_name));
604	req->nr_version = NETMAP_API;
605}
606
607static int
608netmap_set_vnet_hdr_len(struct net_backend *be, int vnet_hdr_len)
609{
610	int err;
611	struct nmreq req;
612	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
613
614	nmreq_init(&req, priv->ifname);
615	req.nr_cmd = NETMAP_BDG_VNET_HDR;
616	req.nr_arg1 = vnet_hdr_len;
617	err = ioctl(be->fd, NIOCREGIF, &req);
618	if (err) {
619		WPRINTF(("Unable to set vnet header length %d",
620				vnet_hdr_len));
621		return (err);
622	}
623
624	be->be_vnet_hdr_len = vnet_hdr_len;
625
626	return (0);
627}
628
629static int
630netmap_has_vnet_hdr_len(struct net_backend *be, unsigned vnet_hdr_len)
631{
632	int prev_hdr_len = be->be_vnet_hdr_len;
633	int ret;
634
635	if (vnet_hdr_len == prev_hdr_len) {
636		return (1);
637	}
638
639	ret = netmap_set_vnet_hdr_len(be, vnet_hdr_len);
640	if (ret) {
641		return (0);
642	}
643
644	netmap_set_vnet_hdr_len(be, prev_hdr_len);
645
646	return (1);
647}
648
649static uint64_t
650netmap_get_cap(struct net_backend *be)
651{
652
653	return (netmap_has_vnet_hdr_len(be, VNET_HDR_LEN) ?
654	    NETMAP_FEATURES : 0);
655}
656
657static int
658netmap_set_cap(struct net_backend *be, uint64_t features,
659	       unsigned vnet_hdr_len)
660{
661
662	return (netmap_set_vnet_hdr_len(be, vnet_hdr_len));
663}
664
665static int
666netmap_init(struct net_backend *be, const char *devname,
667	    const char *opts, net_be_rxeof_t cb, void *param)
668{
669	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
670
671	strlcpy(priv->ifname, devname, sizeof(priv->ifname));
672	priv->ifname[sizeof(priv->ifname) - 1] = '\0';
673
674	priv->nmd = nm_open(priv->ifname, NULL, NETMAP_NO_TX_POLL, NULL);
675	if (priv->nmd == NULL) {
676		WPRINTF(("Unable to nm_open(): interface '%s', errno (%s)",
677			devname, strerror(errno)));
678		free(priv);
679		return (-1);
680	}
681
682	priv->memid = priv->nmd->req.nr_arg2;
683	priv->tx = NETMAP_TXRING(priv->nmd->nifp, 0);
684	priv->rx = NETMAP_RXRING(priv->nmd->nifp, 0);
685	priv->cb = cb;
686	priv->cb_param = param;
687	be->fd = priv->nmd->fd;
688
689	priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param);
690	if (priv->mevp == NULL) {
691		WPRINTF(("Could not register event"));
692		return (-1);
693	}
694
695	return (0);
696}
697
698static void
699netmap_cleanup(struct net_backend *be)
700{
701	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
702
703	if (priv->mevp) {
704		mevent_delete(priv->mevp);
705	}
706	if (priv->nmd) {
707		nm_close(priv->nmd);
708	}
709	be->fd = -1;
710}
711
712static ssize_t
713netmap_send(struct net_backend *be, const struct iovec *iov,
714	    int iovcnt)
715{
716	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
717	struct netmap_ring *ring;
718	ssize_t totlen = 0;
719	int nm_buf_size;
720	int nm_buf_len;
721	uint32_t head;
722	void *nm_buf;
723	int j;
724
725	ring = priv->tx;
726	head = ring->head;
727	if (head == ring->tail) {
728		WPRINTF(("No space, drop %zu bytes", count_iov(iov, iovcnt)));
729		goto txsync;
730	}
731	nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx);
732	nm_buf_size = ring->nr_buf_size;
733	nm_buf_len = 0;
734
735	for (j = 0; j < iovcnt; j++) {
736		int iov_frag_size = iov[j].iov_len;
737		void *iov_frag_buf = iov[j].iov_base;
738
739		totlen += iov_frag_size;
740
741		/*
742		 * Split each iovec fragment over more netmap slots, if
743		 * necessary.
744		 */
745		for (;;) {
746			int copylen;
747
748			copylen = iov_frag_size < nm_buf_size ? iov_frag_size : nm_buf_size;
749			memcpy(nm_buf, iov_frag_buf, copylen);
750
751			iov_frag_buf += copylen;
752			iov_frag_size -= copylen;
753			nm_buf += copylen;
754			nm_buf_size -= copylen;
755			nm_buf_len += copylen;
756
757			if (iov_frag_size == 0) {
758				break;
759			}
760
761			ring->slot[head].len = nm_buf_len;
762			ring->slot[head].flags = NS_MOREFRAG;
763			head = nm_ring_next(ring, head);
764			if (head == ring->tail) {
765				/*
766				 * We ran out of netmap slots while
767				 * splitting the iovec fragments.
768				 */
769				WPRINTF(("No space, drop %zu bytes",
770				   count_iov(iov, iovcnt)));
771				goto txsync;
772			}
773			nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx);
774			nm_buf_size = ring->nr_buf_size;
775			nm_buf_len = 0;
776		}
777	}
778
779	/* Complete the last slot, which must not have NS_MOREFRAG set. */
780	ring->slot[head].len = nm_buf_len;
781	ring->slot[head].flags = 0;
782	head = nm_ring_next(ring, head);
783
784	/* Now update ring->head and ring->cur. */
785	ring->head = ring->cur = head;
786txsync:
787	ioctl(be->fd, NIOCTXSYNC, NULL);
788
789	return (totlen);
790}
791
792static ssize_t
793netmap_peek_recvlen(struct net_backend *be)
794{
795	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
796	struct netmap_ring *ring = priv->rx;
797	uint32_t head = ring->head;
798	ssize_t totlen = 0;
799
800	while (head != ring->tail) {
801		struct netmap_slot *slot = ring->slot + head;
802
803		totlen += slot->len;
804		if ((slot->flags & NS_MOREFRAG) == 0)
805			break;
806		head = nm_ring_next(ring, head);
807	}
808
809	return (totlen);
810}
811
812static ssize_t
813netmap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt)
814{
815	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
816	struct netmap_slot *slot = NULL;
817	struct netmap_ring *ring;
818	void *iov_frag_buf;
819	int iov_frag_size;
820	ssize_t totlen = 0;
821	uint32_t head;
822
823	assert(iovcnt);
824
825	ring = priv->rx;
826	head = ring->head;
827	iov_frag_buf = iov->iov_base;
828	iov_frag_size = iov->iov_len;
829
830	do {
831		int nm_buf_len;
832		void *nm_buf;
833
834		if (head == ring->tail) {
835			return (0);
836		}
837
838		slot = ring->slot + head;
839		nm_buf = NETMAP_BUF(ring, slot->buf_idx);
840		nm_buf_len = slot->len;
841
842		for (;;) {
843			int copylen = nm_buf_len < iov_frag_size ?
844			    nm_buf_len : iov_frag_size;
845
846			memcpy(iov_frag_buf, nm_buf, copylen);
847			nm_buf += copylen;
848			nm_buf_len -= copylen;
849			iov_frag_buf += copylen;
850			iov_frag_size -= copylen;
851			totlen += copylen;
852
853			if (nm_buf_len == 0) {
854				break;
855			}
856
857			iov++;
858			iovcnt--;
859			if (iovcnt == 0) {
860				/* No space to receive. */
861				WPRINTF(("Short iov, drop %zd bytes",
862				    totlen));
863				return (-ENOSPC);
864			}
865			iov_frag_buf = iov->iov_base;
866			iov_frag_size = iov->iov_len;
867		}
868
869		head = nm_ring_next(ring, head);
870
871	} while (slot->flags & NS_MOREFRAG);
872
873	/* Release slots to netmap. */
874	ring->head = ring->cur = head;
875
876	return (totlen);
877}
878
879static void
880netmap_recv_enable(struct net_backend *be)
881{
882	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
883
884	mevent_enable(priv->mevp);
885}
886
887static void
888netmap_recv_disable(struct net_backend *be)
889{
890	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
891
892	mevent_disable(priv->mevp);
893}
894
895static struct net_backend netmap_backend = {
896	.prefix = "netmap",
897	.priv_size = sizeof(struct netmap_priv),
898	.init = netmap_init,
899	.cleanup = netmap_cleanup,
900	.send = netmap_send,
901	.peek_recvlen = netmap_peek_recvlen,
902	.recv = netmap_recv,
903	.recv_enable = netmap_recv_enable,
904	.recv_disable = netmap_recv_disable,
905	.get_cap = netmap_get_cap,
906	.set_cap = netmap_set_cap,
907};
908
909/* A clone of the netmap backend, with a different prefix. */
910static struct net_backend vale_backend = {
911	.prefix = "vale",
912	.priv_size = sizeof(struct netmap_priv),
913	.init = netmap_init,
914	.cleanup = netmap_cleanup,
915	.send = netmap_send,
916	.peek_recvlen = netmap_peek_recvlen,
917	.recv = netmap_recv,
918	.recv_enable = netmap_recv_enable,
919	.recv_disable = netmap_recv_disable,
920	.get_cap = netmap_get_cap,
921	.set_cap = netmap_set_cap,
922};
923
924DATA_SET(net_backend_set, netmap_backend);
925DATA_SET(net_backend_set, vale_backend);
926
927/*
928 * Initialize a backend and attach to the frontend.
929 * This is called during frontend initialization.
930 *  @pbe is a pointer to the backend to be initialized
931 *  @devname is the backend-name as supplied on the command line,
932 * 	e.g. -s 2:0,frontend-name,backend-name[,other-args]
933 *  @cb is the receive callback supplied by the frontend,
934 *	and it is invoked in the event loop when a receive
935 *	event is generated in the hypervisor,
936 *  @param is a pointer to the frontend, and normally used as
937 *	the argument for the callback.
938 */
939int
940netbe_init(struct net_backend **ret, const char *opts, net_be_rxeof_t cb,
941    void *param)
942{
943	struct net_backend **pbe, *nbe, *tbe = NULL;
944	char *devname;
945	char *options;
946	int err;
947
948	devname = options = strdup(opts);
949
950	if (devname == NULL) {
951		return (-1);
952	}
953
954	devname = strsep(&options, ",");
955
956	/*
957	 * Find the network backend that matches the user-provided
958	 * device name. net_backend_set is built using a linker set.
959	 */
960	SET_FOREACH(pbe, net_backend_set) {
961		if (strncmp(devname, (*pbe)->prefix,
962		    strlen((*pbe)->prefix)) == 0) {
963			tbe = *pbe;
964			assert(tbe->init != NULL);
965			assert(tbe->cleanup != NULL);
966			assert(tbe->send != NULL);
967			assert(tbe->recv != NULL);
968			assert(tbe->get_cap != NULL);
969			assert(tbe->set_cap != NULL);
970			break;
971		}
972	}
973
974	*ret = NULL;
975	if (tbe == NULL) {
976		free(devname);
977		return (EINVAL);
978	}
979
980	nbe = calloc(1, sizeof(*nbe) + tbe->priv_size);
981	*nbe = *tbe;	/* copy the template */
982	nbe->fd = -1;
983	nbe->sc = param;
984	nbe->be_vnet_hdr_len = 0;
985	nbe->fe_vnet_hdr_len = 0;
986
987	/* Initialize the backend. */
988	err = nbe->init(nbe, devname, options, cb, param);
989	if (err) {
990		free(devname);
991		free(nbe);
992		return (err);
993	}
994
995	*ret = nbe;
996	free(devname);
997
998	return (0);
999}
1000
1001void
1002netbe_cleanup(struct net_backend *be)
1003{
1004
1005	if (be != NULL) {
1006		be->cleanup(be);
1007		free(be);
1008	}
1009}
1010
1011uint64_t
1012netbe_get_cap(struct net_backend *be)
1013{
1014
1015	assert(be != NULL);
1016	return (be->get_cap(be));
1017}
1018
1019int
1020netbe_set_cap(struct net_backend *be, uint64_t features,
1021	      unsigned vnet_hdr_len)
1022{
1023	int ret;
1024
1025	assert(be != NULL);
1026
1027	/* There are only three valid lengths, i.e., 0, 10 and 12. */
1028	if (vnet_hdr_len && vnet_hdr_len != VNET_HDR_LEN
1029		&& vnet_hdr_len != (VNET_HDR_LEN - sizeof(uint16_t)))
1030		return (-1);
1031
1032	be->fe_vnet_hdr_len = vnet_hdr_len;
1033
1034	ret = be->set_cap(be, features, vnet_hdr_len);
1035	assert(be->be_vnet_hdr_len == 0 ||
1036	       be->be_vnet_hdr_len == be->fe_vnet_hdr_len);
1037
1038	return (ret);
1039}
1040
1041ssize_t
1042netbe_send(struct net_backend *be, const struct iovec *iov, int iovcnt)
1043{
1044
1045	return (be->send(be, iov, iovcnt));
1046}
1047
1048ssize_t
1049netbe_peek_recvlen(struct net_backend *be)
1050{
1051
1052	return (be->peek_recvlen(be));
1053}
1054
1055/*
1056 * Try to read a packet from the backend, without blocking.
1057 * If no packets are available, return 0. In case of success, return
1058 * the length of the packet just read. Return -1 in case of errors.
1059 */
1060ssize_t
1061netbe_recv(struct net_backend *be, const struct iovec *iov, int iovcnt)
1062{
1063
1064	return (be->recv(be, iov, iovcnt));
1065}
1066
1067/*
1068 * Read a packet from the backend and discard it.
1069 * Returns the size of the discarded packet or zero if no packet was available.
1070 * A negative error code is returned in case of read error.
1071 */
1072ssize_t
1073netbe_rx_discard(struct net_backend *be)
1074{
1075	/*
1076	 * MP note: the dummybuf is only used to discard frames,
1077	 * so there is no need for it to be per-vtnet or locked.
1078	 * We only make it large enough for TSO-sized segment.
1079	 */
1080	static uint8_t dummybuf[65536 + 64];
1081	struct iovec iov;
1082
1083	iov.iov_base = dummybuf;
1084	iov.iov_len = sizeof(dummybuf);
1085
1086	return netbe_recv(be, &iov, 1);
1087}
1088
1089void
1090netbe_rx_disable(struct net_backend *be)
1091{
1092
1093	return be->recv_disable(be);
1094}
1095
1096void
1097netbe_rx_enable(struct net_backend *be)
1098{
1099
1100	return be->recv_enable(be);
1101}
1102
1103size_t
1104netbe_get_vnet_hdr_len(struct net_backend *be)
1105{
1106
1107	return (be->be_vnet_hdr_len);
1108}
1109