1/*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2019 Vincenzo Maffione <vmaffione@FreeBSD.org>
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
19 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
20 * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
21 * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
22 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
23 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
24 * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
25 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28/*
29 * This file implements multiple network backends (tap, netmap, ...),
30 * to be used by network frontends such as virtio-net and e1000.
31 * The API to access the backend (e.g. send/receive packets, negotiate
32 * features) is exported by net_backends.h.
33 */
34
35#include <sys/types.h>
36#ifndef WITHOUT_CAPSICUM
37#include <sys/capsicum.h>
38#endif
39#include <sys/ioctl.h>
40#include <sys/mman.h>
41#include <sys/uio.h>
42
43#include <net/if.h>
44#include <net/if_tap.h>
45
46#include <assert.h>
47#ifndef WITHOUT_CAPSICUM
48#include <capsicum_helpers.h>
49#endif
50#include <err.h>
51#include <errno.h>
52#include <fcntl.h>
53#include <poll.h>
54#include <pthread.h>
55#include <pthread_np.h>
56#include <stdio.h>
57#include <stdlib.h>
58#include <stdint.h>
59#include <string.h>
60#include <sysexits.h>
61#include <unistd.h>
62
63#include "config.h"
64#include "debug.h"
65#include "iov.h"
66#include "mevent.h"
67#include "net_backends.h"
68#include "net_backends_priv.h"
69#include "pci_emul.h"
70
71#define	NET_BE_SIZE(be)		(sizeof(*be) + (be)->priv_size)
72
73void
74tap_cleanup(struct net_backend *be)
75{
76	struct tap_priv *priv = NET_BE_PRIV(be);
77
78	if (priv->mevp) {
79		mevent_delete(priv->mevp);
80	}
81	if (be->fd != -1) {
82		close(be->fd);
83		be->fd = -1;
84	}
85}
86
87static int
88tap_init(struct net_backend *be, const char *devname,
89    nvlist_t *nvl __unused, net_be_rxeof_t cb, void *param)
90{
91	struct tap_priv *priv = NET_BE_PRIV(be);
92	char tbuf[80];
93	int opt = 1, up = IFF_UP;
94
95#ifndef WITHOUT_CAPSICUM
96	cap_rights_t rights;
97#endif
98
99	if (cb == NULL) {
100		EPRINTLN("TAP backend requires non-NULL callback");
101		return (-1);
102	}
103
104	strcpy(tbuf, "/dev/");
105	strlcat(tbuf, devname, sizeof(tbuf));
106
107	be->fd = open(tbuf, O_RDWR);
108	if (be->fd == -1) {
109		EPRINTLN("open of tap device %s failed", tbuf);
110		goto error;
111	}
112
113	/*
114	 * Set non-blocking and register for read
115	 * notifications with the event loop
116	 */
117	if (ioctl(be->fd, FIONBIO, &opt) < 0) {
118		EPRINTLN("tap device O_NONBLOCK failed");
119		goto error;
120	}
121
122	if (ioctl(be->fd, VMIO_SIOCSIFFLAGS, up)) {
123		EPRINTLN("tap device link up failed");
124		goto error;
125	}
126
127#ifndef WITHOUT_CAPSICUM
128	cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE);
129	if (caph_rights_limit(be->fd, &rights) == -1)
130		errx(EX_OSERR, "Unable to apply rights for sandbox");
131#endif
132
133	memset(priv->bbuf, 0, sizeof(priv->bbuf));
134	priv->bbuflen = 0;
135
136	priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param);
137	if (priv->mevp == NULL) {
138		EPRINTLN("Could not register event");
139		goto error;
140	}
141
142	return (0);
143
144error:
145	tap_cleanup(be);
146	return (-1);
147}
148
149/*
150 * Called to send a buffer chain out to the tap device
151 */
152ssize_t
153tap_send(struct net_backend *be, const struct iovec *iov, int iovcnt)
154{
155	return (writev(be->fd, iov, iovcnt));
156}
157
158ssize_t
159tap_peek_recvlen(struct net_backend *be)
160{
161	struct tap_priv *priv = NET_BE_PRIV(be);
162	ssize_t ret;
163
164	if (priv->bbuflen > 0) {
165		/*
166		 * We already have a packet in the bounce buffer.
167		 * Just return its length.
168		 */
169		return priv->bbuflen;
170	}
171
172	/*
173	 * Read the next packet (if any) into the bounce buffer, so
174	 * that we get to know its length and we can return that
175	 * to the caller.
176	 */
177	ret = read(be->fd, priv->bbuf, sizeof(priv->bbuf));
178	if (ret < 0 && errno == EWOULDBLOCK) {
179		return (0);
180	}
181
182	if (ret > 0)
183		priv->bbuflen = ret;
184
185	return (ret);
186}
187
188ssize_t
189tap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt)
190{
191	struct tap_priv *priv = NET_BE_PRIV(be);
192	ssize_t ret;
193
194	if (priv->bbuflen > 0) {
195		/*
196		 * A packet is available in the bounce buffer, so
197		 * we read it from there.
198		 */
199		ret = buf_to_iov(priv->bbuf, priv->bbuflen,
200		    iov, iovcnt, 0);
201
202		/* Mark the bounce buffer as empty. */
203		priv->bbuflen = 0;
204
205		return (ret);
206	}
207
208	ret = readv(be->fd, iov, iovcnt);
209	if (ret < 0 && errno == EWOULDBLOCK) {
210		return (0);
211	}
212
213	return (ret);
214}
215
216void
217tap_recv_enable(struct net_backend *be)
218{
219	struct tap_priv *priv = NET_BE_PRIV(be);
220
221	mevent_enable(priv->mevp);
222}
223
224void
225tap_recv_disable(struct net_backend *be)
226{
227	struct tap_priv *priv = NET_BE_PRIV(be);
228
229	mevent_disable(priv->mevp);
230}
231
232uint64_t
233tap_get_cap(struct net_backend *be __unused)
234{
235
236	return (0); /* no capabilities for now */
237}
238
239int
240tap_set_cap(struct net_backend *be __unused, uint64_t features,
241    unsigned vnet_hdr_len)
242{
243
244	return ((features || vnet_hdr_len) ? -1 : 0);
245}
246
247static struct net_backend tap_backend = {
248	.prefix = "tap",
249	.priv_size = sizeof(struct tap_priv),
250	.init = tap_init,
251	.cleanup = tap_cleanup,
252	.send = tap_send,
253	.peek_recvlen = tap_peek_recvlen,
254	.recv = tap_recv,
255	.recv_enable = tap_recv_enable,
256	.recv_disable = tap_recv_disable,
257	.get_cap = tap_get_cap,
258	.set_cap = tap_set_cap,
259};
260
261/* A clone of the tap backend, with a different prefix. */
262static struct net_backend vmnet_backend = {
263	.prefix = "vmnet",
264	.priv_size = sizeof(struct tap_priv),
265	.init = tap_init,
266	.cleanup = tap_cleanup,
267	.send = tap_send,
268	.peek_recvlen = tap_peek_recvlen,
269	.recv = tap_recv,
270	.recv_enable = tap_recv_enable,
271	.recv_disable = tap_recv_disable,
272	.get_cap = tap_get_cap,
273	.set_cap = tap_set_cap,
274};
275
276DATA_SET(net_backend_set, tap_backend);
277DATA_SET(net_backend_set, vmnet_backend);
278
279int
280netbe_legacy_config(nvlist_t *nvl, const char *opts)
281{
282	char *backend, *cp;
283
284	if (opts == NULL)
285		return (0);
286
287	cp = strchr(opts, ',');
288	if (cp == NULL) {
289		set_config_value_node(nvl, "backend", opts);
290		return (0);
291	}
292	backend = strndup(opts, cp - opts);
293	set_config_value_node(nvl, "backend", backend);
294	free(backend);
295	return (pci_parse_legacy_config(nvl, cp + 1));
296}
297
298/*
299 * Initialize a backend and attach to the frontend.
300 * This is called during frontend initialization.
301 *  @ret is a pointer to the backend to be initialized
302 *  @devname is the backend-name as supplied on the command line,
303 * 	e.g. -s 2:0,frontend-name,backend-name[,other-args]
304 *  @cb is the receive callback supplied by the frontend,
305 *	and it is invoked in the event loop when a receive
306 *	event is generated in the hypervisor,
307 *  @param is a pointer to the frontend, and normally used as
308 *	the argument for the callback.
309 */
310int
311netbe_init(struct net_backend **ret, nvlist_t *nvl, net_be_rxeof_t cb,
312    void *param)
313{
314	struct net_backend **pbe, *nbe, *tbe = NULL;
315	const char *value, *type;
316	char *devname;
317	int err;
318
319	value = get_config_value_node(nvl, "backend");
320	if (value == NULL) {
321		return (-1);
322	}
323	devname = strdup(value);
324
325	/*
326	 * Use the type given by configuration if exists; otherwise
327	 * use the prefix of the backend as the type.
328	 */
329	type = get_config_value_node(nvl, "type");
330	if (type == NULL)
331		type = devname;
332
333	/*
334	 * Find the network backend that matches the user-provided
335	 * device name. net_backend_set is built using a linker set.
336	 */
337	SET_FOREACH(pbe, net_backend_set) {
338		if (strncmp(type, (*pbe)->prefix,
339		    strlen((*pbe)->prefix)) == 0) {
340			tbe = *pbe;
341			assert(tbe->init != NULL);
342			assert(tbe->cleanup != NULL);
343			assert(tbe->send != NULL);
344			assert(tbe->recv != NULL);
345			assert(tbe->get_cap != NULL);
346			assert(tbe->set_cap != NULL);
347			break;
348		}
349	}
350
351	*ret = NULL;
352	if (tbe == NULL) {
353		free(devname);
354		return (EINVAL);
355	}
356
357	nbe = calloc(1, NET_BE_SIZE(tbe));
358	*nbe = *tbe;	/* copy the template */
359	nbe->fd = -1;
360	nbe->sc = param;
361	nbe->be_vnet_hdr_len = 0;
362	nbe->fe_vnet_hdr_len = 0;
363
364	/* Initialize the backend. */
365	err = nbe->init(nbe, devname, nvl, cb, param);
366	if (err) {
367		free(devname);
368		free(nbe);
369		return (err);
370	}
371
372	*ret = nbe;
373	free(devname);
374
375	return (0);
376}
377
378void
379netbe_cleanup(struct net_backend *be)
380{
381
382	if (be != NULL) {
383		be->cleanup(be);
384		free(be);
385	}
386}
387
388uint64_t
389netbe_get_cap(struct net_backend *be)
390{
391
392	assert(be != NULL);
393	return (be->get_cap(be));
394}
395
396int
397netbe_set_cap(struct net_backend *be, uint64_t features,
398	      unsigned vnet_hdr_len)
399{
400	int ret;
401
402	assert(be != NULL);
403
404	/* There are only three valid lengths, i.e., 0, 10 and 12. */
405	if (vnet_hdr_len && vnet_hdr_len != VNET_HDR_LEN
406		&& vnet_hdr_len != (VNET_HDR_LEN - sizeof(uint16_t)))
407		return (-1);
408
409	be->fe_vnet_hdr_len = vnet_hdr_len;
410
411	ret = be->set_cap(be, features, vnet_hdr_len);
412	assert(be->be_vnet_hdr_len == 0 ||
413	       be->be_vnet_hdr_len == be->fe_vnet_hdr_len);
414
415	return (ret);
416}
417
418ssize_t
419netbe_send(struct net_backend *be, const struct iovec *iov, int iovcnt)
420{
421
422	return (be->send(be, iov, iovcnt));
423}
424
425ssize_t
426netbe_peek_recvlen(struct net_backend *be)
427{
428
429	return (be->peek_recvlen(be));
430}
431
432/*
433 * Try to read a packet from the backend, without blocking.
434 * If no packets are available, return 0. In case of success, return
435 * the length of the packet just read. Return -1 in case of errors.
436 */
437ssize_t
438netbe_recv(struct net_backend *be, const struct iovec *iov, int iovcnt)
439{
440
441	return (be->recv(be, iov, iovcnt));
442}
443
444/*
445 * Read a packet from the backend and discard it.
446 * Returns the size of the discarded packet or zero if no packet was available.
447 * A negative error code is returned in case of read error.
448 */
449ssize_t
450netbe_rx_discard(struct net_backend *be)
451{
452	/*
453	 * MP note: the dummybuf is only used to discard frames,
454	 * so there is no need for it to be per-vtnet or locked.
455	 * We only make it large enough for TSO-sized segment.
456	 */
457	static uint8_t dummybuf[65536 + 64];
458	struct iovec iov;
459
460	iov.iov_base = dummybuf;
461	iov.iov_len = sizeof(dummybuf);
462
463	return netbe_recv(be, &iov, 1);
464}
465
466void
467netbe_rx_disable(struct net_backend *be)
468{
469
470	return be->recv_disable(be);
471}
472
473void
474netbe_rx_enable(struct net_backend *be)
475{
476
477	return be->recv_enable(be);
478}
479
480size_t
481netbe_get_vnet_hdr_len(struct net_backend *be)
482{
483
484	return (be->be_vnet_hdr_len);
485}
486