1/*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2019 Vincenzo Maffione <vmaffione@FreeBSD.org> 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS 19 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, 20 * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT 21 * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 22 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 23 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE 24 * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 25 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28/* 29 * This file implements multiple network backends (tap, netmap, ...), 30 * to be used by network frontends such as virtio-net and e1000. 31 * The API to access the backend (e.g. send/receive packets, negotiate 32 * features) is exported by net_backends.h. 33 */ 34 35#include <sys/types.h> 36#ifndef WITHOUT_CAPSICUM 37#include <sys/capsicum.h> 38#endif 39#include <sys/ioctl.h> 40#include <sys/mman.h> 41#include <sys/uio.h> 42 43#include <net/if.h> 44#include <net/if_tap.h> 45 46#include <assert.h> 47#ifndef WITHOUT_CAPSICUM 48#include <capsicum_helpers.h> 49#endif 50#include <err.h> 51#include <errno.h> 52#include <fcntl.h> 53#include <poll.h> 54#include <pthread.h> 55#include <pthread_np.h> 56#include <stdio.h> 57#include <stdlib.h> 58#include <stdint.h> 59#include <string.h> 60#include <sysexits.h> 61#include <unistd.h> 62 63#include "config.h" 64#include "debug.h" 65#include "iov.h" 66#include "mevent.h" 67#include "net_backends.h" 68#include "net_backends_priv.h" 69#include "pci_emul.h" 70 71#define NET_BE_SIZE(be) (sizeof(*be) + (be)->priv_size) 72 73void 74tap_cleanup(struct net_backend *be) 75{ 76 struct tap_priv *priv = NET_BE_PRIV(be); 77 78 if (priv->mevp) { 79 mevent_delete(priv->mevp); 80 } 81 if (be->fd != -1) { 82 close(be->fd); 83 be->fd = -1; 84 } 85} 86 87static int 88tap_init(struct net_backend *be, const char *devname, 89 nvlist_t *nvl __unused, net_be_rxeof_t cb, void *param) 90{ 91 struct tap_priv *priv = NET_BE_PRIV(be); 92 char tbuf[80]; 93 int opt = 1, up = IFF_UP; 94 95#ifndef WITHOUT_CAPSICUM 96 cap_rights_t rights; 97#endif 98 99 if (cb == NULL) { 100 EPRINTLN("TAP backend requires non-NULL callback"); 101 return (-1); 102 } 103 104 strcpy(tbuf, "/dev/"); 105 strlcat(tbuf, devname, sizeof(tbuf)); 106 107 be->fd = open(tbuf, O_RDWR); 108 if (be->fd == -1) { 109 EPRINTLN("open of tap device %s failed", tbuf); 110 goto error; 111 } 112 113 /* 114 * Set non-blocking and register for read 115 * notifications with the event loop 116 */ 117 if (ioctl(be->fd, FIONBIO, &opt) < 0) { 118 EPRINTLN("tap device O_NONBLOCK failed"); 119 goto error; 120 } 121 122 if (ioctl(be->fd, VMIO_SIOCSIFFLAGS, up)) { 123 EPRINTLN("tap device link up failed"); 124 goto error; 125 } 126 127#ifndef WITHOUT_CAPSICUM 128 cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE); 129 if (caph_rights_limit(be->fd, &rights) == -1) 130 errx(EX_OSERR, "Unable to apply rights for sandbox"); 131#endif 132 133 memset(priv->bbuf, 0, sizeof(priv->bbuf)); 134 priv->bbuflen = 0; 135 136 priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param); 137 if (priv->mevp == NULL) { 138 EPRINTLN("Could not register event"); 139 goto error; 140 } 141 142 return (0); 143 144error: 145 tap_cleanup(be); 146 return (-1); 147} 148 149/* 150 * Called to send a buffer chain out to the tap device 151 */ 152ssize_t 153tap_send(struct net_backend *be, const struct iovec *iov, int iovcnt) 154{ 155 return (writev(be->fd, iov, iovcnt)); 156} 157 158ssize_t 159tap_peek_recvlen(struct net_backend *be) 160{ 161 struct tap_priv *priv = NET_BE_PRIV(be); 162 ssize_t ret; 163 164 if (priv->bbuflen > 0) { 165 /* 166 * We already have a packet in the bounce buffer. 167 * Just return its length. 168 */ 169 return priv->bbuflen; 170 } 171 172 /* 173 * Read the next packet (if any) into the bounce buffer, so 174 * that we get to know its length and we can return that 175 * to the caller. 176 */ 177 ret = read(be->fd, priv->bbuf, sizeof(priv->bbuf)); 178 if (ret < 0 && errno == EWOULDBLOCK) { 179 return (0); 180 } 181 182 if (ret > 0) 183 priv->bbuflen = ret; 184 185 return (ret); 186} 187 188ssize_t 189tap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt) 190{ 191 struct tap_priv *priv = NET_BE_PRIV(be); 192 ssize_t ret; 193 194 if (priv->bbuflen > 0) { 195 /* 196 * A packet is available in the bounce buffer, so 197 * we read it from there. 198 */ 199 ret = buf_to_iov(priv->bbuf, priv->bbuflen, 200 iov, iovcnt, 0); 201 202 /* Mark the bounce buffer as empty. */ 203 priv->bbuflen = 0; 204 205 return (ret); 206 } 207 208 ret = readv(be->fd, iov, iovcnt); 209 if (ret < 0 && errno == EWOULDBLOCK) { 210 return (0); 211 } 212 213 return (ret); 214} 215 216void 217tap_recv_enable(struct net_backend *be) 218{ 219 struct tap_priv *priv = NET_BE_PRIV(be); 220 221 mevent_enable(priv->mevp); 222} 223 224void 225tap_recv_disable(struct net_backend *be) 226{ 227 struct tap_priv *priv = NET_BE_PRIV(be); 228 229 mevent_disable(priv->mevp); 230} 231 232uint64_t 233tap_get_cap(struct net_backend *be __unused) 234{ 235 236 return (0); /* no capabilities for now */ 237} 238 239int 240tap_set_cap(struct net_backend *be __unused, uint64_t features, 241 unsigned vnet_hdr_len) 242{ 243 244 return ((features || vnet_hdr_len) ? -1 : 0); 245} 246 247static struct net_backend tap_backend = { 248 .prefix = "tap", 249 .priv_size = sizeof(struct tap_priv), 250 .init = tap_init, 251 .cleanup = tap_cleanup, 252 .send = tap_send, 253 .peek_recvlen = tap_peek_recvlen, 254 .recv = tap_recv, 255 .recv_enable = tap_recv_enable, 256 .recv_disable = tap_recv_disable, 257 .get_cap = tap_get_cap, 258 .set_cap = tap_set_cap, 259}; 260 261/* A clone of the tap backend, with a different prefix. */ 262static struct net_backend vmnet_backend = { 263 .prefix = "vmnet", 264 .priv_size = sizeof(struct tap_priv), 265 .init = tap_init, 266 .cleanup = tap_cleanup, 267 .send = tap_send, 268 .peek_recvlen = tap_peek_recvlen, 269 .recv = tap_recv, 270 .recv_enable = tap_recv_enable, 271 .recv_disable = tap_recv_disable, 272 .get_cap = tap_get_cap, 273 .set_cap = tap_set_cap, 274}; 275 276DATA_SET(net_backend_set, tap_backend); 277DATA_SET(net_backend_set, vmnet_backend); 278 279int 280netbe_legacy_config(nvlist_t *nvl, const char *opts) 281{ 282 char *backend, *cp; 283 284 if (opts == NULL) 285 return (0); 286 287 cp = strchr(opts, ','); 288 if (cp == NULL) { 289 set_config_value_node(nvl, "backend", opts); 290 return (0); 291 } 292 backend = strndup(opts, cp - opts); 293 set_config_value_node(nvl, "backend", backend); 294 free(backend); 295 return (pci_parse_legacy_config(nvl, cp + 1)); 296} 297 298/* 299 * Initialize a backend and attach to the frontend. 300 * This is called during frontend initialization. 301 * @ret is a pointer to the backend to be initialized 302 * @devname is the backend-name as supplied on the command line, 303 * e.g. -s 2:0,frontend-name,backend-name[,other-args] 304 * @cb is the receive callback supplied by the frontend, 305 * and it is invoked in the event loop when a receive 306 * event is generated in the hypervisor, 307 * @param is a pointer to the frontend, and normally used as 308 * the argument for the callback. 309 */ 310int 311netbe_init(struct net_backend **ret, nvlist_t *nvl, net_be_rxeof_t cb, 312 void *param) 313{ 314 struct net_backend **pbe, *nbe, *tbe = NULL; 315 const char *value, *type; 316 char *devname; 317 int err; 318 319 value = get_config_value_node(nvl, "backend"); 320 if (value == NULL) { 321 return (-1); 322 } 323 devname = strdup(value); 324 325 /* 326 * Use the type given by configuration if exists; otherwise 327 * use the prefix of the backend as the type. 328 */ 329 type = get_config_value_node(nvl, "type"); 330 if (type == NULL) 331 type = devname; 332 333 /* 334 * Find the network backend that matches the user-provided 335 * device name. net_backend_set is built using a linker set. 336 */ 337 SET_FOREACH(pbe, net_backend_set) { 338 if (strncmp(type, (*pbe)->prefix, 339 strlen((*pbe)->prefix)) == 0) { 340 tbe = *pbe; 341 assert(tbe->init != NULL); 342 assert(tbe->cleanup != NULL); 343 assert(tbe->send != NULL); 344 assert(tbe->recv != NULL); 345 assert(tbe->get_cap != NULL); 346 assert(tbe->set_cap != NULL); 347 break; 348 } 349 } 350 351 *ret = NULL; 352 if (tbe == NULL) { 353 free(devname); 354 return (EINVAL); 355 } 356 357 nbe = calloc(1, NET_BE_SIZE(tbe)); 358 *nbe = *tbe; /* copy the template */ 359 nbe->fd = -1; 360 nbe->sc = param; 361 nbe->be_vnet_hdr_len = 0; 362 nbe->fe_vnet_hdr_len = 0; 363 364 /* Initialize the backend. */ 365 err = nbe->init(nbe, devname, nvl, cb, param); 366 if (err) { 367 free(devname); 368 free(nbe); 369 return (err); 370 } 371 372 *ret = nbe; 373 free(devname); 374 375 return (0); 376} 377 378void 379netbe_cleanup(struct net_backend *be) 380{ 381 382 if (be != NULL) { 383 be->cleanup(be); 384 free(be); 385 } 386} 387 388uint64_t 389netbe_get_cap(struct net_backend *be) 390{ 391 392 assert(be != NULL); 393 return (be->get_cap(be)); 394} 395 396int 397netbe_set_cap(struct net_backend *be, uint64_t features, 398 unsigned vnet_hdr_len) 399{ 400 int ret; 401 402 assert(be != NULL); 403 404 /* There are only three valid lengths, i.e., 0, 10 and 12. */ 405 if (vnet_hdr_len && vnet_hdr_len != VNET_HDR_LEN 406 && vnet_hdr_len != (VNET_HDR_LEN - sizeof(uint16_t))) 407 return (-1); 408 409 be->fe_vnet_hdr_len = vnet_hdr_len; 410 411 ret = be->set_cap(be, features, vnet_hdr_len); 412 assert(be->be_vnet_hdr_len == 0 || 413 be->be_vnet_hdr_len == be->fe_vnet_hdr_len); 414 415 return (ret); 416} 417 418ssize_t 419netbe_send(struct net_backend *be, const struct iovec *iov, int iovcnt) 420{ 421 422 return (be->send(be, iov, iovcnt)); 423} 424 425ssize_t 426netbe_peek_recvlen(struct net_backend *be) 427{ 428 429 return (be->peek_recvlen(be)); 430} 431 432/* 433 * Try to read a packet from the backend, without blocking. 434 * If no packets are available, return 0. In case of success, return 435 * the length of the packet just read. Return -1 in case of errors. 436 */ 437ssize_t 438netbe_recv(struct net_backend *be, const struct iovec *iov, int iovcnt) 439{ 440 441 return (be->recv(be, iov, iovcnt)); 442} 443 444/* 445 * Read a packet from the backend and discard it. 446 * Returns the size of the discarded packet or zero if no packet was available. 447 * A negative error code is returned in case of read error. 448 */ 449ssize_t 450netbe_rx_discard(struct net_backend *be) 451{ 452 /* 453 * MP note: the dummybuf is only used to discard frames, 454 * so there is no need for it to be per-vtnet or locked. 455 * We only make it large enough for TSO-sized segment. 456 */ 457 static uint8_t dummybuf[65536 + 64]; 458 struct iovec iov; 459 460 iov.iov_base = dummybuf; 461 iov.iov_len = sizeof(dummybuf); 462 463 return netbe_recv(be, &iov, 1); 464} 465 466void 467netbe_rx_disable(struct net_backend *be) 468{ 469 470 return be->recv_disable(be); 471} 472 473void 474netbe_rx_enable(struct net_backend *be) 475{ 476 477 return be->recv_enable(be); 478} 479 480size_t 481netbe_get_vnet_hdr_len(struct net_backend *be) 482{ 483 484 return (be->be_vnet_hdr_len); 485} 486