1/* 2 * Copyright (C) 2013-2016 Universita` di Pisa 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27 28#if defined(__FreeBSD__) 29#include <sys/cdefs.h> /* prerequisite */ 30__FBSDID("$FreeBSD: stable/11/sys/dev/netmap/netmap_vale.c 344047 2019-02-12 09:26:05Z vmaffione $"); 31 32#include <sys/types.h> 33#include <sys/errno.h> 34#include <sys/param.h> /* defines used in kernel.h */ 35#include <sys/kernel.h> /* types used in module initialization */ 36#include <sys/conf.h> /* cdevsw struct, UID, GID */ 37#include <sys/sockio.h> 38#include <sys/socketvar.h> /* struct socket */ 39#include <sys/malloc.h> 40#include <sys/poll.h> 41#include <sys/rwlock.h> 42#include <sys/socket.h> /* sockaddrs */ 43#include <sys/selinfo.h> 44#include <sys/sysctl.h> 45#include <net/if.h> 46#include <net/if_var.h> 47#include <net/bpf.h> /* BIOCIMMEDIATE */ 48#include <machine/bus.h> /* bus_dmamap_* */ 49#include <sys/endian.h> 50#include <sys/refcount.h> 51#include <sys/smp.h> 52 53 54#elif defined(linux) 55 56#include "bsd_glue.h" 57 58#elif defined(__APPLE__) 59 60#warning OSX support is only partial 61#include "osx_glue.h" 62 63#elif defined(_WIN32) 64#include "win_glue.h" 65 66#else 67 68#error Unsupported platform 69 70#endif /* unsupported */ 71 72/* 73 * common headers 74 */ 75 76#include <net/netmap.h> 77#include <dev/netmap/netmap_kern.h> 78#include <dev/netmap/netmap_mem2.h> 79#include <dev/netmap/netmap_bdg.h> 80 81#ifdef WITH_VALE 82 83/* 84 * system parameters (most of them in netmap_kern.h) 85 * NM_BDG_NAME prefix for switch port names, default "vale" 86 * NM_BDG_MAXPORTS number of ports 87 * NM_BRIDGES max number of switches in the system. 88 * XXX should become a sysctl or tunable 89 * 90 * Switch ports are named valeX:Y where X is the switch name and Y 91 * is the port. If Y matches a physical interface name, the port is 92 * connected to a physical device. 93 * 94 * Unlike physical interfaces, switch ports use their own memory region 95 * for rings and buffers. 96 * The virtual interfaces use per-queue lock instead of core lock. 97 * In the tx loop, we aggregate traffic in batches to make all operations 98 * faster. The batch size is bridge_batch. 99 */ 100#define NM_BDG_MAXRINGS 16 /* XXX unclear how many. */ 101#define NM_BDG_MAXSLOTS 4096 /* XXX same as above */ 102#define NM_BRIDGE_RINGSIZE 1024 /* in the device */ 103#define NM_BDG_BATCH 1024 /* entries in the forwarding buffer */ 104/* actual size of the tables */ 105#define NM_BDG_BATCH_MAX (NM_BDG_BATCH + NETMAP_MAX_FRAGS) 106/* NM_FT_NULL terminates a list of slots in the ft */ 107#define NM_FT_NULL NM_BDG_BATCH_MAX 108 109 110/* 111 * bridge_batch is set via sysctl to the max batch size to be 112 * used in the bridge. The actual value may be larger as the 113 * last packet in the block may overflow the size. 114 */ 115static int bridge_batch = NM_BDG_BATCH; /* bridge batch size */ 116SYSBEGIN(vars_vale); 117SYSCTL_DECL(_dev_netmap); 118SYSCTL_INT(_dev_netmap, OID_AUTO, bridge_batch, CTLFLAG_RW, &bridge_batch, 0, 119 "Max batch size to be used in the bridge"); 120SYSEND; 121 122static int netmap_vale_vp_create(struct nmreq_header *hdr, struct ifnet *, 123 struct netmap_mem_d *nmd, struct netmap_vp_adapter **); 124static int netmap_vale_vp_bdg_attach(const char *, struct netmap_adapter *, 125 struct nm_bridge *); 126static int netmap_vale_bwrap_attach(const char *, struct netmap_adapter *); 127 128/* 129 * For each output interface, nm_vale_q is used to construct a list. 130 * bq_len is the number of output buffers (we can have coalescing 131 * during the copy). 132 */ 133struct nm_vale_q { 134 uint16_t bq_head; 135 uint16_t bq_tail; 136 uint32_t bq_len; /* number of buffers */ 137}; 138 139/* Holds the default callbacks */ 140struct netmap_bdg_ops vale_bdg_ops = { 141 .lookup = netmap_vale_learning, 142 .config = NULL, 143 .dtor = NULL, 144 .vp_create = netmap_vale_vp_create, 145 .bwrap_attach = netmap_vale_bwrap_attach, 146 .name = NM_BDG_NAME, 147}; 148 149/* 150 * this is a slightly optimized copy routine which rounds 151 * to multiple of 64 bytes and is often faster than dealing 152 * with other odd sizes. We assume there is enough room 153 * in the source and destination buffers. 154 * 155 * XXX only for multiples of 64 bytes, non overlapped. 156 */ 157static inline void 158pkt_copy(void *_src, void *_dst, int l) 159{ 160 uint64_t *src = _src; 161 uint64_t *dst = _dst; 162 if (unlikely(l >= 1024)) { 163 memcpy(dst, src, l); 164 return; 165 } 166 for (; likely(l > 0); l-=64) { 167 *dst++ = *src++; 168 *dst++ = *src++; 169 *dst++ = *src++; 170 *dst++ = *src++; 171 *dst++ = *src++; 172 *dst++ = *src++; 173 *dst++ = *src++; 174 *dst++ = *src++; 175 } 176} 177 178 179/* 180 * Free the forwarding tables for rings attached to switch ports. 181 */ 182static void 183nm_free_bdgfwd(struct netmap_adapter *na) 184{ 185 int nrings, i; 186 struct netmap_kring **kring; 187 188 NMG_LOCK_ASSERT(); 189 nrings = na->num_tx_rings; 190 kring = na->tx_rings; 191 for (i = 0; i < nrings; i++) { 192 if (kring[i]->nkr_ft) { 193 nm_os_free(kring[i]->nkr_ft); 194 kring[i]->nkr_ft = NULL; /* protect from freeing twice */ 195 } 196 } 197} 198 199 200/* 201 * Allocate the forwarding tables for the rings attached to the bridge ports. 202 */ 203static int 204nm_alloc_bdgfwd(struct netmap_adapter *na) 205{ 206 int nrings, l, i, num_dstq; 207 struct netmap_kring **kring; 208 209 NMG_LOCK_ASSERT(); 210 /* all port:rings + broadcast */ 211 num_dstq = NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1; 212 l = sizeof(struct nm_bdg_fwd) * NM_BDG_BATCH_MAX; 213 l += sizeof(struct nm_vale_q) * num_dstq; 214 l += sizeof(uint16_t) * NM_BDG_BATCH_MAX; 215 216 nrings = netmap_real_rings(na, NR_TX); 217 kring = na->tx_rings; 218 for (i = 0; i < nrings; i++) { 219 struct nm_bdg_fwd *ft; 220 struct nm_vale_q *dstq; 221 int j; 222 223 ft = nm_os_malloc(l); 224 if (!ft) { 225 nm_free_bdgfwd(na); 226 return ENOMEM; 227 } 228 dstq = (struct nm_vale_q *)(ft + NM_BDG_BATCH_MAX); 229 for (j = 0; j < num_dstq; j++) { 230 dstq[j].bq_head = dstq[j].bq_tail = NM_FT_NULL; 231 dstq[j].bq_len = 0; 232 } 233 kring[i]->nkr_ft = ft; 234 } 235 return 0; 236} 237 238/* Allows external modules to create bridges in exclusive mode, 239 * returns an authentication token that the external module will need 240 * to provide during nm_bdg_ctl_{attach, detach}(), netmap_bdg_regops(), 241 * and nm_bdg_update_private_data() operations. 242 * Successfully executed if ret != NULL and *return_status == 0. 243 */ 244void * 245netmap_vale_create(const char *bdg_name, int *return_status) 246{ 247 struct nm_bridge *b = NULL; 248 void *ret = NULL; 249 250 NMG_LOCK(); 251 b = nm_find_bridge(bdg_name, 0 /* don't create */, NULL); 252 if (b) { 253 *return_status = EEXIST; 254 goto unlock_bdg_create; 255 } 256 257 b = nm_find_bridge(bdg_name, 1 /* create */, &vale_bdg_ops); 258 if (!b) { 259 *return_status = ENOMEM; 260 goto unlock_bdg_create; 261 } 262 263 b->bdg_flags |= NM_BDG_ACTIVE | NM_BDG_EXCLUSIVE; 264 ret = nm_bdg_get_auth_token(b); 265 *return_status = 0; 266 267unlock_bdg_create: 268 NMG_UNLOCK(); 269 return ret; 270} 271 272/* Allows external modules to destroy a bridge created through 273 * netmap_bdg_create(), the bridge must be empty. 274 */ 275int 276netmap_vale_destroy(const char *bdg_name, void *auth_token) 277{ 278 struct nm_bridge *b = NULL; 279 int ret = 0; 280 281 NMG_LOCK(); 282 b = nm_find_bridge(bdg_name, 0 /* don't create */, NULL); 283 if (!b) { 284 ret = ENXIO; 285 goto unlock_bdg_free; 286 } 287 288 if (!nm_bdg_valid_auth_token(b, auth_token)) { 289 ret = EACCES; 290 goto unlock_bdg_free; 291 } 292 if (!(b->bdg_flags & NM_BDG_EXCLUSIVE)) { 293 ret = EINVAL; 294 goto unlock_bdg_free; 295 } 296 297 b->bdg_flags &= ~(NM_BDG_EXCLUSIVE | NM_BDG_ACTIVE); 298 ret = netmap_bdg_free(b); 299 if (ret) { 300 b->bdg_flags |= NM_BDG_EXCLUSIVE | NM_BDG_ACTIVE; 301 } 302 303unlock_bdg_free: 304 NMG_UNLOCK(); 305 return ret; 306} 307 308/* Process NETMAP_REQ_VALE_LIST. */ 309int 310netmap_vale_list(struct nmreq_header *hdr) 311{ 312 struct nmreq_vale_list *req = 313 (struct nmreq_vale_list *)(uintptr_t)hdr->nr_body; 314 int namelen = strlen(hdr->nr_name); 315 struct nm_bridge *b, *bridges; 316 struct netmap_vp_adapter *vpna; 317 int error = 0, i, j; 318 u_int num_bridges; 319 320 netmap_bns_getbridges(&bridges, &num_bridges); 321 322 /* this is used to enumerate bridges and ports */ 323 if (namelen) { /* look up indexes of bridge and port */ 324 if (strncmp(hdr->nr_name, NM_BDG_NAME, 325 strlen(NM_BDG_NAME))) { 326 return EINVAL; 327 } 328 NMG_LOCK(); 329 b = nm_find_bridge(hdr->nr_name, 0 /* don't create */, NULL); 330 if (!b) { 331 NMG_UNLOCK(); 332 return ENOENT; 333 } 334 335 req->nr_bridge_idx = b - bridges; /* bridge index */ 336 req->nr_port_idx = NM_BDG_NOPORT; 337 for (j = 0; j < b->bdg_active_ports; j++) { 338 i = b->bdg_port_index[j]; 339 vpna = b->bdg_ports[i]; 340 if (vpna == NULL) { 341 nm_prerr("This should not happen"); 342 continue; 343 } 344 /* the former and the latter identify a 345 * virtual port and a NIC, respectively 346 */ 347 if (!strcmp(vpna->up.name, hdr->nr_name)) { 348 req->nr_port_idx = i; /* port index */ 349 break; 350 } 351 } 352 NMG_UNLOCK(); 353 } else { 354 /* return the first non-empty entry starting from 355 * bridge nr_arg1 and port nr_arg2. 356 * 357 * Users can detect the end of the same bridge by 358 * seeing the new and old value of nr_arg1, and can 359 * detect the end of all the bridge by error != 0 360 */ 361 i = req->nr_bridge_idx; 362 j = req->nr_port_idx; 363 364 NMG_LOCK(); 365 for (error = ENOENT; i < NM_BRIDGES; i++) { 366 b = bridges + i; 367 for ( ; j < NM_BDG_MAXPORTS; j++) { 368 if (b->bdg_ports[j] == NULL) 369 continue; 370 vpna = b->bdg_ports[j]; 371 /* write back the VALE switch name */ 372 strlcpy(hdr->nr_name, vpna->up.name, 373 sizeof(hdr->nr_name)); 374 error = 0; 375 goto out; 376 } 377 j = 0; /* following bridges scan from 0 */ 378 } 379 out: 380 req->nr_bridge_idx = i; 381 req->nr_port_idx = j; 382 NMG_UNLOCK(); 383 } 384 385 return error; 386} 387 388/* Process NETMAP_REQ_VALE_ATTACH. 389 */ 390int 391netmap_vale_attach(struct nmreq_header *hdr, void *auth_token) 392{ 393 struct nmreq_vale_attach *req = 394 (struct nmreq_vale_attach *)(uintptr_t)hdr->nr_body; 395 struct netmap_vp_adapter * vpna; 396 struct netmap_adapter *na = NULL; 397 struct netmap_mem_d *nmd = NULL; 398 struct nm_bridge *b = NULL; 399 int error; 400 401 NMG_LOCK(); 402 /* permission check for modified bridges */ 403 b = nm_find_bridge(hdr->nr_name, 0 /* don't create */, NULL); 404 if (b && !nm_bdg_valid_auth_token(b, auth_token)) { 405 error = EACCES; 406 goto unlock_exit; 407 } 408 409 if (req->reg.nr_mem_id) { 410 nmd = netmap_mem_find(req->reg.nr_mem_id); 411 if (nmd == NULL) { 412 error = EINVAL; 413 goto unlock_exit; 414 } 415 } 416 417 /* check for existing one */ 418 error = netmap_get_vale_na(hdr, &na, nmd, 0); 419 if (na) { 420 error = EBUSY; 421 goto unref_exit; 422 } 423 error = netmap_get_vale_na(hdr, &na, 424 nmd, 1 /* create if not exists */); 425 if (error) { /* no device */ 426 goto unlock_exit; 427 } 428 429 if (na == NULL) { /* VALE prefix missing */ 430 error = EINVAL; 431 goto unlock_exit; 432 } 433 434 if (NETMAP_OWNED_BY_ANY(na)) { 435 error = EBUSY; 436 goto unref_exit; 437 } 438 439 if (na->nm_bdg_ctl) { 440 /* nop for VALE ports. The bwrap needs to put the hwna 441 * in netmap mode (see netmap_bwrap_bdg_ctl) 442 */ 443 error = na->nm_bdg_ctl(hdr, na); 444 if (error) 445 goto unref_exit; 446 nm_prdis("registered %s to netmap-mode", na->name); 447 } 448 vpna = (struct netmap_vp_adapter *)na; 449 req->port_index = vpna->bdg_port; 450 451 if (nmd) 452 netmap_mem_put(nmd); 453 454 NMG_UNLOCK(); 455 return 0; 456 457unref_exit: 458 netmap_adapter_put(na); 459unlock_exit: 460 if (nmd) 461 netmap_mem_put(nmd); 462 463 NMG_UNLOCK(); 464 return error; 465} 466 467/* Process NETMAP_REQ_VALE_DETACH. 468 */ 469int 470netmap_vale_detach(struct nmreq_header *hdr, void *auth_token) 471{ 472 struct nmreq_vale_detach *nmreq_det = (void *)(uintptr_t)hdr->nr_body; 473 struct netmap_vp_adapter *vpna; 474 struct netmap_adapter *na; 475 struct nm_bridge *b = NULL; 476 int error; 477 478 NMG_LOCK(); 479 /* permission check for modified bridges */ 480 b = nm_find_bridge(hdr->nr_name, 0 /* don't create */, NULL); 481 if (b && !nm_bdg_valid_auth_token(b, auth_token)) { 482 error = EACCES; 483 goto unlock_exit; 484 } 485 486 error = netmap_get_vale_na(hdr, &na, NULL, 0 /* don't create */); 487 if (error) { /* no device, or another bridge or user owns the device */ 488 goto unlock_exit; 489 } 490 491 if (na == NULL) { /* VALE prefix missing */ 492 error = EINVAL; 493 goto unlock_exit; 494 } else if (nm_is_bwrap(na) && 495 ((struct netmap_bwrap_adapter *)na)->na_polling_state) { 496 /* Don't detach a NIC with polling */ 497 error = EBUSY; 498 goto unref_exit; 499 } 500 501 vpna = (struct netmap_vp_adapter *)na; 502 if (na->na_vp != vpna) { 503 /* trying to detach first attach of VALE persistent port attached 504 * to 2 bridges 505 */ 506 error = EBUSY; 507 goto unref_exit; 508 } 509 nmreq_det->port_index = vpna->bdg_port; 510 511 if (na->nm_bdg_ctl) { 512 /* remove the port from bridge. The bwrap 513 * also needs to put the hwna in normal mode 514 */ 515 error = na->nm_bdg_ctl(hdr, na); 516 } 517 518unref_exit: 519 netmap_adapter_put(na); 520unlock_exit: 521 NMG_UNLOCK(); 522 return error; 523 524} 525 526 527/* nm_dtor callback for ephemeral VALE ports */ 528static void 529netmap_vale_vp_dtor(struct netmap_adapter *na) 530{ 531 struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na; 532 struct nm_bridge *b = vpna->na_bdg; 533 534 nm_prdis("%s has %d references", na->name, na->na_refcount); 535 536 if (b) { 537 netmap_bdg_detach_common(b, vpna->bdg_port, -1); 538 } 539 540 if (na->ifp != NULL && !nm_iszombie(na)) { 541 NM_DETACH_NA(na->ifp); 542 if (vpna->autodelete) { 543 nm_prdis("releasing %s", na->ifp->if_xname); 544 NMG_UNLOCK(); 545 nm_os_vi_detach(na->ifp); 546 NMG_LOCK(); 547 } 548 } 549} 550 551 552 553/* nm_krings_create callback for VALE ports. 554 * Calls the standard netmap_krings_create, then adds leases on rx 555 * rings and bdgfwd on tx rings. 556 */ 557static int 558netmap_vale_vp_krings_create(struct netmap_adapter *na) 559{ 560 u_int tailroom; 561 int error, i; 562 uint32_t *leases; 563 u_int nrx = netmap_real_rings(na, NR_RX); 564 565 /* 566 * Leases are attached to RX rings on vale ports 567 */ 568 tailroom = sizeof(uint32_t) * na->num_rx_desc * nrx; 569 570 error = netmap_krings_create(na, tailroom); 571 if (error) 572 return error; 573 574 leases = na->tailroom; 575 576 for (i = 0; i < nrx; i++) { /* Receive rings */ 577 na->rx_rings[i]->nkr_leases = leases; 578 leases += na->num_rx_desc; 579 } 580 581 error = nm_alloc_bdgfwd(na); 582 if (error) { 583 netmap_krings_delete(na); 584 return error; 585 } 586 587 return 0; 588} 589 590 591/* nm_krings_delete callback for VALE ports. */ 592static void 593netmap_vale_vp_krings_delete(struct netmap_adapter *na) 594{ 595 nm_free_bdgfwd(na); 596 netmap_krings_delete(na); 597} 598 599 600static int 601nm_vale_flush(struct nm_bdg_fwd *ft, u_int n, 602 struct netmap_vp_adapter *na, u_int ring_nr); 603 604 605/* 606 * main dispatch routine for the bridge. 607 * Grab packets from a kring, move them into the ft structure 608 * associated to the tx (input) port. Max one instance per port, 609 * filtered on input (ioctl, poll or XXX). 610 * Returns the next position in the ring. 611 */ 612static int 613nm_vale_preflush(struct netmap_kring *kring, u_int end) 614{ 615 struct netmap_vp_adapter *na = 616 (struct netmap_vp_adapter*)kring->na; 617 struct netmap_ring *ring = kring->ring; 618 struct nm_bdg_fwd *ft; 619 u_int ring_nr = kring->ring_id; 620 u_int j = kring->nr_hwcur, lim = kring->nkr_num_slots - 1; 621 u_int ft_i = 0; /* start from 0 */ 622 u_int frags = 1; /* how many frags ? */ 623 struct nm_bridge *b = na->na_bdg; 624 625 /* To protect against modifications to the bridge we acquire a 626 * shared lock, waiting if we can sleep (if the source port is 627 * attached to a user process) or with a trylock otherwise (NICs). 628 */ 629 nm_prdis("wait rlock for %d packets", ((j > end ? lim+1 : 0) + end) - j); 630 if (na->up.na_flags & NAF_BDG_MAYSLEEP) 631 BDG_RLOCK(b); 632 else if (!BDG_RTRYLOCK(b)) 633 return j; 634 nm_prdis(5, "rlock acquired for %d packets", ((j > end ? lim+1 : 0) + end) - j); 635 ft = kring->nkr_ft; 636 637 for (; likely(j != end); j = nm_next(j, lim)) { 638 struct netmap_slot *slot = &ring->slot[j]; 639 char *buf; 640 641 ft[ft_i].ft_len = slot->len; 642 ft[ft_i].ft_flags = slot->flags; 643 ft[ft_i].ft_offset = 0; 644 645 nm_prdis("flags is 0x%x", slot->flags); 646 /* we do not use the buf changed flag, but we still need to reset it */ 647 slot->flags &= ~NS_BUF_CHANGED; 648 649 /* this slot goes into a list so initialize the link field */ 650 ft[ft_i].ft_next = NM_FT_NULL; 651 buf = ft[ft_i].ft_buf = (slot->flags & NS_INDIRECT) ? 652 (void *)(uintptr_t)slot->ptr : NMB(&na->up, slot); 653 if (unlikely(buf == NULL)) { 654 nm_prlim(5, "NULL %s buffer pointer from %s slot %d len %d", 655 (slot->flags & NS_INDIRECT) ? "INDIRECT" : "DIRECT", 656 kring->name, j, ft[ft_i].ft_len); 657 buf = ft[ft_i].ft_buf = NETMAP_BUF_BASE(&na->up); 658 ft[ft_i].ft_len = 0; 659 ft[ft_i].ft_flags = 0; 660 } 661 __builtin_prefetch(buf); 662 ++ft_i; 663 if (slot->flags & NS_MOREFRAG) { 664 frags++; 665 continue; 666 } 667 if (unlikely(netmap_verbose && frags > 1)) 668 nm_prlim(5, "%d frags at %d", frags, ft_i - frags); 669 ft[ft_i - frags].ft_frags = frags; 670 frags = 1; 671 if (unlikely((int)ft_i >= bridge_batch)) 672 ft_i = nm_vale_flush(ft, ft_i, na, ring_nr); 673 } 674 if (frags > 1) { 675 /* Here ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG, and we 676 * have to fix frags count. */ 677 frags--; 678 ft[ft_i - 1].ft_flags &= ~NS_MOREFRAG; 679 ft[ft_i - frags].ft_frags = frags; 680 nm_prlim(5, "Truncate incomplete fragment at %d (%d frags)", ft_i, frags); 681 } 682 if (ft_i) 683 ft_i = nm_vale_flush(ft, ft_i, na, ring_nr); 684 BDG_RUNLOCK(b); 685 return j; 686} 687 688 689/* ----- FreeBSD if_bridge hash function ------- */ 690 691/* 692 * The following hash function is adapted from "Hash Functions" by Bob Jenkins 693 * ("Algorithm Alley", Dr. Dobbs Journal, September 1997). 694 * 695 * http://www.burtleburtle.net/bob/hash/spooky.html 696 */ 697#define mix(a, b, c) \ 698do { \ 699 a -= b; a -= c; a ^= (c >> 13); \ 700 b -= c; b -= a; b ^= (a << 8); \ 701 c -= a; c -= b; c ^= (b >> 13); \ 702 a -= b; a -= c; a ^= (c >> 12); \ 703 b -= c; b -= a; b ^= (a << 16); \ 704 c -= a; c -= b; c ^= (b >> 5); \ 705 a -= b; a -= c; a ^= (c >> 3); \ 706 b -= c; b -= a; b ^= (a << 10); \ 707 c -= a; c -= b; c ^= (b >> 15); \ 708} while (/*CONSTCOND*/0) 709 710 711static __inline uint32_t 712nm_vale_rthash(const uint8_t *addr) 713{ 714 uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = 0; // hask key 715 716 b += addr[5] << 8; 717 b += addr[4]; 718 a += addr[3] << 24; 719 a += addr[2] << 16; 720 a += addr[1] << 8; 721 a += addr[0]; 722 723 mix(a, b, c); 724#define BRIDGE_RTHASH_MASK (NM_BDG_HASH-1) 725 return (c & BRIDGE_RTHASH_MASK); 726} 727 728#undef mix 729 730 731/* 732 * Lookup function for a learning bridge. 733 * Update the hash table with the source address, 734 * and then returns the destination port index, and the 735 * ring in *dst_ring (at the moment, always use ring 0) 736 */ 737uint32_t 738netmap_vale_learning(struct nm_bdg_fwd *ft, uint8_t *dst_ring, 739 struct netmap_vp_adapter *na, void *private_data) 740{ 741 uint8_t *buf = ((uint8_t *)ft->ft_buf) + ft->ft_offset; 742 u_int buf_len = ft->ft_len - ft->ft_offset; 743 struct nm_hash_ent *ht = private_data; 744 uint32_t sh, dh; 745 u_int dst, mysrc = na->bdg_port; 746 uint64_t smac, dmac; 747 uint8_t indbuf[12]; 748 749 if (buf_len < 14) { 750 return NM_BDG_NOPORT; 751 } 752 753 if (ft->ft_flags & NS_INDIRECT) { 754 if (copyin(buf, indbuf, sizeof(indbuf))) { 755 return NM_BDG_NOPORT; 756 } 757 buf = indbuf; 758 } 759 760 dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff; 761 smac = le64toh(*(uint64_t *)(buf + 4)); 762 smac >>= 16; 763 764 /* 765 * The hash is somewhat expensive, there might be some 766 * worthwhile optimizations here. 767 */ 768 if (((buf[6] & 1) == 0) && (na->last_smac != smac)) { /* valid src */ 769 uint8_t *s = buf+6; 770 sh = nm_vale_rthash(s); /* hash of source */ 771 /* update source port forwarding entry */ 772 na->last_smac = ht[sh].mac = smac; /* XXX expire ? */ 773 ht[sh].ports = mysrc; 774 if (netmap_debug & NM_DEBUG_VALE) 775 nm_prinf("src %02x:%02x:%02x:%02x:%02x:%02x on port %d", 776 s[0], s[1], s[2], s[3], s[4], s[5], mysrc); 777 } 778 dst = NM_BDG_BROADCAST; 779 if ((buf[0] & 1) == 0) { /* unicast */ 780 dh = nm_vale_rthash(buf); /* hash of dst */ 781 if (ht[dh].mac == dmac) { /* found dst */ 782 dst = ht[dh].ports; 783 } 784 } 785 return dst; 786} 787 788 789/* 790 * Available space in the ring. Only used in VALE code 791 * and only with is_rx = 1 792 */ 793static inline uint32_t 794nm_kr_space(struct netmap_kring *k, int is_rx) 795{ 796 int space; 797 798 if (is_rx) { 799 int busy = k->nkr_hwlease - k->nr_hwcur; 800 if (busy < 0) 801 busy += k->nkr_num_slots; 802 space = k->nkr_num_slots - 1 - busy; 803 } else { 804 /* XXX never used in this branch */ 805 space = k->nr_hwtail - k->nkr_hwlease; 806 if (space < 0) 807 space += k->nkr_num_slots; 808 } 809#if 0 810 // sanity check 811 if (k->nkr_hwlease >= k->nkr_num_slots || 812 k->nr_hwcur >= k->nkr_num_slots || 813 k->nr_tail >= k->nkr_num_slots || 814 busy < 0 || 815 busy >= k->nkr_num_slots) { 816 nm_prerr("invalid kring, cur %d tail %d lease %d lease_idx %d lim %d", 817 k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease, 818 k->nkr_lease_idx, k->nkr_num_slots); 819 } 820#endif 821 return space; 822} 823 824 825 826 827/* make a lease on the kring for N positions. return the 828 * lease index 829 * XXX only used in VALE code and with is_rx = 1 830 */ 831static inline uint32_t 832nm_kr_lease(struct netmap_kring *k, u_int n, int is_rx) 833{ 834 uint32_t lim = k->nkr_num_slots - 1; 835 uint32_t lease_idx = k->nkr_lease_idx; 836 837 k->nkr_leases[lease_idx] = NR_NOSLOT; 838 k->nkr_lease_idx = nm_next(lease_idx, lim); 839 840#ifdef CONFIG_NETMAP_DEBUG 841 if (n > nm_kr_space(k, is_rx)) { 842 nm_prerr("invalid request for %d slots", n); 843 panic("x"); 844 } 845#endif /* CONFIG NETMAP_DEBUG */ 846 /* XXX verify that there are n slots */ 847 k->nkr_hwlease += n; 848 if (k->nkr_hwlease > lim) 849 k->nkr_hwlease -= lim + 1; 850 851#ifdef CONFIG_NETMAP_DEBUG 852 if (k->nkr_hwlease >= k->nkr_num_slots || 853 k->nr_hwcur >= k->nkr_num_slots || 854 k->nr_hwtail >= k->nkr_num_slots || 855 k->nkr_lease_idx >= k->nkr_num_slots) { 856 nm_prerr("invalid kring %s, cur %d tail %d lease %d lease_idx %d lim %d", 857 k->na->name, 858 k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease, 859 k->nkr_lease_idx, k->nkr_num_slots); 860 } 861#endif /* CONFIG_NETMAP_DEBUG */ 862 return lease_idx; 863} 864 865/* 866 * 867 * This flush routine supports only unicast and broadcast but a large 868 * number of ports, and lets us replace the learn and dispatch functions. 869 */ 870int 871nm_vale_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na, 872 u_int ring_nr) 873{ 874 struct nm_vale_q *dst_ents, *brddst; 875 uint16_t num_dsts = 0, *dsts; 876 struct nm_bridge *b = na->na_bdg; 877 u_int i, me = na->bdg_port; 878 879 /* 880 * The work area (pointed by ft) is followed by an array of 881 * pointers to queues , dst_ents; there are NM_BDG_MAXRINGS 882 * queues per port plus one for the broadcast traffic. 883 * Then we have an array of destination indexes. 884 */ 885 dst_ents = (struct nm_vale_q *)(ft + NM_BDG_BATCH_MAX); 886 dsts = (uint16_t *)(dst_ents + NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1); 887 888 /* first pass: find a destination for each packet in the batch */ 889 for (i = 0; likely(i < n); i += ft[i].ft_frags) { 890 uint8_t dst_ring = ring_nr; /* default, same ring as origin */ 891 uint16_t dst_port, d_i; 892 struct nm_vale_q *d; 893 struct nm_bdg_fwd *start_ft = NULL; 894 895 nm_prdis("slot %d frags %d", i, ft[i].ft_frags); 896 897 if (na->up.virt_hdr_len < ft[i].ft_len) { 898 ft[i].ft_offset = na->up.virt_hdr_len; 899 start_ft = &ft[i]; 900 } else if (na->up.virt_hdr_len == ft[i].ft_len && ft[i].ft_flags & NS_MOREFRAG) { 901 ft[i].ft_offset = ft[i].ft_len; 902 start_ft = &ft[i+1]; 903 } else { 904 /* Drop the packet if the virtio-net header is not into the first 905 * fragment nor at the very beginning of the second. 906 */ 907 continue; 908 } 909 dst_port = b->bdg_ops.lookup(start_ft, &dst_ring, na, b->private_data); 910 if (netmap_verbose > 255) 911 nm_prlim(5, "slot %d port %d -> %d", i, me, dst_port); 912 if (dst_port >= NM_BDG_NOPORT) 913 continue; /* this packet is identified to be dropped */ 914 else if (dst_port == NM_BDG_BROADCAST) 915 dst_ring = 0; /* broadcasts always go to ring 0 */ 916 else if (unlikely(dst_port == me || 917 !b->bdg_ports[dst_port])) 918 continue; 919 920 /* get a position in the scratch pad */ 921 d_i = dst_port * NM_BDG_MAXRINGS + dst_ring; 922 d = dst_ents + d_i; 923 924 /* append the first fragment to the list */ 925 if (d->bq_head == NM_FT_NULL) { /* new destination */ 926 d->bq_head = d->bq_tail = i; 927 /* remember this position to be scanned later */ 928 if (dst_port != NM_BDG_BROADCAST) 929 dsts[num_dsts++] = d_i; 930 } else { 931 ft[d->bq_tail].ft_next = i; 932 d->bq_tail = i; 933 } 934 d->bq_len += ft[i].ft_frags; 935 } 936 937 /* 938 * Broadcast traffic goes to ring 0 on all destinations. 939 * So we need to add these rings to the list of ports to scan. 940 * XXX at the moment we scan all NM_BDG_MAXPORTS ports, which is 941 * expensive. We should keep a compact list of active destinations 942 * so we could shorten this loop. 943 */ 944 brddst = dst_ents + NM_BDG_BROADCAST * NM_BDG_MAXRINGS; 945 if (brddst->bq_head != NM_FT_NULL) { 946 u_int j; 947 for (j = 0; likely(j < b->bdg_active_ports); j++) { 948 uint16_t d_i; 949 i = b->bdg_port_index[j]; 950 if (unlikely(i == me)) 951 continue; 952 d_i = i * NM_BDG_MAXRINGS; 953 if (dst_ents[d_i].bq_head == NM_FT_NULL) 954 dsts[num_dsts++] = d_i; 955 } 956 } 957 958 nm_prdis(5, "pass 1 done %d pkts %d dsts", n, num_dsts); 959 /* second pass: scan destinations */ 960 for (i = 0; i < num_dsts; i++) { 961 struct netmap_vp_adapter *dst_na; 962 struct netmap_kring *kring; 963 struct netmap_ring *ring; 964 u_int dst_nr, lim, j, d_i, next, brd_next; 965 u_int needed, howmany; 966 int retry = netmap_txsync_retry; 967 struct nm_vale_q *d; 968 uint32_t my_start = 0, lease_idx = 0; 969 int nrings; 970 int virt_hdr_mismatch = 0; 971 972 d_i = dsts[i]; 973 nm_prdis("second pass %d port %d", i, d_i); 974 d = dst_ents + d_i; 975 // XXX fix the division 976 dst_na = b->bdg_ports[d_i/NM_BDG_MAXRINGS]; 977 /* protect from the lookup function returning an inactive 978 * destination port 979 */ 980 if (unlikely(dst_na == NULL)) 981 goto cleanup; 982 if (dst_na->up.na_flags & NAF_SW_ONLY) 983 goto cleanup; 984 /* 985 * The interface may be in !netmap mode in two cases: 986 * - when na is attached but not activated yet; 987 * - when na is being deactivated but is still attached. 988 */ 989 if (unlikely(!nm_netmap_on(&dst_na->up))) { 990 nm_prdis("not in netmap mode!"); 991 goto cleanup; 992 } 993 994 /* there is at least one either unicast or broadcast packet */ 995 brd_next = brddst->bq_head; 996 next = d->bq_head; 997 /* we need to reserve this many slots. If fewer are 998 * available, some packets will be dropped. 999 * Packets may have multiple fragments, so we may not use 1000 * there is a chance that we may not use all of the slots 1001 * we have claimed, so we will need to handle the leftover 1002 * ones when we regain the lock. 1003 */ 1004 needed = d->bq_len + brddst->bq_len; 1005 1006 if (unlikely(dst_na->up.virt_hdr_len != na->up.virt_hdr_len)) { 1007 if (netmap_verbose) { 1008 nm_prlim(3, "virt_hdr_mismatch, src %d dst %d", na->up.virt_hdr_len, 1009 dst_na->up.virt_hdr_len); 1010 } 1011 /* There is a virtio-net header/offloadings mismatch between 1012 * source and destination. The slower mismatch datapath will 1013 * be used to cope with all the mismatches. 1014 */ 1015 virt_hdr_mismatch = 1; 1016 if (dst_na->mfs < na->mfs) { 1017 /* We may need to do segmentation offloadings, and so 1018 * we may need a number of destination slots greater 1019 * than the number of input slots ('needed'). 1020 * We look for the smallest integer 'x' which satisfies: 1021 * needed * na->mfs + x * H <= x * na->mfs 1022 * where 'H' is the length of the longest header that may 1023 * be replicated in the segmentation process (e.g. for 1024 * TCPv4 we must account for ethernet header, IP header 1025 * and TCPv4 header). 1026 */ 1027 KASSERT(dst_na->mfs > 0, ("vpna->mfs is 0")); 1028 needed = (needed * na->mfs) / 1029 (dst_na->mfs - WORST_CASE_GSO_HEADER) + 1; 1030 nm_prdis(3, "srcmtu=%u, dstmtu=%u, x=%u", na->mfs, dst_na->mfs, needed); 1031 } 1032 } 1033 1034 nm_prdis(5, "pass 2 dst %d is %x %s", 1035 i, d_i, is_vp ? "virtual" : "nic/host"); 1036 dst_nr = d_i & (NM_BDG_MAXRINGS-1); 1037 nrings = dst_na->up.num_rx_rings; 1038 if (dst_nr >= nrings) 1039 dst_nr = dst_nr % nrings; 1040 kring = dst_na->up.rx_rings[dst_nr]; 1041 ring = kring->ring; 1042 /* the destination ring may have not been opened for RX */ 1043 if (unlikely(ring == NULL || kring->nr_mode != NKR_NETMAP_ON)) 1044 goto cleanup; 1045 lim = kring->nkr_num_slots - 1; 1046 1047retry: 1048 1049 if (dst_na->retry && retry) { 1050 /* try to get some free slot from the previous run */ 1051 kring->nm_notify(kring, NAF_FORCE_RECLAIM); 1052 /* actually useful only for bwraps, since there 1053 * the notify will trigger a txsync on the hwna. VALE ports 1054 * have dst_na->retry == 0 1055 */ 1056 } 1057 /* reserve the buffers in the queue and an entry 1058 * to report completion, and drop lock. 1059 * XXX this might become a helper function. 1060 */ 1061 mtx_lock(&kring->q_lock); 1062 if (kring->nkr_stopped) { 1063 mtx_unlock(&kring->q_lock); 1064 goto cleanup; 1065 } 1066 my_start = j = kring->nkr_hwlease; 1067 howmany = nm_kr_space(kring, 1); 1068 if (needed < howmany) 1069 howmany = needed; 1070 lease_idx = nm_kr_lease(kring, howmany, 1); 1071 mtx_unlock(&kring->q_lock); 1072 1073 /* only retry if we need more than available slots */ 1074 if (retry && needed <= howmany) 1075 retry = 0; 1076 1077 /* copy to the destination queue */ 1078 while (howmany > 0) { 1079 struct netmap_slot *slot; 1080 struct nm_bdg_fwd *ft_p, *ft_end; 1081 u_int cnt; 1082 1083 /* find the queue from which we pick next packet. 1084 * NM_FT_NULL is always higher than valid indexes 1085 * so we never dereference it if the other list 1086 * has packets (and if both are empty we never 1087 * get here). 1088 */ 1089 if (next < brd_next) { 1090 ft_p = ft + next; 1091 next = ft_p->ft_next; 1092 } else { /* insert broadcast */ 1093 ft_p = ft + brd_next; 1094 brd_next = ft_p->ft_next; 1095 } 1096 cnt = ft_p->ft_frags; // cnt > 0 1097 if (unlikely(cnt > howmany)) 1098 break; /* no more space */ 1099 if (netmap_verbose && cnt > 1) 1100 nm_prlim(5, "rx %d frags to %d", cnt, j); 1101 ft_end = ft_p + cnt; 1102 if (unlikely(virt_hdr_mismatch)) { 1103 bdg_mismatch_datapath(na, dst_na, ft_p, ring, &j, lim, &howmany); 1104 } else { 1105 howmany -= cnt; 1106 do { 1107 char *dst, *src = ft_p->ft_buf; 1108 size_t copy_len = ft_p->ft_len, dst_len = copy_len; 1109 1110 slot = &ring->slot[j]; 1111 dst = NMB(&dst_na->up, slot); 1112 1113 nm_prdis("send [%d] %d(%d) bytes at %s:%d", 1114 i, (int)copy_len, (int)dst_len, 1115 NM_IFPNAME(dst_ifp), j); 1116 /* round to a multiple of 64 */ 1117 copy_len = (copy_len + 63) & ~63; 1118 1119 if (unlikely(copy_len > NETMAP_BUF_SIZE(&dst_na->up) || 1120 copy_len > NETMAP_BUF_SIZE(&na->up))) { 1121 nm_prlim(5, "invalid len %d, down to 64", (int)copy_len); 1122 copy_len = dst_len = 64; // XXX 1123 } 1124 if (ft_p->ft_flags & NS_INDIRECT) { 1125 if (copyin(src, dst, copy_len)) { 1126 // invalid user pointer, pretend len is 0 1127 dst_len = 0; 1128 } 1129 } else { 1130 //memcpy(dst, src, copy_len); 1131 pkt_copy(src, dst, (int)copy_len); 1132 } 1133 slot->len = dst_len; 1134 slot->flags = (cnt << 8)| NS_MOREFRAG; 1135 j = nm_next(j, lim); 1136 needed--; 1137 ft_p++; 1138 } while (ft_p != ft_end); 1139 slot->flags = (cnt << 8); /* clear flag on last entry */ 1140 } 1141 /* are we done ? */ 1142 if (next == NM_FT_NULL && brd_next == NM_FT_NULL) 1143 break; 1144 } 1145 { 1146 /* current position */ 1147 uint32_t *p = kring->nkr_leases; /* shorthand */ 1148 uint32_t update_pos; 1149 int still_locked = 1; 1150 1151 mtx_lock(&kring->q_lock); 1152 if (unlikely(howmany > 0)) { 1153 /* not used all bufs. If i am the last one 1154 * i can recover the slots, otherwise must 1155 * fill them with 0 to mark empty packets. 1156 */ 1157 nm_prdis("leftover %d bufs", howmany); 1158 if (nm_next(lease_idx, lim) == kring->nkr_lease_idx) { 1159 /* yes i am the last one */ 1160 nm_prdis("roll back nkr_hwlease to %d", j); 1161 kring->nkr_hwlease = j; 1162 } else { 1163 while (howmany-- > 0) { 1164 ring->slot[j].len = 0; 1165 ring->slot[j].flags = 0; 1166 j = nm_next(j, lim); 1167 } 1168 } 1169 } 1170 p[lease_idx] = j; /* report I am done */ 1171 1172 update_pos = kring->nr_hwtail; 1173 1174 if (my_start == update_pos) { 1175 /* all slots before my_start have been reported, 1176 * so scan subsequent leases to see if other ranges 1177 * have been completed, and to a selwakeup or txsync. 1178 */ 1179 while (lease_idx != kring->nkr_lease_idx && 1180 p[lease_idx] != NR_NOSLOT) { 1181 j = p[lease_idx]; 1182 p[lease_idx] = NR_NOSLOT; 1183 lease_idx = nm_next(lease_idx, lim); 1184 } 1185 /* j is the new 'write' position. j != my_start 1186 * means there are new buffers to report 1187 */ 1188 if (likely(j != my_start)) { 1189 kring->nr_hwtail = j; 1190 still_locked = 0; 1191 mtx_unlock(&kring->q_lock); 1192 kring->nm_notify(kring, 0); 1193 /* this is netmap_notify for VALE ports and 1194 * netmap_bwrap_notify for bwrap. The latter will 1195 * trigger a txsync on the underlying hwna 1196 */ 1197 if (dst_na->retry && retry--) { 1198 /* XXX this is going to call nm_notify again. 1199 * Only useful for bwrap in virtual machines 1200 */ 1201 goto retry; 1202 } 1203 } 1204 } 1205 if (still_locked) 1206 mtx_unlock(&kring->q_lock); 1207 } 1208cleanup: 1209 d->bq_head = d->bq_tail = NM_FT_NULL; /* cleanup */ 1210 d->bq_len = 0; 1211 } 1212 brddst->bq_head = brddst->bq_tail = NM_FT_NULL; /* cleanup */ 1213 brddst->bq_len = 0; 1214 return 0; 1215} 1216 1217/* nm_txsync callback for VALE ports */ 1218static int 1219netmap_vale_vp_txsync(struct netmap_kring *kring, int flags) 1220{ 1221 struct netmap_vp_adapter *na = 1222 (struct netmap_vp_adapter *)kring->na; 1223 u_int done; 1224 u_int const lim = kring->nkr_num_slots - 1; 1225 u_int const head = kring->rhead; 1226 1227 if (bridge_batch <= 0) { /* testing only */ 1228 done = head; // used all 1229 goto done; 1230 } 1231 if (!na->na_bdg) { 1232 done = head; 1233 goto done; 1234 } 1235 if (bridge_batch > NM_BDG_BATCH) 1236 bridge_batch = NM_BDG_BATCH; 1237 1238 done = nm_vale_preflush(kring, head); 1239done: 1240 if (done != head) 1241 nm_prerr("early break at %d/ %d, tail %d", done, head, kring->nr_hwtail); 1242 /* 1243 * packets between 'done' and 'cur' are left unsent. 1244 */ 1245 kring->nr_hwcur = done; 1246 kring->nr_hwtail = nm_prev(done, lim); 1247 if (netmap_debug & NM_DEBUG_TXSYNC) 1248 nm_prinf("%s ring %d flags %d", na->up.name, kring->ring_id, flags); 1249 return 0; 1250} 1251 1252 1253/* create a netmap_vp_adapter that describes a VALE port. 1254 * Only persistent VALE ports have a non-null ifp. 1255 */ 1256static int 1257netmap_vale_vp_create(struct nmreq_header *hdr, struct ifnet *ifp, 1258 struct netmap_mem_d *nmd, struct netmap_vp_adapter **ret) 1259{ 1260 struct nmreq_register *req = (struct nmreq_register *)(uintptr_t)hdr->nr_body; 1261 struct netmap_vp_adapter *vpna; 1262 struct netmap_adapter *na; 1263 int error = 0; 1264 u_int npipes = 0; 1265 u_int extrabufs = 0; 1266 1267 if (hdr->nr_reqtype != NETMAP_REQ_REGISTER) { 1268 return EINVAL; 1269 } 1270 1271 vpna = nm_os_malloc(sizeof(*vpna)); 1272 if (vpna == NULL) 1273 return ENOMEM; 1274 1275 na = &vpna->up; 1276 1277 na->ifp = ifp; 1278 strlcpy(na->name, hdr->nr_name, sizeof(na->name)); 1279 1280 /* bound checking */ 1281 na->num_tx_rings = req->nr_tx_rings; 1282 nm_bound_var(&na->num_tx_rings, 1, 1, NM_BDG_MAXRINGS, NULL); 1283 req->nr_tx_rings = na->num_tx_rings; /* write back */ 1284 na->num_rx_rings = req->nr_rx_rings; 1285 nm_bound_var(&na->num_rx_rings, 1, 1, NM_BDG_MAXRINGS, NULL); 1286 req->nr_rx_rings = na->num_rx_rings; /* write back */ 1287 nm_bound_var(&req->nr_tx_slots, NM_BRIDGE_RINGSIZE, 1288 1, NM_BDG_MAXSLOTS, NULL); 1289 na->num_tx_desc = req->nr_tx_slots; 1290 nm_bound_var(&req->nr_rx_slots, NM_BRIDGE_RINGSIZE, 1291 1, NM_BDG_MAXSLOTS, NULL); 1292 /* validate number of pipes. We want at least 1, 1293 * but probably can do with some more. 1294 * So let's use 2 as default (when 0 is supplied) 1295 */ 1296 nm_bound_var(&npipes, 2, 1, NM_MAXPIPES, NULL); 1297 /* validate extra bufs */ 1298 extrabufs = req->nr_extra_bufs; 1299 nm_bound_var(&extrabufs, 0, 0, 1300 128*NM_BDG_MAXSLOTS, NULL); 1301 req->nr_extra_bufs = extrabufs; /* write back */ 1302 na->num_rx_desc = req->nr_rx_slots; 1303 /* Set the mfs to a default value, as it is needed on the VALE 1304 * mismatch datapath. XXX We should set it according to the MTU 1305 * known to the kernel. */ 1306 vpna->mfs = NM_BDG_MFS_DEFAULT; 1307 vpna->last_smac = ~0llu; 1308 /*if (vpna->mfs > netmap_buf_size) TODO netmap_buf_size is zero?? 1309 vpna->mfs = netmap_buf_size; */ 1310 if (netmap_verbose) 1311 nm_prinf("max frame size %u", vpna->mfs); 1312 1313 na->na_flags |= NAF_BDG_MAYSLEEP; 1314 /* persistent VALE ports look like hw devices 1315 * with a native netmap adapter 1316 */ 1317 if (ifp) 1318 na->na_flags |= NAF_NATIVE; 1319 na->nm_txsync = netmap_vale_vp_txsync; 1320 na->nm_rxsync = netmap_vp_rxsync; /* use the one provided by bdg */ 1321 na->nm_register = netmap_vp_reg; /* use the one provided by bdg */ 1322 na->nm_krings_create = netmap_vale_vp_krings_create; 1323 na->nm_krings_delete = netmap_vale_vp_krings_delete; 1324 na->nm_dtor = netmap_vale_vp_dtor; 1325 nm_prdis("nr_mem_id %d", req->nr_mem_id); 1326 na->nm_mem = nmd ? 1327 netmap_mem_get(nmd): 1328 netmap_mem_private_new( 1329 na->num_tx_rings, na->num_tx_desc, 1330 na->num_rx_rings, na->num_rx_desc, 1331 req->nr_extra_bufs, npipes, &error); 1332 if (na->nm_mem == NULL) 1333 goto err; 1334 na->nm_bdg_attach = netmap_vale_vp_bdg_attach; 1335 /* other nmd fields are set in the common routine */ 1336 error = netmap_attach_common(na); 1337 if (error) 1338 goto err; 1339 *ret = vpna; 1340 return 0; 1341 1342err: 1343 if (na->nm_mem != NULL) 1344 netmap_mem_put(na->nm_mem); 1345 nm_os_free(vpna); 1346 return error; 1347} 1348 1349/* nm_bdg_attach callback for VALE ports 1350 * The na_vp port is this same netmap_adapter. There is no host port. 1351 */ 1352static int 1353netmap_vale_vp_bdg_attach(const char *name, struct netmap_adapter *na, 1354 struct nm_bridge *b) 1355{ 1356 struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na; 1357 1358 if ((b->bdg_flags & NM_BDG_NEED_BWRAP) || vpna->na_bdg) { 1359 return NM_NEED_BWRAP; 1360 } 1361 na->na_vp = vpna; 1362 strlcpy(na->name, name, sizeof(na->name)); 1363 na->na_hostvp = NULL; 1364 return 0; 1365} 1366 1367static int 1368netmap_vale_bwrap_krings_create(struct netmap_adapter *na) 1369{ 1370 int error; 1371 1372 /* impersonate a netmap_vp_adapter */ 1373 error = netmap_vale_vp_krings_create(na); 1374 if (error) 1375 return error; 1376 error = netmap_bwrap_krings_create_common(na); 1377 if (error) { 1378 netmap_vale_vp_krings_delete(na); 1379 } 1380 return error; 1381} 1382 1383static void 1384netmap_vale_bwrap_krings_delete(struct netmap_adapter *na) 1385{ 1386 netmap_bwrap_krings_delete_common(na); 1387 netmap_vale_vp_krings_delete(na); 1388} 1389 1390static int 1391netmap_vale_bwrap_attach(const char *nr_name, struct netmap_adapter *hwna) 1392{ 1393 struct netmap_bwrap_adapter *bna; 1394 struct netmap_adapter *na = NULL; 1395 struct netmap_adapter *hostna = NULL; 1396 int error; 1397 1398 bna = nm_os_malloc(sizeof(*bna)); 1399 if (bna == NULL) { 1400 return ENOMEM; 1401 } 1402 na = &bna->up.up; 1403 strlcpy(na->name, nr_name, sizeof(na->name)); 1404 na->nm_register = netmap_bwrap_reg; 1405 na->nm_txsync = netmap_vale_vp_txsync; 1406 // na->nm_rxsync = netmap_bwrap_rxsync; 1407 na->nm_krings_create = netmap_vale_bwrap_krings_create; 1408 na->nm_krings_delete = netmap_vale_bwrap_krings_delete; 1409 na->nm_notify = netmap_bwrap_notify; 1410 bna->up.retry = 1; /* XXX maybe this should depend on the hwna */ 1411 /* Set the mfs, needed on the VALE mismatch datapath. */ 1412 bna->up.mfs = NM_BDG_MFS_DEFAULT; 1413 1414 if (hwna->na_flags & NAF_HOST_RINGS) { 1415 hostna = &bna->host.up; 1416 hostna->nm_notify = netmap_bwrap_notify; 1417 bna->host.mfs = NM_BDG_MFS_DEFAULT; 1418 } 1419 1420 error = netmap_bwrap_attach_common(na, hwna); 1421 if (error) { 1422 nm_os_free(bna); 1423 } 1424 return error; 1425} 1426 1427int 1428netmap_get_vale_na(struct nmreq_header *hdr, struct netmap_adapter **na, 1429 struct netmap_mem_d *nmd, int create) 1430{ 1431 return netmap_get_bdg_na(hdr, na, nmd, create, &vale_bdg_ops); 1432} 1433 1434 1435/* creates a persistent VALE port */ 1436int 1437nm_vi_create(struct nmreq_header *hdr) 1438{ 1439 struct nmreq_vale_newif *req = 1440 (struct nmreq_vale_newif *)(uintptr_t)hdr->nr_body; 1441 int error = 0; 1442 /* Build a nmreq_register out of the nmreq_vale_newif, 1443 * so that we can call netmap_get_bdg_na(). */ 1444 struct nmreq_register regreq; 1445 bzero(®req, sizeof(regreq)); 1446 regreq.nr_tx_slots = req->nr_tx_slots; 1447 regreq.nr_rx_slots = req->nr_rx_slots; 1448 regreq.nr_tx_rings = req->nr_tx_rings; 1449 regreq.nr_rx_rings = req->nr_rx_rings; 1450 regreq.nr_mem_id = req->nr_mem_id; 1451 hdr->nr_reqtype = NETMAP_REQ_REGISTER; 1452 hdr->nr_body = (uintptr_t)®req; 1453 error = netmap_vi_create(hdr, 0 /* no autodelete */); 1454 hdr->nr_reqtype = NETMAP_REQ_VALE_NEWIF; 1455 hdr->nr_body = (uintptr_t)req; 1456 /* Write back to the original struct. */ 1457 req->nr_tx_slots = regreq.nr_tx_slots; 1458 req->nr_rx_slots = regreq.nr_rx_slots; 1459 req->nr_tx_rings = regreq.nr_tx_rings; 1460 req->nr_rx_rings = regreq.nr_rx_rings; 1461 req->nr_mem_id = regreq.nr_mem_id; 1462 return error; 1463} 1464 1465/* remove a persistent VALE port from the system */ 1466int 1467nm_vi_destroy(const char *name) 1468{ 1469 struct ifnet *ifp; 1470 struct netmap_vp_adapter *vpna; 1471 int error; 1472 1473 ifp = ifunit_ref(name); 1474 if (!ifp) 1475 return ENXIO; 1476 NMG_LOCK(); 1477 /* make sure this is actually a VALE port */ 1478 if (!NM_NA_VALID(ifp) || NA(ifp)->nm_register != netmap_vp_reg) { 1479 error = EINVAL; 1480 goto err; 1481 } 1482 1483 vpna = (struct netmap_vp_adapter *)NA(ifp); 1484 1485 /* we can only destroy ports that were created via NETMAP_BDG_NEWIF */ 1486 if (vpna->autodelete) { 1487 error = EINVAL; 1488 goto err; 1489 } 1490 1491 /* also make sure that nobody is using the inferface */ 1492 if (NETMAP_OWNED_BY_ANY(&vpna->up) || 1493 vpna->up.na_refcount > 1 /* any ref besides the one in nm_vi_create()? */) { 1494 error = EBUSY; 1495 goto err; 1496 } 1497 1498 NMG_UNLOCK(); 1499 1500 if (netmap_verbose) 1501 nm_prinf("destroying a persistent vale interface %s", ifp->if_xname); 1502 /* Linux requires all the references are released 1503 * before unregister 1504 */ 1505 netmap_detach(ifp); 1506 if_rele(ifp); 1507 nm_os_vi_detach(ifp); 1508 return 0; 1509 1510err: 1511 NMG_UNLOCK(); 1512 if_rele(ifp); 1513 return error; 1514} 1515 1516static int 1517nm_update_info(struct nmreq_register *req, struct netmap_adapter *na) 1518{ 1519 req->nr_rx_rings = na->num_rx_rings; 1520 req->nr_tx_rings = na->num_tx_rings; 1521 req->nr_rx_slots = na->num_rx_desc; 1522 req->nr_tx_slots = na->num_tx_desc; 1523 return netmap_mem_get_info(na->nm_mem, &req->nr_memsize, NULL, 1524 &req->nr_mem_id); 1525} 1526 1527 1528/* 1529 * Create a virtual interface registered to the system. 1530 * The interface will be attached to a bridge later. 1531 */ 1532int 1533netmap_vi_create(struct nmreq_header *hdr, int autodelete) 1534{ 1535 struct nmreq_register *req = (struct nmreq_register *)(uintptr_t)hdr->nr_body; 1536 struct ifnet *ifp; 1537 struct netmap_vp_adapter *vpna; 1538 struct netmap_mem_d *nmd = NULL; 1539 int error; 1540 1541 if (hdr->nr_reqtype != NETMAP_REQ_REGISTER) { 1542 return EINVAL; 1543 } 1544 1545 /* don't include VALE prefix */ 1546 if (!strncmp(hdr->nr_name, NM_BDG_NAME, strlen(NM_BDG_NAME))) 1547 return EINVAL; 1548 if (strlen(hdr->nr_name) >= IFNAMSIZ) { 1549 return EINVAL; 1550 } 1551 ifp = ifunit_ref(hdr->nr_name); 1552 if (ifp) { /* already exist, cannot create new one */ 1553 error = EEXIST; 1554 NMG_LOCK(); 1555 if (NM_NA_VALID(ifp)) { 1556 int update_err = nm_update_info(req, NA(ifp)); 1557 if (update_err) 1558 error = update_err; 1559 } 1560 NMG_UNLOCK(); 1561 if_rele(ifp); 1562 return error; 1563 } 1564 error = nm_os_vi_persist(hdr->nr_name, &ifp); 1565 if (error) 1566 return error; 1567 1568 NMG_LOCK(); 1569 if (req->nr_mem_id) { 1570 nmd = netmap_mem_find(req->nr_mem_id); 1571 if (nmd == NULL) { 1572 error = EINVAL; 1573 goto err_1; 1574 } 1575 } 1576 /* netmap_vp_create creates a struct netmap_vp_adapter */ 1577 error = netmap_vale_vp_create(hdr, ifp, nmd, &vpna); 1578 if (error) { 1579 if (netmap_debug & NM_DEBUG_VALE) 1580 nm_prerr("error %d", error); 1581 goto err_1; 1582 } 1583 /* persist-specific routines */ 1584 vpna->up.nm_bdg_ctl = netmap_vp_bdg_ctl; 1585 if (!autodelete) { 1586 netmap_adapter_get(&vpna->up); 1587 } else { 1588 vpna->autodelete = 1; 1589 } 1590 NM_ATTACH_NA(ifp, &vpna->up); 1591 /* return the updated info */ 1592 error = nm_update_info(req, &vpna->up); 1593 if (error) { 1594 goto err_2; 1595 } 1596 nm_prdis("returning nr_mem_id %d", req->nr_mem_id); 1597 if (nmd) 1598 netmap_mem_put(nmd); 1599 NMG_UNLOCK(); 1600 nm_prdis("created %s", ifp->if_xname); 1601 return 0; 1602 1603err_2: 1604 netmap_detach(ifp); 1605err_1: 1606 if (nmd) 1607 netmap_mem_put(nmd); 1608 NMG_UNLOCK(); 1609 nm_os_vi_detach(ifp); 1610 1611 return error; 1612} 1613 1614#endif /* WITH_VALE */ 1615