pkt-gen.c revision 342035
1/* 2 * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved. 3 * Copyright (C) 2013-2015 Universita` di Pisa. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27/* 28 * $FreeBSD: stable/11/tools/tools/netmap/pkt-gen.c 342035 2018-12-13 10:18:31Z vmaffione $ 29 * $Id: pkt-gen.c 12346 2013-06-12 17:36:25Z luigi $ 30 * 31 * Example program to show how to build a multithreaded packet 32 * source/sink using the netmap device. 33 * 34 * In this example we create a programmable number of threads 35 * to take care of all the queues of the interface used to 36 * send or receive traffic. 37 * 38 */ 39 40#define _GNU_SOURCE /* for CPU_SET() */ 41#include <stdio.h> 42#define NETMAP_WITH_LIBS 43#include <net/netmap_user.h> 44 45 46#include <ctype.h> // isprint() 47#include <unistd.h> // sysconf() 48#include <sys/poll.h> 49#include <arpa/inet.h> /* ntohs */ 50#ifndef _WIN32 51#include <sys/sysctl.h> /* sysctl */ 52#endif 53#include <ifaddrs.h> /* getifaddrs */ 54#include <net/ethernet.h> 55#include <netinet/in.h> 56#include <netinet/ip.h> 57#include <netinet/udp.h> 58#include <netinet/ip6.h> 59#ifdef linux 60#define IPV6_VERSION 0x60 61#define IPV6_DEFHLIM 64 62#endif 63#include <assert.h> 64#include <math.h> 65 66#include <pthread.h> 67 68#ifndef NO_PCAP 69#include <pcap/pcap.h> 70#endif 71 72#include "ctrs.h" 73 74static void usage(int); 75 76#ifdef _WIN32 77#define cpuset_t DWORD_PTR //uint64_t 78static inline void CPU_ZERO(cpuset_t *p) 79{ 80 *p = 0; 81} 82 83static inline void CPU_SET(uint32_t i, cpuset_t *p) 84{ 85 *p |= 1<< (i & 0x3f); 86} 87 88#define pthread_setaffinity_np(a, b, c) !SetThreadAffinityMask(a, *c) //((void)a, 0) 89#define TAP_CLONEDEV "/dev/tap" 90#define AF_LINK 18 //defined in winsocks.h 91#define CLOCK_REALTIME_PRECISE CLOCK_REALTIME 92#include <net/if_dl.h> 93 94/* 95 * Convert an ASCII representation of an ethernet address to 96 * binary form. 97 */ 98struct ether_addr * 99ether_aton(const char *a) 100{ 101 int i; 102 static struct ether_addr o; 103 unsigned int o0, o1, o2, o3, o4, o5; 104 105 i = sscanf(a, "%x:%x:%x:%x:%x:%x", &o0, &o1, &o2, &o3, &o4, &o5); 106 107 if (i != 6) 108 return (NULL); 109 110 o.octet[0]=o0; 111 o.octet[1]=o1; 112 o.octet[2]=o2; 113 o.octet[3]=o3; 114 o.octet[4]=o4; 115 o.octet[5]=o5; 116 117 return ((struct ether_addr *)&o); 118} 119 120/* 121 * Convert a binary representation of an ethernet address to 122 * an ASCII string. 123 */ 124char * 125ether_ntoa(const struct ether_addr *n) 126{ 127 int i; 128 static char a[18]; 129 130 i = sprintf(a, "%02x:%02x:%02x:%02x:%02x:%02x", 131 n->octet[0], n->octet[1], n->octet[2], 132 n->octet[3], n->octet[4], n->octet[5]); 133 return (i < 17 ? NULL : (char *)&a); 134} 135#endif /* _WIN32 */ 136 137#ifdef linux 138 139#define cpuset_t cpu_set_t 140 141#define ifr_flagshigh ifr_flags /* only the low 16 bits here */ 142#define IFF_PPROMISC IFF_PROMISC /* IFF_PPROMISC does not exist */ 143#include <linux/ethtool.h> 144#include <linux/sockios.h> 145 146#define CLOCK_REALTIME_PRECISE CLOCK_REALTIME 147#include <netinet/ether.h> /* ether_aton */ 148#include <linux/if_packet.h> /* sockaddr_ll */ 149#endif /* linux */ 150 151#ifdef __FreeBSD__ 152#include <sys/endian.h> /* le64toh */ 153#include <machine/param.h> 154 155#include <pthread_np.h> /* pthread w/ affinity */ 156#include <sys/cpuset.h> /* cpu_set */ 157#include <net/if_dl.h> /* LLADDR */ 158#endif /* __FreeBSD__ */ 159 160#ifdef __APPLE__ 161 162#define cpuset_t uint64_t // XXX 163static inline void CPU_ZERO(cpuset_t *p) 164{ 165 *p = 0; 166} 167 168static inline void CPU_SET(uint32_t i, cpuset_t *p) 169{ 170 *p |= 1<< (i & 0x3f); 171} 172 173#define pthread_setaffinity_np(a, b, c) ((void)a, 0) 174 175#define ifr_flagshigh ifr_flags // XXX 176#define IFF_PPROMISC IFF_PROMISC 177#include <net/if_dl.h> /* LLADDR */ 178#define clock_gettime(a,b) \ 179 do {struct timespec t0 = {0,0}; *(b) = t0; } while (0) 180#endif /* __APPLE__ */ 181 182const char *default_payload="netmap pkt-gen DIRECT payload\n" 183 "http://info.iet.unipi.it/~luigi/netmap/ "; 184 185const char *indirect_payload="netmap pkt-gen indirect payload\n" 186 "http://info.iet.unipi.it/~luigi/netmap/ "; 187 188int verbose = 0; 189int normalize = 1; 190 191#define VIRT_HDR_1 10 /* length of a base vnet-hdr */ 192#define VIRT_HDR_2 12 /* length of the extenede vnet-hdr */ 193#define VIRT_HDR_MAX VIRT_HDR_2 194struct virt_header { 195 uint8_t fields[VIRT_HDR_MAX]; 196}; 197 198#define MAX_BODYSIZE 65536 199 200struct pkt { 201 struct virt_header vh; 202 struct ether_header eh; 203 union { 204 struct { 205 struct ip ip; 206 struct udphdr udp; 207 uint8_t body[MAX_BODYSIZE]; /* hardwired */ 208 } ipv4; 209 struct { 210 struct ip6_hdr ip; 211 struct udphdr udp; 212 uint8_t body[MAX_BODYSIZE]; /* hardwired */ 213 } ipv6; 214 }; 215} __attribute__((__packed__)); 216 217#define PKT(p, f, af) \ 218 ((af) == AF_INET ? (p)->ipv4.f: (p)->ipv6.f) 219 220struct ip_range { 221 char *name; 222 union { 223 struct { 224 uint32_t start, end; /* same as struct in_addr */ 225 } ipv4; 226 struct { 227 struct in6_addr start, end; 228 uint8_t sgroup, egroup; 229 } ipv6; 230 }; 231 uint16_t port0, port1; 232}; 233 234struct mac_range { 235 char *name; 236 struct ether_addr start, end; 237}; 238 239/* ifname can be netmap:foo-xxxx */ 240#define MAX_IFNAMELEN 64 /* our buffer for ifname */ 241#define MAX_PKTSIZE MAX_BODYSIZE /* XXX: + IP_HDR + ETH_HDR */ 242 243/* compact timestamp to fit into 60 byte packet. (enough to obtain RTT) */ 244struct tstamp { 245 uint32_t sec; 246 uint32_t nsec; 247}; 248 249/* 250 * global arguments for all threads 251 */ 252 253struct glob_arg { 254 int af; /* address family AF_INET/AF_INET6 */ 255 struct ip_range src_ip; 256 struct ip_range dst_ip; 257 struct mac_range dst_mac; 258 struct mac_range src_mac; 259 int pkt_size; 260 int pkt_min_size; 261 int burst; 262 int forever; 263 uint64_t npackets; /* total packets to send */ 264 int frags; /* fragments per packet */ 265 u_int frag_size; /* size of each fragment */ 266 int nthreads; 267 int cpus; /* cpus used for running */ 268 int system_cpus; /* cpus on the system */ 269 270 int options; /* testing */ 271#define OPT_PREFETCH 1 272#define OPT_ACCESS 2 273#define OPT_COPY 4 274#define OPT_MEMCPY 8 275#define OPT_TS 16 /* add a timestamp */ 276#define OPT_INDIRECT 32 /* use indirect buffers, tx only */ 277#define OPT_DUMP 64 /* dump rx/tx traffic */ 278#define OPT_RUBBISH 256 /* send wathever the buffers contain */ 279#define OPT_RANDOM_SRC 512 280#define OPT_RANDOM_DST 1024 281#define OPT_PPS_STATS 2048 282 int dev_type; 283#ifndef NO_PCAP 284 pcap_t *p; 285#endif 286 287 int tx_rate; 288 struct timespec tx_period; 289 290 int affinity; 291 int main_fd; 292 struct nm_desc *nmd; 293 int report_interval; /* milliseconds between prints */ 294 void *(*td_body)(void *); 295 int td_type; 296 void *mmap_addr; 297 char ifname[MAX_IFNAMELEN]; 298 char *nmr_config; 299 int dummy_send; 300 int virt_header; /* send also the virt_header */ 301 char *packet_file; /* -P option */ 302#define STATS_WIN 15 303 int win_idx; 304 int64_t win[STATS_WIN]; 305 int wait_link; 306 int framing; /* #bits of framing (for bw output) */ 307}; 308enum dev_type { DEV_NONE, DEV_NETMAP, DEV_PCAP, DEV_TAP }; 309 310enum { 311 TD_TYPE_SENDER = 1, 312 TD_TYPE_RECEIVER, 313 TD_TYPE_OTHER, 314}; 315 316/* 317 * Arguments for a new thread. The same structure is used by 318 * the source and the sink 319 */ 320struct targ { 321 struct glob_arg *g; 322 int used; 323 int completed; 324 int cancel; 325 int fd; 326 struct nm_desc *nmd; 327 /* these ought to be volatile, but they are 328 * only sampled and errors should not accumulate 329 */ 330 struct my_ctrs ctr; 331 332 struct timespec tic, toc; 333 int me; 334 pthread_t thread; 335 int affinity; 336 337 struct pkt pkt; 338 void *frame; 339 uint16_t seed[3]; 340 u_int frags; 341 u_int frag_size; 342}; 343 344static __inline uint16_t 345cksum_add(uint16_t sum, uint16_t a) 346{ 347 uint16_t res; 348 349 res = sum + a; 350 return (res + (res < a)); 351} 352 353static void 354extract_ipv4_addr(char *name, uint32_t *addr, uint16_t *port) 355{ 356 struct in_addr a; 357 char *pp; 358 359 pp = strchr(name, ':'); 360 if (pp != NULL) { /* do we have ports ? */ 361 *pp++ = '\0'; 362 *port = (uint16_t)strtol(pp, NULL, 0); 363 } 364 365 inet_pton(AF_INET, name, &a); 366 *addr = ntohl(a.s_addr); 367} 368 369static void 370extract_ipv6_addr(char *name, struct in6_addr *addr, uint16_t *port, 371 uint8_t *group) 372{ 373 char *pp; 374 375 /* 376 * We accept IPv6 address in the following form: 377 * group@[2001:DB8::1001]:port (w/ brackets and port) 378 * group@[2001:DB8::1] (w/ brackets and w/o port) 379 * group@2001:DB8::1234 (w/o brackets and w/o port) 380 */ 381 pp = strchr(name, '@'); 382 if (pp != NULL) { 383 *pp++ = '\0'; 384 *group = (uint8_t)strtol(name, NULL, 0); 385 if (*group > 7) 386 *group = 7; 387 name = pp; 388 } 389 if (name[0] == '[') 390 name++; 391 pp = strchr(name, ']'); 392 if (pp != NULL) 393 *pp++ = '\0'; 394 if (pp != NULL && *pp != ':') 395 pp = NULL; 396 if (pp != NULL) { /* do we have ports ? */ 397 *pp++ = '\0'; 398 *port = (uint16_t)strtol(pp, NULL, 0); 399 } 400 inet_pton(AF_INET6, name, addr); 401} 402/* 403 * extract the extremes from a range of ipv4 addresses. 404 * addr_lo[-addr_hi][:port_lo[-port_hi]] 405 */ 406static int 407extract_ip_range(struct ip_range *r, int af) 408{ 409 char *name, *ap, start[INET6_ADDRSTRLEN]; 410 char end[INET6_ADDRSTRLEN]; 411 struct in_addr a; 412 uint32_t tmp; 413 414 if (verbose) 415 D("extract IP range from %s", r->name); 416 417 name = strdup(r->name); 418 if (name == NULL) { 419 D("strdup failed"); 420 usage(-1); 421 } 422 /* the first - splits start/end of range */ 423 ap = strchr(name, '-'); 424 if (ap != NULL) 425 *ap++ = '\0'; 426 r->port0 = 1234; /* default port */ 427 if (af == AF_INET6) { 428 r->ipv6.sgroup = 7; /* default group */ 429 extract_ipv6_addr(name, &r->ipv6.start, &r->port0, 430 &r->ipv6.sgroup); 431 } else 432 extract_ipv4_addr(name, &r->ipv4.start, &r->port0); 433 434 r->port1 = r->port0; 435 if (af == AF_INET6) { 436 if (ap != NULL) { 437 r->ipv6.egroup = r->ipv6.sgroup; 438 extract_ipv6_addr(ap, &r->ipv6.end, &r->port1, 439 &r->ipv6.egroup); 440 } else { 441 r->ipv6.end = r->ipv6.start; 442 r->ipv6.egroup = r->ipv6.sgroup; 443 } 444 } else { 445 if (ap != NULL) { 446 extract_ipv4_addr(ap, &r->ipv4.end, &r->port1); 447 if (r->ipv4.start > r->ipv4.end) { 448 tmp = r->ipv4.end; 449 r->ipv4.end = r->ipv4.start; 450 r->ipv4.start = tmp; 451 } 452 } else 453 r->ipv4.end = r->ipv4.start; 454 } 455 456 if (r->port0 > r->port1) { 457 tmp = r->port0; 458 r->port0 = r->port1; 459 r->port1 = tmp; 460 } 461 if (af == AF_INET) { 462 a.s_addr = htonl(r->ipv4.start); 463 inet_ntop(af, &a, start, sizeof(start)); 464 a.s_addr = htonl(r->ipv4.end); 465 inet_ntop(af, &a, end, sizeof(end)); 466 } else { 467 inet_ntop(af, &r->ipv6.start, start, sizeof(start)); 468 inet_ntop(af, &r->ipv6.end, end, sizeof(end)); 469 } 470 if (af == AF_INET) 471 D("range is %s:%d to %s:%d", start, r->port0, end, r->port1); 472 else 473 D("range is %d@[%s]:%d to %d@[%s]:%d", r->ipv6.sgroup, 474 start, r->port0, r->ipv6.egroup, end, r->port1); 475 476 free(name); 477 if (r->port0 != r->port1 || 478 (af == AF_INET && r->ipv4.start != r->ipv4.end) || 479 (af == AF_INET6 && 480 !IN6_ARE_ADDR_EQUAL(&r->ipv6.start, &r->ipv6.end))) 481 return (OPT_COPY); 482 return (0); 483} 484 485static int 486extract_mac_range(struct mac_range *r) 487{ 488 struct ether_addr *e; 489 if (verbose) 490 D("extract MAC range from %s", r->name); 491 492 e = ether_aton(r->name); 493 if (e == NULL) { 494 D("invalid MAC address '%s'", r->name); 495 return 1; 496 } 497 bcopy(e, &r->start, 6); 498 bcopy(e, &r->end, 6); 499#if 0 500 bcopy(targ->src_mac, eh->ether_shost, 6); 501 p = index(targ->g->src_mac, '-'); 502 if (p) 503 targ->src_mac_range = atoi(p+1); 504 505 bcopy(ether_aton(targ->g->dst_mac), targ->dst_mac, 6); 506 bcopy(targ->dst_mac, eh->ether_dhost, 6); 507 p = index(targ->g->dst_mac, '-'); 508 if (p) 509 targ->dst_mac_range = atoi(p+1); 510#endif 511 if (verbose) 512 D("%s starts at %s", r->name, ether_ntoa(&r->start)); 513 return 0; 514} 515 516static int 517get_if_mtu(const struct glob_arg *g) 518{ 519 char ifname[IFNAMSIZ]; 520 struct ifreq ifreq; 521 int s, ret; 522 523 if (!strncmp(g->ifname, "netmap:", 7) && !strchr(g->ifname, '{') 524 && !strchr(g->ifname, '}')) { 525 /* Parse the interface name and ask the kernel for the 526 * MTU value. */ 527 strncpy(ifname, g->ifname+7, IFNAMSIZ-1); 528 ifname[strcspn(ifname, "-*^{}/@")] = '\0'; 529 530 s = socket(AF_INET, SOCK_DGRAM, 0); 531 if (s < 0) { 532 D("socket() failed: %s", strerror(errno)); 533 return s; 534 } 535 536 memset(&ifreq, 0, sizeof(ifreq)); 537 strncpy(ifreq.ifr_name, ifname, IFNAMSIZ); 538 539 ret = ioctl(s, SIOCGIFMTU, &ifreq); 540 if (ret) { 541 D("ioctl(SIOCGIFMTU) failed: %s", strerror(errno)); 542 } 543 544 return ifreq.ifr_mtu; 545 } 546 547 /* This is a pipe or a VALE port, where the MTU is very large, 548 * so we use some practical limit. */ 549 return 65536; 550} 551 552static struct targ *targs; 553static int global_nthreads; 554 555/* control-C handler */ 556static void 557sigint_h(int sig) 558{ 559 int i; 560 561 (void)sig; /* UNUSED */ 562 D("received control-C on thread %p", (void *)pthread_self()); 563 for (i = 0; i < global_nthreads; i++) { 564 targs[i].cancel = 1; 565 } 566} 567 568/* sysctl wrapper to return the number of active CPUs */ 569static int 570system_ncpus(void) 571{ 572 int ncpus; 573#if defined (__FreeBSD__) 574 int mib[2] = { CTL_HW, HW_NCPU }; 575 size_t len = sizeof(mib); 576 sysctl(mib, 2, &ncpus, &len, NULL, 0); 577#elif defined(linux) 578 ncpus = sysconf(_SC_NPROCESSORS_ONLN); 579#elif defined(_WIN32) 580 { 581 SYSTEM_INFO sysinfo; 582 GetSystemInfo(&sysinfo); 583 ncpus = sysinfo.dwNumberOfProcessors; 584 } 585#else /* others */ 586 ncpus = 1; 587#endif /* others */ 588 return (ncpus); 589} 590 591#ifdef __linux__ 592#define sockaddr_dl sockaddr_ll 593#define sdl_family sll_family 594#define AF_LINK AF_PACKET 595#define LLADDR(s) s->sll_addr; 596#include <linux/if_tun.h> 597#define TAP_CLONEDEV "/dev/net/tun" 598#endif /* __linux__ */ 599 600#ifdef __FreeBSD__ 601#include <net/if_tun.h> 602#define TAP_CLONEDEV "/dev/tap" 603#endif /* __FreeBSD */ 604 605#ifdef __APPLE__ 606// #warning TAP not supported on apple ? 607#include <net/if_utun.h> 608#define TAP_CLONEDEV "/dev/tap" 609#endif /* __APPLE__ */ 610 611 612/* 613 * parse the vale configuration in conf and put it in nmr. 614 * Return the flag set if necessary. 615 * The configuration may consist of 1 to 4 numbers separated 616 * by commas: #tx-slots,#rx-slots,#tx-rings,#rx-rings. 617 * Missing numbers or zeroes stand for default values. 618 * As an additional convenience, if exactly one number 619 * is specified, then this is assigned to both #tx-slots and #rx-slots. 620 * If there is no 4th number, then the 3rd is assigned to both #tx-rings 621 * and #rx-rings. 622 */ 623int 624parse_nmr_config(const char* conf, struct nmreq *nmr) 625{ 626 char *w, *tok; 627 int i, v; 628 629 nmr->nr_tx_rings = nmr->nr_rx_rings = 0; 630 nmr->nr_tx_slots = nmr->nr_rx_slots = 0; 631 if (conf == NULL || ! *conf) 632 return 0; 633 w = strdup(conf); 634 for (i = 0, tok = strtok(w, ","); tok; i++, tok = strtok(NULL, ",")) { 635 v = atoi(tok); 636 switch (i) { 637 case 0: 638 nmr->nr_tx_slots = nmr->nr_rx_slots = v; 639 break; 640 case 1: 641 nmr->nr_rx_slots = v; 642 break; 643 case 2: 644 nmr->nr_tx_rings = nmr->nr_rx_rings = v; 645 break; 646 case 3: 647 nmr->nr_rx_rings = v; 648 break; 649 default: 650 D("ignored config: %s", tok); 651 break; 652 } 653 } 654 D("txr %d txd %d rxr %d rxd %d", 655 nmr->nr_tx_rings, nmr->nr_tx_slots, 656 nmr->nr_rx_rings, nmr->nr_rx_slots); 657 free(w); 658 return (nmr->nr_tx_rings || nmr->nr_tx_slots || 659 nmr->nr_rx_rings || nmr->nr_rx_slots) ? 660 NM_OPEN_RING_CFG : 0; 661} 662 663 664/* 665 * locate the src mac address for our interface, put it 666 * into the user-supplied buffer. return 0 if ok, -1 on error. 667 */ 668static int 669source_hwaddr(const char *ifname, char *buf) 670{ 671 struct ifaddrs *ifaphead, *ifap; 672 673 if (getifaddrs(&ifaphead) != 0) { 674 D("getifaddrs %s failed", ifname); 675 return (-1); 676 } 677 678 for (ifap = ifaphead; ifap; ifap = ifap->ifa_next) { 679 struct sockaddr_dl *sdl = 680 (struct sockaddr_dl *)ifap->ifa_addr; 681 uint8_t *mac; 682 683 if (!sdl || sdl->sdl_family != AF_LINK) 684 continue; 685 if (strncmp(ifap->ifa_name, ifname, IFNAMSIZ) != 0) 686 continue; 687 mac = (uint8_t *)LLADDR(sdl); 688 sprintf(buf, "%02x:%02x:%02x:%02x:%02x:%02x", 689 mac[0], mac[1], mac[2], 690 mac[3], mac[4], mac[5]); 691 if (verbose) 692 D("source hwaddr %s", buf); 693 break; 694 } 695 freeifaddrs(ifaphead); 696 return ifap ? 0 : 1; 697} 698 699 700/* set the thread affinity. */ 701static int 702setaffinity(pthread_t me, int i) 703{ 704 cpuset_t cpumask; 705 706 if (i == -1) 707 return 0; 708 709 /* Set thread affinity affinity.*/ 710 CPU_ZERO(&cpumask); 711 CPU_SET(i, &cpumask); 712 713 if (pthread_setaffinity_np(me, sizeof(cpuset_t), &cpumask) != 0) { 714 D("Unable to set affinity: %s", strerror(errno)); 715 return 1; 716 } 717 return 0; 718} 719 720 721/* Compute the checksum of the given ip header. */ 722static uint32_t 723checksum(const void *data, uint16_t len, uint32_t sum) 724{ 725 const uint8_t *addr = data; 726 uint32_t i; 727 728 /* Checksum all the pairs of bytes first... */ 729 for (i = 0; i < (len & ~1U); i += 2) { 730 sum += (u_int16_t)ntohs(*((u_int16_t *)(addr + i))); 731 if (sum > 0xFFFF) 732 sum -= 0xFFFF; 733 } 734 /* 735 * If there's a single byte left over, checksum it, too. 736 * Network byte order is big-endian, so the remaining byte is 737 * the high byte. 738 */ 739 if (i < len) { 740 sum += addr[i] << 8; 741 if (sum > 0xFFFF) 742 sum -= 0xFFFF; 743 } 744 return sum; 745} 746 747static uint16_t 748wrapsum(uint32_t sum) 749{ 750 sum = ~sum & 0xFFFF; 751 return (htons(sum)); 752} 753 754/* Check the payload of the packet for errors (use it for debug). 755 * Look for consecutive ascii representations of the size of the packet. 756 */ 757static void 758dump_payload(const char *_p, int len, struct netmap_ring *ring, int cur) 759{ 760 char buf[128]; 761 int i, j, i0; 762 const unsigned char *p = (const unsigned char *)_p; 763 764 /* get the length in ASCII of the length of the packet. */ 765 766 printf("ring %p cur %5d [buf %6d flags 0x%04x len %5d]\n", 767 ring, cur, ring->slot[cur].buf_idx, 768 ring->slot[cur].flags, len); 769 /* hexdump routine */ 770 for (i = 0; i < len; ) { 771 memset(buf, ' ', sizeof(buf)); 772 sprintf(buf, "%5d: ", i); 773 i0 = i; 774 for (j=0; j < 16 && i < len; i++, j++) 775 sprintf(buf+7+j*3, "%02x ", (uint8_t)(p[i])); 776 i = i0; 777 for (j=0; j < 16 && i < len; i++, j++) 778 sprintf(buf+7+j + 48, "%c", 779 isprint(p[i]) ? p[i] : '.'); 780 printf("%s\n", buf); 781 } 782} 783 784/* 785 * Fill a packet with some payload. 786 * We create a UDP packet so the payload starts at 787 * 14+20+8 = 42 bytes. 788 */ 789#ifdef __linux__ 790#define uh_sport source 791#define uh_dport dest 792#define uh_ulen len 793#define uh_sum check 794#endif /* linux */ 795 796static void 797update_ip(struct pkt *pkt, struct targ *t) 798{ 799 struct glob_arg *g = t->g; 800 struct ip ip; 801 struct udphdr udp; 802 uint32_t oaddr, naddr; 803 uint16_t oport, nport; 804 uint16_t ip_sum, udp_sum; 805 806 memcpy(&ip, &pkt->ipv4.ip, sizeof(ip)); 807 memcpy(&udp, &pkt->ipv4.udp, sizeof(udp)); 808 do { 809 ip_sum = udp_sum = 0; 810 naddr = oaddr = ntohl(ip.ip_src.s_addr); 811 nport = oport = ntohs(udp.uh_sport); 812 if (g->options & OPT_RANDOM_SRC) { 813 ip.ip_src.s_addr = nrand48(t->seed); 814 udp.uh_sport = nrand48(t->seed); 815 naddr = ntohl(ip.ip_src.s_addr); 816 nport = ntohs(udp.uh_sport); 817 break; 818 } 819 if (oport < g->src_ip.port1) { 820 nport = oport + 1; 821 udp.uh_sport = htons(nport); 822 break; 823 } 824 nport = g->src_ip.port0; 825 udp.uh_sport = htons(nport); 826 if (oaddr < g->src_ip.ipv4.end) { 827 naddr = oaddr + 1; 828 ip.ip_src.s_addr = htonl(naddr); 829 break; 830 } 831 naddr = g->src_ip.ipv4.start; 832 ip.ip_src.s_addr = htonl(naddr); 833 } while (0); 834 /* update checksums if needed */ 835 if (oaddr != naddr) { 836 ip_sum = cksum_add(ip_sum, ~oaddr >> 16); 837 ip_sum = cksum_add(ip_sum, ~oaddr & 0xffff); 838 ip_sum = cksum_add(ip_sum, naddr >> 16); 839 ip_sum = cksum_add(ip_sum, naddr & 0xffff); 840 } 841 if (oport != nport) { 842 udp_sum = cksum_add(udp_sum, ~oport); 843 udp_sum = cksum_add(udp_sum, nport); 844 } 845 do { 846 naddr = oaddr = ntohl(ip.ip_dst.s_addr); 847 nport = oport = ntohs(udp.uh_dport); 848 if (g->options & OPT_RANDOM_DST) { 849 ip.ip_dst.s_addr = nrand48(t->seed); 850 udp.uh_dport = nrand48(t->seed); 851 naddr = ntohl(ip.ip_dst.s_addr); 852 nport = ntohs(udp.uh_dport); 853 break; 854 } 855 if (oport < g->dst_ip.port1) { 856 nport = oport + 1; 857 udp.uh_dport = htons(nport); 858 break; 859 } 860 nport = g->dst_ip.port0; 861 udp.uh_dport = htons(nport); 862 if (oaddr < g->dst_ip.ipv4.end) { 863 naddr = oaddr + 1; 864 ip.ip_dst.s_addr = htonl(naddr); 865 break; 866 } 867 naddr = g->dst_ip.ipv4.start; 868 ip.ip_dst.s_addr = htonl(naddr); 869 } while (0); 870 /* update checksums */ 871 if (oaddr != naddr) { 872 ip_sum = cksum_add(ip_sum, ~oaddr >> 16); 873 ip_sum = cksum_add(ip_sum, ~oaddr & 0xffff); 874 ip_sum = cksum_add(ip_sum, naddr >> 16); 875 ip_sum = cksum_add(ip_sum, naddr & 0xffff); 876 } 877 if (oport != nport) { 878 udp_sum = cksum_add(udp_sum, ~oport); 879 udp_sum = cksum_add(udp_sum, nport); 880 } 881 if (udp_sum != 0) 882 udp.uh_sum = ~cksum_add(~udp.uh_sum, htons(udp_sum)); 883 if (ip_sum != 0) { 884 ip.ip_sum = ~cksum_add(~ip.ip_sum, htons(ip_sum)); 885 udp.uh_sum = ~cksum_add(~udp.uh_sum, htons(ip_sum)); 886 } 887 memcpy(&pkt->ipv4.ip, &ip, sizeof(ip)); 888 memcpy(&pkt->ipv4.udp, &udp, sizeof(udp)); 889} 890 891#ifndef s6_addr16 892#define s6_addr16 __u6_addr.__u6_addr16 893#endif 894static void 895update_ip6(struct pkt *pkt, struct targ *t) 896{ 897 struct glob_arg *g = t->g; 898 struct ip6_hdr ip6; 899 struct udphdr udp; 900 uint16_t udp_sum; 901 uint16_t oaddr, naddr; 902 uint16_t oport, nport; 903 uint8_t group; 904 905 memcpy(&ip6, &pkt->ipv6.ip, sizeof(ip6)); 906 memcpy(&udp, &pkt->ipv6.udp, sizeof(udp)); 907 do { 908 udp_sum = 0; 909 group = g->src_ip.ipv6.sgroup; 910 naddr = oaddr = ntohs(ip6.ip6_src.s6_addr16[group]); 911 nport = oport = ntohs(udp.uh_sport); 912 if (g->options & OPT_RANDOM_SRC) { 913 ip6.ip6_src.s6_addr16[group] = nrand48(t->seed); 914 udp.uh_sport = nrand48(t->seed); 915 naddr = ntohs(ip6.ip6_src.s6_addr16[group]); 916 nport = ntohs(udp.uh_sport); 917 break; 918 } 919 if (oport < g->src_ip.port1) { 920 nport = oport + 1; 921 udp.uh_sport = htons(nport); 922 break; 923 } 924 nport = g->src_ip.port0; 925 udp.uh_sport = htons(nport); 926 if (oaddr < ntohs(g->src_ip.ipv6.end.s6_addr16[group])) { 927 naddr = oaddr + 1; 928 ip6.ip6_src.s6_addr16[group] = htons(naddr); 929 break; 930 } 931 naddr = ntohs(g->src_ip.ipv6.start.s6_addr16[group]); 932 ip6.ip6_src.s6_addr16[group] = htons(naddr); 933 } while (0); 934 /* update checksums if needed */ 935 if (oaddr != naddr) 936 udp_sum = cksum_add(~oaddr, naddr); 937 if (oport != nport) 938 udp_sum = cksum_add(udp_sum, 939 cksum_add(~oport, nport)); 940 do { 941 group = g->dst_ip.ipv6.egroup; 942 naddr = oaddr = ntohs(ip6.ip6_dst.s6_addr16[group]); 943 nport = oport = ntohs(udp.uh_dport); 944 if (g->options & OPT_RANDOM_DST) { 945 ip6.ip6_dst.s6_addr16[group] = nrand48(t->seed); 946 udp.uh_dport = nrand48(t->seed); 947 naddr = ntohs(ip6.ip6_dst.s6_addr16[group]); 948 nport = ntohs(udp.uh_dport); 949 break; 950 } 951 if (oport < g->dst_ip.port1) { 952 nport = oport + 1; 953 udp.uh_dport = htons(nport); 954 break; 955 } 956 nport = g->dst_ip.port0; 957 udp.uh_dport = htons(nport); 958 if (oaddr < ntohs(g->dst_ip.ipv6.end.s6_addr16[group])) { 959 naddr = oaddr + 1; 960 ip6.ip6_dst.s6_addr16[group] = htons(naddr); 961 break; 962 } 963 naddr = ntohs(g->dst_ip.ipv6.start.s6_addr16[group]); 964 ip6.ip6_dst.s6_addr16[group] = htons(naddr); 965 } while (0); 966 /* update checksums */ 967 if (oaddr != naddr) 968 udp_sum = cksum_add(udp_sum, 969 cksum_add(~oaddr, naddr)); 970 if (oport != nport) 971 udp_sum = cksum_add(udp_sum, 972 cksum_add(~oport, nport)); 973 if (udp_sum != 0) 974 udp.uh_sum = ~cksum_add(~udp.uh_sum, udp_sum); 975 memcpy(&pkt->ipv6.ip, &ip6, sizeof(ip6)); 976 memcpy(&pkt->ipv6.udp, &udp, sizeof(udp)); 977} 978 979static void 980update_addresses(struct pkt *pkt, struct targ *t) 981{ 982 983 if (t->g->af == AF_INET) 984 update_ip(pkt, t); 985 else 986 update_ip6(pkt, t); 987} 988/* 989 * initialize one packet and prepare for the next one. 990 * The copy could be done better instead of repeating it each time. 991 */ 992static void 993initialize_packet(struct targ *targ) 994{ 995 struct pkt *pkt = &targ->pkt; 996 struct ether_header *eh; 997 struct ip6_hdr ip6; 998 struct ip ip; 999 struct udphdr udp; 1000 void *udp_ptr; 1001 uint16_t paylen; 1002 uint32_t csum = 0; 1003 const char *payload = targ->g->options & OPT_INDIRECT ? 1004 indirect_payload : default_payload; 1005 int i, l0 = strlen(payload); 1006 1007#ifndef NO_PCAP 1008 char errbuf[PCAP_ERRBUF_SIZE]; 1009 pcap_t *file; 1010 struct pcap_pkthdr *header; 1011 const unsigned char *packet; 1012 1013 /* Read a packet from a PCAP file if asked. */ 1014 if (targ->g->packet_file != NULL) { 1015 if ((file = pcap_open_offline(targ->g->packet_file, 1016 errbuf)) == NULL) 1017 D("failed to open pcap file %s", 1018 targ->g->packet_file); 1019 if (pcap_next_ex(file, &header, &packet) < 0) 1020 D("failed to read packet from %s", 1021 targ->g->packet_file); 1022 if ((targ->frame = malloc(header->caplen)) == NULL) 1023 D("out of memory"); 1024 bcopy(packet, (unsigned char *)targ->frame, header->caplen); 1025 targ->g->pkt_size = header->caplen; 1026 pcap_close(file); 1027 return; 1028 } 1029#endif 1030 1031 paylen = targ->g->pkt_size - sizeof(*eh) - 1032 (targ->g->af == AF_INET ? sizeof(ip): sizeof(ip6)); 1033 1034 /* create a nice NUL-terminated string */ 1035 for (i = 0; i < paylen; i += l0) { 1036 if (l0 > paylen - i) 1037 l0 = paylen - i; // last round 1038 bcopy(payload, PKT(pkt, body, targ->g->af) + i, l0); 1039 } 1040 PKT(pkt, body, targ->g->af)[i - 1] = '\0'; 1041 1042 /* prepare the headers */ 1043 eh = &pkt->eh; 1044 bcopy(&targ->g->src_mac.start, eh->ether_shost, 6); 1045 bcopy(&targ->g->dst_mac.start, eh->ether_dhost, 6); 1046 1047 if (targ->g->af == AF_INET) { 1048 eh->ether_type = htons(ETHERTYPE_IP); 1049 memcpy(&ip, &pkt->ipv4.ip, sizeof(ip)); 1050 udp_ptr = &pkt->ipv4.udp; 1051 ip.ip_v = IPVERSION; 1052 ip.ip_hl = sizeof(ip) >> 2; 1053 ip.ip_id = 0; 1054 ip.ip_tos = IPTOS_LOWDELAY; 1055 ip.ip_len = htons(targ->g->pkt_size - sizeof(*eh)); 1056 ip.ip_id = 0; 1057 ip.ip_off = htons(IP_DF); /* Don't fragment */ 1058 ip.ip_ttl = IPDEFTTL; 1059 ip.ip_p = IPPROTO_UDP; 1060 ip.ip_dst.s_addr = htonl(targ->g->dst_ip.ipv4.start); 1061 ip.ip_src.s_addr = htonl(targ->g->src_ip.ipv4.start); 1062 ip.ip_sum = wrapsum(checksum(&ip, sizeof(ip), 0)); 1063 memcpy(&pkt->ipv4.ip, &ip, sizeof(ip)); 1064 } else { 1065 eh->ether_type = htons(ETHERTYPE_IPV6); 1066 memcpy(&ip6, &pkt->ipv4.ip, sizeof(ip6)); 1067 udp_ptr = &pkt->ipv6.udp; 1068 ip6.ip6_flow = 0; 1069 ip6.ip6_plen = htons(paylen); 1070 ip6.ip6_vfc = IPV6_VERSION; 1071 ip6.ip6_nxt = IPPROTO_UDP; 1072 ip6.ip6_hlim = IPV6_DEFHLIM; 1073 ip6.ip6_src = targ->g->src_ip.ipv6.start; 1074 ip6.ip6_dst = targ->g->dst_ip.ipv6.start; 1075 } 1076 memcpy(&udp, udp_ptr, sizeof(udp)); 1077 1078 udp.uh_sport = htons(targ->g->src_ip.port0); 1079 udp.uh_dport = htons(targ->g->dst_ip.port0); 1080 udp.uh_ulen = htons(paylen); 1081 if (targ->g->af == AF_INET) { 1082 /* Magic: taken from sbin/dhclient/packet.c */ 1083 udp.uh_sum = wrapsum( 1084 checksum(&udp, sizeof(udp), /* udp header */ 1085 checksum(pkt->ipv4.body, /* udp payload */ 1086 paylen - sizeof(udp), 1087 checksum(&pkt->ipv4.ip.ip_src, /* pseudo header */ 1088 2 * sizeof(pkt->ipv4.ip.ip_src), 1089 IPPROTO_UDP + (u_int32_t)ntohs(udp.uh_ulen))))); 1090 memcpy(&pkt->ipv4.ip, &ip, sizeof(ip)); 1091 } else { 1092 /* Save part of pseudo header checksum into csum */ 1093 csum = IPPROTO_UDP << 24; 1094 csum = checksum(&csum, sizeof(csum), paylen); 1095 udp.uh_sum = wrapsum( 1096 checksum(udp_ptr, sizeof(udp), /* udp header */ 1097 checksum(pkt->ipv6.body, /* udp payload */ 1098 paylen - sizeof(udp), 1099 checksum(&pkt->ipv6.ip.ip6_src, /* pseudo header */ 1100 2 * sizeof(pkt->ipv6.ip.ip6_src), csum)))); 1101 memcpy(&pkt->ipv6.ip, &ip6, sizeof(ip6)); 1102 } 1103 memcpy(udp_ptr, &udp, sizeof(udp)); 1104 1105 bzero(&pkt->vh, sizeof(pkt->vh)); 1106 // dump_payload((void *)pkt, targ->g->pkt_size, NULL, 0); 1107} 1108 1109static void 1110get_vnet_hdr_len(struct glob_arg *g) 1111{ 1112 struct nmreq req; 1113 int err; 1114 1115 memset(&req, 0, sizeof(req)); 1116 bcopy(g->nmd->req.nr_name, req.nr_name, sizeof(req.nr_name)); 1117 req.nr_version = NETMAP_API; 1118 req.nr_cmd = NETMAP_VNET_HDR_GET; 1119 err = ioctl(g->main_fd, NIOCREGIF, &req); 1120 if (err) { 1121 D("Unable to get virtio-net header length"); 1122 return; 1123 } 1124 1125 g->virt_header = req.nr_arg1; 1126 if (g->virt_header) { 1127 D("Port requires virtio-net header, length = %d", 1128 g->virt_header); 1129 } 1130} 1131 1132static void 1133set_vnet_hdr_len(struct glob_arg *g) 1134{ 1135 int err, l = g->virt_header; 1136 struct nmreq req; 1137 1138 if (l == 0) 1139 return; 1140 1141 memset(&req, 0, sizeof(req)); 1142 bcopy(g->nmd->req.nr_name, req.nr_name, sizeof(req.nr_name)); 1143 req.nr_version = NETMAP_API; 1144 req.nr_cmd = NETMAP_BDG_VNET_HDR; 1145 req.nr_arg1 = l; 1146 err = ioctl(g->main_fd, NIOCREGIF, &req); 1147 if (err) { 1148 D("Unable to set virtio-net header length %d", l); 1149 } 1150} 1151 1152/* 1153 * create and enqueue a batch of packets on a ring. 1154 * On the last one set NS_REPORT to tell the driver to generate 1155 * an interrupt when done. 1156 */ 1157static int 1158send_packets(struct netmap_ring *ring, struct pkt *pkt, void *frame, 1159 int size, struct targ *t, u_int count, int options) 1160{ 1161 u_int n, sent, cur = ring->cur; 1162 u_int frags = t->frags; 1163 u_int frag_size = t->frag_size; 1164 struct netmap_slot *slot = &ring->slot[cur]; 1165 1166 n = nm_ring_space(ring); 1167#if 0 1168 if (options & (OPT_COPY | OPT_PREFETCH) ) { 1169 for (sent = 0; sent < count; sent++) { 1170 struct netmap_slot *slot = &ring->slot[cur]; 1171 char *p = NETMAP_BUF(ring, slot->buf_idx); 1172 1173 __builtin_prefetch(p); 1174 cur = nm_ring_next(ring, cur); 1175 } 1176 cur = ring->cur; 1177 } 1178#endif 1179 for (sent = 0; sent < count && n >= frags; sent++, n--) { 1180 char *p; 1181 int buf_changed; 1182 u_int tosend = size; 1183 1184 slot = &ring->slot[cur]; 1185 p = NETMAP_BUF(ring, slot->buf_idx); 1186 buf_changed = slot->flags & NS_BUF_CHANGED; 1187 1188 slot->flags = 0; 1189 if (options & OPT_RUBBISH) { 1190 /* do nothing */ 1191 } else if (options & OPT_INDIRECT) { 1192 slot->flags |= NS_INDIRECT; 1193 slot->ptr = (uint64_t)((uintptr_t)frame); 1194 } else if (frags > 1) { 1195 u_int i; 1196 const char *f = frame; 1197 char *fp = p; 1198 for (i = 0; i < frags - 1; i++) { 1199 memcpy(fp, f, frag_size); 1200 slot->len = frag_size; 1201 slot->flags = NS_MOREFRAG; 1202 if (options & OPT_DUMP) 1203 dump_payload(fp, frag_size, ring, cur); 1204 tosend -= frag_size; 1205 f += frag_size; 1206 cur = nm_ring_next(ring, cur); 1207 slot = &ring->slot[cur]; 1208 fp = NETMAP_BUF(ring, slot->buf_idx); 1209 } 1210 n -= (frags - 1); 1211 p = fp; 1212 slot->flags = 0; 1213 memcpy(p, f, tosend); 1214 update_addresses(pkt, t); 1215 } else if ((options & (OPT_COPY | OPT_MEMCPY)) || buf_changed) { 1216 if (options & OPT_COPY) 1217 nm_pkt_copy(frame, p, size); 1218 else 1219 memcpy(p, frame, size); 1220 update_addresses(pkt, t); 1221 } else if (options & OPT_PREFETCH) { 1222 __builtin_prefetch(p); 1223 } 1224 slot->len = tosend; 1225 if (options & OPT_DUMP) 1226 dump_payload(p, tosend, ring, cur); 1227 cur = nm_ring_next(ring, cur); 1228 } 1229 if (sent) { 1230 slot->flags |= NS_REPORT; 1231 ring->head = ring->cur = cur; 1232 } 1233 if (sent < count) { 1234 /* tell netmap that we need more slots */ 1235 ring->cur = ring->tail; 1236 } 1237 1238 return (sent); 1239} 1240 1241/* 1242 * Index of the highest bit set 1243 */ 1244uint32_t 1245msb64(uint64_t x) 1246{ 1247 uint64_t m = 1ULL << 63; 1248 int i; 1249 1250 for (i = 63; i >= 0; i--, m >>=1) 1251 if (m & x) 1252 return i; 1253 return 0; 1254} 1255 1256/* 1257 * wait until ts, either busy or sleeping if more than 1ms. 1258 * Return wakeup time. 1259 */ 1260static struct timespec 1261wait_time(struct timespec ts) 1262{ 1263 for (;;) { 1264 struct timespec w, cur; 1265 clock_gettime(CLOCK_REALTIME_PRECISE, &cur); 1266 w = timespec_sub(ts, cur); 1267 if (w.tv_sec < 0) 1268 return cur; 1269 else if (w.tv_sec > 0 || w.tv_nsec > 1000000) 1270 poll(NULL, 0, 1); 1271 } 1272} 1273 1274/* 1275 * Send a packet, and wait for a response. 1276 * The payload (after UDP header, ofs 42) has a 4-byte sequence 1277 * followed by a struct timeval (or bintime?) 1278 */ 1279 1280static void * 1281ping_body(void *data) 1282{ 1283 struct targ *targ = (struct targ *) data; 1284 struct pollfd pfd = { .fd = targ->fd, .events = POLLIN }; 1285 struct netmap_if *nifp = targ->nmd->nifp; 1286 int i, m, rx = 0; 1287 void *frame; 1288 int size; 1289 struct timespec ts, now, last_print; 1290 struct timespec nexttime = {0, 0}; /* silence compiler */ 1291 uint64_t sent = 0, n = targ->g->npackets; 1292 uint64_t count = 0, t_cur, t_min = ~0, av = 0; 1293 uint64_t g_min = ~0, g_av = 0; 1294 uint64_t buckets[64]; /* bins for delays, ns */ 1295 int rate_limit = targ->g->tx_rate, tosend = 0; 1296 1297 frame = (char*)&targ->pkt + sizeof(targ->pkt.vh) - targ->g->virt_header; 1298 size = targ->g->pkt_size + targ->g->virt_header; 1299 1300 1301 if (targ->g->nthreads > 1) { 1302 D("can only ping with 1 thread"); 1303 return NULL; 1304 } 1305 1306 bzero(&buckets, sizeof(buckets)); 1307 clock_gettime(CLOCK_REALTIME_PRECISE, &last_print); 1308 now = last_print; 1309 if (rate_limit) { 1310 targ->tic = timespec_add(now, (struct timespec){2,0}); 1311 targ->tic.tv_nsec = 0; 1312 wait_time(targ->tic); 1313 nexttime = targ->tic; 1314 } 1315 while (!targ->cancel && (n == 0 || sent < n)) { 1316 struct netmap_ring *ring = NETMAP_TXRING(nifp, targ->nmd->first_tx_ring); 1317 struct netmap_slot *slot; 1318 char *p; 1319 int rv; 1320 uint64_t limit, event = 0; 1321 1322 if (rate_limit && tosend <= 0) { 1323 tosend = targ->g->burst; 1324 nexttime = timespec_add(nexttime, targ->g->tx_period); 1325 wait_time(nexttime); 1326 } 1327 1328 limit = rate_limit ? tosend : targ->g->burst; 1329 if (n > 0 && n - sent < limit) 1330 limit = n - sent; 1331 for (m = 0; (unsigned)m < limit; m++) { 1332 slot = &ring->slot[ring->cur]; 1333 slot->len = size; 1334 p = NETMAP_BUF(ring, slot->buf_idx); 1335 1336 if (nm_ring_empty(ring)) { 1337 D("-- ouch, cannot send"); 1338 break; 1339 } else { 1340 struct tstamp *tp; 1341 nm_pkt_copy(frame, p, size); 1342 clock_gettime(CLOCK_REALTIME_PRECISE, &ts); 1343 bcopy(&sent, p+42, sizeof(sent)); 1344 tp = (struct tstamp *)(p+46); 1345 tp->sec = (uint32_t)ts.tv_sec; 1346 tp->nsec = (uint32_t)ts.tv_nsec; 1347 sent++; 1348 ring->head = ring->cur = nm_ring_next(ring, ring->cur); 1349 } 1350 } 1351 if (m > 0) 1352 event++; 1353 targ->ctr.pkts = sent; 1354 targ->ctr.bytes = sent*size; 1355 targ->ctr.events = event; 1356 if (rate_limit) 1357 tosend -= m; 1358#ifdef BUSYWAIT 1359 rv = ioctl(pfd.fd, NIOCTXSYNC, NULL); 1360 if (rv < 0) { 1361 D("TXSYNC error on queue %d: %s", targ->me, 1362 strerror(errno)); 1363 } 1364 again: 1365 ioctl(pfd.fd, NIOCRXSYNC, NULL); 1366#else 1367 /* should use a parameter to decide how often to send */ 1368 if ( (rv = poll(&pfd, 1, 3000)) <= 0) { 1369 D("poll error on queue %d: %s", targ->me, 1370 (rv ? strerror(errno) : "timeout")); 1371 continue; 1372 } 1373#endif /* BUSYWAIT */ 1374 /* see what we got back */ 1375 rx = 0; 1376 for (i = targ->nmd->first_rx_ring; 1377 i <= targ->nmd->last_rx_ring; i++) { 1378 ring = NETMAP_RXRING(nifp, i); 1379 while (!nm_ring_empty(ring)) { 1380 uint32_t seq; 1381 struct tstamp *tp; 1382 int pos; 1383 1384 slot = &ring->slot[ring->cur]; 1385 p = NETMAP_BUF(ring, slot->buf_idx); 1386 1387 clock_gettime(CLOCK_REALTIME_PRECISE, &now); 1388 bcopy(p+42, &seq, sizeof(seq)); 1389 tp = (struct tstamp *)(p+46); 1390 ts.tv_sec = (time_t)tp->sec; 1391 ts.tv_nsec = (long)tp->nsec; 1392 ts.tv_sec = now.tv_sec - ts.tv_sec; 1393 ts.tv_nsec = now.tv_nsec - ts.tv_nsec; 1394 if (ts.tv_nsec < 0) { 1395 ts.tv_nsec += 1000000000; 1396 ts.tv_sec--; 1397 } 1398 if (0) D("seq %d/%llu delta %d.%09d", seq, 1399 (unsigned long long)sent, 1400 (int)ts.tv_sec, (int)ts.tv_nsec); 1401 t_cur = ts.tv_sec * 1000000000UL + ts.tv_nsec; 1402 if (t_cur < t_min) 1403 t_min = t_cur; 1404 count ++; 1405 av += t_cur; 1406 pos = msb64(t_cur); 1407 buckets[pos]++; 1408 /* now store it in a bucket */ 1409 ring->head = ring->cur = nm_ring_next(ring, ring->cur); 1410 rx++; 1411 } 1412 } 1413 //D("tx %d rx %d", sent, rx); 1414 //usleep(100000); 1415 ts.tv_sec = now.tv_sec - last_print.tv_sec; 1416 ts.tv_nsec = now.tv_nsec - last_print.tv_nsec; 1417 if (ts.tv_nsec < 0) { 1418 ts.tv_nsec += 1000000000; 1419 ts.tv_sec--; 1420 } 1421 if (ts.tv_sec >= 1) { 1422 D("count %d RTT: min %d av %d ns", 1423 (int)count, (int)t_min, (int)(av/count)); 1424 int k, j, kmin, off; 1425 char buf[512]; 1426 1427 for (kmin = 0; kmin < 64; kmin ++) 1428 if (buckets[kmin]) 1429 break; 1430 for (k = 63; k >= kmin; k--) 1431 if (buckets[k]) 1432 break; 1433 buf[0] = '\0'; 1434 off = 0; 1435 for (j = kmin; j <= k; j++) { 1436 off += sprintf(buf + off, " %5d", (int)buckets[j]); 1437 } 1438 D("k: %d .. %d\n\t%s", 1<<kmin, 1<<k, buf); 1439 bzero(&buckets, sizeof(buckets)); 1440 count = 0; 1441 g_av += av; 1442 av = 0; 1443 if (t_min < g_min) 1444 g_min = t_min; 1445 t_min = ~0; 1446 last_print = now; 1447 } 1448#ifdef BUSYWAIT 1449 if (rx < m && ts.tv_sec <= 3 && !targ->cancel) 1450 goto again; 1451#endif /* BUSYWAIT */ 1452 } 1453 1454 if (sent > 0) { 1455 D("RTT over %llu packets: min %d av %d ns", 1456 (long long unsigned)sent, (int)g_min, 1457 (int)((double)g_av/sent)); 1458 } 1459 targ->completed = 1; 1460 1461 /* reset the ``used`` flag. */ 1462 targ->used = 0; 1463 1464 return NULL; 1465} 1466 1467 1468/* 1469 * reply to ping requests 1470 */ 1471static void * 1472pong_body(void *data) 1473{ 1474 struct targ *targ = (struct targ *) data; 1475 struct pollfd pfd = { .fd = targ->fd, .events = POLLIN }; 1476 struct netmap_if *nifp = targ->nmd->nifp; 1477 struct netmap_ring *txring, *rxring; 1478 int i, rx = 0; 1479 uint64_t sent = 0, n = targ->g->npackets; 1480 1481 if (targ->g->nthreads > 1) { 1482 D("can only reply ping with 1 thread"); 1483 return NULL; 1484 } 1485 if (n > 0) 1486 D("understood ponger %llu but don't know how to do it", 1487 (unsigned long long)n); 1488 while (!targ->cancel && (n == 0 || sent < n)) { 1489 uint32_t txcur, txavail; 1490//#define BUSYWAIT 1491#ifdef BUSYWAIT 1492 ioctl(pfd.fd, NIOCRXSYNC, NULL); 1493#else 1494 int rv; 1495 if ( (rv = poll(&pfd, 1, 1000)) <= 0) { 1496 D("poll error on queue %d: %s", targ->me, 1497 rv ? strerror(errno) : "timeout"); 1498 continue; 1499 } 1500#endif 1501 txring = NETMAP_TXRING(nifp, targ->nmd->first_tx_ring); 1502 txcur = txring->cur; 1503 txavail = nm_ring_space(txring); 1504 /* see what we got back */ 1505 for (i = targ->nmd->first_rx_ring; i <= targ->nmd->last_rx_ring; i++) { 1506 rxring = NETMAP_RXRING(nifp, i); 1507 while (!nm_ring_empty(rxring)) { 1508 uint16_t *spkt, *dpkt; 1509 uint32_t cur = rxring->cur; 1510 struct netmap_slot *slot = &rxring->slot[cur]; 1511 char *src, *dst; 1512 src = NETMAP_BUF(rxring, slot->buf_idx); 1513 //D("got pkt %p of size %d", src, slot->len); 1514 rxring->head = rxring->cur = nm_ring_next(rxring, cur); 1515 rx++; 1516 if (txavail == 0) 1517 continue; 1518 dst = NETMAP_BUF(txring, 1519 txring->slot[txcur].buf_idx); 1520 /* copy... */ 1521 dpkt = (uint16_t *)dst; 1522 spkt = (uint16_t *)src; 1523 nm_pkt_copy(src, dst, slot->len); 1524 /* swap source and destination MAC */ 1525 dpkt[0] = spkt[3]; 1526 dpkt[1] = spkt[4]; 1527 dpkt[2] = spkt[5]; 1528 dpkt[3] = spkt[0]; 1529 dpkt[4] = spkt[1]; 1530 dpkt[5] = spkt[2]; 1531 txring->slot[txcur].len = slot->len; 1532 txcur = nm_ring_next(txring, txcur); 1533 txavail--; 1534 sent++; 1535 } 1536 } 1537 txring->head = txring->cur = txcur; 1538 targ->ctr.pkts = sent; 1539#ifdef BUSYWAIT 1540 ioctl(pfd.fd, NIOCTXSYNC, NULL); 1541#endif 1542 //D("tx %d rx %d", sent, rx); 1543 } 1544 1545 targ->completed = 1; 1546 1547 /* reset the ``used`` flag. */ 1548 targ->used = 0; 1549 1550 return NULL; 1551} 1552 1553 1554static void * 1555sender_body(void *data) 1556{ 1557 struct targ *targ = (struct targ *) data; 1558 struct pollfd pfd = { .fd = targ->fd, .events = POLLOUT }; 1559 struct netmap_if *nifp; 1560 struct netmap_ring *txring = NULL; 1561 int i; 1562 uint64_t n = targ->g->npackets / targ->g->nthreads; 1563 uint64_t sent = 0; 1564 uint64_t event = 0; 1565 int options = targ->g->options | OPT_COPY; 1566 struct timespec nexttime = { 0, 0}; // XXX silence compiler 1567 int rate_limit = targ->g->tx_rate; 1568 struct pkt *pkt = &targ->pkt; 1569 void *frame; 1570 int size; 1571 1572 if (targ->frame == NULL) { 1573 frame = (char *)pkt + sizeof(pkt->vh) - targ->g->virt_header; 1574 size = targ->g->pkt_size + targ->g->virt_header; 1575 } else { 1576 frame = targ->frame; 1577 size = targ->g->pkt_size; 1578 } 1579 1580 D("start, fd %d main_fd %d", targ->fd, targ->g->main_fd); 1581 if (setaffinity(targ->thread, targ->affinity)) 1582 goto quit; 1583 1584 /* main loop.*/ 1585 clock_gettime(CLOCK_REALTIME_PRECISE, &targ->tic); 1586 if (rate_limit) { 1587 targ->tic = timespec_add(targ->tic, (struct timespec){2,0}); 1588 targ->tic.tv_nsec = 0; 1589 wait_time(targ->tic); 1590 nexttime = targ->tic; 1591 } 1592 if (targ->g->dev_type == DEV_TAP) { 1593 D("writing to file desc %d", targ->g->main_fd); 1594 1595 for (i = 0; !targ->cancel && (n == 0 || sent < n); i++) { 1596 if (write(targ->g->main_fd, frame, size) != -1) 1597 sent++; 1598 update_addresses(pkt, targ); 1599 if (i > 10000) { 1600 targ->ctr.pkts = sent; 1601 targ->ctr.bytes = sent*size; 1602 targ->ctr.events = sent; 1603 i = 0; 1604 } 1605 } 1606#ifndef NO_PCAP 1607 } else if (targ->g->dev_type == DEV_PCAP) { 1608 pcap_t *p = targ->g->p; 1609 1610 for (i = 0; !targ->cancel && (n == 0 || sent < n); i++) { 1611 if (pcap_inject(p, frame, size) != -1) 1612 sent++; 1613 update_addresses(pkt, targ); 1614 if (i > 10000) { 1615 targ->ctr.pkts = sent; 1616 targ->ctr.bytes = sent*size; 1617 targ->ctr.events = sent; 1618 i = 0; 1619 } 1620 } 1621#endif /* NO_PCAP */ 1622 } else { 1623 int tosend = 0; 1624 u_int bufsz, frag_size = targ->g->frag_size; 1625 1626 nifp = targ->nmd->nifp; 1627 txring = NETMAP_TXRING(nifp, targ->nmd->first_tx_ring); 1628 bufsz = txring->nr_buf_size; 1629 if (bufsz < frag_size) 1630 frag_size = bufsz; 1631 targ->frag_size = targ->g->pkt_size / targ->frags; 1632 if (targ->frag_size > frag_size) { 1633 targ->frags = targ->g->pkt_size / frag_size; 1634 targ->frag_size = frag_size; 1635 if (targ->g->pkt_size % frag_size != 0) 1636 targ->frags++; 1637 } 1638 D("frags %u frag_size %u", targ->frags, targ->frag_size); 1639 while (!targ->cancel && (n == 0 || sent < n)) { 1640 int rv; 1641 1642 if (rate_limit && tosend <= 0) { 1643 tosend = targ->g->burst; 1644 nexttime = timespec_add(nexttime, targ->g->tx_period); 1645 wait_time(nexttime); 1646 } 1647 1648 /* 1649 * wait for available room in the send queue(s) 1650 */ 1651#ifdef BUSYWAIT 1652 (void)rv; 1653 if (ioctl(pfd.fd, NIOCTXSYNC, NULL) < 0) { 1654 D("ioctl error on queue %d: %s", targ->me, 1655 strerror(errno)); 1656 goto quit; 1657 } 1658#else /* !BUSYWAIT */ 1659 if ( (rv = poll(&pfd, 1, 2000)) <= 0) { 1660 if (targ->cancel) 1661 break; 1662 D("poll error on queue %d: %s", targ->me, 1663 rv ? strerror(errno) : "timeout"); 1664 // goto quit; 1665 } 1666 if (pfd.revents & POLLERR) { 1667 D("poll error on %d ring %d-%d", pfd.fd, 1668 targ->nmd->first_tx_ring, targ->nmd->last_tx_ring); 1669 goto quit; 1670 } 1671#endif /* !BUSYWAIT */ 1672 /* 1673 * scan our queues and send on those with room 1674 */ 1675 if (options & OPT_COPY && sent > 100000 && !(targ->g->options & OPT_COPY) ) { 1676 D("drop copy"); 1677 options &= ~OPT_COPY; 1678 } 1679 for (i = targ->nmd->first_tx_ring; i <= targ->nmd->last_tx_ring; i++) { 1680 int m; 1681 uint64_t limit = rate_limit ? tosend : targ->g->burst; 1682 1683 if (n > 0 && n == sent) 1684 break; 1685 1686 if (n > 0 && n - sent < limit) 1687 limit = n - sent; 1688 txring = NETMAP_TXRING(nifp, i); 1689 if (nm_ring_empty(txring)) 1690 continue; 1691 1692 if (targ->g->pkt_min_size > 0) { 1693 size = nrand48(targ->seed) % 1694 (targ->g->pkt_size - targ->g->pkt_min_size) + 1695 targ->g->pkt_min_size; 1696 } 1697 m = send_packets(txring, pkt, frame, size, targ, 1698 limit, options); 1699 ND("limit %lu tail %d m %d", 1700 limit, txring->tail, m); 1701 sent += m; 1702 if (m > 0) //XXX-ste: can m be 0? 1703 event++; 1704 targ->ctr.pkts = sent; 1705 targ->ctr.bytes += m*size; 1706 targ->ctr.events = event; 1707 if (rate_limit) { 1708 tosend -= m; 1709 if (tosend <= 0) 1710 break; 1711 } 1712 } 1713 } 1714 /* flush any remaining packets */ 1715 if (txring != NULL) { 1716 D("flush tail %d head %d on thread %p", 1717 txring->tail, txring->head, 1718 (void *)pthread_self()); 1719 ioctl(pfd.fd, NIOCTXSYNC, NULL); 1720 } 1721 1722 /* final part: wait all the TX queues to be empty. */ 1723 for (i = targ->nmd->first_tx_ring; i <= targ->nmd->last_tx_ring; i++) { 1724 txring = NETMAP_TXRING(nifp, i); 1725 while (!targ->cancel && nm_tx_pending(txring)) { 1726 RD(5, "pending tx tail %d head %d on ring %d", 1727 txring->tail, txring->head, i); 1728 ioctl(pfd.fd, NIOCTXSYNC, NULL); 1729 usleep(1); /* wait 1 tick */ 1730 } 1731 } 1732 } /* end DEV_NETMAP */ 1733 1734 clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc); 1735 targ->completed = 1; 1736 targ->ctr.pkts = sent; 1737 targ->ctr.bytes = sent*size; 1738 targ->ctr.events = event; 1739quit: 1740 /* reset the ``used`` flag. */ 1741 targ->used = 0; 1742 1743 return (NULL); 1744} 1745 1746 1747#ifndef NO_PCAP 1748static void 1749receive_pcap(u_char *user, const struct pcap_pkthdr * h, 1750 const u_char * bytes) 1751{ 1752 struct my_ctrs *ctr = (struct my_ctrs *)user; 1753 (void)bytes; /* UNUSED */ 1754 ctr->bytes += h->len; 1755 ctr->pkts++; 1756} 1757#endif /* !NO_PCAP */ 1758 1759 1760static int 1761receive_packets(struct netmap_ring *ring, u_int limit, int dump, uint64_t *bytes) 1762{ 1763 u_int cur, rx, n; 1764 uint64_t b = 0; 1765 u_int complete = 0; 1766 1767 if (bytes == NULL) 1768 bytes = &b; 1769 1770 cur = ring->cur; 1771 n = nm_ring_space(ring); 1772 if (n < limit) 1773 limit = n; 1774 for (rx = 0; rx < limit; rx++) { 1775 struct netmap_slot *slot = &ring->slot[cur]; 1776 char *p = NETMAP_BUF(ring, slot->buf_idx); 1777 1778 *bytes += slot->len; 1779 if (dump) 1780 dump_payload(p, slot->len, ring, cur); 1781 if (!(slot->flags & NS_MOREFRAG)) 1782 complete++; 1783 1784 cur = nm_ring_next(ring, cur); 1785 } 1786 ring->head = ring->cur = cur; 1787 1788 return (complete); 1789} 1790 1791static void * 1792receiver_body(void *data) 1793{ 1794 struct targ *targ = (struct targ *) data; 1795 struct pollfd pfd = { .fd = targ->fd, .events = POLLIN }; 1796 struct netmap_if *nifp; 1797 struct netmap_ring *rxring; 1798 int i; 1799 struct my_ctrs cur; 1800 1801 memset(&cur, 0, sizeof(cur)); 1802 1803 if (setaffinity(targ->thread, targ->affinity)) 1804 goto quit; 1805 1806 D("reading from %s fd %d main_fd %d", 1807 targ->g->ifname, targ->fd, targ->g->main_fd); 1808 /* unbounded wait for the first packet. */ 1809 for (;!targ->cancel;) { 1810 i = poll(&pfd, 1, 1000); 1811 if (i > 0 && !(pfd.revents & POLLERR)) 1812 break; 1813 if (i < 0) { 1814 D("poll() error: %s", strerror(errno)); 1815 goto quit; 1816 } 1817 if (pfd.revents & POLLERR) { 1818 D("fd error"); 1819 goto quit; 1820 } 1821 RD(1, "waiting for initial packets, poll returns %d %d", 1822 i, pfd.revents); 1823 } 1824 /* main loop, exit after 1s silence */ 1825 clock_gettime(CLOCK_REALTIME_PRECISE, &targ->tic); 1826 if (targ->g->dev_type == DEV_TAP) { 1827 while (!targ->cancel) { 1828 char buf[MAX_BODYSIZE]; 1829 /* XXX should we poll ? */ 1830 i = read(targ->g->main_fd, buf, sizeof(buf)); 1831 if (i > 0) { 1832 targ->ctr.pkts++; 1833 targ->ctr.bytes += i; 1834 targ->ctr.events++; 1835 } 1836 } 1837#ifndef NO_PCAP 1838 } else if (targ->g->dev_type == DEV_PCAP) { 1839 while (!targ->cancel) { 1840 /* XXX should we poll ? */ 1841 pcap_dispatch(targ->g->p, targ->g->burst, receive_pcap, 1842 (u_char *)&targ->ctr); 1843 targ->ctr.events++; 1844 } 1845#endif /* !NO_PCAP */ 1846 } else { 1847 int dump = targ->g->options & OPT_DUMP; 1848 1849 nifp = targ->nmd->nifp; 1850 while (!targ->cancel) { 1851 /* Once we started to receive packets, wait at most 1 seconds 1852 before quitting. */ 1853#ifdef BUSYWAIT 1854 if (ioctl(pfd.fd, NIOCRXSYNC, NULL) < 0) { 1855 D("ioctl error on queue %d: %s", targ->me, 1856 strerror(errno)); 1857 goto quit; 1858 } 1859#else /* !BUSYWAIT */ 1860 if (poll(&pfd, 1, 1 * 1000) <= 0 && !targ->g->forever) { 1861 clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc); 1862 targ->toc.tv_sec -= 1; /* Subtract timeout time. */ 1863 goto out; 1864 } 1865 1866 if (pfd.revents & POLLERR) { 1867 D("poll err"); 1868 goto quit; 1869 } 1870#endif /* !BUSYWAIT */ 1871 uint64_t cur_space = 0; 1872 for (i = targ->nmd->first_rx_ring; i <= targ->nmd->last_rx_ring; i++) { 1873 int m; 1874 1875 rxring = NETMAP_RXRING(nifp, i); 1876 /* compute free space in the ring */ 1877 m = rxring->head + rxring->num_slots - rxring->tail; 1878 if (m >= (int) rxring->num_slots) 1879 m -= rxring->num_slots; 1880 cur_space += m; 1881 if (nm_ring_empty(rxring)) 1882 continue; 1883 1884 m = receive_packets(rxring, targ->g->burst, dump, &cur.bytes); 1885 cur.pkts += m; 1886 if (m > 0) 1887 cur.events++; 1888 } 1889 cur.min_space = targ->ctr.min_space; 1890 if (cur_space < cur.min_space) 1891 cur.min_space = cur_space; 1892 targ->ctr = cur; 1893 } 1894 } 1895 1896 clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc); 1897 1898#if !defined(BUSYWAIT) 1899out: 1900#endif 1901 targ->completed = 1; 1902 targ->ctr = cur; 1903 1904quit: 1905 /* reset the ``used`` flag. */ 1906 targ->used = 0; 1907 1908 return (NULL); 1909} 1910 1911static void * 1912txseq_body(void *data) 1913{ 1914 struct targ *targ = (struct targ *) data; 1915 struct pollfd pfd = { .fd = targ->fd, .events = POLLOUT }; 1916 struct netmap_ring *ring; 1917 int64_t sent = 0; 1918 uint64_t event = 0; 1919 int options = targ->g->options | OPT_COPY; 1920 struct timespec nexttime = {0, 0}; 1921 int rate_limit = targ->g->tx_rate; 1922 struct pkt *pkt = &targ->pkt; 1923 int frags = targ->g->frags; 1924 uint32_t sequence = 0; 1925 int budget = 0; 1926 void *frame; 1927 int size; 1928 1929 if (targ->g->nthreads > 1) { 1930 D("can only txseq ping with 1 thread"); 1931 return NULL; 1932 } 1933 1934 if (targ->g->npackets > 0) { 1935 D("Ignoring -n argument"); 1936 } 1937 1938 frame = (char *)pkt + sizeof(pkt->vh) - targ->g->virt_header; 1939 size = targ->g->pkt_size + targ->g->virt_header; 1940 1941 D("start, fd %d main_fd %d", targ->fd, targ->g->main_fd); 1942 if (setaffinity(targ->thread, targ->affinity)) 1943 goto quit; 1944 1945 clock_gettime(CLOCK_REALTIME_PRECISE, &targ->tic); 1946 if (rate_limit) { 1947 targ->tic = timespec_add(targ->tic, (struct timespec){2,0}); 1948 targ->tic.tv_nsec = 0; 1949 wait_time(targ->tic); 1950 nexttime = targ->tic; 1951 } 1952 1953 /* Only use the first queue. */ 1954 ring = NETMAP_TXRING(targ->nmd->nifp, targ->nmd->first_tx_ring); 1955 1956 while (!targ->cancel) { 1957 int64_t limit; 1958 unsigned int space; 1959 unsigned int head; 1960 int fcnt; 1961 uint16_t sum = 0; 1962 int rv; 1963 1964 if (!rate_limit) { 1965 budget = targ->g->burst; 1966 1967 } else if (budget <= 0) { 1968 budget = targ->g->burst; 1969 nexttime = timespec_add(nexttime, targ->g->tx_period); 1970 wait_time(nexttime); 1971 } 1972 1973 /* wait for available room in the send queue */ 1974#ifdef BUSYWAIT 1975 (void)rv; 1976 if (ioctl(pfd.fd, NIOCTXSYNC, NULL) < 0) { 1977 D("ioctl error on queue %d: %s", targ->me, 1978 strerror(errno)); 1979 goto quit; 1980 } 1981#else /* !BUSYWAIT */ 1982 if ( (rv = poll(&pfd, 1, 2000)) <= 0) { 1983 if (targ->cancel) 1984 break; 1985 D("poll error on queue %d: %s", targ->me, 1986 rv ? strerror(errno) : "timeout"); 1987 // goto quit; 1988 } 1989 if (pfd.revents & POLLERR) { 1990 D("poll error on %d ring %d-%d", pfd.fd, 1991 targ->nmd->first_tx_ring, targ->nmd->last_tx_ring); 1992 goto quit; 1993 } 1994#endif /* !BUSYWAIT */ 1995 1996 /* If no room poll() again. */ 1997 space = nm_ring_space(ring); 1998 if (!space) { 1999 continue; 2000 } 2001 2002 limit = budget; 2003 2004 if (space < limit) { 2005 limit = space; 2006 } 2007 2008 /* Cut off ``limit`` to make sure is multiple of ``frags``. */ 2009 if (frags > 1) { 2010 limit = (limit / frags) * frags; 2011 } 2012 2013 limit = sent + limit; /* Convert to absolute. */ 2014 2015 for (fcnt = frags, head = ring->head; 2016 sent < limit; sent++, sequence++) { 2017 struct netmap_slot *slot = &ring->slot[head]; 2018 char *p = NETMAP_BUF(ring, slot->buf_idx); 2019 uint16_t *w = (uint16_t *)PKT(pkt, body, targ->g->af), t; 2020 2021 memcpy(&sum, targ->g->af == AF_INET ? &pkt->ipv4.udp.uh_sum : &pkt->ipv6.udp.uh_sum, sizeof(sum)); 2022 2023 slot->flags = 0; 2024 t = *w; 2025 PKT(pkt, body, targ->g->af)[0] = sequence >> 24; 2026 PKT(pkt, body, targ->g->af)[1] = (sequence >> 16) & 0xff; 2027 sum = ~cksum_add(~sum, cksum_add(~t, *w)); 2028 t = *++w; 2029 PKT(pkt, body, targ->g->af)[2] = (sequence >> 8) & 0xff; 2030 PKT(pkt, body, targ->g->af)[3] = sequence & 0xff; 2031 sum = ~cksum_add(~sum, cksum_add(~t, *w)); 2032 memcpy(targ->g->af == AF_INET ? &pkt->ipv4.udp.uh_sum : &pkt->ipv6.udp.uh_sum, &sum, sizeof(sum)); 2033 nm_pkt_copy(frame, p, size); 2034 if (fcnt == frags) { 2035 update_addresses(pkt, targ); 2036 } 2037 2038 if (options & OPT_DUMP) { 2039 dump_payload(p, size, ring, head); 2040 } 2041 2042 slot->len = size; 2043 2044 if (--fcnt > 0) { 2045 slot->flags |= NS_MOREFRAG; 2046 } else { 2047 fcnt = frags; 2048 } 2049 2050 if (sent == limit - 1) { 2051 /* Make sure we don't push an incomplete 2052 * packet. */ 2053 assert(!(slot->flags & NS_MOREFRAG)); 2054 slot->flags |= NS_REPORT; 2055 } 2056 2057 head = nm_ring_next(ring, head); 2058 if (rate_limit) { 2059 budget--; 2060 } 2061 } 2062 2063 ring->cur = ring->head = head; 2064 2065 event ++; 2066 targ->ctr.pkts = sent; 2067 targ->ctr.bytes = sent * size; 2068 targ->ctr.events = event; 2069 } 2070 2071 /* flush any remaining packets */ 2072 D("flush tail %d head %d on thread %p", 2073 ring->tail, ring->head, 2074 (void *)pthread_self()); 2075 ioctl(pfd.fd, NIOCTXSYNC, NULL); 2076 2077 /* final part: wait the TX queues to become empty. */ 2078 while (!targ->cancel && nm_tx_pending(ring)) { 2079 RD(5, "pending tx tail %d head %d on ring %d", 2080 ring->tail, ring->head, targ->nmd->first_tx_ring); 2081 ioctl(pfd.fd, NIOCTXSYNC, NULL); 2082 usleep(1); /* wait 1 tick */ 2083 } 2084 2085 clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc); 2086 targ->completed = 1; 2087 targ->ctr.pkts = sent; 2088 targ->ctr.bytes = sent * size; 2089 targ->ctr.events = event; 2090quit: 2091 /* reset the ``used`` flag. */ 2092 targ->used = 0; 2093 2094 return (NULL); 2095} 2096 2097 2098static char * 2099multi_slot_to_string(struct netmap_ring *ring, unsigned int head, 2100 unsigned int nfrags, char *strbuf, size_t strbuflen) 2101{ 2102 unsigned int f; 2103 char *ret = strbuf; 2104 2105 for (f = 0; f < nfrags; f++) { 2106 struct netmap_slot *slot = &ring->slot[head]; 2107 int m = snprintf(strbuf, strbuflen, "|%u,%x|", slot->len, 2108 slot->flags); 2109 if (m >= (int)strbuflen) { 2110 break; 2111 } 2112 strbuf += m; 2113 strbuflen -= m; 2114 2115 head = nm_ring_next(ring, head); 2116 } 2117 2118 return ret; 2119} 2120 2121static void * 2122rxseq_body(void *data) 2123{ 2124 struct targ *targ = (struct targ *) data; 2125 struct pollfd pfd = { .fd = targ->fd, .events = POLLIN }; 2126 int dump = targ->g->options & OPT_DUMP; 2127 struct netmap_ring *ring; 2128 unsigned int frags_exp = 1; 2129 struct my_ctrs cur; 2130 unsigned int frags = 0; 2131 int first_packet = 1; 2132 int first_slot = 1; 2133 int i, j, af, nrings; 2134 uint32_t seq, *seq_exp = NULL; 2135 2136 memset(&cur, 0, sizeof(cur)); 2137 2138 if (setaffinity(targ->thread, targ->affinity)) 2139 goto quit; 2140 2141 nrings = targ->nmd->last_rx_ring - targ->nmd->first_rx_ring + 1; 2142 seq_exp = calloc(nrings, sizeof(uint32_t)); 2143 if (seq_exp == NULL) { 2144 D("failed to allocate seq array"); 2145 goto quit; 2146 } 2147 2148 D("reading from %s fd %d main_fd %d", 2149 targ->g->ifname, targ->fd, targ->g->main_fd); 2150 /* unbounded wait for the first packet. */ 2151 for (;!targ->cancel;) { 2152 i = poll(&pfd, 1, 1000); 2153 if (i > 0 && !(pfd.revents & POLLERR)) 2154 break; 2155 RD(1, "waiting for initial packets, poll returns %d %d", 2156 i, pfd.revents); 2157 } 2158 2159 clock_gettime(CLOCK_REALTIME_PRECISE, &targ->tic); 2160 2161 2162 while (!targ->cancel) { 2163 unsigned int head; 2164 int limit; 2165 2166#ifdef BUSYWAIT 2167 if (ioctl(pfd.fd, NIOCRXSYNC, NULL) < 0) { 2168 D("ioctl error on queue %d: %s", targ->me, 2169 strerror(errno)); 2170 goto quit; 2171 } 2172#else /* !BUSYWAIT */ 2173 if (poll(&pfd, 1, 1 * 1000) <= 0 && !targ->g->forever) { 2174 clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc); 2175 targ->toc.tv_sec -= 1; /* Subtract timeout time. */ 2176 goto out; 2177 } 2178 2179 if (pfd.revents & POLLERR) { 2180 D("poll err"); 2181 goto quit; 2182 } 2183#endif /* !BUSYWAIT */ 2184 2185 for (j = targ->nmd->first_rx_ring; j <= targ->nmd->last_rx_ring; j++) { 2186 ring = NETMAP_RXRING(targ->nmd->nifp, j); 2187 if (nm_ring_empty(ring)) 2188 continue; 2189 2190 limit = nm_ring_space(ring); 2191 if (limit > targ->g->burst) 2192 limit = targ->g->burst; 2193 2194#if 0 2195 /* Enable this if 2196 * 1) we remove the early-return optimization from 2197 * the netmap poll implementation, or 2198 * 2) pipes get NS_MOREFRAG support. 2199 * With the current netmap implementation, an experiment like 2200 * pkt-gen -i vale:1{1 -f txseq -F 9 2201 * pkt-gen -i vale:1}1 -f rxseq 2202 * would get stuck as soon as we find nm_ring_space(ring) < 9, 2203 * since here limit is rounded to 0 and 2204 * pipe rxsync is not called anymore by the poll() of this loop. 2205 */ 2206 if (frags_exp > 1) { 2207 int o = limit; 2208 /* Cut off to the closest smaller multiple. */ 2209 limit = (limit / frags_exp) * frags_exp; 2210 RD(2, "LIMIT %d --> %d", o, limit); 2211 } 2212#endif 2213 2214 for (head = ring->head, i = 0; i < limit; i++) { 2215 struct netmap_slot *slot = &ring->slot[head]; 2216 char *p = NETMAP_BUF(ring, slot->buf_idx); 2217 int len = slot->len; 2218 struct pkt *pkt; 2219 2220 if (dump) { 2221 dump_payload(p, slot->len, ring, head); 2222 } 2223 2224 frags++; 2225 if (!(slot->flags & NS_MOREFRAG)) { 2226 if (first_packet) { 2227 first_packet = 0; 2228 } else if (frags != frags_exp) { 2229 char prbuf[512]; 2230 RD(1, "Received packets with %u frags, " 2231 "expected %u, '%s'", frags, frags_exp, 2232 multi_slot_to_string(ring, head-frags+1, 2233 frags, 2234 prbuf, sizeof(prbuf))); 2235 } 2236 first_packet = 0; 2237 frags_exp = frags; 2238 frags = 0; 2239 } 2240 2241 p -= sizeof(pkt->vh) - targ->g->virt_header; 2242 len += sizeof(pkt->vh) - targ->g->virt_header; 2243 pkt = (struct pkt *)p; 2244 if (ntohs(pkt->eh.ether_type) == ETHERTYPE_IP) 2245 af = AF_INET; 2246 else 2247 af = AF_INET6; 2248 2249 if ((char *)pkt + len < ((char *)PKT(pkt, body, af)) + 2250 sizeof(seq)) { 2251 RD(1, "%s: packet too small (len=%u)", __func__, 2252 slot->len); 2253 } else { 2254 seq = (PKT(pkt, body, af)[0] << 24) | 2255 (PKT(pkt, body, af)[1] << 16) | 2256 (PKT(pkt, body, af)[2] << 8) | 2257 PKT(pkt, body, af)[3]; 2258 if (first_slot) { 2259 /* Grab the first one, whatever it 2260 is. */ 2261 seq_exp[j] = seq; 2262 first_slot = 0; 2263 } else if (seq != seq_exp[j]) { 2264 uint32_t delta = seq - seq_exp[j]; 2265 2266 if (delta < (0xFFFFFFFF >> 1)) { 2267 RD(2, "Sequence GAP: exp %u found %u", 2268 seq_exp[j], seq); 2269 } else { 2270 RD(2, "Sequence OUT OF ORDER: " 2271 "exp %u found %u", seq_exp[j], seq); 2272 } 2273 seq_exp[j] = seq; 2274 } 2275 seq_exp[j]++; 2276 } 2277 2278 cur.bytes += slot->len; 2279 head = nm_ring_next(ring, head); 2280 cur.pkts++; 2281 } 2282 2283 ring->cur = ring->head = head; 2284 2285 cur.events++; 2286 targ->ctr = cur; 2287 } 2288 } 2289 clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc); 2290 2291#ifndef BUSYWAIT 2292out: 2293#endif /* !BUSYWAIT */ 2294 targ->completed = 1; 2295 targ->ctr = cur; 2296 2297quit: 2298 if (seq_exp != NULL) 2299 free(seq_exp); 2300 /* reset the ``used`` flag. */ 2301 targ->used = 0; 2302 2303 return (NULL); 2304} 2305 2306 2307static void 2308tx_output(struct glob_arg *g, struct my_ctrs *cur, double delta, const char *msg) 2309{ 2310 double bw, raw_bw, pps, abs; 2311 char b1[40], b2[80], b3[80]; 2312 int size; 2313 2314 if (cur->pkts == 0) { 2315 printf("%s nothing.\n", msg); 2316 return; 2317 } 2318 2319 size = (int)(cur->bytes / cur->pkts); 2320 2321 printf("%s %llu packets %llu bytes %llu events %d bytes each in %.2f seconds.\n", 2322 msg, 2323 (unsigned long long)cur->pkts, 2324 (unsigned long long)cur->bytes, 2325 (unsigned long long)cur->events, size, delta); 2326 if (delta == 0) 2327 delta = 1e-6; 2328 if (size < 60) /* correct for min packet size */ 2329 size = 60; 2330 pps = cur->pkts / delta; 2331 bw = (8.0 * cur->bytes) / delta; 2332 raw_bw = (8.0 * cur->bytes + cur->pkts * g->framing) / delta; 2333 abs = cur->pkts / (double)(cur->events); 2334 2335 printf("Speed: %spps Bandwidth: %sbps (raw %sbps). Average batch: %.2f pkts\n", 2336 norm(b1, pps, normalize), norm(b2, bw, normalize), norm(b3, raw_bw, normalize), abs); 2337} 2338 2339static void 2340usage(int errcode) 2341{ 2342/* This usage is generated from the pkt-gen man page: 2343 * $ man pkt-gen > x 2344 * and pasted here adding the string terminators and endlines with simple 2345 * regular expressions. */ 2346 const char *cmd = "pkt-gen"; 2347 fprintf(stderr, 2348 "Usage:\n" 2349 "%s arguments\n" 2350" -h Show program usage and exit.\n" 2351"\n" 2352" -i interface\n" 2353" Name of the network interface that pkt-gen operates on. It can be a system network interface\n" 2354" (e.g., em0), the name of a vale(4) port (e.g., valeSSS:PPP), the name of a netmap pipe or\n" 2355" monitor, or any valid netmap port name accepted by the nm_open library function, as docu-\n" 2356" mented in netmap(4) (NIOCREGIF section).\n" 2357"\n" 2358" -f function\n" 2359" The function to be executed by pkt-gen. Specify tx for transmission, rx for reception, ping\n" 2360" for client-side ping-pong operation, and pong for server-side ping-pong operation.\n" 2361"\n" 2362" -n count\n" 2363" Number of iterations of the pkt-gen function, with 0 meaning infinite). In case of tx or rx,\n" 2364" count is the number of packets to receive or transmit. In case of ping or pong, count is the\n" 2365" number of ping-pong transactions.\n" 2366"\n" 2367" -l pkt_size\n" 2368" Packet size in bytes excluding CRC. If passed a second time, use random sizes larger or\n" 2369" equal than the second one and lower than the first one.\n" 2370"\n" 2371" -b burst_size\n" 2372" Transmit or receive up to burst_size packets at a time.\n" 2373"\n" 2374" -4 Use IPv4 addresses.\n" 2375"\n" 2376" -6 Use IPv6 addresses.\n" 2377"\n" 2378" -d dst_ip[:port[-dst_ip:port]]\n" 2379" Destination IPv4/IPv6 address and port, single or range.\n" 2380"\n" 2381" -s src_ip[:port[-src_ip:port]]\n" 2382" Source IPv4/IPv6 address and port, single or range.\n" 2383"\n" 2384" -D dst_mac\n" 2385" Destination MAC address in colon notation (e.g., aa:bb:cc:dd:ee:00).\n" 2386"\n" 2387" -S src_mac\n" 2388" Source MAC address in colon notation.\n" 2389"\n" 2390" -a cpu_id\n" 2391" Pin the first thread of pkt-gen to a particular CPU using pthread_setaffinity_np(3). If more\n" 2392" threads are used, they are pinned to the subsequent CPUs, one per thread.\n" 2393"\n" 2394" -c cpus\n" 2395" Maximum number of CPUs to use (0 means to use all the available ones).\n" 2396"\n" 2397" -p threads\n" 2398" Number of threads to use. By default, only a single thread is used to handle all the netmap\n" 2399" rings. If threads is larger than one, each thread handles a single TX ring (in tx mode), a\n" 2400" single RX ring (in rx mode), or a TX/RX ring couple. The number of threads must be less or\n" 2401" equal than the number of TX (or RX) ring available in the device specified by interface.\n" 2402"\n" 2403" -T report_ms\n" 2404" Number of milliseconds between reports.\n" 2405"\n" 2406" -w wait_for_link_time\n" 2407" Number of seconds to wait before starting the pkt-gen function, useuful to make sure that the\n" 2408" network link is up. A network device driver may take some time to enter netmap mode, or to\n" 2409" create a new transmit/receive ring pair when netmap(4) requests one.\n" 2410"\n" 2411" -R rate\n" 2412" Packet transmission rate. Not setting the packet transmission rate tells pkt-gen to transmit\n" 2413" packets as quickly as possible. On servers from 2010 on-wards netmap(4) is able to com-\n" 2414" pletely use all of the bandwidth of a 10 or 40Gbps link, so this option should be used unless\n" 2415" your intention is to saturate the link.\n" 2416"\n" 2417" -X Dump payload of each packet transmitted or received.\n" 2418"\n" 2419" -H len Add empty virtio-net-header with size 'len'. Valid sizes are 0, 10 and 12. This option is\n" 2420" only used with Virtual Machine technologies that use virtio as a network interface.\n" 2421"\n" 2422" -P file\n" 2423" Load the packet to be transmitted from a pcap file rather than constructing it within\n" 2424" pkt-gen.\n" 2425"\n" 2426" -z Use random IPv4/IPv6 src address/port.\n" 2427"\n" 2428" -Z Use random IPv4/IPv6 dst address/port.\n" 2429"\n" 2430" -N Do not normalize units (i.e., use bps, pps instead of Mbps, Kpps, etc.).\n" 2431"\n" 2432" -F num_frags\n" 2433" Send multi-slot packets, each one with num_frags fragments. A multi-slot packet is repre-\n" 2434" sented by two or more consecutive netmap slots with the NS_MOREFRAG flag set (except for the\n" 2435" last slot). This is useful to transmit or receive packets larger than the netmap buffer\n" 2436" size.\n" 2437"\n" 2438" -M frag_size\n" 2439" In multi-slot mode, frag_size specifies the size of each fragment, if smaller than the packet\n" 2440" length divided by num_frags.\n" 2441"\n" 2442" -I Use indirect buffers. It is only valid for transmitting on VALE ports, and it is implemented\n" 2443" by setting the NS_INDIRECT flag in the netmap slots.\n" 2444"\n" 2445" -W Exit immediately if all the RX rings are empty the first time they are examined.\n" 2446"\n" 2447" -v Increase the verbosity level.\n" 2448"\n" 2449" -r In tx mode, do not initialize packets, but send whatever the content of the uninitialized\n" 2450" netmap buffers is (rubbish mode).\n" 2451"\n" 2452" -A Compute mean and standard deviation (over a sliding window) for the transmit or receive rate.\n" 2453"\n" 2454" -B Take Ethernet framing and CRC into account when computing the average bps. This adds 4 bytes\n" 2455" of CRC and 20 bytes of framing to each packet.\n" 2456"\n" 2457" -C tx_slots[,rx_slots[,tx_rings[,rx_rings]]]\n" 2458" Configuration in terms of number of rings and slots to be used when opening the netmap port.\n" 2459" Such configuration has effect on software ports created on the fly, such as VALE ports and\n" 2460" netmap pipes. The configuration may consist of 1 to 4 numbers separated by commas: tx_slots,\n" 2461" rx_slots, tx_rings, rx_rings. Missing numbers or zeroes stand for default values. As an\n" 2462" additional convenience, if exactly one number is specified, then this is assigned to both\n" 2463" tx_slots and rx_slots. If there is no fourth number, then the third one is assigned to both\n" 2464" tx_rings and rx_rings.\n" 2465"\n" 2466" -o options data generation options (parsed using atoi)\n" 2467" OPT_PREFETCH 1\n" 2468" OPT_ACCESS 2\n" 2469" OPT_COPY 4\n" 2470" OPT_MEMCPY 8\n" 2471" OPT_TS 16 (add a timestamp)\n" 2472" OPT_INDIRECT 32 (use indirect buffers)\n" 2473" OPT_DUMP 64 (dump rx/tx traffic)\n" 2474" OPT_RUBBISH 256\n" 2475" (send wathever the buffers contain)\n" 2476" OPT_RANDOM_SRC 512\n" 2477" OPT_RANDOM_DST 1024\n" 2478" OPT_PPS_STATS 2048\n" 2479 "", 2480 cmd); 2481 exit(errcode); 2482} 2483 2484static void 2485start_threads(struct glob_arg *g) { 2486 int i; 2487 2488 targs = calloc(g->nthreads, sizeof(*targs)); 2489 struct targ *t; 2490 /* 2491 * Now create the desired number of threads, each one 2492 * using a single descriptor. 2493 */ 2494 for (i = 0; i < g->nthreads; i++) { 2495 uint64_t seed = time(0) | (time(0) << 32); 2496 t = &targs[i]; 2497 2498 bzero(t, sizeof(*t)); 2499 t->fd = -1; /* default, with pcap */ 2500 t->g = g; 2501 memcpy(t->seed, &seed, sizeof(t->seed)); 2502 2503 if (g->dev_type == DEV_NETMAP) { 2504 struct nm_desc nmd = *g->nmd; /* copy, we overwrite ringid */ 2505 uint64_t nmd_flags = 0; 2506 nmd.self = &nmd; 2507 2508 if (i > 0) { 2509 /* the first thread uses the fd opened by the main 2510 * thread, the other threads re-open /dev/netmap 2511 */ 2512 if (g->nthreads > 1) { 2513 nmd.req.nr_flags = 2514 g->nmd->req.nr_flags & ~NR_REG_MASK; 2515 nmd.req.nr_flags |= NR_REG_ONE_NIC; 2516 nmd.req.nr_ringid = i; 2517 } 2518 /* Only touch one of the rings (rx is already ok) */ 2519 if (g->td_type == TD_TYPE_RECEIVER) 2520 nmd_flags |= NETMAP_NO_TX_POLL; 2521 2522 /* register interface. Override ifname and ringid etc. */ 2523 t->nmd = nm_open(t->g->ifname, NULL, nmd_flags | 2524 NM_OPEN_IFNAME | NM_OPEN_NO_MMAP, &nmd); 2525 if (t->nmd == NULL) { 2526 D("Unable to open %s: %s", 2527 t->g->ifname, strerror(errno)); 2528 continue; 2529 } 2530 } else { 2531 t->nmd = g->nmd; 2532 } 2533 t->fd = t->nmd->fd; 2534 t->frags = g->frags; 2535 } else { 2536 targs[i].fd = g->main_fd; 2537 } 2538 t->used = 1; 2539 t->me = i; 2540 if (g->affinity >= 0) { 2541 t->affinity = (g->affinity + i) % g->cpus; 2542 } else { 2543 t->affinity = -1; 2544 } 2545 /* default, init packets */ 2546 initialize_packet(t); 2547 } 2548 /* Wait for PHY reset. */ 2549 D("Wait %d secs for phy reset", g->wait_link); 2550 sleep(g->wait_link); 2551 D("Ready..."); 2552 2553 for (i = 0; i < g->nthreads; i++) { 2554 t = &targs[i]; 2555 if (pthread_create(&t->thread, NULL, g->td_body, t) == -1) { 2556 D("Unable to create thread %d: %s", i, strerror(errno)); 2557 t->used = 0; 2558 } 2559 } 2560} 2561 2562static void 2563main_thread(struct glob_arg *g) 2564{ 2565 int i; 2566 2567 struct my_ctrs prev, cur; 2568 double delta_t; 2569 struct timeval tic, toc; 2570 2571 prev.pkts = prev.bytes = prev.events = 0; 2572 gettimeofday(&prev.t, NULL); 2573 for (;;) { 2574 char b1[40], b2[40], b3[40], b4[100]; 2575 uint64_t pps, usec; 2576 struct my_ctrs x; 2577 double abs; 2578 int done = 0; 2579 2580 usec = wait_for_next_report(&prev.t, &cur.t, 2581 g->report_interval); 2582 2583 cur.pkts = cur.bytes = cur.events = 0; 2584 cur.min_space = 0; 2585 if (usec < 10000) /* too short to be meaningful */ 2586 continue; 2587 /* accumulate counts for all threads */ 2588 for (i = 0; i < g->nthreads; i++) { 2589 cur.pkts += targs[i].ctr.pkts; 2590 cur.bytes += targs[i].ctr.bytes; 2591 cur.events += targs[i].ctr.events; 2592 cur.min_space += targs[i].ctr.min_space; 2593 targs[i].ctr.min_space = 99999; 2594 if (targs[i].used == 0) 2595 done++; 2596 } 2597 x.pkts = cur.pkts - prev.pkts; 2598 x.bytes = cur.bytes - prev.bytes; 2599 x.events = cur.events - prev.events; 2600 pps = (x.pkts*1000000 + usec/2) / usec; 2601 abs = (x.events > 0) ? (x.pkts / (double) x.events) : 0; 2602 2603 if (!(g->options & OPT_PPS_STATS)) { 2604 strcpy(b4, ""); 2605 } else { 2606 /* Compute some pps stats using a sliding window. */ 2607 double ppsavg = 0.0, ppsdev = 0.0; 2608 int nsamples = 0; 2609 2610 g->win[g->win_idx] = pps; 2611 g->win_idx = (g->win_idx + 1) % STATS_WIN; 2612 2613 for (i = 0; i < STATS_WIN; i++) { 2614 ppsavg += g->win[i]; 2615 if (g->win[i]) { 2616 nsamples ++; 2617 } 2618 } 2619 ppsavg /= nsamples; 2620 2621 for (i = 0; i < STATS_WIN; i++) { 2622 if (g->win[i] == 0) { 2623 continue; 2624 } 2625 ppsdev += (g->win[i] - ppsavg) * (g->win[i] - ppsavg); 2626 } 2627 ppsdev /= nsamples; 2628 ppsdev = sqrt(ppsdev); 2629 2630 snprintf(b4, sizeof(b4), "[avg/std %s/%s pps]", 2631 norm(b1, ppsavg, normalize), norm(b2, ppsdev, normalize)); 2632 } 2633 2634 D("%spps %s(%spkts %sbps in %llu usec) %.2f avg_batch %d min_space", 2635 norm(b1, pps, normalize), b4, 2636 norm(b2, (double)x.pkts, normalize), 2637 norm(b3, (double)x.bytes*8+(double)x.pkts*g->framing, normalize), 2638 (unsigned long long)usec, 2639 abs, (int)cur.min_space); 2640 prev = cur; 2641 2642 if (done == g->nthreads) 2643 break; 2644 } 2645 2646 timerclear(&tic); 2647 timerclear(&toc); 2648 cur.pkts = cur.bytes = cur.events = 0; 2649 /* final round */ 2650 for (i = 0; i < g->nthreads; i++) { 2651 struct timespec t_tic, t_toc; 2652 /* 2653 * Join active threads, unregister interfaces and close 2654 * file descriptors. 2655 */ 2656 if (targs[i].used) 2657 pthread_join(targs[i].thread, NULL); /* blocking */ 2658 if (g->dev_type == DEV_NETMAP) { 2659 nm_close(targs[i].nmd); 2660 targs[i].nmd = NULL; 2661 } else { 2662 close(targs[i].fd); 2663 } 2664 2665 if (targs[i].completed == 0) 2666 D("ouch, thread %d exited with error", i); 2667 2668 /* 2669 * Collect threads output and extract information about 2670 * how long it took to send all the packets. 2671 */ 2672 cur.pkts += targs[i].ctr.pkts; 2673 cur.bytes += targs[i].ctr.bytes; 2674 cur.events += targs[i].ctr.events; 2675 /* collect the largest start (tic) and end (toc) times, 2676 * XXX maybe we should do the earliest tic, or do a weighted 2677 * average ? 2678 */ 2679 t_tic = timeval2spec(&tic); 2680 t_toc = timeval2spec(&toc); 2681 if (!timerisset(&tic) || timespec_ge(&targs[i].tic, &t_tic)) 2682 tic = timespec2val(&targs[i].tic); 2683 if (!timerisset(&toc) || timespec_ge(&targs[i].toc, &t_toc)) 2684 toc = timespec2val(&targs[i].toc); 2685 } 2686 2687 /* print output. */ 2688 timersub(&toc, &tic, &toc); 2689 delta_t = toc.tv_sec + 1e-6* toc.tv_usec; 2690 if (g->td_type == TD_TYPE_SENDER) 2691 tx_output(g, &cur, delta_t, "Sent"); 2692 else if (g->td_type == TD_TYPE_RECEIVER) 2693 tx_output(g, &cur, delta_t, "Received"); 2694} 2695 2696struct td_desc { 2697 int ty; 2698 char *key; 2699 void *f; 2700 int default_burst; 2701}; 2702 2703static struct td_desc func[] = { 2704 { TD_TYPE_RECEIVER, "rx", receiver_body, 512}, /* default */ 2705 { TD_TYPE_SENDER, "tx", sender_body, 512 }, 2706 { TD_TYPE_OTHER, "ping", ping_body, 1 }, 2707 { TD_TYPE_OTHER, "pong", pong_body, 1 }, 2708 { TD_TYPE_SENDER, "txseq", txseq_body, 512 }, 2709 { TD_TYPE_RECEIVER, "rxseq", rxseq_body, 512 }, 2710 { 0, NULL, NULL, 0 } 2711}; 2712 2713static int 2714tap_alloc(char *dev) 2715{ 2716 struct ifreq ifr; 2717 int fd, err; 2718 char *clonedev = TAP_CLONEDEV; 2719 2720 (void)err; 2721 (void)dev; 2722 /* Arguments taken by the function: 2723 * 2724 * char *dev: the name of an interface (or '\0'). MUST have enough 2725 * space to hold the interface name if '\0' is passed 2726 * int flags: interface flags (eg, IFF_TUN etc.) 2727 */ 2728 2729#ifdef __FreeBSD__ 2730 if (dev[3]) { /* tapSomething */ 2731 static char buf[128]; 2732 snprintf(buf, sizeof(buf), "/dev/%s", dev); 2733 clonedev = buf; 2734 } 2735#endif 2736 /* open the device */ 2737 if( (fd = open(clonedev, O_RDWR)) < 0 ) { 2738 return fd; 2739 } 2740 D("%s open successful", clonedev); 2741 2742 /* preparation of the struct ifr, of type "struct ifreq" */ 2743 memset(&ifr, 0, sizeof(ifr)); 2744 2745#ifdef linux 2746 ifr.ifr_flags = IFF_TAP | IFF_NO_PI; 2747 2748 if (*dev) { 2749 /* if a device name was specified, put it in the structure; otherwise, 2750 * the kernel will try to allocate the "next" device of the 2751 * specified type */ 2752 size_t len = strlen(dev); 2753 if (len > IFNAMSIZ) { 2754 D("%s too long", dev); 2755 return -1; 2756 } 2757 memcpy(ifr.ifr_name, dev, len); 2758 } 2759 2760 /* try to create the device */ 2761 if( (err = ioctl(fd, TUNSETIFF, (void *) &ifr)) < 0 ) { 2762 D("failed to to a TUNSETIFF: %s", strerror(errno)); 2763 close(fd); 2764 return err; 2765 } 2766 2767 /* if the operation was successful, write back the name of the 2768 * interface to the variable "dev", so the caller can know 2769 * it. Note that the caller MUST reserve space in *dev (see calling 2770 * code below) */ 2771 strcpy(dev, ifr.ifr_name); 2772 D("new name is %s", dev); 2773#endif /* linux */ 2774 2775 /* this is the special file descriptor that the caller will use to talk 2776 * with the virtual interface */ 2777 return fd; 2778} 2779 2780int 2781main(int arc, char **argv) 2782{ 2783 int i; 2784 struct sigaction sa; 2785 sigset_t ss; 2786 2787 struct glob_arg g; 2788 2789 int ch; 2790 int devqueues = 1; /* how many device queues */ 2791 int wait_link_arg = 0; 2792 2793 int pkt_size_done = 0; 2794 2795 struct td_desc *fn = func; 2796 2797 bzero(&g, sizeof(g)); 2798 2799 g.main_fd = -1; 2800 g.td_body = fn->f; 2801 g.td_type = fn->ty; 2802 g.report_interval = 1000; /* report interval */ 2803 g.affinity = -1; 2804 /* ip addresses can also be a range x.x.x.x-x.x.x.y */ 2805 g.af = AF_INET; /* default */ 2806 g.src_ip.name = "10.0.0.1"; 2807 g.dst_ip.name = "10.1.0.1"; 2808 g.dst_mac.name = "ff:ff:ff:ff:ff:ff"; 2809 g.src_mac.name = NULL; 2810 g.pkt_size = 60; 2811 g.pkt_min_size = 0; 2812 g.nthreads = 1; 2813 g.cpus = 1; /* default */ 2814 g.forever = 1; 2815 g.tx_rate = 0; 2816 g.frags = 1; 2817 g.frag_size = (u_int)-1; /* use the netmap buffer size by default */ 2818 g.nmr_config = ""; 2819 g.virt_header = 0; 2820 g.wait_link = 2; /* wait 2 seconds for physical ports */ 2821 2822 while ((ch = getopt(arc, argv, "46a:f:F:Nn:i:Il:d:s:D:S:b:c:o:p:" 2823 "T:w:WvR:XC:H:rP:zZAhBM:")) != -1) { 2824 2825 switch(ch) { 2826 default: 2827 D("bad option %c %s", ch, optarg); 2828 usage(-1); 2829 break; 2830 2831 case 'h': 2832 usage(0); 2833 break; 2834 2835 case '4': 2836 g.af = AF_INET; 2837 break; 2838 2839 case '6': 2840 g.af = AF_INET6; 2841 break; 2842 2843 case 'N': 2844 normalize = 0; 2845 break; 2846 2847 case 'n': 2848 g.npackets = strtoull(optarg, NULL, 10); 2849 break; 2850 2851 case 'F': 2852 i = atoi(optarg); 2853 if (i < 1 || i > 63) { 2854 D("invalid frags %d [1..63], ignore", i); 2855 break; 2856 } 2857 g.frags = i; 2858 break; 2859 2860 case 'M': 2861 g.frag_size = atoi(optarg); 2862 break; 2863 2864 case 'f': 2865 for (fn = func; fn->key; fn++) { 2866 if (!strcmp(fn->key, optarg)) 2867 break; 2868 } 2869 if (fn->key) { 2870 g.td_body = fn->f; 2871 g.td_type = fn->ty; 2872 } else { 2873 D("unrecognised function %s", optarg); 2874 } 2875 break; 2876 2877 case 'o': /* data generation options */ 2878 g.options |= atoi(optarg); 2879 break; 2880 2881 case 'a': /* force affinity */ 2882 g.affinity = atoi(optarg); 2883 break; 2884 2885 case 'i': /* interface */ 2886 /* a prefix of tap: netmap: or pcap: forces the mode. 2887 * otherwise we guess 2888 */ 2889 D("interface is %s", optarg); 2890 if (strlen(optarg) > MAX_IFNAMELEN - 8) { 2891 D("ifname too long %s", optarg); 2892 break; 2893 } 2894 strcpy(g.ifname, optarg); 2895 if (!strcmp(optarg, "null")) { 2896 g.dev_type = DEV_NETMAP; 2897 g.dummy_send = 1; 2898 } else if (!strncmp(optarg, "tap:", 4)) { 2899 g.dev_type = DEV_TAP; 2900 strcpy(g.ifname, optarg + 4); 2901 } else if (!strncmp(optarg, "pcap:", 5)) { 2902 g.dev_type = DEV_PCAP; 2903 strcpy(g.ifname, optarg + 5); 2904 } else if (!strncmp(optarg, "netmap:", 7) || 2905 !strncmp(optarg, "vale", 4)) { 2906 g.dev_type = DEV_NETMAP; 2907 } else if (!strncmp(optarg, "tap", 3)) { 2908 g.dev_type = DEV_TAP; 2909 } else { /* prepend netmap: */ 2910 g.dev_type = DEV_NETMAP; 2911 sprintf(g.ifname, "netmap:%s", optarg); 2912 } 2913 break; 2914 2915 case 'I': 2916 g.options |= OPT_INDIRECT; /* use indirect buffers */ 2917 break; 2918 2919 case 'l': /* pkt_size */ 2920 if (pkt_size_done) { 2921 g.pkt_min_size = atoi(optarg); 2922 } else { 2923 g.pkt_size = atoi(optarg); 2924 pkt_size_done = 1; 2925 } 2926 break; 2927 2928 case 'd': 2929 g.dst_ip.name = optarg; 2930 break; 2931 2932 case 's': 2933 g.src_ip.name = optarg; 2934 break; 2935 2936 case 'T': /* report interval */ 2937 g.report_interval = atoi(optarg); 2938 break; 2939 2940 case 'w': 2941 g.wait_link = atoi(optarg); 2942 wait_link_arg = 1; 2943 break; 2944 2945 case 'W': 2946 g.forever = 0; /* exit RX with no traffic */ 2947 break; 2948 2949 case 'b': /* burst */ 2950 g.burst = atoi(optarg); 2951 break; 2952 case 'c': 2953 g.cpus = atoi(optarg); 2954 break; 2955 case 'p': 2956 g.nthreads = atoi(optarg); 2957 break; 2958 2959 case 'D': /* destination mac */ 2960 g.dst_mac.name = optarg; 2961 break; 2962 2963 case 'S': /* source mac */ 2964 g.src_mac.name = optarg; 2965 break; 2966 case 'v': 2967 verbose++; 2968 break; 2969 case 'R': 2970 g.tx_rate = atoi(optarg); 2971 break; 2972 case 'X': 2973 g.options |= OPT_DUMP; 2974 break; 2975 case 'C': 2976 g.nmr_config = strdup(optarg); 2977 break; 2978 case 'H': 2979 g.virt_header = atoi(optarg); 2980 break; 2981 case 'P': 2982 g.packet_file = strdup(optarg); 2983 break; 2984 case 'r': 2985 g.options |= OPT_RUBBISH; 2986 break; 2987 case 'z': 2988 g.options |= OPT_RANDOM_SRC; 2989 break; 2990 case 'Z': 2991 g.options |= OPT_RANDOM_DST; 2992 break; 2993 case 'A': 2994 g.options |= OPT_PPS_STATS; 2995 break; 2996 case 'B': 2997 /* raw packets have4 bytes crc + 20 bytes framing */ 2998 // XXX maybe add an option to pass the IFG 2999 g.framing = 24 * 8; 3000 break; 3001 } 3002 } 3003 3004 if (strlen(g.ifname) <=0 ) { 3005 D("missing ifname"); 3006 usage(-1); 3007 } 3008 3009 if (g.burst == 0) { 3010 g.burst = fn->default_burst; 3011 D("using default burst size: %d", g.burst); 3012 } 3013 3014 g.system_cpus = i = system_ncpus(); 3015 if (g.cpus < 0 || g.cpus > i) { 3016 D("%d cpus is too high, have only %d cpus", g.cpus, i); 3017 usage(-1); 3018 } 3019 D("running on %d cpus (have %d)", g.cpus, i); 3020 if (g.cpus == 0) 3021 g.cpus = i; 3022 3023 if (!wait_link_arg && !strncmp(g.ifname, "vale", 4)) { 3024 g.wait_link = 0; 3025 } 3026 3027 if (g.pkt_size < 16 || g.pkt_size > MAX_PKTSIZE) { 3028 D("bad pktsize %d [16..%d]\n", g.pkt_size, MAX_PKTSIZE); 3029 usage(-1); 3030 } 3031 3032 if (g.pkt_min_size > 0 && (g.pkt_min_size < 16 || g.pkt_min_size > g.pkt_size)) { 3033 D("bad pktminsize %d [16..%d]\n", g.pkt_min_size, g.pkt_size); 3034 usage(-1); 3035 } 3036 3037 if (g.src_mac.name == NULL) { 3038 static char mybuf[20] = "00:00:00:00:00:00"; 3039 /* retrieve source mac address. */ 3040 if (source_hwaddr(g.ifname, mybuf) == -1) { 3041 D("Unable to retrieve source mac"); 3042 // continue, fail later 3043 } 3044 g.src_mac.name = mybuf; 3045 } 3046 /* extract address ranges */ 3047 if (extract_mac_range(&g.src_mac) || extract_mac_range(&g.dst_mac)) 3048 usage(-1); 3049 g.options |= extract_ip_range(&g.src_ip, g.af); 3050 g.options |= extract_ip_range(&g.dst_ip, g.af); 3051 3052 if (g.virt_header != 0 && g.virt_header != VIRT_HDR_1 3053 && g.virt_header != VIRT_HDR_2) { 3054 D("bad virtio-net-header length"); 3055 usage(-1); 3056 } 3057 3058 if (g.dev_type == DEV_TAP) { 3059 D("want to use tap %s", g.ifname); 3060 g.main_fd = tap_alloc(g.ifname); 3061 if (g.main_fd < 0) { 3062 D("cannot open tap %s", g.ifname); 3063 usage(-1); 3064 } 3065#ifndef NO_PCAP 3066 } else if (g.dev_type == DEV_PCAP) { 3067 char pcap_errbuf[PCAP_ERRBUF_SIZE]; 3068 3069 pcap_errbuf[0] = '\0'; // init the buffer 3070 g.p = pcap_open_live(g.ifname, 256 /* XXX */, 1, 100, pcap_errbuf); 3071 if (g.p == NULL) { 3072 D("cannot open pcap on %s", g.ifname); 3073 usage(-1); 3074 } 3075 g.main_fd = pcap_fileno(g.p); 3076 D("using pcap on %s fileno %d", g.ifname, g.main_fd); 3077#endif /* !NO_PCAP */ 3078 } else if (g.dummy_send) { /* but DEV_NETMAP */ 3079 D("using a dummy send routine"); 3080 } else { 3081 struct nm_desc base_nmd; 3082 char errmsg[MAXERRMSG]; 3083 u_int flags; 3084 3085 bzero(&base_nmd, sizeof(base_nmd)); 3086 3087 parse_nmr_config(g.nmr_config, &base_nmd.req); 3088 3089 base_nmd.req.nr_flags |= NR_ACCEPT_VNET_HDR; 3090 3091 if (nm_parse(g.ifname, &base_nmd, errmsg) < 0) { 3092 D("Invalid name '%s': %s", g.ifname, errmsg); 3093 goto out; 3094 } 3095 3096 /* 3097 * Open the netmap device using nm_open(). 3098 * 3099 * protocol stack and may cause a reset of the card, 3100 * which in turn may take some time for the PHY to 3101 * reconfigure. We do the open here to have time to reset. 3102 */ 3103 flags = NM_OPEN_IFNAME | NM_OPEN_ARG1 | NM_OPEN_ARG2 | 3104 NM_OPEN_ARG3 | NM_OPEN_RING_CFG; 3105 if (g.nthreads > 1) { 3106 base_nmd.req.nr_flags &= ~NR_REG_MASK; 3107 base_nmd.req.nr_flags |= NR_REG_ONE_NIC; 3108 base_nmd.req.nr_ringid = 0; 3109 } 3110 g.nmd = nm_open(g.ifname, NULL, flags, &base_nmd); 3111 if (g.nmd == NULL) { 3112 D("Unable to open %s: %s", g.ifname, strerror(errno)); 3113 goto out; 3114 } 3115 g.main_fd = g.nmd->fd; 3116 D("mapped %luKB at %p", (unsigned long)(g.nmd->req.nr_memsize>>10), 3117 g.nmd->mem); 3118 3119 if (g.virt_header) { 3120 /* Set the virtio-net header length, since the user asked 3121 * for it explicitely. */ 3122 set_vnet_hdr_len(&g); 3123 } else { 3124 /* Check whether the netmap port we opened requires us to send 3125 * and receive frames with virtio-net header. */ 3126 get_vnet_hdr_len(&g); 3127 } 3128 3129 /* get num of queues in tx or rx */ 3130 if (g.td_type == TD_TYPE_SENDER) 3131 devqueues = g.nmd->req.nr_tx_rings; 3132 else 3133 devqueues = g.nmd->req.nr_rx_rings; 3134 3135 /* validate provided nthreads. */ 3136 if (g.nthreads < 1 || g.nthreads > devqueues) { 3137 D("bad nthreads %d, have %d queues", g.nthreads, devqueues); 3138 // continue, fail later 3139 } 3140 3141 if (g.td_type == TD_TYPE_SENDER) { 3142 int mtu = get_if_mtu(&g); 3143 3144 if (mtu > 0 && g.pkt_size > mtu) { 3145 D("pkt_size (%d) must be <= mtu (%d)", 3146 g.pkt_size, mtu); 3147 return -1; 3148 } 3149 } 3150 3151 if (verbose) { 3152 struct netmap_if *nifp = g.nmd->nifp; 3153 struct nmreq *req = &g.nmd->req; 3154 3155 D("nifp at offset %d, %d tx %d rx region %d", 3156 req->nr_offset, req->nr_tx_rings, req->nr_rx_rings, 3157 req->nr_arg2); 3158 for (i = 0; i <= req->nr_tx_rings; i++) { 3159 struct netmap_ring *ring = NETMAP_TXRING(nifp, i); 3160 D(" TX%d at 0x%p slots %d", i, 3161 (void *)((char *)ring - (char *)nifp), ring->num_slots); 3162 } 3163 for (i = 0; i <= req->nr_rx_rings; i++) { 3164 struct netmap_ring *ring = NETMAP_RXRING(nifp, i); 3165 D(" RX%d at 0x%p slots %d", i, 3166 (void *)((char *)ring - (char *)nifp), ring->num_slots); 3167 } 3168 } 3169 3170 /* Print some debug information. */ 3171 fprintf(stdout, 3172 "%s %s: %d queues, %d threads and %d cpus.\n", 3173 (g.td_type == TD_TYPE_SENDER) ? "Sending on" : 3174 ((g.td_type == TD_TYPE_RECEIVER) ? "Receiving from" : 3175 "Working on"), 3176 g.ifname, 3177 devqueues, 3178 g.nthreads, 3179 g.cpus); 3180 if (g.td_type == TD_TYPE_SENDER) { 3181 fprintf(stdout, "%s -> %s (%s -> %s)\n", 3182 g.src_ip.name, g.dst_ip.name, 3183 g.src_mac.name, g.dst_mac.name); 3184 } 3185 3186out: 3187 /* Exit if something went wrong. */ 3188 if (g.main_fd < 0) { 3189 D("aborting"); 3190 usage(-1); 3191 } 3192 } 3193 3194 3195 if (g.options) { 3196 D("--- SPECIAL OPTIONS:%s%s%s%s%s%s\n", 3197 g.options & OPT_PREFETCH ? " prefetch" : "", 3198 g.options & OPT_ACCESS ? " access" : "", 3199 g.options & OPT_MEMCPY ? " memcpy" : "", 3200 g.options & OPT_INDIRECT ? " indirect" : "", 3201 g.options & OPT_COPY ? " copy" : "", 3202 g.options & OPT_RUBBISH ? " rubbish " : ""); 3203 } 3204 3205 g.tx_period.tv_sec = g.tx_period.tv_nsec = 0; 3206 if (g.tx_rate > 0) { 3207 /* try to have at least something every second, 3208 * reducing the burst size to some 0.01s worth of data 3209 * (but no less than one full set of fragments) 3210 */ 3211 uint64_t x; 3212 int lim = (g.tx_rate)/300; 3213 if (g.burst > lim) 3214 g.burst = lim; 3215 if (g.burst == 0) 3216 g.burst = 1; 3217 x = ((uint64_t)1000000000 * (uint64_t)g.burst) / (uint64_t) g.tx_rate; 3218 g.tx_period.tv_nsec = x; 3219 g.tx_period.tv_sec = g.tx_period.tv_nsec / 1000000000; 3220 g.tx_period.tv_nsec = g.tx_period.tv_nsec % 1000000000; 3221 } 3222 if (g.td_type == TD_TYPE_SENDER) 3223 D("Sending %d packets every %ld.%09ld s", 3224 g.burst, g.tx_period.tv_sec, g.tx_period.tv_nsec); 3225 /* Install ^C handler. */ 3226 global_nthreads = g.nthreads; 3227 sigemptyset(&ss); 3228 sigaddset(&ss, SIGINT); 3229 /* block SIGINT now, so that all created threads will inherit the mask */ 3230 if (pthread_sigmask(SIG_BLOCK, &ss, NULL) < 0) { 3231 D("failed to block SIGINT: %s", strerror(errno)); 3232 } 3233 start_threads(&g); 3234 /* Install the handler and re-enable SIGINT for the main thread */ 3235 memset(&sa, 0, sizeof(sa)); 3236 sa.sa_handler = sigint_h; 3237 if (sigaction(SIGINT, &sa, NULL) < 0) { 3238 D("failed to install ^C handler: %s", strerror(errno)); 3239 } 3240 3241 if (pthread_sigmask(SIG_UNBLOCK, &ss, NULL) < 0) { 3242 D("failed to re-enable SIGINT: %s", strerror(errno)); 3243 } 3244 main_thread(&g); 3245 free(targs); 3246 return 0; 3247} 3248 3249/* end of file */ 3250