1/*
2 * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved.
3 * Copyright (C) 2013-2015 Universita` di Pisa. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *   1. Redistributions of source code must retain the above copyright
9 *      notice, this list of conditions and the following disclaimer.
10 *   2. Redistributions in binary form must reproduce the above copyright
11 *      notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27/*
28 * $FreeBSD: stable/11/tools/tools/netmap/pkt-gen.c 344918 2019-03-08 08:27:33Z vmaffione $
29 * $Id: pkt-gen.c 12346 2013-06-12 17:36:25Z luigi $
30 *
31 * Example program to show how to build a multithreaded packet
32 * source/sink using the netmap device.
33 *
34 * In this example we create a programmable number of threads
35 * to take care of all the queues of the interface used to
36 * send or receive traffic.
37 *
38 */
39
40#define _GNU_SOURCE	/* for CPU_SET() */
41#include <stdio.h>
42#define NETMAP_WITH_LIBS
43#include <net/netmap_user.h>
44
45
46#include <ctype.h>	// isprint()
47#include <unistd.h>	// sysconf()
48#include <sys/poll.h>
49#include <arpa/inet.h>	/* ntohs */
50#ifndef _WIN32
51#include <sys/sysctl.h>	/* sysctl */
52#endif
53#include <ifaddrs.h>	/* getifaddrs */
54#include <net/ethernet.h>
55#include <netinet/in.h>
56#include <netinet/ip.h>
57#include <netinet/udp.h>
58#include <netinet/ip6.h>
59#ifdef linux
60#define IPV6_VERSION	0x60
61#define IPV6_DEFHLIM	64
62#endif
63#include <assert.h>
64#include <math.h>
65
66#include <pthread.h>
67
68#ifndef NO_PCAP
69#include <pcap/pcap.h>
70#endif
71
72#include "ctrs.h"
73
74static void usage(int);
75
76#ifdef _WIN32
77#define cpuset_t        DWORD_PTR   //uint64_t
78static inline void CPU_ZERO(cpuset_t *p)
79{
80	*p = 0;
81}
82
83static inline void CPU_SET(uint32_t i, cpuset_t *p)
84{
85	*p |= 1<< (i & 0x3f);
86}
87
88#define pthread_setaffinity_np(a, b, c) !SetThreadAffinityMask(a, *c)    //((void)a, 0)
89#define TAP_CLONEDEV	"/dev/tap"
90#define AF_LINK	18	//defined in winsocks.h
91#define CLOCK_REALTIME_PRECISE CLOCK_REALTIME
92#include <net/if_dl.h>
93
94/*
95 * Convert an ASCII representation of an ethernet address to
96 * binary form.
97 */
98struct ether_addr *
99ether_aton(const char *a)
100{
101	int i;
102	static struct ether_addr o;
103	unsigned int o0, o1, o2, o3, o4, o5;
104
105	i = sscanf(a, "%x:%x:%x:%x:%x:%x", &o0, &o1, &o2, &o3, &o4, &o5);
106
107	if (i != 6)
108		return (NULL);
109
110	o.octet[0]=o0;
111	o.octet[1]=o1;
112	o.octet[2]=o2;
113	o.octet[3]=o3;
114	o.octet[4]=o4;
115	o.octet[5]=o5;
116
117	return ((struct ether_addr *)&o);
118}
119
120/*
121 * Convert a binary representation of an ethernet address to
122 * an ASCII string.
123 */
124char *
125ether_ntoa(const struct ether_addr *n)
126{
127	int i;
128	static char a[18];
129
130	i = sprintf(a, "%02x:%02x:%02x:%02x:%02x:%02x",
131	    n->octet[0], n->octet[1], n->octet[2],
132	    n->octet[3], n->octet[4], n->octet[5]);
133	return (i < 17 ? NULL : (char *)&a);
134}
135#endif /* _WIN32 */
136
137#ifdef linux
138
139#define cpuset_t        cpu_set_t
140
141#define ifr_flagshigh  ifr_flags        /* only the low 16 bits here */
142#define IFF_PPROMISC   IFF_PROMISC      /* IFF_PPROMISC does not exist */
143#include <linux/ethtool.h>
144#include <linux/sockios.h>
145
146#define CLOCK_REALTIME_PRECISE CLOCK_REALTIME
147#include <netinet/ether.h>      /* ether_aton */
148#include <linux/if_packet.h>    /* sockaddr_ll */
149#endif  /* linux */
150
151#ifdef __FreeBSD__
152#include <sys/endian.h> /* le64toh */
153#include <machine/param.h>
154
155#include <pthread_np.h> /* pthread w/ affinity */
156#include <sys/cpuset.h> /* cpu_set */
157#include <net/if_dl.h>  /* LLADDR */
158#endif  /* __FreeBSD__ */
159
160#ifdef __APPLE__
161
162#define cpuset_t        uint64_t        // XXX
163static inline void CPU_ZERO(cpuset_t *p)
164{
165	*p = 0;
166}
167
168static inline void CPU_SET(uint32_t i, cpuset_t *p)
169{
170	*p |= 1<< (i & 0x3f);
171}
172
173#define pthread_setaffinity_np(a, b, c) ((void)a, 0)
174
175#define ifr_flagshigh  ifr_flags        // XXX
176#define IFF_PPROMISC   IFF_PROMISC
177#include <net/if_dl.h>  /* LLADDR */
178#define clock_gettime(a,b)      \
179	do {struct timespec t0 = {0,0}; *(b) = t0; } while (0)
180#endif  /* __APPLE__ */
181
182const char *default_payload="netmap pkt-gen DIRECT payload\n"
183	"http://info.iet.unipi.it/~luigi/netmap/ ";
184
185const char *indirect_payload="netmap pkt-gen indirect payload\n"
186	"http://info.iet.unipi.it/~luigi/netmap/ ";
187
188int verbose = 0;
189int normalize = 1;
190
191#define VIRT_HDR_1	10	/* length of a base vnet-hdr */
192#define VIRT_HDR_2	12	/* length of the extenede vnet-hdr */
193#define VIRT_HDR_MAX	VIRT_HDR_2
194struct virt_header {
195	uint8_t fields[VIRT_HDR_MAX];
196};
197
198#define MAX_BODYSIZE	65536
199
200struct pkt {
201	struct virt_header vh;
202	struct ether_header eh;
203	union {
204		struct {
205			struct ip ip;
206			struct udphdr udp;
207			uint8_t body[MAX_BODYSIZE];	/* hardwired */
208		} ipv4;
209		struct {
210			struct ip6_hdr ip;
211			struct udphdr udp;
212			uint8_t body[MAX_BODYSIZE];	/* hardwired */
213		} ipv6;
214	};
215} __attribute__((__packed__));
216
217#define	PKT(p, f, af)	\
218    ((af) == AF_INET ? (p)->ipv4.f: (p)->ipv6.f)
219
220struct ip_range {
221	char *name;
222	union {
223		struct {
224			uint32_t start, end; /* same as struct in_addr */
225		} ipv4;
226		struct {
227			struct in6_addr start, end;
228			uint8_t sgroup, egroup;
229		} ipv6;
230	};
231	uint16_t port0, port1;
232};
233
234struct mac_range {
235	char *name;
236	struct ether_addr start, end;
237};
238
239/* ifname can be netmap:foo-xxxx */
240#define MAX_IFNAMELEN	64	/* our buffer for ifname */
241#define MAX_PKTSIZE	MAX_BODYSIZE	/* XXX: + IP_HDR + ETH_HDR */
242
243/* compact timestamp to fit into 60 byte packet. (enough to obtain RTT) */
244struct tstamp {
245	uint32_t sec;
246	uint32_t nsec;
247};
248
249/*
250 * global arguments for all threads
251 */
252
253struct glob_arg {
254	int af;		/* address family AF_INET/AF_INET6 */
255	struct ip_range src_ip;
256	struct ip_range dst_ip;
257	struct mac_range dst_mac;
258	struct mac_range src_mac;
259	int pkt_size;
260	int pkt_min_size;
261	int burst;
262	int forever;
263	uint64_t npackets;	/* total packets to send */
264	int frags;		/* fragments per packet */
265	u_int frag_size;	/* size of each fragment */
266	int nthreads;
267	int cpus;	/* cpus used for running */
268	int system_cpus;	/* cpus on the system */
269
270	int options;	/* testing */
271#define OPT_PREFETCH	1
272#define OPT_ACCESS	2
273#define OPT_COPY	4
274#define OPT_MEMCPY	8
275#define OPT_TS		16	/* add a timestamp */
276#define OPT_INDIRECT	32	/* use indirect buffers, tx only */
277#define OPT_DUMP	64	/* dump rx/tx traffic */
278#define OPT_RUBBISH	256	/* send wathever the buffers contain */
279#define OPT_RANDOM_SRC  512
280#define OPT_RANDOM_DST  1024
281#define OPT_PPS_STATS   2048
282	int dev_type;
283#ifndef NO_PCAP
284	pcap_t *p;
285#endif
286
287	int tx_rate;
288	struct timespec tx_period;
289
290	int affinity;
291	int main_fd;
292	struct nm_desc *nmd;
293	int report_interval;		/* milliseconds between prints */
294	void *(*td_body)(void *);
295	int td_type;
296	void *mmap_addr;
297	char ifname[MAX_IFNAMELEN];
298	char *nmr_config;
299	int dummy_send;
300	int virt_header;	/* send also the virt_header */
301	char *packet_file;	/* -P option */
302#define	STATS_WIN	15
303	int win_idx;
304	int64_t win[STATS_WIN];
305	int wait_link;
306	int framing;		/* #bits of framing (for bw output) */
307};
308enum dev_type { DEV_NONE, DEV_NETMAP, DEV_PCAP, DEV_TAP };
309
310enum {
311	TD_TYPE_SENDER = 1,
312	TD_TYPE_RECEIVER,
313	TD_TYPE_OTHER,
314};
315
316/*
317 * Arguments for a new thread. The same structure is used by
318 * the source and the sink
319 */
320struct targ {
321	struct glob_arg *g;
322	int used;
323	int completed;
324	int cancel;
325	int fd;
326	struct nm_desc *nmd;
327	/* these ought to be volatile, but they are
328	 * only sampled and errors should not accumulate
329	 */
330	struct my_ctrs ctr;
331
332	struct timespec tic, toc;
333	int me;
334	pthread_t thread;
335	int affinity;
336
337	struct pkt pkt;
338	void *frame;
339	uint16_t seed[3];
340	u_int frags;
341	u_int frag_size;
342};
343
344static __inline uint16_t
345cksum_add(uint16_t sum, uint16_t a)
346{
347	uint16_t res;
348
349	res = sum + a;
350	return (res + (res < a));
351}
352
353static void
354extract_ipv4_addr(char *name, uint32_t *addr, uint16_t *port)
355{
356	struct in_addr a;
357	char *pp;
358
359	pp = strchr(name, ':');
360	if (pp != NULL) {	/* do we have ports ? */
361		*pp++ = '\0';
362		*port = (uint16_t)strtol(pp, NULL, 0);
363	}
364
365	inet_pton(AF_INET, name, &a);
366	*addr = ntohl(a.s_addr);
367}
368
369static void
370extract_ipv6_addr(char *name, struct in6_addr *addr, uint16_t *port,
371    uint8_t *group)
372{
373	char *pp;
374
375	/*
376	 * We accept IPv6 address in the following form:
377	 *  group@[2001:DB8::1001]:port	(w/ brackets and port)
378	 *  group@[2001:DB8::1]		(w/ brackets and w/o port)
379	 *  group@2001:DB8::1234	(w/o brackets and w/o port)
380	 */
381	pp = strchr(name, '@');
382	if (pp != NULL) {
383		*pp++ = '\0';
384		*group = (uint8_t)strtol(name, NULL, 0);
385		if (*group > 7)
386			*group = 7;
387		name = pp;
388	}
389	if (name[0] == '[')
390		name++;
391	pp = strchr(name, ']');
392	if (pp != NULL)
393		*pp++ = '\0';
394	if (pp != NULL && *pp != ':')
395		pp = NULL;
396	if (pp != NULL) {	/* do we have ports ? */
397		*pp++ = '\0';
398		*port = (uint16_t)strtol(pp, NULL, 0);
399	}
400	inet_pton(AF_INET6, name, addr);
401}
402/*
403 * extract the extremes from a range of ipv4 addresses.
404 * addr_lo[-addr_hi][:port_lo[-port_hi]]
405 */
406static int
407extract_ip_range(struct ip_range *r, int af)
408{
409	char *name, *ap, start[INET6_ADDRSTRLEN];
410	char end[INET6_ADDRSTRLEN];
411	struct in_addr a;
412	uint32_t tmp;
413
414	if (verbose)
415		D("extract IP range from %s", r->name);
416
417	name = strdup(r->name);
418	if (name == NULL) {
419		D("strdup failed");
420		usage(-1);
421	}
422	/* the first - splits start/end of range */
423	ap = strchr(name, '-');
424	if (ap != NULL)
425		*ap++ = '\0';
426	r->port0 = 1234;	/* default port */
427	if (af == AF_INET6) {
428		r->ipv6.sgroup = 7; /* default group */
429		extract_ipv6_addr(name, &r->ipv6.start, &r->port0,
430		    &r->ipv6.sgroup);
431	} else
432		extract_ipv4_addr(name, &r->ipv4.start, &r->port0);
433
434	r->port1 = r->port0;
435	if (af == AF_INET6) {
436		if (ap != NULL) {
437			r->ipv6.egroup = r->ipv6.sgroup;
438			extract_ipv6_addr(ap, &r->ipv6.end, &r->port1,
439			    &r->ipv6.egroup);
440		} else {
441			r->ipv6.end = r->ipv6.start;
442			r->ipv6.egroup = r->ipv6.sgroup;
443		}
444	} else {
445		if (ap != NULL) {
446			extract_ipv4_addr(ap, &r->ipv4.end, &r->port1);
447			if (r->ipv4.start > r->ipv4.end) {
448				tmp = r->ipv4.end;
449				r->ipv4.end = r->ipv4.start;
450				r->ipv4.start = tmp;
451			}
452		} else
453			r->ipv4.end = r->ipv4.start;
454	}
455
456	if (r->port0 > r->port1) {
457		tmp = r->port0;
458		r->port0 = r->port1;
459		r->port1 = tmp;
460	}
461	if (af == AF_INET) {
462		a.s_addr = htonl(r->ipv4.start);
463		inet_ntop(af, &a, start, sizeof(start));
464		a.s_addr = htonl(r->ipv4.end);
465		inet_ntop(af, &a, end, sizeof(end));
466	} else {
467		inet_ntop(af, &r->ipv6.start, start, sizeof(start));
468		inet_ntop(af, &r->ipv6.end, end, sizeof(end));
469	}
470	if (af == AF_INET)
471		D("range is %s:%d to %s:%d", start, r->port0, end, r->port1);
472	else
473		D("range is %d@[%s]:%d to %d@[%s]:%d", r->ipv6.sgroup,
474		    start, r->port0, r->ipv6.egroup, end, r->port1);
475
476	free(name);
477	if (r->port0 != r->port1 ||
478	    (af == AF_INET && r->ipv4.start != r->ipv4.end) ||
479	    (af == AF_INET6 &&
480		!IN6_ARE_ADDR_EQUAL(&r->ipv6.start, &r->ipv6.end)))
481		return (OPT_COPY);
482	return (0);
483}
484
485static int
486extract_mac_range(struct mac_range *r)
487{
488	struct ether_addr *e;
489	if (verbose)
490	    D("extract MAC range from %s", r->name);
491
492	e = ether_aton(r->name);
493	if (e == NULL) {
494		D("invalid MAC address '%s'", r->name);
495		return 1;
496	}
497	bcopy(e, &r->start, 6);
498	bcopy(e, &r->end, 6);
499#if 0
500	bcopy(targ->src_mac, eh->ether_shost, 6);
501	p = index(targ->g->src_mac, '-');
502	if (p)
503		targ->src_mac_range = atoi(p+1);
504
505	bcopy(ether_aton(targ->g->dst_mac), targ->dst_mac, 6);
506	bcopy(targ->dst_mac, eh->ether_dhost, 6);
507	p = index(targ->g->dst_mac, '-');
508	if (p)
509		targ->dst_mac_range = atoi(p+1);
510#endif
511	if (verbose)
512		D("%s starts at %s", r->name, ether_ntoa(&r->start));
513	return 0;
514}
515
516static int
517get_if_mtu(const struct glob_arg *g)
518{
519	char ifname[IFNAMSIZ];
520	struct ifreq ifreq;
521	int s, ret;
522
523	if (!strncmp(g->ifname, "netmap:", 7) && !strchr(g->ifname, '{')
524			&& !strchr(g->ifname, '}')) {
525		/* Parse the interface name and ask the kernel for the
526		 * MTU value. */
527		strncpy(ifname, g->ifname+7, IFNAMSIZ-1);
528		ifname[strcspn(ifname, "-*^{}/@")] = '\0';
529
530		s = socket(AF_INET, SOCK_DGRAM, 0);
531		if (s < 0) {
532			D("socket() failed: %s", strerror(errno));
533			return s;
534		}
535
536		memset(&ifreq, 0, sizeof(ifreq));
537		strncpy(ifreq.ifr_name, ifname, IFNAMSIZ);
538
539		ret = ioctl(s, SIOCGIFMTU, &ifreq);
540		if (ret) {
541			D("ioctl(SIOCGIFMTU) failed: %s", strerror(errno));
542		}
543
544		return ifreq.ifr_mtu;
545	}
546
547	/* This is a pipe or a VALE port, where the MTU is very large,
548	 * so we use some practical limit. */
549	return 65536;
550}
551
552static struct targ *targs;
553static int global_nthreads;
554
555/* control-C handler */
556static void
557sigint_h(int sig)
558{
559	int i;
560
561	(void)sig;	/* UNUSED */
562	D("received control-C on thread %p", (void *)pthread_self());
563	for (i = 0; i < global_nthreads; i++) {
564		targs[i].cancel = 1;
565	}
566}
567
568/* sysctl wrapper to return the number of active CPUs */
569static int
570system_ncpus(void)
571{
572	int ncpus;
573#if defined (__FreeBSD__)
574	int mib[2] = { CTL_HW, HW_NCPU };
575	size_t len = sizeof(mib);
576	sysctl(mib, 2, &ncpus, &len, NULL, 0);
577#elif defined(linux)
578	ncpus = sysconf(_SC_NPROCESSORS_ONLN);
579#elif defined(_WIN32)
580	{
581		SYSTEM_INFO sysinfo;
582		GetSystemInfo(&sysinfo);
583		ncpus = sysinfo.dwNumberOfProcessors;
584	}
585#else /* others */
586	ncpus = 1;
587#endif /* others */
588	return (ncpus);
589}
590
591#ifdef __linux__
592#define sockaddr_dl    sockaddr_ll
593#define sdl_family     sll_family
594#define AF_LINK        AF_PACKET
595#define LLADDR(s)      s->sll_addr;
596#include <linux/if_tun.h>
597#define TAP_CLONEDEV	"/dev/net/tun"
598#endif /* __linux__ */
599
600#ifdef __FreeBSD__
601#include <net/if_tun.h>
602#define TAP_CLONEDEV	"/dev/tap"
603#endif /* __FreeBSD */
604
605#ifdef __APPLE__
606// #warning TAP not supported on apple ?
607#include <net/if_utun.h>
608#define TAP_CLONEDEV	"/dev/tap"
609#endif /* __APPLE__ */
610
611
612/*
613 * parse the vale configuration in conf and put it in nmr.
614 * Return the flag set if necessary.
615 * The configuration may consist of 1 to 4 numbers separated
616 * by commas: #tx-slots,#rx-slots,#tx-rings,#rx-rings.
617 * Missing numbers or zeroes stand for default values.
618 * As an additional convenience, if exactly one number
619 * is specified, then this is assigned to both #tx-slots and #rx-slots.
620 * If there is no 4th number, then the 3rd is assigned to both #tx-rings
621 * and #rx-rings.
622 */
623int
624parse_nmr_config(const char* conf, struct nmreq *nmr)
625{
626	char *w, *tok;
627	int i, v;
628
629	if (conf == NULL || ! *conf)
630		return 0;
631	nmr->nr_tx_rings = nmr->nr_rx_rings = 0;
632	nmr->nr_tx_slots = nmr->nr_rx_slots = 0;
633	w = strdup(conf);
634	for (i = 0, tok = strtok(w, ","); tok; i++, tok = strtok(NULL, ",")) {
635		v = atoi(tok);
636		switch (i) {
637		case 0:
638			nmr->nr_tx_slots = nmr->nr_rx_slots = v;
639			break;
640		case 1:
641			nmr->nr_rx_slots = v;
642			break;
643		case 2:
644			nmr->nr_tx_rings = nmr->nr_rx_rings = v;
645			break;
646		case 3:
647			nmr->nr_rx_rings = v;
648			break;
649		default:
650			D("ignored config: %s", tok);
651			break;
652		}
653	}
654	D("txr %d txd %d rxr %d rxd %d",
655			nmr->nr_tx_rings, nmr->nr_tx_slots,
656			nmr->nr_rx_rings, nmr->nr_rx_slots);
657	free(w);
658	return (nmr->nr_tx_rings || nmr->nr_tx_slots ||
659		nmr->nr_rx_rings || nmr->nr_rx_slots) ?
660		NM_OPEN_RING_CFG : 0;
661}
662
663
664/*
665 * locate the src mac address for our interface, put it
666 * into the user-supplied buffer. return 0 if ok, -1 on error.
667 */
668static int
669source_hwaddr(const char *ifname, char *buf)
670{
671	struct ifaddrs *ifaphead, *ifap;
672
673	if (getifaddrs(&ifaphead) != 0) {
674		D("getifaddrs %s failed", ifname);
675		return (-1);
676	}
677
678	for (ifap = ifaphead; ifap; ifap = ifap->ifa_next) {
679		struct sockaddr_dl *sdl =
680			(struct sockaddr_dl *)ifap->ifa_addr;
681		uint8_t *mac;
682
683		if (!sdl || sdl->sdl_family != AF_LINK)
684			continue;
685		if (strncmp(ifap->ifa_name, ifname, IFNAMSIZ) != 0)
686			continue;
687		mac = (uint8_t *)LLADDR(sdl);
688		sprintf(buf, "%02x:%02x:%02x:%02x:%02x:%02x",
689			mac[0], mac[1], mac[2],
690			mac[3], mac[4], mac[5]);
691		if (verbose)
692			D("source hwaddr %s", buf);
693		break;
694	}
695	freeifaddrs(ifaphead);
696	return ifap ? 0 : 1;
697}
698
699
700/* set the thread affinity. */
701static int
702setaffinity(pthread_t me, int i)
703{
704	cpuset_t cpumask;
705
706	if (i == -1)
707		return 0;
708
709	/* Set thread affinity affinity.*/
710	CPU_ZERO(&cpumask);
711	CPU_SET(i, &cpumask);
712
713	if (pthread_setaffinity_np(me, sizeof(cpuset_t), &cpumask) != 0) {
714		D("Unable to set affinity: %s", strerror(errno));
715		return 1;
716	}
717	return 0;
718}
719
720
721/* Compute the checksum of the given ip header. */
722static uint32_t
723checksum(const void *data, uint16_t len, uint32_t sum)
724{
725	const uint8_t *addr = data;
726	uint32_t i;
727
728	/* Checksum all the pairs of bytes first... */
729	for (i = 0; i < (len & ~1U); i += 2) {
730		sum += (u_int16_t)ntohs(*((u_int16_t *)(addr + i)));
731		if (sum > 0xFFFF)
732			sum -= 0xFFFF;
733	}
734	/*
735	 * If there's a single byte left over, checksum it, too.
736	 * Network byte order is big-endian, so the remaining byte is
737	 * the high byte.
738	 */
739	if (i < len) {
740		sum += addr[i] << 8;
741		if (sum > 0xFFFF)
742			sum -= 0xFFFF;
743	}
744	return sum;
745}
746
747static uint16_t
748wrapsum(uint32_t sum)
749{
750	sum = ~sum & 0xFFFF;
751	return (htons(sum));
752}
753
754/* Check the payload of the packet for errors (use it for debug).
755 * Look for consecutive ascii representations of the size of the packet.
756 */
757static void
758dump_payload(const char *_p, int len, struct netmap_ring *ring, int cur)
759{
760	char buf[128];
761	int i, j, i0;
762	const unsigned char *p = (const unsigned char *)_p;
763
764	/* get the length in ASCII of the length of the packet. */
765
766	printf("ring %p cur %5d [buf %6d flags 0x%04x len %5d]\n",
767		ring, cur, ring->slot[cur].buf_idx,
768		ring->slot[cur].flags, len);
769	/* hexdump routine */
770	for (i = 0; i < len; ) {
771		memset(buf, ' ', sizeof(buf));
772		sprintf(buf, "%5d: ", i);
773		i0 = i;
774		for (j=0; j < 16 && i < len; i++, j++)
775			sprintf(buf+7+j*3, "%02x ", (uint8_t)(p[i]));
776		i = i0;
777		for (j=0; j < 16 && i < len; i++, j++)
778			sprintf(buf+7+j + 48, "%c",
779				isprint(p[i]) ? p[i] : '.');
780		printf("%s\n", buf);
781	}
782}
783
784/*
785 * Fill a packet with some payload.
786 * We create a UDP packet so the payload starts at
787 *	14+20+8 = 42 bytes.
788 */
789#ifdef __linux__
790#define uh_sport source
791#define uh_dport dest
792#define uh_ulen len
793#define uh_sum check
794#endif /* linux */
795
796static void
797update_ip(struct pkt *pkt, struct targ *t)
798{
799	struct glob_arg *g = t->g;
800	struct ip ip;
801	struct udphdr udp;
802	uint32_t oaddr, naddr;
803	uint16_t oport, nport;
804	uint16_t ip_sum, udp_sum;
805
806	memcpy(&ip, &pkt->ipv4.ip, sizeof(ip));
807	memcpy(&udp, &pkt->ipv4.udp, sizeof(udp));
808	do {
809		ip_sum = udp_sum = 0;
810		naddr = oaddr = ntohl(ip.ip_src.s_addr);
811		nport = oport = ntohs(udp.uh_sport);
812		if (g->options & OPT_RANDOM_SRC) {
813			ip.ip_src.s_addr = nrand48(t->seed);
814			udp.uh_sport = nrand48(t->seed);
815			naddr = ntohl(ip.ip_src.s_addr);
816			nport = ntohs(udp.uh_sport);
817			break;
818		}
819		if (oport < g->src_ip.port1) {
820			nport = oport + 1;
821			udp.uh_sport = htons(nport);
822			break;
823		}
824		nport = g->src_ip.port0;
825		udp.uh_sport = htons(nport);
826		if (oaddr < g->src_ip.ipv4.end) {
827			naddr = oaddr + 1;
828			ip.ip_src.s_addr = htonl(naddr);
829			break;
830		}
831		naddr = g->src_ip.ipv4.start;
832		ip.ip_src.s_addr = htonl(naddr);
833	} while (0);
834	/* update checksums if needed */
835	if (oaddr != naddr) {
836		ip_sum = cksum_add(ip_sum, ~oaddr >> 16);
837		ip_sum = cksum_add(ip_sum, ~oaddr & 0xffff);
838		ip_sum = cksum_add(ip_sum, naddr >> 16);
839		ip_sum = cksum_add(ip_sum, naddr & 0xffff);
840	}
841	if (oport != nport) {
842		udp_sum = cksum_add(udp_sum, ~oport);
843		udp_sum = cksum_add(udp_sum, nport);
844	}
845	do {
846		naddr = oaddr = ntohl(ip.ip_dst.s_addr);
847		nport = oport = ntohs(udp.uh_dport);
848		if (g->options & OPT_RANDOM_DST) {
849			ip.ip_dst.s_addr = nrand48(t->seed);
850			udp.uh_dport = nrand48(t->seed);
851			naddr = ntohl(ip.ip_dst.s_addr);
852			nport = ntohs(udp.uh_dport);
853			break;
854		}
855		if (oport < g->dst_ip.port1) {
856			nport = oport + 1;
857			udp.uh_dport = htons(nport);
858			break;
859		}
860		nport = g->dst_ip.port0;
861		udp.uh_dport = htons(nport);
862		if (oaddr < g->dst_ip.ipv4.end) {
863			naddr = oaddr + 1;
864			ip.ip_dst.s_addr = htonl(naddr);
865			break;
866		}
867		naddr = g->dst_ip.ipv4.start;
868		ip.ip_dst.s_addr = htonl(naddr);
869	} while (0);
870	/* update checksums */
871	if (oaddr != naddr) {
872		ip_sum = cksum_add(ip_sum, ~oaddr >> 16);
873		ip_sum = cksum_add(ip_sum, ~oaddr & 0xffff);
874		ip_sum = cksum_add(ip_sum, naddr >> 16);
875		ip_sum = cksum_add(ip_sum, naddr & 0xffff);
876	}
877	if (oport != nport) {
878		udp_sum = cksum_add(udp_sum, ~oport);
879		udp_sum = cksum_add(udp_sum, nport);
880	}
881	if (udp_sum != 0)
882		udp.uh_sum = ~cksum_add(~udp.uh_sum, htons(udp_sum));
883	if (ip_sum != 0) {
884		ip.ip_sum = ~cksum_add(~ip.ip_sum, htons(ip_sum));
885		udp.uh_sum = ~cksum_add(~udp.uh_sum, htons(ip_sum));
886	}
887	memcpy(&pkt->ipv4.ip, &ip, sizeof(ip));
888	memcpy(&pkt->ipv4.udp, &udp, sizeof(udp));
889}
890
891#ifndef s6_addr16
892#define	s6_addr16	__u6_addr.__u6_addr16
893#endif
894static void
895update_ip6(struct pkt *pkt, struct targ *t)
896{
897	struct glob_arg *g = t->g;
898	struct ip6_hdr ip6;
899	struct udphdr udp;
900	uint16_t udp_sum;
901	uint16_t oaddr, naddr;
902	uint16_t oport, nport;
903	uint8_t group;
904
905	memcpy(&ip6, &pkt->ipv6.ip, sizeof(ip6));
906	memcpy(&udp, &pkt->ipv6.udp, sizeof(udp));
907	do {
908		udp_sum = 0;
909		group = g->src_ip.ipv6.sgroup;
910		naddr = oaddr = ntohs(ip6.ip6_src.s6_addr16[group]);
911		nport = oport = ntohs(udp.uh_sport);
912		if (g->options & OPT_RANDOM_SRC) {
913			ip6.ip6_src.s6_addr16[group] = nrand48(t->seed);
914			udp.uh_sport = nrand48(t->seed);
915			naddr = ntohs(ip6.ip6_src.s6_addr16[group]);
916			nport = ntohs(udp.uh_sport);
917			break;
918		}
919		if (oport < g->src_ip.port1) {
920			nport = oport + 1;
921			udp.uh_sport = htons(nport);
922			break;
923		}
924		nport = g->src_ip.port0;
925		udp.uh_sport = htons(nport);
926		if (oaddr < ntohs(g->src_ip.ipv6.end.s6_addr16[group])) {
927			naddr = oaddr + 1;
928			ip6.ip6_src.s6_addr16[group] = htons(naddr);
929			break;
930		}
931		naddr = ntohs(g->src_ip.ipv6.start.s6_addr16[group]);
932		ip6.ip6_src.s6_addr16[group] = htons(naddr);
933	} while (0);
934	/* update checksums if needed */
935	if (oaddr != naddr)
936		udp_sum = cksum_add(~oaddr, naddr);
937	if (oport != nport)
938		udp_sum = cksum_add(udp_sum,
939		    cksum_add(~oport, nport));
940	do {
941		group = g->dst_ip.ipv6.egroup;
942		naddr = oaddr = ntohs(ip6.ip6_dst.s6_addr16[group]);
943		nport = oport = ntohs(udp.uh_dport);
944		if (g->options & OPT_RANDOM_DST) {
945			ip6.ip6_dst.s6_addr16[group] = nrand48(t->seed);
946			udp.uh_dport = nrand48(t->seed);
947			naddr = ntohs(ip6.ip6_dst.s6_addr16[group]);
948			nport = ntohs(udp.uh_dport);
949			break;
950		}
951		if (oport < g->dst_ip.port1) {
952			nport = oport + 1;
953			udp.uh_dport = htons(nport);
954			break;
955		}
956		nport = g->dst_ip.port0;
957		udp.uh_dport = htons(nport);
958		if (oaddr < ntohs(g->dst_ip.ipv6.end.s6_addr16[group])) {
959			naddr = oaddr + 1;
960			ip6.ip6_dst.s6_addr16[group] = htons(naddr);
961			break;
962		}
963		naddr = ntohs(g->dst_ip.ipv6.start.s6_addr16[group]);
964		ip6.ip6_dst.s6_addr16[group] = htons(naddr);
965	} while (0);
966	/* update checksums */
967	if (oaddr != naddr)
968		udp_sum = cksum_add(udp_sum,
969		    cksum_add(~oaddr, naddr));
970	if (oport != nport)
971		udp_sum = cksum_add(udp_sum,
972		    cksum_add(~oport, nport));
973	if (udp_sum != 0)
974		udp.uh_sum = ~cksum_add(~udp.uh_sum, udp_sum);
975	memcpy(&pkt->ipv6.ip, &ip6, sizeof(ip6));
976	memcpy(&pkt->ipv6.udp, &udp, sizeof(udp));
977}
978
979static void
980update_addresses(struct pkt *pkt, struct targ *t)
981{
982
983	if (t->g->af == AF_INET)
984		update_ip(pkt, t);
985	else
986		update_ip6(pkt, t);
987}
988/*
989 * initialize one packet and prepare for the next one.
990 * The copy could be done better instead of repeating it each time.
991 */
992static void
993initialize_packet(struct targ *targ)
994{
995	struct pkt *pkt = &targ->pkt;
996	struct ether_header *eh;
997	struct ip6_hdr ip6;
998	struct ip ip;
999	struct udphdr udp;
1000	void *udp_ptr;
1001	uint16_t paylen;
1002	uint32_t csum = 0;
1003	const char *payload = targ->g->options & OPT_INDIRECT ?
1004		indirect_payload : default_payload;
1005	int i, l0 = strlen(payload);
1006
1007#ifndef NO_PCAP
1008	char errbuf[PCAP_ERRBUF_SIZE];
1009	pcap_t *file;
1010	struct pcap_pkthdr *header;
1011	const unsigned char *packet;
1012
1013	/* Read a packet from a PCAP file if asked. */
1014	if (targ->g->packet_file != NULL) {
1015		if ((file = pcap_open_offline(targ->g->packet_file,
1016			    errbuf)) == NULL)
1017			D("failed to open pcap file %s",
1018			    targ->g->packet_file);
1019		if (pcap_next_ex(file, &header, &packet) < 0)
1020			D("failed to read packet from %s",
1021			    targ->g->packet_file);
1022		if ((targ->frame = malloc(header->caplen)) == NULL)
1023			D("out of memory");
1024		bcopy(packet, (unsigned char *)targ->frame, header->caplen);
1025		targ->g->pkt_size = header->caplen;
1026		pcap_close(file);
1027		return;
1028	}
1029#endif
1030
1031	paylen = targ->g->pkt_size - sizeof(*eh) -
1032	    (targ->g->af == AF_INET ? sizeof(ip): sizeof(ip6));
1033
1034	/* create a nice NUL-terminated string */
1035	for (i = 0; i < paylen; i += l0) {
1036		if (l0 > paylen - i)
1037			l0 = paylen - i; // last round
1038		bcopy(payload, PKT(pkt, body, targ->g->af) + i, l0);
1039	}
1040	PKT(pkt, body, targ->g->af)[i - 1] = '\0';
1041
1042	/* prepare the headers */
1043	eh = &pkt->eh;
1044	bcopy(&targ->g->src_mac.start, eh->ether_shost, 6);
1045	bcopy(&targ->g->dst_mac.start, eh->ether_dhost, 6);
1046
1047	if (targ->g->af == AF_INET) {
1048		eh->ether_type = htons(ETHERTYPE_IP);
1049		memcpy(&ip, &pkt->ipv4.ip, sizeof(ip));
1050		udp_ptr = &pkt->ipv4.udp;
1051		ip.ip_v = IPVERSION;
1052		ip.ip_hl = sizeof(ip) >> 2;
1053		ip.ip_id = 0;
1054		ip.ip_tos = IPTOS_LOWDELAY;
1055		ip.ip_len = htons(targ->g->pkt_size - sizeof(*eh));
1056		ip.ip_id = 0;
1057		ip.ip_off = htons(IP_DF); /* Don't fragment */
1058		ip.ip_ttl = IPDEFTTL;
1059		ip.ip_p = IPPROTO_UDP;
1060		ip.ip_dst.s_addr = htonl(targ->g->dst_ip.ipv4.start);
1061		ip.ip_src.s_addr = htonl(targ->g->src_ip.ipv4.start);
1062		ip.ip_sum = wrapsum(checksum(&ip, sizeof(ip), 0));
1063		memcpy(&pkt->ipv4.ip, &ip, sizeof(ip));
1064	} else {
1065		eh->ether_type = htons(ETHERTYPE_IPV6);
1066		memcpy(&ip6, &pkt->ipv4.ip, sizeof(ip6));
1067		udp_ptr = &pkt->ipv6.udp;
1068		ip6.ip6_flow = 0;
1069		ip6.ip6_plen = htons(paylen);
1070		ip6.ip6_vfc = IPV6_VERSION;
1071		ip6.ip6_nxt = IPPROTO_UDP;
1072		ip6.ip6_hlim = IPV6_DEFHLIM;
1073		ip6.ip6_src = targ->g->src_ip.ipv6.start;
1074		ip6.ip6_dst = targ->g->dst_ip.ipv6.start;
1075	}
1076	memcpy(&udp, udp_ptr, sizeof(udp));
1077
1078	udp.uh_sport = htons(targ->g->src_ip.port0);
1079	udp.uh_dport = htons(targ->g->dst_ip.port0);
1080	udp.uh_ulen = htons(paylen);
1081	if (targ->g->af == AF_INET) {
1082		/* Magic: taken from sbin/dhclient/packet.c */
1083		udp.uh_sum = wrapsum(
1084		    checksum(&udp, sizeof(udp),	/* udp header */
1085		    checksum(pkt->ipv4.body,	/* udp payload */
1086		    paylen - sizeof(udp),
1087		    checksum(&pkt->ipv4.ip.ip_src, /* pseudo header */
1088			2 * sizeof(pkt->ipv4.ip.ip_src),
1089			IPPROTO_UDP + (u_int32_t)ntohs(udp.uh_ulen)))));
1090		memcpy(&pkt->ipv4.ip, &ip, sizeof(ip));
1091	} else {
1092		/* Save part of pseudo header checksum into csum */
1093		csum = IPPROTO_UDP << 24;
1094		csum = checksum(&csum, sizeof(csum), paylen);
1095		udp.uh_sum = wrapsum(
1096		    checksum(udp_ptr, sizeof(udp),	/* udp header */
1097		    checksum(pkt->ipv6.body,	/* udp payload */
1098		    paylen - sizeof(udp),
1099		    checksum(&pkt->ipv6.ip.ip6_src, /* pseudo header */
1100			2 * sizeof(pkt->ipv6.ip.ip6_src), csum))));
1101		memcpy(&pkt->ipv6.ip, &ip6, sizeof(ip6));
1102	}
1103	memcpy(udp_ptr, &udp, sizeof(udp));
1104
1105	bzero(&pkt->vh, sizeof(pkt->vh));
1106	// dump_payload((void *)pkt, targ->g->pkt_size, NULL, 0);
1107}
1108
1109static void
1110get_vnet_hdr_len(struct glob_arg *g)
1111{
1112	struct nmreq req;
1113	int err;
1114
1115	memset(&req, 0, sizeof(req));
1116	bcopy(g->nmd->req.nr_name, req.nr_name, sizeof(req.nr_name));
1117	req.nr_version = NETMAP_API;
1118	req.nr_cmd = NETMAP_VNET_HDR_GET;
1119	err = ioctl(g->main_fd, NIOCREGIF, &req);
1120	if (err) {
1121		D("Unable to get virtio-net header length");
1122		return;
1123	}
1124
1125	g->virt_header = req.nr_arg1;
1126	if (g->virt_header) {
1127		D("Port requires virtio-net header, length = %d",
1128		  g->virt_header);
1129	}
1130}
1131
1132static void
1133set_vnet_hdr_len(struct glob_arg *g)
1134{
1135	int err, l = g->virt_header;
1136	struct nmreq req;
1137
1138	if (l == 0)
1139		return;
1140
1141	memset(&req, 0, sizeof(req));
1142	bcopy(g->nmd->req.nr_name, req.nr_name, sizeof(req.nr_name));
1143	req.nr_version = NETMAP_API;
1144	req.nr_cmd = NETMAP_BDG_VNET_HDR;
1145	req.nr_arg1 = l;
1146	err = ioctl(g->main_fd, NIOCREGIF, &req);
1147	if (err) {
1148		D("Unable to set virtio-net header length %d", l);
1149	}
1150}
1151
1152/*
1153 * create and enqueue a batch of packets on a ring.
1154 * On the last one set NS_REPORT to tell the driver to generate
1155 * an interrupt when done.
1156 */
1157static int
1158send_packets(struct netmap_ring *ring, struct pkt *pkt, void *frame,
1159		int size, struct targ *t, u_int count, int options)
1160{
1161	u_int n, sent, head = ring->head;
1162	u_int frags = t->frags;
1163	u_int frag_size = t->frag_size;
1164	struct netmap_slot *slot = &ring->slot[head];
1165
1166	n = nm_ring_space(ring);
1167#if 0
1168	if (options & (OPT_COPY | OPT_PREFETCH) ) {
1169		for (sent = 0; sent < count; sent++) {
1170			struct netmap_slot *slot = &ring->slot[head];
1171			char *p = NETMAP_BUF(ring, slot->buf_idx);
1172
1173			__builtin_prefetch(p);
1174			head = nm_ring_next(ring, head);
1175		}
1176		head = ring->head;
1177	}
1178#endif
1179	for (sent = 0; sent < count && n >= frags; sent++, n--) {
1180		char *p;
1181		int buf_changed;
1182		u_int tosend = size;
1183
1184		slot = &ring->slot[head];
1185		p = NETMAP_BUF(ring, slot->buf_idx);
1186		buf_changed = slot->flags & NS_BUF_CHANGED;
1187
1188		slot->flags = 0;
1189		if (options & OPT_RUBBISH) {
1190			/* do nothing */
1191		} else if (options & OPT_INDIRECT) {
1192			slot->flags |= NS_INDIRECT;
1193			slot->ptr = (uint64_t)((uintptr_t)frame);
1194		} else if (frags > 1) {
1195			u_int i;
1196			const char *f = frame;
1197			char *fp = p;
1198			for (i = 0; i < frags - 1; i++) {
1199				memcpy(fp, f, frag_size);
1200				slot->len = frag_size;
1201				slot->flags = NS_MOREFRAG;
1202				if (options & OPT_DUMP)
1203					dump_payload(fp, frag_size, ring, head);
1204				tosend -= frag_size;
1205				f += frag_size;
1206				head = nm_ring_next(ring, head);
1207				slot = &ring->slot[head];
1208				fp = NETMAP_BUF(ring, slot->buf_idx);
1209			}
1210			n -= (frags - 1);
1211			p = fp;
1212			slot->flags = 0;
1213			memcpy(p, f, tosend);
1214			update_addresses(pkt, t);
1215		} else if ((options & (OPT_COPY | OPT_MEMCPY)) || buf_changed) {
1216			if (options & OPT_COPY)
1217				nm_pkt_copy(frame, p, size);
1218			else
1219				memcpy(p, frame, size);
1220			update_addresses(pkt, t);
1221		} else if (options & OPT_PREFETCH) {
1222			__builtin_prefetch(p);
1223		}
1224		slot->len = tosend;
1225		if (options & OPT_DUMP)
1226			dump_payload(p, tosend, ring, head);
1227		head = nm_ring_next(ring, head);
1228	}
1229	if (sent) {
1230		slot->flags |= NS_REPORT;
1231		ring->head = ring->cur = head;
1232	}
1233	if (sent < count) {
1234		/* tell netmap that we need more slots */
1235		ring->cur = ring->tail;
1236	}
1237
1238	return (sent);
1239}
1240
1241/*
1242 * Index of the highest bit set
1243 */
1244uint32_t
1245msb64(uint64_t x)
1246{
1247	uint64_t m = 1ULL << 63;
1248	int i;
1249
1250	for (i = 63; i >= 0; i--, m >>=1)
1251		if (m & x)
1252			return i;
1253	return 0;
1254}
1255
1256/*
1257 * wait until ts, either busy or sleeping if more than 1ms.
1258 * Return wakeup time.
1259 */
1260static struct timespec
1261wait_time(struct timespec ts)
1262{
1263	for (;;) {
1264		struct timespec w, cur;
1265		clock_gettime(CLOCK_REALTIME_PRECISE, &cur);
1266		w = timespec_sub(ts, cur);
1267		if (w.tv_sec < 0)
1268			return cur;
1269		else if (w.tv_sec > 0 || w.tv_nsec > 1000000)
1270			poll(NULL, 0, 1);
1271	}
1272}
1273
1274/*
1275 * Send a packet, and wait for a response.
1276 * The payload (after UDP header, ofs 42) has a 4-byte sequence
1277 * followed by a struct timeval (or bintime?)
1278 */
1279
1280static void *
1281ping_body(void *data)
1282{
1283	struct targ *targ = (struct targ *) data;
1284	struct pollfd pfd = { .fd = targ->fd, .events = POLLIN };
1285	struct netmap_if *nifp = targ->nmd->nifp;
1286	int i, m, rx = 0;
1287	void *frame;
1288	int size;
1289	struct timespec ts, now, last_print;
1290	struct timespec nexttime = {0, 0}; /* silence compiler */
1291	uint64_t sent = 0, n = targ->g->npackets;
1292	uint64_t count = 0, t_cur, t_min = ~0, av = 0;
1293	uint64_t g_min = ~0, g_av = 0;
1294	uint64_t buckets[64];	/* bins for delays, ns */
1295	int rate_limit = targ->g->tx_rate, tosend = 0;
1296
1297	frame = (char*)&targ->pkt + sizeof(targ->pkt.vh) - targ->g->virt_header;
1298	size = targ->g->pkt_size + targ->g->virt_header;
1299
1300
1301	if (targ->g->nthreads > 1) {
1302		D("can only ping with 1 thread");
1303		return NULL;
1304	}
1305
1306	bzero(&buckets, sizeof(buckets));
1307	clock_gettime(CLOCK_REALTIME_PRECISE, &last_print);
1308	now = last_print;
1309	if (rate_limit) {
1310		targ->tic = timespec_add(now, (struct timespec){2,0});
1311		targ->tic.tv_nsec = 0;
1312		wait_time(targ->tic);
1313		nexttime = targ->tic;
1314	}
1315	while (!targ->cancel && (n == 0 || sent < n)) {
1316		struct netmap_ring *ring = NETMAP_TXRING(nifp, targ->nmd->first_tx_ring);
1317		struct netmap_slot *slot;
1318		char *p;
1319		int rv;
1320		uint64_t limit, event = 0;
1321
1322		if (rate_limit && tosend <= 0) {
1323			tosend = targ->g->burst;
1324			nexttime = timespec_add(nexttime, targ->g->tx_period);
1325			wait_time(nexttime);
1326		}
1327
1328		limit = rate_limit ? tosend : targ->g->burst;
1329		if (n > 0 && n - sent < limit)
1330			limit = n - sent;
1331		for (m = 0; (unsigned)m < limit; m++) {
1332			slot = &ring->slot[ring->head];
1333			slot->len = size;
1334			p = NETMAP_BUF(ring, slot->buf_idx);
1335
1336			if (nm_ring_empty(ring)) {
1337				D("-- ouch, cannot send");
1338				break;
1339			} else {
1340				struct tstamp *tp;
1341				nm_pkt_copy(frame, p, size);
1342				clock_gettime(CLOCK_REALTIME_PRECISE, &ts);
1343				bcopy(&sent, p+42, sizeof(sent));
1344				tp = (struct tstamp *)(p+46);
1345				tp->sec = (uint32_t)ts.tv_sec;
1346				tp->nsec = (uint32_t)ts.tv_nsec;
1347				sent++;
1348				ring->head = ring->cur = nm_ring_next(ring, ring->head);
1349			}
1350		}
1351		if (m > 0)
1352			event++;
1353		targ->ctr.pkts = sent;
1354		targ->ctr.bytes = sent*size;
1355		targ->ctr.events = event;
1356		if (rate_limit)
1357			tosend -= m;
1358#ifdef BUSYWAIT
1359		rv = ioctl(pfd.fd, NIOCTXSYNC, NULL);
1360		if (rv < 0) {
1361			D("TXSYNC error on queue %d: %s", targ->me,
1362				strerror(errno));
1363		}
1364	again:
1365		ioctl(pfd.fd, NIOCRXSYNC, NULL);
1366#else
1367		/* should use a parameter to decide how often to send */
1368		if ( (rv = poll(&pfd, 1, 3000)) <= 0) {
1369			D("poll error on queue %d: %s", targ->me,
1370				(rv ? strerror(errno) : "timeout"));
1371			continue;
1372		}
1373#endif /* BUSYWAIT */
1374		/* see what we got back */
1375		rx = 0;
1376		for (i = targ->nmd->first_rx_ring;
1377			i <= targ->nmd->last_rx_ring; i++) {
1378			ring = NETMAP_RXRING(nifp, i);
1379			while (!nm_ring_empty(ring)) {
1380				uint32_t seq;
1381				struct tstamp *tp;
1382				int pos;
1383
1384				slot = &ring->slot[ring->head];
1385				p = NETMAP_BUF(ring, slot->buf_idx);
1386
1387				clock_gettime(CLOCK_REALTIME_PRECISE, &now);
1388				bcopy(p+42, &seq, sizeof(seq));
1389				tp = (struct tstamp *)(p+46);
1390				ts.tv_sec = (time_t)tp->sec;
1391				ts.tv_nsec = (long)tp->nsec;
1392				ts.tv_sec = now.tv_sec - ts.tv_sec;
1393				ts.tv_nsec = now.tv_nsec - ts.tv_nsec;
1394				if (ts.tv_nsec < 0) {
1395					ts.tv_nsec += 1000000000;
1396					ts.tv_sec--;
1397				}
1398				if (0) D("seq %d/%llu delta %d.%09d", seq,
1399					(unsigned long long)sent,
1400					(int)ts.tv_sec, (int)ts.tv_nsec);
1401				t_cur = ts.tv_sec * 1000000000UL + ts.tv_nsec;
1402				if (t_cur < t_min)
1403					t_min = t_cur;
1404				count ++;
1405				av += t_cur;
1406				pos = msb64(t_cur);
1407				buckets[pos]++;
1408				/* now store it in a bucket */
1409				ring->head = ring->cur = nm_ring_next(ring, ring->head);
1410				rx++;
1411			}
1412		}
1413		//D("tx %d rx %d", sent, rx);
1414		//usleep(100000);
1415		ts.tv_sec = now.tv_sec - last_print.tv_sec;
1416		ts.tv_nsec = now.tv_nsec - last_print.tv_nsec;
1417		if (ts.tv_nsec < 0) {
1418			ts.tv_nsec += 1000000000;
1419			ts.tv_sec--;
1420		}
1421		if (ts.tv_sec >= 1) {
1422			D("count %d RTT: min %d av %d ns",
1423				(int)count, (int)t_min, (int)(av/count));
1424			int k, j, kmin, off;
1425			char buf[512];
1426
1427			for (kmin = 0; kmin < 64; kmin ++)
1428				if (buckets[kmin])
1429					break;
1430			for (k = 63; k >= kmin; k--)
1431				if (buckets[k])
1432					break;
1433			buf[0] = '\0';
1434			off = 0;
1435			for (j = kmin; j <= k; j++) {
1436				off += sprintf(buf + off, " %5d", (int)buckets[j]);
1437			}
1438			D("k: %d .. %d\n\t%s", 1<<kmin, 1<<k, buf);
1439			bzero(&buckets, sizeof(buckets));
1440			count = 0;
1441			g_av += av;
1442			av = 0;
1443			if (t_min < g_min)
1444				g_min = t_min;
1445			t_min = ~0;
1446			last_print = now;
1447		}
1448#ifdef BUSYWAIT
1449		if (rx < m && ts.tv_sec <= 3 && !targ->cancel)
1450			goto again;
1451#endif /* BUSYWAIT */
1452	}
1453
1454	if (sent > 0) {
1455		D("RTT over %llu packets: min %d av %d ns",
1456			(long long unsigned)sent, (int)g_min,
1457			(int)((double)g_av/sent));
1458	}
1459	targ->completed = 1;
1460
1461	/* reset the ``used`` flag. */
1462	targ->used = 0;
1463
1464	return NULL;
1465}
1466
1467
1468/*
1469 * reply to ping requests
1470 */
1471static void *
1472pong_body(void *data)
1473{
1474	struct targ *targ = (struct targ *) data;
1475	struct pollfd pfd = { .fd = targ->fd, .events = POLLIN };
1476	struct netmap_if *nifp = targ->nmd->nifp;
1477	struct netmap_ring *txring, *rxring;
1478	int i, rx = 0;
1479	uint64_t sent = 0, n = targ->g->npackets;
1480
1481	if (targ->g->nthreads > 1) {
1482		D("can only reply ping with 1 thread");
1483		return NULL;
1484	}
1485	if (n > 0)
1486		D("understood ponger %llu but don't know how to do it",
1487			(unsigned long long)n);
1488	while (!targ->cancel && (n == 0 || sent < n)) {
1489		uint32_t txhead, txavail;
1490//#define BUSYWAIT
1491#ifdef BUSYWAIT
1492		ioctl(pfd.fd, NIOCRXSYNC, NULL);
1493#else
1494		int rv;
1495		if ( (rv = poll(&pfd, 1, 1000)) <= 0) {
1496			D("poll error on queue %d: %s", targ->me,
1497				rv ? strerror(errno) : "timeout");
1498			continue;
1499		}
1500#endif
1501		txring = NETMAP_TXRING(nifp, targ->nmd->first_tx_ring);
1502		txhead = txring->head;
1503		txavail = nm_ring_space(txring);
1504		/* see what we got back */
1505		for (i = targ->nmd->first_rx_ring; i <= targ->nmd->last_rx_ring; i++) {
1506			rxring = NETMAP_RXRING(nifp, i);
1507			while (!nm_ring_empty(rxring)) {
1508				uint16_t *spkt, *dpkt;
1509				uint32_t head = rxring->head;
1510				struct netmap_slot *slot = &rxring->slot[head];
1511				char *src, *dst;
1512				src = NETMAP_BUF(rxring, slot->buf_idx);
1513				//D("got pkt %p of size %d", src, slot->len);
1514				rxring->head = rxring->cur = nm_ring_next(rxring, head);
1515				rx++;
1516				if (txavail == 0)
1517					continue;
1518				dst = NETMAP_BUF(txring,
1519				    txring->slot[txhead].buf_idx);
1520				/* copy... */
1521				dpkt = (uint16_t *)dst;
1522				spkt = (uint16_t *)src;
1523				nm_pkt_copy(src, dst, slot->len);
1524				/* swap source and destination MAC */
1525				dpkt[0] = spkt[3];
1526				dpkt[1] = spkt[4];
1527				dpkt[2] = spkt[5];
1528				dpkt[3] = spkt[0];
1529				dpkt[4] = spkt[1];
1530				dpkt[5] = spkt[2];
1531				txring->slot[txhead].len = slot->len;
1532				txhead = nm_ring_next(txring, txhead);
1533				txavail--;
1534				sent++;
1535			}
1536		}
1537		txring->head = txring->cur = txhead;
1538		targ->ctr.pkts = sent;
1539#ifdef BUSYWAIT
1540		ioctl(pfd.fd, NIOCTXSYNC, NULL);
1541#endif
1542		//D("tx %d rx %d", sent, rx);
1543	}
1544
1545	targ->completed = 1;
1546
1547	/* reset the ``used`` flag. */
1548	targ->used = 0;
1549
1550	return NULL;
1551}
1552
1553
1554static void *
1555sender_body(void *data)
1556{
1557	struct targ *targ = (struct targ *) data;
1558	struct pollfd pfd = { .fd = targ->fd, .events = POLLOUT };
1559	struct netmap_if *nifp;
1560	struct netmap_ring *txring = NULL;
1561	int i;
1562	uint64_t n = targ->g->npackets / targ->g->nthreads;
1563	uint64_t sent = 0;
1564	uint64_t event = 0;
1565	int options = targ->g->options | OPT_COPY;
1566	struct timespec nexttime = { 0, 0}; // XXX silence compiler
1567	int rate_limit = targ->g->tx_rate;
1568	struct pkt *pkt = &targ->pkt;
1569	void *frame;
1570	int size;
1571
1572	if (targ->frame == NULL) {
1573		frame = (char *)pkt + sizeof(pkt->vh) - targ->g->virt_header;
1574		size = targ->g->pkt_size + targ->g->virt_header;
1575	} else {
1576		frame = targ->frame;
1577		size = targ->g->pkt_size;
1578	}
1579
1580	D("start, fd %d main_fd %d", targ->fd, targ->g->main_fd);
1581	if (setaffinity(targ->thread, targ->affinity))
1582		goto quit;
1583
1584	/* main loop.*/
1585	clock_gettime(CLOCK_REALTIME_PRECISE, &targ->tic);
1586	if (rate_limit) {
1587		targ->tic = timespec_add(targ->tic, (struct timespec){2,0});
1588		targ->tic.tv_nsec = 0;
1589		wait_time(targ->tic);
1590		nexttime = targ->tic;
1591	}
1592	if (targ->g->dev_type == DEV_TAP) {
1593	    D("writing to file desc %d", targ->g->main_fd);
1594
1595	    for (i = 0; !targ->cancel && (n == 0 || sent < n); i++) {
1596		if (write(targ->g->main_fd, frame, size) != -1)
1597			sent++;
1598		update_addresses(pkt, targ);
1599		if (i > 10000) {
1600			targ->ctr.pkts = sent;
1601			targ->ctr.bytes = sent*size;
1602			targ->ctr.events = sent;
1603			i = 0;
1604		}
1605	    }
1606#ifndef NO_PCAP
1607    } else if (targ->g->dev_type == DEV_PCAP) {
1608	    pcap_t *p = targ->g->p;
1609
1610	    for (i = 0; !targ->cancel && (n == 0 || sent < n); i++) {
1611		if (pcap_inject(p, frame, size) != -1)
1612			sent++;
1613		update_addresses(pkt, targ);
1614		if (i > 10000) {
1615			targ->ctr.pkts = sent;
1616			targ->ctr.bytes = sent*size;
1617			targ->ctr.events = sent;
1618			i = 0;
1619		}
1620	    }
1621#endif /* NO_PCAP */
1622    } else {
1623	int tosend = 0;
1624	u_int bufsz, frag_size = targ->g->frag_size;
1625
1626	nifp = targ->nmd->nifp;
1627	txring = NETMAP_TXRING(nifp, targ->nmd->first_tx_ring);
1628	bufsz = txring->nr_buf_size;
1629	if (bufsz < frag_size)
1630		frag_size = bufsz;
1631	targ->frag_size = targ->g->pkt_size / targ->frags;
1632	if (targ->frag_size > frag_size) {
1633		targ->frags = targ->g->pkt_size / frag_size;
1634		targ->frag_size = frag_size;
1635		if (targ->g->pkt_size % frag_size != 0)
1636			targ->frags++;
1637	}
1638	D("frags %u frag_size %u", targ->frags, targ->frag_size);
1639	while (!targ->cancel && (n == 0 || sent < n)) {
1640		int rv;
1641
1642		if (rate_limit && tosend <= 0) {
1643			tosend = targ->g->burst;
1644			nexttime = timespec_add(nexttime, targ->g->tx_period);
1645			wait_time(nexttime);
1646		}
1647
1648		/*
1649		 * wait for available room in the send queue(s)
1650		 */
1651#ifdef BUSYWAIT
1652		(void)rv;
1653		if (ioctl(pfd.fd, NIOCTXSYNC, NULL) < 0) {
1654			D("ioctl error on queue %d: %s", targ->me,
1655					strerror(errno));
1656			goto quit;
1657		}
1658#else /* !BUSYWAIT */
1659		if ( (rv = poll(&pfd, 1, 2000)) <= 0) {
1660			if (targ->cancel)
1661				break;
1662			D("poll error on queue %d: %s", targ->me,
1663				rv ? strerror(errno) : "timeout");
1664			// goto quit;
1665		}
1666		if (pfd.revents & POLLERR) {
1667			D("poll error on %d ring %d-%d", pfd.fd,
1668				targ->nmd->first_tx_ring, targ->nmd->last_tx_ring);
1669			goto quit;
1670		}
1671#endif /* !BUSYWAIT */
1672		/*
1673		 * scan our queues and send on those with room
1674		 */
1675		if (options & OPT_COPY && sent > 100000 && !(targ->g->options & OPT_COPY) ) {
1676			D("drop copy");
1677			options &= ~OPT_COPY;
1678		}
1679		for (i = targ->nmd->first_tx_ring; i <= targ->nmd->last_tx_ring; i++) {
1680			int m;
1681			uint64_t limit = rate_limit ?  tosend : targ->g->burst;
1682
1683			if (n > 0 && n == sent)
1684				break;
1685
1686			if (n > 0 && n - sent < limit)
1687				limit = n - sent;
1688			txring = NETMAP_TXRING(nifp, i);
1689			if (nm_ring_empty(txring))
1690				continue;
1691
1692			if (targ->g->pkt_min_size > 0) {
1693				size = nrand48(targ->seed) %
1694					(targ->g->pkt_size - targ->g->pkt_min_size) +
1695					targ->g->pkt_min_size;
1696			}
1697			m = send_packets(txring, pkt, frame, size, targ,
1698					 limit, options);
1699			ND("limit %lu tail %d m %d",
1700				limit, txring->tail, m);
1701			sent += m;
1702			if (m > 0) //XXX-ste: can m be 0?
1703				event++;
1704			targ->ctr.pkts = sent;
1705			targ->ctr.bytes += m*size;
1706			targ->ctr.events = event;
1707			if (rate_limit) {
1708				tosend -= m;
1709				if (tosend <= 0)
1710					break;
1711			}
1712		}
1713	}
1714	/* flush any remaining packets */
1715	if (txring != NULL) {
1716		D("flush tail %d head %d on thread %p",
1717			txring->tail, txring->head,
1718			(void *)pthread_self());
1719		ioctl(pfd.fd, NIOCTXSYNC, NULL);
1720	}
1721
1722	/* final part: wait all the TX queues to be empty. */
1723	for (i = targ->nmd->first_tx_ring; i <= targ->nmd->last_tx_ring; i++) {
1724		txring = NETMAP_TXRING(nifp, i);
1725		while (!targ->cancel && nm_tx_pending(txring)) {
1726			RD(5, "pending tx tail %d head %d on ring %d",
1727				txring->tail, txring->head, i);
1728			ioctl(pfd.fd, NIOCTXSYNC, NULL);
1729			usleep(1); /* wait 1 tick */
1730		}
1731	}
1732    } /* end DEV_NETMAP */
1733
1734	clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc);
1735	targ->completed = 1;
1736	targ->ctr.pkts = sent;
1737	targ->ctr.bytes = sent*size;
1738	targ->ctr.events = event;
1739quit:
1740	/* reset the ``used`` flag. */
1741	targ->used = 0;
1742
1743	return (NULL);
1744}
1745
1746
1747#ifndef NO_PCAP
1748static void
1749receive_pcap(u_char *user, const struct pcap_pkthdr * h,
1750	const u_char * bytes)
1751{
1752	struct my_ctrs *ctr = (struct my_ctrs *)user;
1753	(void)bytes;	/* UNUSED */
1754	ctr->bytes += h->len;
1755	ctr->pkts++;
1756}
1757#endif /* !NO_PCAP */
1758
1759
1760static int
1761receive_packets(struct netmap_ring *ring, u_int limit, int dump, uint64_t *bytes)
1762{
1763	u_int head, rx, n;
1764	uint64_t b = 0;
1765	u_int complete = 0;
1766
1767	if (bytes == NULL)
1768		bytes = &b;
1769
1770	head = ring->head;
1771	n = nm_ring_space(ring);
1772	if (n < limit)
1773		limit = n;
1774	for (rx = 0; rx < limit; rx++) {
1775		struct netmap_slot *slot = &ring->slot[head];
1776		char *p = NETMAP_BUF(ring, slot->buf_idx);
1777
1778		*bytes += slot->len;
1779		if (dump)
1780			dump_payload(p, slot->len, ring, head);
1781		if (!(slot->flags & NS_MOREFRAG))
1782			complete++;
1783
1784		head = nm_ring_next(ring, head);
1785	}
1786	ring->head = ring->cur = head;
1787
1788	return (complete);
1789}
1790
1791static void *
1792receiver_body(void *data)
1793{
1794	struct targ *targ = (struct targ *) data;
1795	struct pollfd pfd = { .fd = targ->fd, .events = POLLIN };
1796	struct netmap_if *nifp;
1797	struct netmap_ring *rxring;
1798	int i;
1799	struct my_ctrs cur;
1800
1801	memset(&cur, 0, sizeof(cur));
1802
1803	if (setaffinity(targ->thread, targ->affinity))
1804		goto quit;
1805
1806	D("reading from %s fd %d main_fd %d",
1807		targ->g->ifname, targ->fd, targ->g->main_fd);
1808	/* unbounded wait for the first packet. */
1809	for (;!targ->cancel;) {
1810		i = poll(&pfd, 1, 1000);
1811		if (i > 0 && !(pfd.revents & POLLERR))
1812			break;
1813		if (i < 0) {
1814			D("poll() error: %s", strerror(errno));
1815			goto quit;
1816		}
1817		if (pfd.revents & POLLERR) {
1818			D("fd error");
1819			goto quit;
1820		}
1821		RD(1, "waiting for initial packets, poll returns %d %d",
1822			i, pfd.revents);
1823	}
1824	/* main loop, exit after 1s silence */
1825	clock_gettime(CLOCK_REALTIME_PRECISE, &targ->tic);
1826    if (targ->g->dev_type == DEV_TAP) {
1827	while (!targ->cancel) {
1828		char buf[MAX_BODYSIZE];
1829		/* XXX should we poll ? */
1830		i = read(targ->g->main_fd, buf, sizeof(buf));
1831		if (i > 0) {
1832			targ->ctr.pkts++;
1833			targ->ctr.bytes += i;
1834			targ->ctr.events++;
1835		}
1836	}
1837#ifndef NO_PCAP
1838    } else if (targ->g->dev_type == DEV_PCAP) {
1839	while (!targ->cancel) {
1840		/* XXX should we poll ? */
1841		pcap_dispatch(targ->g->p, targ->g->burst, receive_pcap,
1842			(u_char *)&targ->ctr);
1843		targ->ctr.events++;
1844	}
1845#endif /* !NO_PCAP */
1846    } else {
1847	int dump = targ->g->options & OPT_DUMP;
1848
1849	nifp = targ->nmd->nifp;
1850	while (!targ->cancel) {
1851		/* Once we started to receive packets, wait at most 1 seconds
1852		   before quitting. */
1853#ifdef BUSYWAIT
1854		if (ioctl(pfd.fd, NIOCRXSYNC, NULL) < 0) {
1855			D("ioctl error on queue %d: %s", targ->me,
1856					strerror(errno));
1857			goto quit;
1858		}
1859#else /* !BUSYWAIT */
1860		if (poll(&pfd, 1, 1 * 1000) <= 0 && !targ->g->forever) {
1861			clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc);
1862			targ->toc.tv_sec -= 1; /* Subtract timeout time. */
1863			goto out;
1864		}
1865
1866		if (pfd.revents & POLLERR) {
1867			D("poll err");
1868			goto quit;
1869		}
1870#endif /* !BUSYWAIT */
1871		uint64_t cur_space = 0;
1872		for (i = targ->nmd->first_rx_ring; i <= targ->nmd->last_rx_ring; i++) {
1873			int m;
1874
1875			rxring = NETMAP_RXRING(nifp, i);
1876			/* compute free space in the ring */
1877			m = rxring->head + rxring->num_slots - rxring->tail;
1878			if (m >= (int) rxring->num_slots)
1879				m -= rxring->num_slots;
1880			cur_space += m;
1881			if (nm_ring_empty(rxring))
1882				continue;
1883
1884			m = receive_packets(rxring, targ->g->burst, dump, &cur.bytes);
1885			cur.pkts += m;
1886			if (m > 0)
1887				cur.events++;
1888		}
1889		cur.min_space = targ->ctr.min_space;
1890		if (cur_space < cur.min_space)
1891			cur.min_space = cur_space;
1892		targ->ctr = cur;
1893	}
1894    }
1895
1896	clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc);
1897
1898#if !defined(BUSYWAIT)
1899out:
1900#endif
1901	targ->completed = 1;
1902	targ->ctr = cur;
1903
1904quit:
1905	/* reset the ``used`` flag. */
1906	targ->used = 0;
1907
1908	return (NULL);
1909}
1910
1911static void *
1912txseq_body(void *data)
1913{
1914	struct targ *targ = (struct targ *) data;
1915	struct pollfd pfd = { .fd = targ->fd, .events = POLLOUT };
1916	struct netmap_ring *ring;
1917	int64_t sent = 0;
1918	uint64_t event = 0;
1919	int options = targ->g->options | OPT_COPY;
1920	struct timespec nexttime = {0, 0};
1921	int rate_limit = targ->g->tx_rate;
1922	struct pkt *pkt = &targ->pkt;
1923	int frags = targ->g->frags;
1924	uint32_t sequence = 0;
1925	int budget = 0;
1926	void *frame;
1927	int size;
1928
1929	if (targ->g->nthreads > 1) {
1930		D("can only txseq ping with 1 thread");
1931		return NULL;
1932	}
1933
1934	if (targ->g->npackets > 0) {
1935		D("Ignoring -n argument");
1936	}
1937
1938	frame = (char *)pkt + sizeof(pkt->vh) - targ->g->virt_header;
1939	size = targ->g->pkt_size + targ->g->virt_header;
1940
1941	D("start, fd %d main_fd %d", targ->fd, targ->g->main_fd);
1942	if (setaffinity(targ->thread, targ->affinity))
1943		goto quit;
1944
1945	clock_gettime(CLOCK_REALTIME_PRECISE, &targ->tic);
1946	if (rate_limit) {
1947		targ->tic = timespec_add(targ->tic, (struct timespec){2,0});
1948		targ->tic.tv_nsec = 0;
1949		wait_time(targ->tic);
1950		nexttime = targ->tic;
1951	}
1952
1953	/* Only use the first queue. */
1954	ring = NETMAP_TXRING(targ->nmd->nifp, targ->nmd->first_tx_ring);
1955
1956	while (!targ->cancel) {
1957		int64_t limit;
1958		unsigned int space;
1959		unsigned int head;
1960		int fcnt;
1961		uint16_t sum = 0;
1962		int rv;
1963
1964		if (!rate_limit) {
1965			budget = targ->g->burst;
1966
1967		} else if (budget <= 0) {
1968			budget = targ->g->burst;
1969			nexttime = timespec_add(nexttime, targ->g->tx_period);
1970			wait_time(nexttime);
1971		}
1972
1973		/* wait for available room in the send queue */
1974#ifdef BUSYWAIT
1975		(void)rv;
1976		if (ioctl(pfd.fd, NIOCTXSYNC, NULL) < 0) {
1977			D("ioctl error on queue %d: %s", targ->me,
1978					strerror(errno));
1979			goto quit;
1980		}
1981#else /* !BUSYWAIT */
1982		if ( (rv = poll(&pfd, 1, 2000)) <= 0) {
1983			if (targ->cancel)
1984				break;
1985			D("poll error on queue %d: %s", targ->me,
1986				rv ? strerror(errno) : "timeout");
1987			// goto quit;
1988		}
1989		if (pfd.revents & POLLERR) {
1990			D("poll error on %d ring %d-%d", pfd.fd,
1991				targ->nmd->first_tx_ring, targ->nmd->last_tx_ring);
1992			goto quit;
1993		}
1994#endif /* !BUSYWAIT */
1995
1996		/* If no room poll() again. */
1997		space = nm_ring_space(ring);
1998		if (!space) {
1999			continue;
2000		}
2001
2002		limit = budget;
2003
2004		if (space < limit) {
2005			limit = space;
2006		}
2007
2008		/* Cut off ``limit`` to make sure is multiple of ``frags``. */
2009		if (frags > 1) {
2010			limit = (limit / frags) * frags;
2011		}
2012
2013		limit = sent + limit; /* Convert to absolute. */
2014
2015		for (fcnt = frags, head = ring->head;
2016				sent < limit; sent++, sequence++) {
2017			struct netmap_slot *slot = &ring->slot[head];
2018			char *p = NETMAP_BUF(ring, slot->buf_idx);
2019			uint16_t *w = (uint16_t *)PKT(pkt, body, targ->g->af), t;
2020
2021			memcpy(&sum, targ->g->af == AF_INET ? &pkt->ipv4.udp.uh_sum : &pkt->ipv6.udp.uh_sum, sizeof(sum));
2022
2023			slot->flags = 0;
2024			t = *w;
2025			PKT(pkt, body, targ->g->af)[0] = sequence >> 24;
2026			PKT(pkt, body, targ->g->af)[1] = (sequence >> 16) & 0xff;
2027			sum = ~cksum_add(~sum, cksum_add(~t, *w));
2028			t = *++w;
2029			PKT(pkt, body, targ->g->af)[2] = (sequence >> 8) & 0xff;
2030			PKT(pkt, body, targ->g->af)[3] = sequence & 0xff;
2031			sum = ~cksum_add(~sum, cksum_add(~t, *w));
2032			memcpy(targ->g->af == AF_INET ? &pkt->ipv4.udp.uh_sum : &pkt->ipv6.udp.uh_sum, &sum, sizeof(sum));
2033			nm_pkt_copy(frame, p, size);
2034			if (fcnt == frags) {
2035				update_addresses(pkt, targ);
2036			}
2037
2038			if (options & OPT_DUMP) {
2039				dump_payload(p, size, ring, head);
2040			}
2041
2042			slot->len = size;
2043
2044			if (--fcnt > 0) {
2045				slot->flags |= NS_MOREFRAG;
2046			} else {
2047				fcnt = frags;
2048			}
2049
2050			if (sent == limit - 1) {
2051				/* Make sure we don't push an incomplete
2052				 * packet. */
2053				assert(!(slot->flags & NS_MOREFRAG));
2054				slot->flags |= NS_REPORT;
2055			}
2056
2057			head = nm_ring_next(ring, head);
2058			if (rate_limit) {
2059				budget--;
2060			}
2061		}
2062
2063		ring->cur = ring->head = head;
2064
2065		event ++;
2066		targ->ctr.pkts = sent;
2067		targ->ctr.bytes = sent * size;
2068		targ->ctr.events = event;
2069	}
2070
2071	/* flush any remaining packets */
2072	D("flush tail %d head %d on thread %p",
2073		ring->tail, ring->head,
2074		(void *)pthread_self());
2075	ioctl(pfd.fd, NIOCTXSYNC, NULL);
2076
2077	/* final part: wait the TX queues to become empty. */
2078	while (!targ->cancel && nm_tx_pending(ring)) {
2079		RD(5, "pending tx tail %d head %d on ring %d",
2080				ring->tail, ring->head, targ->nmd->first_tx_ring);
2081		ioctl(pfd.fd, NIOCTXSYNC, NULL);
2082		usleep(1); /* wait 1 tick */
2083	}
2084
2085	clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc);
2086	targ->completed = 1;
2087	targ->ctr.pkts = sent;
2088	targ->ctr.bytes = sent * size;
2089	targ->ctr.events = event;
2090quit:
2091	/* reset the ``used`` flag. */
2092	targ->used = 0;
2093
2094	return (NULL);
2095}
2096
2097
2098static char *
2099multi_slot_to_string(struct netmap_ring *ring, unsigned int head,
2100		     unsigned int nfrags, char *strbuf, size_t strbuflen)
2101{
2102	unsigned int f;
2103	char *ret = strbuf;
2104
2105	for (f = 0; f < nfrags; f++) {
2106		struct netmap_slot *slot = &ring->slot[head];
2107		int m = snprintf(strbuf, strbuflen, "|%u,%x|", slot->len,
2108				 slot->flags);
2109		if (m >= (int)strbuflen) {
2110			break;
2111		}
2112		strbuf += m;
2113		strbuflen -= m;
2114
2115		head = nm_ring_next(ring, head);
2116	}
2117
2118	return ret;
2119}
2120
2121static void *
2122rxseq_body(void *data)
2123{
2124	struct targ *targ = (struct targ *) data;
2125	struct pollfd pfd = { .fd = targ->fd, .events = POLLIN };
2126	int dump = targ->g->options & OPT_DUMP;
2127	struct netmap_ring *ring;
2128	unsigned int frags_exp = 1;
2129	struct my_ctrs cur;
2130	unsigned int frags = 0;
2131	int first_packet = 1;
2132	int first_slot = 1;
2133	int i, j, af, nrings;
2134	uint32_t seq, *seq_exp = NULL;
2135
2136	memset(&cur, 0, sizeof(cur));
2137
2138	if (setaffinity(targ->thread, targ->affinity))
2139		goto quit;
2140
2141	nrings = targ->nmd->last_rx_ring - targ->nmd->first_rx_ring + 1;
2142	seq_exp = calloc(nrings, sizeof(uint32_t));
2143	if (seq_exp == NULL) {
2144		D("failed to allocate seq array");
2145		goto quit;
2146	}
2147
2148	D("reading from %s fd %d main_fd %d",
2149		targ->g->ifname, targ->fd, targ->g->main_fd);
2150	/* unbounded wait for the first packet. */
2151	for (;!targ->cancel;) {
2152		i = poll(&pfd, 1, 1000);
2153		if (i > 0 && !(pfd.revents & POLLERR))
2154			break;
2155		RD(1, "waiting for initial packets, poll returns %d %d",
2156			i, pfd.revents);
2157	}
2158
2159	clock_gettime(CLOCK_REALTIME_PRECISE, &targ->tic);
2160
2161
2162	while (!targ->cancel) {
2163		unsigned int head;
2164		int limit;
2165
2166#ifdef BUSYWAIT
2167		if (ioctl(pfd.fd, NIOCRXSYNC, NULL) < 0) {
2168			D("ioctl error on queue %d: %s", targ->me,
2169					strerror(errno));
2170			goto quit;
2171		}
2172#else /* !BUSYWAIT */
2173		if (poll(&pfd, 1, 1 * 1000) <= 0 && !targ->g->forever) {
2174			clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc);
2175			targ->toc.tv_sec -= 1; /* Subtract timeout time. */
2176			goto out;
2177		}
2178
2179		if (pfd.revents & POLLERR) {
2180			D("poll err");
2181			goto quit;
2182		}
2183#endif /* !BUSYWAIT */
2184
2185		for (j = targ->nmd->first_rx_ring; j <= targ->nmd->last_rx_ring; j++) {
2186			ring = NETMAP_RXRING(targ->nmd->nifp, j);
2187			if (nm_ring_empty(ring))
2188				continue;
2189
2190			limit = nm_ring_space(ring);
2191			if (limit > targ->g->burst)
2192				limit = targ->g->burst;
2193
2194#if 0
2195			/* Enable this if
2196			 *     1) we remove the early-return optimization from
2197			 *        the netmap poll implementation, or
2198			 *     2) pipes get NS_MOREFRAG support.
2199			 * With the current netmap implementation, an experiment like
2200			 *    pkt-gen -i vale:1{1 -f txseq -F 9
2201			 *    pkt-gen -i vale:1}1 -f rxseq
2202			 * would get stuck as soon as we find nm_ring_space(ring) < 9,
2203			 * since here limit is rounded to 0 and
2204			 * pipe rxsync is not called anymore by the poll() of this loop.
2205			 */
2206			if (frags_exp > 1) {
2207				int o = limit;
2208				/* Cut off to the closest smaller multiple. */
2209				limit = (limit / frags_exp) * frags_exp;
2210				RD(2, "LIMIT %d --> %d", o, limit);
2211			}
2212#endif
2213
2214			for (head = ring->head, i = 0; i < limit; i++) {
2215				struct netmap_slot *slot = &ring->slot[head];
2216				char *p = NETMAP_BUF(ring, slot->buf_idx);
2217				int len = slot->len;
2218				struct pkt *pkt;
2219
2220				if (dump) {
2221					dump_payload(p, slot->len, ring, head);
2222				}
2223
2224				frags++;
2225				if (!(slot->flags & NS_MOREFRAG)) {
2226					if (first_packet) {
2227						first_packet = 0;
2228					} else if (frags != frags_exp) {
2229						char prbuf[512];
2230						RD(1, "Received packets with %u frags, "
2231								"expected %u, '%s'", frags, frags_exp,
2232								multi_slot_to_string(ring, head-frags+1,
2233							       	frags,
2234									prbuf, sizeof(prbuf)));
2235					}
2236					first_packet = 0;
2237					frags_exp = frags;
2238					frags = 0;
2239				}
2240
2241				p -= sizeof(pkt->vh) - targ->g->virt_header;
2242				len += sizeof(pkt->vh) - targ->g->virt_header;
2243				pkt = (struct pkt *)p;
2244				if (ntohs(pkt->eh.ether_type) == ETHERTYPE_IP)
2245					af = AF_INET;
2246				else
2247					af = AF_INET6;
2248
2249				if ((char *)pkt + len < ((char *)PKT(pkt, body, af)) +
2250						sizeof(seq)) {
2251					RD(1, "%s: packet too small (len=%u)", __func__,
2252							slot->len);
2253				} else {
2254					seq = (PKT(pkt, body, af)[0] << 24) |
2255						(PKT(pkt, body, af)[1] << 16) |
2256						(PKT(pkt, body, af)[2] << 8) |
2257						PKT(pkt, body, af)[3];
2258					if (first_slot) {
2259						/* Grab the first one, whatever it
2260						   is. */
2261						seq_exp[j] = seq;
2262						first_slot = 0;
2263					} else if (seq != seq_exp[j]) {
2264						uint32_t delta = seq - seq_exp[j];
2265
2266						if (delta < (0xFFFFFFFF >> 1)) {
2267							RD(2, "Sequence GAP: exp %u found %u",
2268									seq_exp[j], seq);
2269						} else {
2270							RD(2, "Sequence OUT OF ORDER: "
2271									"exp %u found %u", seq_exp[j], seq);
2272						}
2273						seq_exp[j] = seq;
2274					}
2275					seq_exp[j]++;
2276				}
2277
2278				cur.bytes += slot->len;
2279				head = nm_ring_next(ring, head);
2280				cur.pkts++;
2281			}
2282
2283			ring->cur = ring->head = head;
2284
2285			cur.events++;
2286			targ->ctr = cur;
2287		}
2288	}
2289	clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc);
2290
2291#ifndef BUSYWAIT
2292out:
2293#endif /* !BUSYWAIT */
2294	targ->completed = 1;
2295	targ->ctr = cur;
2296
2297quit:
2298	if (seq_exp != NULL)
2299		free(seq_exp);
2300	/* reset the ``used`` flag. */
2301	targ->used = 0;
2302
2303	return (NULL);
2304}
2305
2306
2307static void
2308tx_output(struct glob_arg *g, struct my_ctrs *cur, double delta, const char *msg)
2309{
2310	double bw, raw_bw, pps, abs;
2311	char b1[40], b2[80], b3[80];
2312	int size;
2313
2314	if (cur->pkts == 0) {
2315		printf("%s nothing.\n", msg);
2316		return;
2317	}
2318
2319	size = (int)(cur->bytes / cur->pkts);
2320
2321	printf("%s %llu packets %llu bytes %llu events %d bytes each in %.2f seconds.\n",
2322		msg,
2323		(unsigned long long)cur->pkts,
2324		(unsigned long long)cur->bytes,
2325		(unsigned long long)cur->events, size, delta);
2326	if (delta == 0)
2327		delta = 1e-6;
2328	if (size < 60)		/* correct for min packet size */
2329		size = 60;
2330	pps = cur->pkts / delta;
2331	bw = (8.0 * cur->bytes) / delta;
2332	raw_bw = (8.0 * cur->bytes + cur->pkts * g->framing) / delta;
2333	abs = cur->pkts / (double)(cur->events);
2334
2335	printf("Speed: %spps Bandwidth: %sbps (raw %sbps). Average batch: %.2f pkts\n",
2336		norm(b1, pps, normalize), norm(b2, bw, normalize), norm(b3, raw_bw, normalize), abs);
2337}
2338
2339static void
2340usage(int errcode)
2341{
2342/* This usage is generated from the pkt-gen man page:
2343 *   $ man pkt-gen > x
2344 * and pasted here adding the string terminators and endlines with simple
2345 * regular expressions. */
2346	const char *cmd = "pkt-gen";
2347	fprintf(stderr,
2348		"Usage:\n"
2349		"%s arguments\n"
2350"     -h      Show program usage and exit.\n"
2351"\n"
2352"     -i interface\n"
2353"             Name of the network interface that pkt-gen operates on.  It can be a system network interface\n"
2354"             (e.g., em0), the name of a vale(4) port (e.g., valeSSS:PPP), the name of a netmap pipe or\n"
2355"             monitor, or any valid netmap port name accepted by the nm_open library function, as docu-\n"
2356"             mented in netmap(4) (NIOCREGIF section).\n"
2357"\n"
2358"     -f function\n"
2359"             The function to be executed by pkt-gen.  Specify tx for transmission, rx for reception, ping\n"
2360"             for client-side ping-pong operation, and pong for server-side ping-pong operation.\n"
2361"\n"
2362"     -n count\n"
2363"             Number of iterations of the pkt-gen function, with 0 meaning infinite).  In case of tx or rx,\n"
2364"             count is the number of packets to receive or transmit.  In case of ping or pong, count is the\n"
2365"             number of ping-pong transactions.\n"
2366"\n"
2367"     -l pkt_size\n"
2368"             Packet size in bytes excluding CRC.  If passed a second time, use random sizes larger or\n"
2369"             equal than the second one and lower than the first one.\n"
2370"\n"
2371"     -b burst_size\n"
2372"             Transmit or receive up to burst_size packets at a time.\n"
2373"\n"
2374"     -4      Use IPv4 addresses.\n"
2375"\n"
2376"     -6      Use IPv6 addresses.\n"
2377"\n"
2378"     -d dst_ip[:port[-dst_ip:port]]\n"
2379"             Destination IPv4/IPv6 address and port, single or range.\n"
2380"\n"
2381"     -s src_ip[:port[-src_ip:port]]\n"
2382"             Source IPv4/IPv6 address and port, single or range.\n"
2383"\n"
2384"     -D dst_mac\n"
2385"             Destination MAC address in colon notation (e.g., aa:bb:cc:dd:ee:00).\n"
2386"\n"
2387"     -S src_mac\n"
2388"             Source MAC address in colon notation.\n"
2389"\n"
2390"     -a cpu_id\n"
2391"             Pin the first thread of pkt-gen to a particular CPU using pthread_setaffinity_np(3).  If more\n"
2392"             threads are used, they are pinned to the subsequent CPUs, one per thread.\n"
2393"\n"
2394"     -c cpus\n"
2395"             Maximum number of CPUs to use (0 means to use all the available ones).\n"
2396"\n"
2397"     -p threads\n"
2398"             Number of threads to use.  By default, only a single thread is used to handle all the netmap\n"
2399"             rings.  If threads is larger than one, each thread handles a single TX ring (in tx mode), a\n"
2400"             single RX ring (in rx mode), or a TX/RX ring couple.  The number of threads must be less or\n"
2401"             equal than the number of TX (or RX) ring available in the device specified by interface.\n"
2402"\n"
2403"     -T report_ms\n"
2404"             Number of milliseconds between reports.\n"
2405"\n"
2406"     -w wait_for_link_time\n"
2407"             Number of seconds to wait before starting the pkt-gen function, useuful to make sure that the\n"
2408"             network link is up.  A network device driver may take some time to enter netmap mode, or to\n"
2409"             create a new transmit/receive ring pair when netmap(4) requests one.\n"
2410"\n"
2411"     -R rate\n"
2412"             Packet transmission rate.  Not setting the packet transmission rate tells pkt-gen to transmit\n"
2413"             packets as quickly as possible.  On servers from 2010 on-wards netmap(4) is able to com-\n"
2414"             pletely use all of the bandwidth of a 10 or 40Gbps link, so this option should be used unless\n"
2415"             your intention is to saturate the link.\n"
2416"\n"
2417"     -X      Dump payload of each packet transmitted or received.\n"
2418"\n"
2419"     -H len  Add empty virtio-net-header with size 'len'.  Valid sizes are 0, 10 and 12.  This option is\n"
2420"             only used with Virtual Machine technologies that use virtio as a network interface.\n"
2421"\n"
2422"     -P file\n"
2423"             Load the packet to be transmitted from a pcap file rather than constructing it within\n"
2424"             pkt-gen.\n"
2425"\n"
2426"     -z      Use random IPv4/IPv6 src address/port.\n"
2427"\n"
2428"     -Z      Use random IPv4/IPv6 dst address/port.\n"
2429"\n"
2430"     -N      Do not normalize units (i.e., use bps, pps instead of Mbps, Kpps, etc.).\n"
2431"\n"
2432"     -F num_frags\n"
2433"             Send multi-slot packets, each one with num_frags fragments.  A multi-slot packet is repre-\n"
2434"             sented by two or more consecutive netmap slots with the NS_MOREFRAG flag set (except for the\n"
2435"             last slot).  This is useful to transmit or receive packets larger than the netmap buffer\n"
2436"             size.\n"
2437"\n"
2438"     -M frag_size\n"
2439"             In multi-slot mode, frag_size specifies the size of each fragment, if smaller than the packet\n"
2440"             length divided by num_frags.\n"
2441"\n"
2442"     -I      Use indirect buffers.  It is only valid for transmitting on VALE ports, and it is implemented\n"
2443"             by setting the NS_INDIRECT flag in the netmap slots.\n"
2444"\n"
2445"     -W      Exit immediately if all the RX rings are empty the first time they are examined.\n"
2446"\n"
2447"     -v      Increase the verbosity level.\n"
2448"\n"
2449"     -r      In tx mode, do not initialize packets, but send whatever the content of the uninitialized\n"
2450"             netmap buffers is (rubbish mode).\n"
2451"\n"
2452"     -A      Compute mean and standard deviation (over a sliding window) for the transmit or receive rate.\n"
2453"\n"
2454"     -B      Take Ethernet framing and CRC into account when computing the average bps.  This adds 4 bytes\n"
2455"             of CRC and 20 bytes of framing to each packet.\n"
2456"\n"
2457"     -C tx_slots[,rx_slots[,tx_rings[,rx_rings]]]\n"
2458"             Configuration in terms of number of rings and slots to be used when opening the netmap port.\n"
2459"             Such configuration has effect on software ports created on the fly, such as VALE ports and\n"
2460"             netmap pipes.  The configuration may consist of 1 to 4 numbers separated by commas: tx_slots,\n"
2461"             rx_slots, tx_rings, rx_rings.  Missing numbers or zeroes stand for default values.  As an\n"
2462"             additional convenience, if exactly one number is specified, then this is assigned to both\n"
2463"             tx_slots and rx_slots.  If there is no fourth number, then the third one is assigned to both\n"
2464"             tx_rings and rx_rings.\n"
2465"\n"
2466"     -o options		data generation options (parsed using atoi)\n"
2467"				OPT_PREFETCH	1\n"
2468"				OPT_ACCESS	2\n"
2469"				OPT_COPY	4\n"
2470"				OPT_MEMCPY	8\n"
2471"				OPT_TS		16 (add a timestamp)\n"
2472"				OPT_INDIRECT	32 (use indirect buffers)\n"
2473"				OPT_DUMP	64 (dump rx/tx traffic)\n"
2474"				OPT_RUBBISH	256\n"
2475"					(send wathever the buffers contain)\n"
2476"				OPT_RANDOM_SRC  512\n"
2477"				OPT_RANDOM_DST  1024\n"
2478"				OPT_PPS_STATS   2048\n"
2479		     "",
2480		cmd);
2481	exit(errcode);
2482}
2483
2484static void
2485start_threads(struct glob_arg *g) {
2486	int i;
2487
2488	targs = calloc(g->nthreads, sizeof(*targs));
2489	struct targ *t;
2490	/*
2491	 * Now create the desired number of threads, each one
2492	 * using a single descriptor.
2493	 */
2494	for (i = 0; i < g->nthreads; i++) {
2495		uint64_t seed = time(0) | (time(0) << 32);
2496		t = &targs[i];
2497
2498		bzero(t, sizeof(*t));
2499		t->fd = -1; /* default, with pcap */
2500		t->g = g;
2501		memcpy(t->seed, &seed, sizeof(t->seed));
2502
2503		if (g->dev_type == DEV_NETMAP) {
2504			struct nm_desc nmd = *g->nmd; /* copy, we overwrite ringid */
2505			uint64_t nmd_flags = 0;
2506			nmd.self = &nmd;
2507
2508			if (i > 0) {
2509				/* the first thread uses the fd opened by the main
2510				 * thread, the other threads re-open /dev/netmap
2511				 */
2512				if (g->nthreads > 1) {
2513					nmd.req.nr_flags =
2514						g->nmd->req.nr_flags & ~NR_REG_MASK;
2515					nmd.req.nr_flags |= NR_REG_ONE_NIC;
2516					nmd.req.nr_ringid = i;
2517				}
2518				/* Only touch one of the rings (rx is already ok) */
2519				if (g->td_type == TD_TYPE_RECEIVER)
2520					nmd_flags |= NETMAP_NO_TX_POLL;
2521
2522				/* register interface. Override ifname and ringid etc. */
2523				t->nmd = nm_open(t->g->ifname, NULL, nmd_flags |
2524						NM_OPEN_IFNAME | NM_OPEN_NO_MMAP, &nmd);
2525				if (t->nmd == NULL) {
2526					D("Unable to open %s: %s",
2527							t->g->ifname, strerror(errno));
2528					continue;
2529				}
2530			} else {
2531				t->nmd = g->nmd;
2532			}
2533			t->fd = t->nmd->fd;
2534			t->frags = g->frags;
2535		} else {
2536			targs[i].fd = g->main_fd;
2537		}
2538		t->used = 1;
2539		t->me = i;
2540		if (g->affinity >= 0) {
2541			t->affinity = (g->affinity + i) % g->cpus;
2542		} else {
2543			t->affinity = -1;
2544		}
2545		/* default, init packets */
2546		initialize_packet(t);
2547	}
2548	/* Wait for PHY reset. */
2549	D("Wait %d secs for phy reset", g->wait_link);
2550	sleep(g->wait_link);
2551	D("Ready...");
2552
2553	for (i = 0; i < g->nthreads; i++) {
2554		t = &targs[i];
2555		if (pthread_create(&t->thread, NULL, g->td_body, t) == -1) {
2556			D("Unable to create thread %d: %s", i, strerror(errno));
2557			t->used = 0;
2558		}
2559	}
2560}
2561
2562static void
2563main_thread(struct glob_arg *g)
2564{
2565	int i;
2566
2567	struct my_ctrs prev, cur;
2568	double delta_t;
2569	struct timeval tic, toc;
2570
2571	prev.pkts = prev.bytes = prev.events = 0;
2572	gettimeofday(&prev.t, NULL);
2573	for (;;) {
2574		char b1[40], b2[40], b3[40], b4[100];
2575		uint64_t pps, usec;
2576		struct my_ctrs x;
2577		double abs;
2578		int done = 0;
2579
2580		usec = wait_for_next_report(&prev.t, &cur.t,
2581				g->report_interval);
2582
2583		cur.pkts = cur.bytes = cur.events = 0;
2584		cur.min_space = 0;
2585		if (usec < 10000) /* too short to be meaningful */
2586			continue;
2587		/* accumulate counts for all threads */
2588		for (i = 0; i < g->nthreads; i++) {
2589			cur.pkts += targs[i].ctr.pkts;
2590			cur.bytes += targs[i].ctr.bytes;
2591			cur.events += targs[i].ctr.events;
2592			cur.min_space += targs[i].ctr.min_space;
2593			targs[i].ctr.min_space = 99999;
2594			if (targs[i].used == 0)
2595				done++;
2596		}
2597		x.pkts = cur.pkts - prev.pkts;
2598		x.bytes = cur.bytes - prev.bytes;
2599		x.events = cur.events - prev.events;
2600		pps = (x.pkts*1000000 + usec/2) / usec;
2601		abs = (x.events > 0) ? (x.pkts / (double) x.events) : 0;
2602
2603		if (!(g->options & OPT_PPS_STATS)) {
2604			strcpy(b4, "");
2605		} else {
2606			/* Compute some pps stats using a sliding window. */
2607			double ppsavg = 0.0, ppsdev = 0.0;
2608			int nsamples = 0;
2609
2610			g->win[g->win_idx] = pps;
2611			g->win_idx = (g->win_idx + 1) % STATS_WIN;
2612
2613			for (i = 0; i < STATS_WIN; i++) {
2614				ppsavg += g->win[i];
2615				if (g->win[i]) {
2616					nsamples ++;
2617				}
2618			}
2619			ppsavg /= nsamples;
2620
2621			for (i = 0; i < STATS_WIN; i++) {
2622				if (g->win[i] == 0) {
2623					continue;
2624				}
2625				ppsdev += (g->win[i] - ppsavg) * (g->win[i] - ppsavg);
2626			}
2627			ppsdev /= nsamples;
2628			ppsdev = sqrt(ppsdev);
2629
2630			snprintf(b4, sizeof(b4), "[avg/std %s/%s pps]",
2631				 norm(b1, ppsavg, normalize), norm(b2, ppsdev, normalize));
2632		}
2633
2634		D("%spps %s(%spkts %sbps in %llu usec) %.2f avg_batch %d min_space",
2635			norm(b1, pps, normalize), b4,
2636			norm(b2, (double)x.pkts, normalize),
2637			norm(b3, (double)x.bytes*8+(double)x.pkts*g->framing, normalize),
2638			(unsigned long long)usec,
2639			abs, (int)cur.min_space);
2640		prev = cur;
2641
2642		if (done == g->nthreads)
2643			break;
2644	}
2645
2646	timerclear(&tic);
2647	timerclear(&toc);
2648	cur.pkts = cur.bytes = cur.events = 0;
2649	/* final round */
2650	for (i = 0; i < g->nthreads; i++) {
2651		struct timespec t_tic, t_toc;
2652		/*
2653		 * Join active threads, unregister interfaces and close
2654		 * file descriptors.
2655		 */
2656		if (targs[i].used)
2657			pthread_join(targs[i].thread, NULL); /* blocking */
2658		if (g->dev_type == DEV_NETMAP) {
2659			nm_close(targs[i].nmd);
2660			targs[i].nmd = NULL;
2661		} else {
2662			close(targs[i].fd);
2663		}
2664
2665		if (targs[i].completed == 0)
2666			D("ouch, thread %d exited with error", i);
2667
2668		/*
2669		 * Collect threads output and extract information about
2670		 * how long it took to send all the packets.
2671		 */
2672		cur.pkts += targs[i].ctr.pkts;
2673		cur.bytes += targs[i].ctr.bytes;
2674		cur.events += targs[i].ctr.events;
2675		/* collect the largest start (tic) and end (toc) times,
2676		 * XXX maybe we should do the earliest tic, or do a weighted
2677		 * average ?
2678		 */
2679		t_tic = timeval2spec(&tic);
2680		t_toc = timeval2spec(&toc);
2681		if (!timerisset(&tic) || timespec_ge(&targs[i].tic, &t_tic))
2682			tic = timespec2val(&targs[i].tic);
2683		if (!timerisset(&toc) || timespec_ge(&targs[i].toc, &t_toc))
2684			toc = timespec2val(&targs[i].toc);
2685	}
2686
2687	/* print output. */
2688	timersub(&toc, &tic, &toc);
2689	delta_t = toc.tv_sec + 1e-6* toc.tv_usec;
2690	if (g->td_type == TD_TYPE_SENDER)
2691		tx_output(g, &cur, delta_t, "Sent");
2692	else if (g->td_type == TD_TYPE_RECEIVER)
2693		tx_output(g, &cur, delta_t, "Received");
2694}
2695
2696struct td_desc {
2697	int ty;
2698	char *key;
2699	void *f;
2700	int default_burst;
2701};
2702
2703static struct td_desc func[] = {
2704	{ TD_TYPE_RECEIVER,	"rx",		receiver_body,	512},	/* default */
2705	{ TD_TYPE_SENDER,	"tx",		sender_body,	512 },
2706	{ TD_TYPE_OTHER,	"ping",		ping_body,	1 },
2707	{ TD_TYPE_OTHER,	"pong",		pong_body,	1 },
2708	{ TD_TYPE_SENDER,	"txseq",	txseq_body,	512 },
2709	{ TD_TYPE_RECEIVER,	"rxseq",	rxseq_body,	512 },
2710	{ 0,			NULL,		NULL, 		0 }
2711};
2712
2713static int
2714tap_alloc(char *dev)
2715{
2716	struct ifreq ifr;
2717	int fd, err;
2718	char *clonedev = TAP_CLONEDEV;
2719
2720	(void)err;
2721	(void)dev;
2722	/* Arguments taken by the function:
2723	 *
2724	 * char *dev: the name of an interface (or '\0'). MUST have enough
2725	 *   space to hold the interface name if '\0' is passed
2726	 * int flags: interface flags (eg, IFF_TUN etc.)
2727	 */
2728
2729#ifdef __FreeBSD__
2730	if (dev[3]) { /* tapSomething */
2731		static char buf[128];
2732		snprintf(buf, sizeof(buf), "/dev/%s", dev);
2733		clonedev = buf;
2734	}
2735#endif
2736	/* open the device */
2737	if( (fd = open(clonedev, O_RDWR)) < 0 ) {
2738		return fd;
2739	}
2740	D("%s open successful", clonedev);
2741
2742	/* preparation of the struct ifr, of type "struct ifreq" */
2743	memset(&ifr, 0, sizeof(ifr));
2744
2745#ifdef linux
2746	ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
2747
2748	if (*dev) {
2749		/* if a device name was specified, put it in the structure; otherwise,
2750		* the kernel will try to allocate the "next" device of the
2751		* specified type */
2752		size_t len = strlen(dev);
2753		if (len > IFNAMSIZ) {
2754			D("%s too long", dev);
2755			return -1;
2756		}
2757		memcpy(ifr.ifr_name, dev, len);
2758	}
2759
2760	/* try to create the device */
2761	if( (err = ioctl(fd, TUNSETIFF, (void *) &ifr)) < 0 ) {
2762		D("failed to to a TUNSETIFF: %s", strerror(errno));
2763		close(fd);
2764		return err;
2765	}
2766
2767	/* if the operation was successful, write back the name of the
2768	* interface to the variable "dev", so the caller can know
2769	* it. Note that the caller MUST reserve space in *dev (see calling
2770	* code below) */
2771	strcpy(dev, ifr.ifr_name);
2772	D("new name is %s", dev);
2773#endif /* linux */
2774
2775	/* this is the special file descriptor that the caller will use to talk
2776	 * with the virtual interface */
2777	return fd;
2778}
2779
2780int
2781main(int arc, char **argv)
2782{
2783	int i;
2784	struct sigaction sa;
2785	sigset_t ss;
2786
2787	struct glob_arg g;
2788
2789	int ch;
2790	int devqueues = 1;	/* how many device queues */
2791	int wait_link_arg = 0;
2792
2793	int pkt_size_done = 0;
2794
2795	struct td_desc *fn = func;
2796
2797	bzero(&g, sizeof(g));
2798
2799	g.main_fd = -1;
2800	g.td_body = fn->f;
2801	g.td_type = fn->ty;
2802	g.report_interval = 1000;	/* report interval */
2803	g.affinity = -1;
2804	/* ip addresses can also be a range x.x.x.x-x.x.x.y */
2805	g.af = AF_INET;		/* default */
2806	g.src_ip.name = "10.0.0.1";
2807	g.dst_ip.name = "10.1.0.1";
2808	g.dst_mac.name = "ff:ff:ff:ff:ff:ff";
2809	g.src_mac.name = NULL;
2810	g.pkt_size = 60;
2811	g.pkt_min_size = 0;
2812	g.nthreads = 1;
2813	g.cpus = 1;		/* default */
2814	g.forever = 1;
2815	g.tx_rate = 0;
2816	g.frags = 1;
2817	g.frag_size = (u_int)-1;	/* use the netmap buffer size by default */
2818	g.nmr_config = "";
2819	g.virt_header = 0;
2820	g.wait_link = 2;	/* wait 2 seconds for physical ports */
2821
2822	while ((ch = getopt(arc, argv, "46a:f:F:Nn:i:Il:d:s:D:S:b:c:o:p:"
2823	    "T:w:WvR:XC:H:rP:zZAhBM:")) != -1) {
2824
2825		switch(ch) {
2826		default:
2827			D("bad option %c %s", ch, optarg);
2828			usage(-1);
2829			break;
2830
2831		case 'h':
2832			usage(0);
2833			break;
2834
2835		case '4':
2836			g.af = AF_INET;
2837			break;
2838
2839		case '6':
2840			g.af = AF_INET6;
2841			break;
2842
2843		case 'N':
2844			normalize = 0;
2845			break;
2846
2847		case 'n':
2848			g.npackets = strtoull(optarg, NULL, 10);
2849			break;
2850
2851		case 'F':
2852			i = atoi(optarg);
2853			if (i < 1 || i > 63) {
2854				D("invalid frags %d [1..63], ignore", i);
2855				break;
2856			}
2857			g.frags = i;
2858			break;
2859
2860		case 'M':
2861			g.frag_size = atoi(optarg);
2862			break;
2863
2864		case 'f':
2865			for (fn = func; fn->key; fn++) {
2866				if (!strcmp(fn->key, optarg))
2867					break;
2868			}
2869			if (fn->key) {
2870				g.td_body = fn->f;
2871				g.td_type = fn->ty;
2872			} else {
2873				D("unrecognised function %s", optarg);
2874			}
2875			break;
2876
2877		case 'o':	/* data generation options */
2878			g.options |= atoi(optarg);
2879			break;
2880
2881		case 'a':       /* force affinity */
2882			g.affinity = atoi(optarg);
2883			break;
2884
2885		case 'i':	/* interface */
2886			/* a prefix of tap: netmap: or pcap: forces the mode.
2887			 * otherwise we guess
2888			 */
2889			D("interface is %s", optarg);
2890			if (strlen(optarg) > MAX_IFNAMELEN - 8) {
2891				D("ifname too long %s", optarg);
2892				break;
2893			}
2894			strcpy(g.ifname, optarg);
2895			if (!strcmp(optarg, "null")) {
2896				g.dev_type = DEV_NETMAP;
2897				g.dummy_send = 1;
2898			} else if (!strncmp(optarg, "tap:", 4)) {
2899				g.dev_type = DEV_TAP;
2900				strcpy(g.ifname, optarg + 4);
2901			} else if (!strncmp(optarg, "pcap:", 5)) {
2902				g.dev_type = DEV_PCAP;
2903				strcpy(g.ifname, optarg + 5);
2904			} else if (!strncmp(optarg, "netmap:", 7) ||
2905				   !strncmp(optarg, "vale", 4)) {
2906				g.dev_type = DEV_NETMAP;
2907			} else if (!strncmp(optarg, "tap", 3)) {
2908				g.dev_type = DEV_TAP;
2909			} else { /* prepend netmap: */
2910				g.dev_type = DEV_NETMAP;
2911				sprintf(g.ifname, "netmap:%s", optarg);
2912			}
2913			break;
2914
2915		case 'I':
2916			g.options |= OPT_INDIRECT;	/* use indirect buffers */
2917			break;
2918
2919		case 'l':	/* pkt_size */
2920			if (pkt_size_done) {
2921				g.pkt_min_size = atoi(optarg);
2922			} else {
2923				g.pkt_size = atoi(optarg);
2924				pkt_size_done = 1;
2925			}
2926			break;
2927
2928		case 'd':
2929			g.dst_ip.name = optarg;
2930			break;
2931
2932		case 's':
2933			g.src_ip.name = optarg;
2934			break;
2935
2936		case 'T':	/* report interval */
2937			g.report_interval = atoi(optarg);
2938			break;
2939
2940		case 'w':
2941			g.wait_link = atoi(optarg);
2942			wait_link_arg = 1;
2943			break;
2944
2945		case 'W':
2946			g.forever = 0; /* exit RX with no traffic */
2947			break;
2948
2949		case 'b':	/* burst */
2950			g.burst = atoi(optarg);
2951			break;
2952		case 'c':
2953			g.cpus = atoi(optarg);
2954			break;
2955		case 'p':
2956			g.nthreads = atoi(optarg);
2957			break;
2958
2959		case 'D': /* destination mac */
2960			g.dst_mac.name = optarg;
2961			break;
2962
2963		case 'S': /* source mac */
2964			g.src_mac.name = optarg;
2965			break;
2966		case 'v':
2967			verbose++;
2968			break;
2969		case 'R':
2970			g.tx_rate = atoi(optarg);
2971			break;
2972		case 'X':
2973			g.options |= OPT_DUMP;
2974			break;
2975		case 'C':
2976			g.nmr_config = strdup(optarg);
2977			break;
2978		case 'H':
2979			g.virt_header = atoi(optarg);
2980			break;
2981		case 'P':
2982			g.packet_file = strdup(optarg);
2983			break;
2984		case 'r':
2985			g.options |= OPT_RUBBISH;
2986			break;
2987		case 'z':
2988			g.options |= OPT_RANDOM_SRC;
2989			break;
2990		case 'Z':
2991			g.options |= OPT_RANDOM_DST;
2992			break;
2993		case 'A':
2994			g.options |= OPT_PPS_STATS;
2995			break;
2996		case 'B':
2997			/* raw packets have4 bytes crc + 20 bytes framing */
2998			// XXX maybe add an option to pass the IFG
2999			g.framing = 24 * 8;
3000			break;
3001		}
3002	}
3003
3004	if (strlen(g.ifname) <=0 ) {
3005		D("missing ifname");
3006		usage(-1);
3007	}
3008
3009	if (g.burst == 0) {
3010		g.burst = fn->default_burst;
3011		D("using default burst size: %d", g.burst);
3012	}
3013
3014	g.system_cpus = i = system_ncpus();
3015	if (g.cpus < 0 || g.cpus > i) {
3016		D("%d cpus is too high, have only %d cpus", g.cpus, i);
3017		usage(-1);
3018	}
3019	D("running on %d cpus (have %d)", g.cpus, i);
3020	if (g.cpus == 0)
3021		g.cpus = i;
3022
3023	if (!wait_link_arg && !strncmp(g.ifname, "vale", 4)) {
3024		g.wait_link = 0;
3025	}
3026
3027	if (g.pkt_size < 16 || g.pkt_size > MAX_PKTSIZE) {
3028		D("bad pktsize %d [16..%d]\n", g.pkt_size, MAX_PKTSIZE);
3029		usage(-1);
3030	}
3031
3032	if (g.pkt_min_size > 0 && (g.pkt_min_size < 16 || g.pkt_min_size > g.pkt_size)) {
3033		D("bad pktminsize %d [16..%d]\n", g.pkt_min_size, g.pkt_size);
3034		usage(-1);
3035	}
3036
3037	if (g.src_mac.name == NULL) {
3038		static char mybuf[20] = "00:00:00:00:00:00";
3039		/* retrieve source mac address. */
3040		if (source_hwaddr(g.ifname, mybuf) == -1) {
3041			D("Unable to retrieve source mac");
3042			// continue, fail later
3043		}
3044		g.src_mac.name = mybuf;
3045	}
3046	/* extract address ranges */
3047	if (extract_mac_range(&g.src_mac) || extract_mac_range(&g.dst_mac))
3048		usage(-1);
3049	g.options |= extract_ip_range(&g.src_ip, g.af);
3050	g.options |= extract_ip_range(&g.dst_ip, g.af);
3051
3052	if (g.virt_header != 0 && g.virt_header != VIRT_HDR_1
3053			&& g.virt_header != VIRT_HDR_2) {
3054		D("bad virtio-net-header length");
3055		usage(-1);
3056	}
3057
3058    if (g.dev_type == DEV_TAP) {
3059	D("want to use tap %s", g.ifname);
3060	g.main_fd = tap_alloc(g.ifname);
3061	if (g.main_fd < 0) {
3062		D("cannot open tap %s", g.ifname);
3063		usage(-1);
3064	}
3065#ifndef NO_PCAP
3066    } else if (g.dev_type == DEV_PCAP) {
3067	char pcap_errbuf[PCAP_ERRBUF_SIZE];
3068
3069	pcap_errbuf[0] = '\0'; // init the buffer
3070	g.p = pcap_open_live(g.ifname, 256 /* XXX */, 1, 100, pcap_errbuf);
3071	if (g.p == NULL) {
3072		D("cannot open pcap on %s", g.ifname);
3073		usage(-1);
3074	}
3075	g.main_fd = pcap_fileno(g.p);
3076	D("using pcap on %s fileno %d", g.ifname, g.main_fd);
3077#endif /* !NO_PCAP */
3078    } else if (g.dummy_send) { /* but DEV_NETMAP */
3079	D("using a dummy send routine");
3080    } else {
3081	struct nm_desc base_nmd;
3082	char errmsg[MAXERRMSG];
3083	u_int flags;
3084
3085	bzero(&base_nmd, sizeof(base_nmd));
3086
3087	parse_nmr_config(g.nmr_config, &base_nmd.req);
3088
3089	base_nmd.req.nr_flags |= NR_ACCEPT_VNET_HDR;
3090
3091	if (nm_parse(g.ifname, &base_nmd, errmsg) < 0) {
3092		D("Invalid name '%s': %s", g.ifname, errmsg);
3093		goto out;
3094	}
3095
3096	/*
3097	 * Open the netmap device using nm_open().
3098	 *
3099	 * protocol stack and may cause a reset of the card,
3100	 * which in turn may take some time for the PHY to
3101	 * reconfigure. We do the open here to have time to reset.
3102	 */
3103	flags = NM_OPEN_IFNAME | NM_OPEN_ARG1 | NM_OPEN_ARG2 |
3104		NM_OPEN_ARG3 | NM_OPEN_RING_CFG;
3105	if (g.nthreads > 1) {
3106		base_nmd.req.nr_flags &= ~NR_REG_MASK;
3107		base_nmd.req.nr_flags |= NR_REG_ONE_NIC;
3108		base_nmd.req.nr_ringid = 0;
3109	}
3110	g.nmd = nm_open(g.ifname, NULL, flags, &base_nmd);
3111	if (g.nmd == NULL) {
3112		D("Unable to open %s: %s", g.ifname, strerror(errno));
3113		goto out;
3114	}
3115	g.main_fd = g.nmd->fd;
3116	D("mapped %luKB at %p", (unsigned long)(g.nmd->req.nr_memsize>>10),
3117				g.nmd->mem);
3118
3119	if (g.virt_header) {
3120		/* Set the virtio-net header length, since the user asked
3121		 * for it explicitely. */
3122		set_vnet_hdr_len(&g);
3123	} else {
3124		/* Check whether the netmap port we opened requires us to send
3125		 * and receive frames with virtio-net header. */
3126		get_vnet_hdr_len(&g);
3127	}
3128
3129	/* get num of queues in tx or rx */
3130	if (g.td_type == TD_TYPE_SENDER)
3131		devqueues = g.nmd->req.nr_tx_rings;
3132	else
3133		devqueues = g.nmd->req.nr_rx_rings;
3134
3135	/* validate provided nthreads. */
3136	if (g.nthreads < 1 || g.nthreads > devqueues) {
3137		D("bad nthreads %d, have %d queues", g.nthreads, devqueues);
3138		// continue, fail later
3139	}
3140
3141	if (g.td_type == TD_TYPE_SENDER) {
3142		int mtu = get_if_mtu(&g);
3143
3144		if (mtu > 0 && g.pkt_size > mtu) {
3145			D("pkt_size (%d) must be <= mtu (%d)",
3146				g.pkt_size, mtu);
3147			return -1;
3148		}
3149	}
3150
3151	if (verbose) {
3152		struct netmap_if *nifp = g.nmd->nifp;
3153		struct nmreq *req = &g.nmd->req;
3154
3155		D("nifp at offset %d, %d tx %d rx region %d",
3156		    req->nr_offset, req->nr_tx_rings, req->nr_rx_rings,
3157		    req->nr_arg2);
3158		for (i = 0; i <= req->nr_tx_rings; i++) {
3159			struct netmap_ring *ring = NETMAP_TXRING(nifp, i);
3160			D("   TX%d at 0x%p slots %d", i,
3161			    (void *)((char *)ring - (char *)nifp), ring->num_slots);
3162		}
3163		for (i = 0; i <= req->nr_rx_rings; i++) {
3164			struct netmap_ring *ring = NETMAP_RXRING(nifp, i);
3165			D("   RX%d at 0x%p slots %d", i,
3166			    (void *)((char *)ring - (char *)nifp), ring->num_slots);
3167		}
3168	}
3169
3170	/* Print some debug information. */
3171	fprintf(stdout,
3172		"%s %s: %d queues, %d threads and %d cpus.\n",
3173		(g.td_type == TD_TYPE_SENDER) ? "Sending on" :
3174			((g.td_type == TD_TYPE_RECEIVER) ? "Receiving from" :
3175			"Working on"),
3176		g.ifname,
3177		devqueues,
3178		g.nthreads,
3179		g.cpus);
3180	if (g.td_type == TD_TYPE_SENDER) {
3181		fprintf(stdout, "%s -> %s (%s -> %s)\n",
3182			g.src_ip.name, g.dst_ip.name,
3183			g.src_mac.name, g.dst_mac.name);
3184	}
3185
3186out:
3187	/* Exit if something went wrong. */
3188	if (g.main_fd < 0) {
3189		D("aborting");
3190		usage(-1);
3191	}
3192    }
3193
3194
3195	if (g.options) {
3196		D("--- SPECIAL OPTIONS:%s%s%s%s%s%s\n",
3197			g.options & OPT_PREFETCH ? " prefetch" : "",
3198			g.options & OPT_ACCESS ? " access" : "",
3199			g.options & OPT_MEMCPY ? " memcpy" : "",
3200			g.options & OPT_INDIRECT ? " indirect" : "",
3201			g.options & OPT_COPY ? " copy" : "",
3202			g.options & OPT_RUBBISH ? " rubbish " : "");
3203	}
3204
3205	g.tx_period.tv_sec = g.tx_period.tv_nsec = 0;
3206	if (g.tx_rate > 0) {
3207		/* try to have at least something every second,
3208		 * reducing the burst size to some 0.01s worth of data
3209		 * (but no less than one full set of fragments)
3210	 	 */
3211		uint64_t x;
3212		int lim = (g.tx_rate)/300;
3213		if (g.burst > lim)
3214			g.burst = lim;
3215		if (g.burst == 0)
3216			g.burst = 1;
3217		x = ((uint64_t)1000000000 * (uint64_t)g.burst) / (uint64_t) g.tx_rate;
3218		g.tx_period.tv_nsec = x;
3219		g.tx_period.tv_sec = g.tx_period.tv_nsec / 1000000000;
3220		g.tx_period.tv_nsec = g.tx_period.tv_nsec % 1000000000;
3221	}
3222	if (g.td_type == TD_TYPE_SENDER)
3223	    D("Sending %d packets every  %ld.%09ld s",
3224			g.burst, g.tx_period.tv_sec, g.tx_period.tv_nsec);
3225	/* Install ^C handler. */
3226	global_nthreads = g.nthreads;
3227	sigemptyset(&ss);
3228	sigaddset(&ss, SIGINT);
3229	/* block SIGINT now, so that all created threads will inherit the mask */
3230	if (pthread_sigmask(SIG_BLOCK, &ss, NULL) < 0) {
3231		D("failed to block SIGINT: %s", strerror(errno));
3232	}
3233	start_threads(&g);
3234	/* Install the handler and re-enable SIGINT for the main thread */
3235	memset(&sa, 0, sizeof(sa));
3236	sa.sa_handler = sigint_h;
3237	if (sigaction(SIGINT, &sa, NULL) < 0) {
3238		D("failed to install ^C handler: %s", strerror(errno));
3239	}
3240
3241	if (pthread_sigmask(SIG_UNBLOCK, &ss, NULL) < 0) {
3242		D("failed to re-enable SIGINT: %s", strerror(errno));
3243	}
3244	main_thread(&g);
3245	free(targs);
3246	return 0;
3247}
3248
3249/* end of file */
3250