pkt-gen.c revision 302408
1/*
2 * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved.
3 * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *   1. Redistributions of source code must retain the above copyright
9 *      notice, this list of conditions and the following disclaimer.
10 *   2. Redistributions in binary form must reproduce the above copyright
11 *      notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27/*
28 * $FreeBSD: stable/11/tools/tools/netmap/pkt-gen.c 281746 2015-04-19 17:07:51Z adrian $
29 * $Id: pkt-gen.c 12346 2013-06-12 17:36:25Z luigi $
30 *
31 * Example program to show how to build a multithreaded packet
32 * source/sink using the netmap device.
33 *
34 * In this example we create a programmable number of threads
35 * to take care of all the queues of the interface used to
36 * send or receive traffic.
37 *
38 */
39
40// #define TRASH_VHOST_HDR
41
42#define _GNU_SOURCE	/* for CPU_SET() */
43#include <stdio.h>
44#define NETMAP_WITH_LIBS
45#include <net/netmap_user.h>
46
47
48#include <ctype.h>	// isprint()
49#include <unistd.h>	// sysconf()
50#include <sys/poll.h>
51#include <arpa/inet.h>	/* ntohs */
52#include <sys/sysctl.h>	/* sysctl */
53#include <ifaddrs.h>	/* getifaddrs */
54#include <net/ethernet.h>
55#include <netinet/in.h>
56#include <netinet/ip.h>
57#include <netinet/udp.h>
58
59#include <pthread.h>
60
61#ifndef NO_PCAP
62#include <pcap/pcap.h>
63#endif
64
65#ifdef linux
66
67#define cpuset_t        cpu_set_t
68
69#define ifr_flagshigh  ifr_flags        /* only the low 16 bits here */
70#define IFF_PPROMISC   IFF_PROMISC      /* IFF_PPROMISC does not exist */
71#include <linux/ethtool.h>
72#include <linux/sockios.h>
73
74#define CLOCK_REALTIME_PRECISE CLOCK_REALTIME
75#include <netinet/ether.h>      /* ether_aton */
76#include <linux/if_packet.h>    /* sockaddr_ll */
77#endif  /* linux */
78
79#ifdef __FreeBSD__
80#include <sys/endian.h> /* le64toh */
81#include <machine/param.h>
82
83#include <pthread_np.h> /* pthread w/ affinity */
84#include <sys/cpuset.h> /* cpu_set */
85#include <net/if_dl.h>  /* LLADDR */
86#endif  /* __FreeBSD__ */
87
88#ifdef __APPLE__
89
90#define cpuset_t        uint64_t        // XXX
91static inline void CPU_ZERO(cpuset_t *p)
92{
93        *p = 0;
94}
95
96static inline void CPU_SET(uint32_t i, cpuset_t *p)
97{
98        *p |= 1<< (i & 0x3f);
99}
100
101#define pthread_setaffinity_np(a, b, c) ((void)a, 0)
102
103#define ifr_flagshigh  ifr_flags        // XXX
104#define IFF_PPROMISC   IFF_PROMISC
105#include <net/if_dl.h>  /* LLADDR */
106#define clock_gettime(a,b)      \
107        do {struct timespec t0 = {0,0}; *(b) = t0; } while (0)
108#endif  /* __APPLE__ */
109
110const char *default_payload="netmap pkt-gen DIRECT payload\n"
111	"http://info.iet.unipi.it/~luigi/netmap/ ";
112
113const char *indirect_payload="netmap pkt-gen indirect payload\n"
114	"http://info.iet.unipi.it/~luigi/netmap/ ";
115
116int verbose = 0;
117
118#define SKIP_PAYLOAD 1 /* do not check payload. XXX unused */
119
120
121#define VIRT_HDR_1	10	/* length of a base vnet-hdr */
122#define VIRT_HDR_2	12	/* length of the extenede vnet-hdr */
123#define VIRT_HDR_MAX	VIRT_HDR_2
124struct virt_header {
125	uint8_t fields[VIRT_HDR_MAX];
126};
127
128#define MAX_BODYSIZE	16384
129
130struct pkt {
131	struct virt_header vh;
132	struct ether_header eh;
133	struct ip ip;
134	struct udphdr udp;
135	uint8_t body[MAX_BODYSIZE];	// XXX hardwired
136} __attribute__((__packed__));
137
138struct ip_range {
139	char *name;
140	uint32_t start, end; /* same as struct in_addr */
141	uint16_t port0, port1;
142};
143
144struct mac_range {
145	char *name;
146	struct ether_addr start, end;
147};
148
149/* ifname can be netmap:foo-xxxx */
150#define MAX_IFNAMELEN	64	/* our buffer for ifname */
151//#define MAX_PKTSIZE	1536
152#define MAX_PKTSIZE	MAX_BODYSIZE	/* XXX: + IP_HDR + ETH_HDR */
153
154/* compact timestamp to fit into 60 byte packet. (enough to obtain RTT) */
155struct tstamp {
156	uint32_t sec;
157	uint32_t nsec;
158};
159
160/*
161 * global arguments for all threads
162 */
163
164struct glob_arg {
165	struct ip_range src_ip;
166	struct ip_range dst_ip;
167	struct mac_range dst_mac;
168	struct mac_range src_mac;
169	int pkt_size;
170	int burst;
171	int forever;
172	int npackets;	/* total packets to send */
173	int frags;	/* fragments per packet */
174	int nthreads;
175	int cpus;
176	int options;	/* testing */
177#define OPT_PREFETCH	1
178#define OPT_ACCESS	2
179#define OPT_COPY	4
180#define OPT_MEMCPY	8
181#define OPT_TS		16	/* add a timestamp */
182#define OPT_INDIRECT	32	/* use indirect buffers, tx only */
183#define OPT_DUMP	64	/* dump rx/tx traffic */
184#define OPT_MONITOR_TX  128
185#define OPT_MONITOR_RX  256
186#define OPT_RANDOM_SRC  512
187#define OPT_RANDOM_DST  1024
188	int dev_type;
189#ifndef NO_PCAP
190	pcap_t *p;
191#endif
192
193	int tx_rate;
194	struct timespec tx_period;
195
196	int affinity;
197	int main_fd;
198	struct nm_desc *nmd;
199	int report_interval;		/* milliseconds between prints */
200	void *(*td_body)(void *);
201	void *mmap_addr;
202	char ifname[MAX_IFNAMELEN];
203	char *nmr_config;
204	int dummy_send;
205	int virt_header;	/* send also the virt_header */
206	int extra_bufs;		/* goes in nr_arg3 */
207	char *packet_file;	/* -P option */
208};
209enum dev_type { DEV_NONE, DEV_NETMAP, DEV_PCAP, DEV_TAP };
210
211
212/*
213 * Arguments for a new thread. The same structure is used by
214 * the source and the sink
215 */
216struct targ {
217	struct glob_arg *g;
218	int used;
219	int completed;
220	int cancel;
221	int fd;
222	struct nm_desc *nmd;
223	volatile uint64_t count;
224	struct timespec tic, toc;
225	int me;
226	pthread_t thread;
227	int affinity;
228
229	struct pkt pkt;
230	void *frame;
231};
232
233
234/*
235 * extract the extremes from a range of ipv4 addresses.
236 * addr_lo[-addr_hi][:port_lo[-port_hi]]
237 */
238static void
239extract_ip_range(struct ip_range *r)
240{
241	char *ap, *pp;
242	struct in_addr a;
243
244	if (verbose)
245		D("extract IP range from %s", r->name);
246	r->port0 = r->port1 = 0;
247	r->start = r->end = 0;
248
249	/* the first - splits start/end of range */
250	ap = index(r->name, '-');	/* do we have ports ? */
251	if (ap) {
252		*ap++ = '\0';
253	}
254	/* grab the initial values (mandatory) */
255	pp = index(r->name, ':');
256	if (pp) {
257		*pp++ = '\0';
258		r->port0 = r->port1 = strtol(pp, NULL, 0);
259	};
260	inet_aton(r->name, &a);
261	r->start = r->end = ntohl(a.s_addr);
262	if (ap) {
263		pp = index(ap, ':');
264		if (pp) {
265			*pp++ = '\0';
266			if (*pp)
267				r->port1 = strtol(pp, NULL, 0);
268		}
269		if (*ap) {
270			inet_aton(ap, &a);
271			r->end = ntohl(a.s_addr);
272		}
273	}
274	if (r->port0 > r->port1) {
275		uint16_t tmp = r->port0;
276		r->port0 = r->port1;
277		r->port1 = tmp;
278	}
279	if (r->start > r->end) {
280		uint32_t tmp = r->start;
281		r->start = r->end;
282		r->end = tmp;
283	}
284	{
285		struct in_addr a;
286		char buf1[16]; // one ip address
287
288		a.s_addr = htonl(r->end);
289		strncpy(buf1, inet_ntoa(a), sizeof(buf1));
290		a.s_addr = htonl(r->start);
291		if (1)
292		    D("range is %s:%d to %s:%d",
293			inet_ntoa(a), r->port0, buf1, r->port1);
294	}
295}
296
297static void
298extract_mac_range(struct mac_range *r)
299{
300	if (verbose)
301	    D("extract MAC range from %s", r->name);
302	bcopy(ether_aton(r->name), &r->start, 6);
303	bcopy(ether_aton(r->name), &r->end, 6);
304#if 0
305	bcopy(targ->src_mac, eh->ether_shost, 6);
306	p = index(targ->g->src_mac, '-');
307	if (p)
308		targ->src_mac_range = atoi(p+1);
309
310	bcopy(ether_aton(targ->g->dst_mac), targ->dst_mac, 6);
311	bcopy(targ->dst_mac, eh->ether_dhost, 6);
312	p = index(targ->g->dst_mac, '-');
313	if (p)
314		targ->dst_mac_range = atoi(p+1);
315#endif
316	if (verbose)
317		D("%s starts at %s", r->name, ether_ntoa(&r->start));
318}
319
320static struct targ *targs;
321static int global_nthreads;
322
323/* control-C handler */
324static void
325sigint_h(int sig)
326{
327	int i;
328
329	(void)sig;	/* UNUSED */
330	D("received control-C on thread %p", pthread_self());
331	for (i = 0; i < global_nthreads; i++) {
332		targs[i].cancel = 1;
333	}
334	signal(SIGINT, SIG_DFL);
335}
336
337/* sysctl wrapper to return the number of active CPUs */
338static int
339system_ncpus(void)
340{
341	int ncpus;
342#if defined (__FreeBSD__)
343	int mib[2] = { CTL_HW, HW_NCPU };
344	size_t len = sizeof(mib);
345	sysctl(mib, 2, &ncpus, &len, NULL, 0);
346#elif defined(linux)
347	ncpus = sysconf(_SC_NPROCESSORS_ONLN);
348#else /* others */
349	ncpus = 1;
350#endif /* others */
351	return (ncpus);
352}
353
354#ifdef __linux__
355#define sockaddr_dl    sockaddr_ll
356#define sdl_family     sll_family
357#define AF_LINK        AF_PACKET
358#define LLADDR(s)      s->sll_addr;
359#include <linux/if_tun.h>
360#define TAP_CLONEDEV	"/dev/net/tun"
361#endif /* __linux__ */
362
363#ifdef __FreeBSD__
364#include <net/if_tun.h>
365#define TAP_CLONEDEV	"/dev/tap"
366#endif /* __FreeBSD */
367
368#ifdef __APPLE__
369// #warning TAP not supported on apple ?
370#include <net/if_utun.h>
371#define TAP_CLONEDEV	"/dev/tap"
372#endif /* __APPLE__ */
373
374
375/*
376 * parse the vale configuration in conf and put it in nmr.
377 * Return the flag set if necessary.
378 * The configuration may consist of 0 to 4 numbers separated
379 * by commas: #tx-slots,#rx-slots,#tx-rings,#rx-rings.
380 * Missing numbers or zeroes stand for default values.
381 * As an additional convenience, if exactly one number
382 * is specified, then this is assigned to both #tx-slots and #rx-slots.
383 * If there is no 4th number, then the 3rd is assigned to both #tx-rings
384 * and #rx-rings.
385 */
386int
387parse_nmr_config(const char* conf, struct nmreq *nmr)
388{
389	char *w, *tok;
390	int i, v;
391
392	nmr->nr_tx_rings = nmr->nr_rx_rings = 0;
393	nmr->nr_tx_slots = nmr->nr_rx_slots = 0;
394	if (conf == NULL || ! *conf)
395		return 0;
396	w = strdup(conf);
397	for (i = 0, tok = strtok(w, ","); tok; i++, tok = strtok(NULL, ",")) {
398		v = atoi(tok);
399		switch (i) {
400		case 0:
401			nmr->nr_tx_slots = nmr->nr_rx_slots = v;
402			break;
403		case 1:
404			nmr->nr_rx_slots = v;
405			break;
406		case 2:
407			nmr->nr_tx_rings = nmr->nr_rx_rings = v;
408			break;
409		case 3:
410			nmr->nr_rx_rings = v;
411			break;
412		default:
413			D("ignored config: %s", tok);
414			break;
415		}
416	}
417	D("txr %d txd %d rxr %d rxd %d",
418			nmr->nr_tx_rings, nmr->nr_tx_slots,
419			nmr->nr_rx_rings, nmr->nr_rx_slots);
420	free(w);
421	return (nmr->nr_tx_rings || nmr->nr_tx_slots ||
422                        nmr->nr_rx_rings || nmr->nr_rx_slots) ?
423		NM_OPEN_RING_CFG : 0;
424}
425
426
427/*
428 * locate the src mac address for our interface, put it
429 * into the user-supplied buffer. return 0 if ok, -1 on error.
430 */
431static int
432source_hwaddr(const char *ifname, char *buf)
433{
434	struct ifaddrs *ifaphead, *ifap;
435	int l = sizeof(ifap->ifa_name);
436
437	if (getifaddrs(&ifaphead) != 0) {
438		D("getifaddrs %s failed", ifname);
439		return (-1);
440	}
441
442	for (ifap = ifaphead; ifap; ifap = ifap->ifa_next) {
443		struct sockaddr_dl *sdl =
444			(struct sockaddr_dl *)ifap->ifa_addr;
445		uint8_t *mac;
446
447		if (!sdl || sdl->sdl_family != AF_LINK)
448			continue;
449		if (strncmp(ifap->ifa_name, ifname, l) != 0)
450			continue;
451		mac = (uint8_t *)LLADDR(sdl);
452		sprintf(buf, "%02x:%02x:%02x:%02x:%02x:%02x",
453			mac[0], mac[1], mac[2],
454			mac[3], mac[4], mac[5]);
455		if (verbose)
456			D("source hwaddr %s", buf);
457		break;
458	}
459	freeifaddrs(ifaphead);
460	return ifap ? 0 : 1;
461}
462
463
464/* set the thread affinity. */
465static int
466setaffinity(pthread_t me, int i)
467{
468	cpuset_t cpumask;
469
470	if (i == -1)
471		return 0;
472
473	/* Set thread affinity affinity.*/
474	CPU_ZERO(&cpumask);
475	CPU_SET(i, &cpumask);
476
477	if (pthread_setaffinity_np(me, sizeof(cpuset_t), &cpumask) != 0) {
478		D("Unable to set affinity: %s", strerror(errno));
479		return 1;
480	}
481	return 0;
482}
483
484/* Compute the checksum of the given ip header. */
485static uint16_t
486checksum(const void *data, uint16_t len, uint32_t sum)
487{
488        const uint8_t *addr = data;
489	uint32_t i;
490
491        /* Checksum all the pairs of bytes first... */
492        for (i = 0; i < (len & ~1U); i += 2) {
493                sum += (u_int16_t)ntohs(*((u_int16_t *)(addr + i)));
494                if (sum > 0xFFFF)
495                        sum -= 0xFFFF;
496        }
497	/*
498	 * If there's a single byte left over, checksum it, too.
499	 * Network byte order is big-endian, so the remaining byte is
500	 * the high byte.
501	 */
502	if (i < len) {
503		sum += addr[i] << 8;
504		if (sum > 0xFFFF)
505			sum -= 0xFFFF;
506	}
507	return sum;
508}
509
510static u_int16_t
511wrapsum(u_int32_t sum)
512{
513	sum = ~sum & 0xFFFF;
514	return (htons(sum));
515}
516
517/* Check the payload of the packet for errors (use it for debug).
518 * Look for consecutive ascii representations of the size of the packet.
519 */
520static void
521dump_payload(char *p, int len, struct netmap_ring *ring, int cur)
522{
523	char buf[128];
524	int i, j, i0;
525
526	/* get the length in ASCII of the length of the packet. */
527
528	printf("ring %p cur %5d [buf %6d flags 0x%04x len %5d]\n",
529		ring, cur, ring->slot[cur].buf_idx,
530		ring->slot[cur].flags, len);
531	/* hexdump routine */
532	for (i = 0; i < len; ) {
533		memset(buf, sizeof(buf), ' ');
534		sprintf(buf, "%5d: ", i);
535		i0 = i;
536		for (j=0; j < 16 && i < len; i++, j++)
537			sprintf(buf+7+j*3, "%02x ", (uint8_t)(p[i]));
538		i = i0;
539		for (j=0; j < 16 && i < len; i++, j++)
540			sprintf(buf+7+j + 48, "%c",
541				isprint(p[i]) ? p[i] : '.');
542		printf("%s\n", buf);
543	}
544}
545
546/*
547 * Fill a packet with some payload.
548 * We create a UDP packet so the payload starts at
549 *	14+20+8 = 42 bytes.
550 */
551#ifdef __linux__
552#define uh_sport source
553#define uh_dport dest
554#define uh_ulen len
555#define uh_sum check
556#endif /* linux */
557
558/*
559 * increment the addressed in the packet,
560 * starting from the least significant field.
561 *	DST_IP DST_PORT SRC_IP SRC_PORT
562 */
563static void
564update_addresses(struct pkt *pkt, struct glob_arg *g)
565{
566	uint32_t a;
567	uint16_t p;
568	struct ip *ip = &pkt->ip;
569	struct udphdr *udp = &pkt->udp;
570
571    do {
572    	/* XXX for now it doesn't handle non-random src, random dst */
573	if (g->options & OPT_RANDOM_SRC) {
574		udp->uh_sport = random();
575		ip->ip_src.s_addr = random();
576	} else {
577		p = ntohs(udp->uh_sport);
578		if (p < g->src_ip.port1) { /* just inc, no wrap */
579			udp->uh_sport = htons(p + 1);
580			break;
581		}
582		udp->uh_sport = htons(g->src_ip.port0);
583
584		a = ntohl(ip->ip_src.s_addr);
585		if (a < g->src_ip.end) { /* just inc, no wrap */
586			ip->ip_src.s_addr = htonl(a + 1);
587			break;
588		}
589		ip->ip_src.s_addr = htonl(g->src_ip.start);
590
591		udp->uh_sport = htons(g->src_ip.port0);
592	}
593
594	if (g->options & OPT_RANDOM_DST) {
595		udp->uh_dport = random();
596		ip->ip_dst.s_addr = random();
597	} else {
598		p = ntohs(udp->uh_dport);
599		if (p < g->dst_ip.port1) { /* just inc, no wrap */
600			udp->uh_dport = htons(p + 1);
601			break;
602		}
603		udp->uh_dport = htons(g->dst_ip.port0);
604
605		a = ntohl(ip->ip_dst.s_addr);
606		if (a < g->dst_ip.end) { /* just inc, no wrap */
607			ip->ip_dst.s_addr = htonl(a + 1);
608			break;
609		}
610	}
611	ip->ip_dst.s_addr = htonl(g->dst_ip.start);
612    } while (0);
613    // update checksum
614}
615
616/*
617 * initialize one packet and prepare for the next one.
618 * The copy could be done better instead of repeating it each time.
619 */
620static void
621initialize_packet(struct targ *targ)
622{
623	struct pkt *pkt = &targ->pkt;
624	struct ether_header *eh;
625	struct ip *ip;
626	struct udphdr *udp;
627	uint16_t paylen = targ->g->pkt_size - sizeof(*eh) - sizeof(struct ip);
628	const char *payload = targ->g->options & OPT_INDIRECT ?
629		indirect_payload : default_payload;
630	int i, l0 = strlen(payload);
631
632	char errbuf[PCAP_ERRBUF_SIZE];
633	pcap_t *file;
634	struct pcap_pkthdr *header;
635	const unsigned char *packet;
636
637	/* Read a packet from a PCAP file if asked. */
638	if (targ->g->packet_file != NULL) {
639		if ((file = pcap_open_offline(targ->g->packet_file,
640			    errbuf)) == NULL)
641			D("failed to open pcap file %s",
642			    targ->g->packet_file);
643		if (pcap_next_ex(file, &header, &packet) < 0)
644			D("failed to read packet from %s",
645			    targ->g->packet_file);
646		if ((targ->frame = malloc(header->caplen)) == NULL)
647			D("out of memory");
648		bcopy(packet, (unsigned char *)targ->frame, header->caplen);
649		targ->g->pkt_size = header->caplen;
650		pcap_close(file);
651		return;
652	}
653
654	/* create a nice NUL-terminated string */
655	for (i = 0; i < paylen; i += l0) {
656		if (l0 > paylen - i)
657			l0 = paylen - i; // last round
658		bcopy(payload, pkt->body + i, l0);
659	}
660	pkt->body[i-1] = '\0';
661	ip = &pkt->ip;
662
663	/* prepare the headers */
664        ip->ip_v = IPVERSION;
665        ip->ip_hl = 5;
666        ip->ip_id = 0;
667        ip->ip_tos = IPTOS_LOWDELAY;
668	ip->ip_len = ntohs(targ->g->pkt_size - sizeof(*eh));
669        ip->ip_id = 0;
670        ip->ip_off = htons(IP_DF); /* Don't fragment */
671        ip->ip_ttl = IPDEFTTL;
672	ip->ip_p = IPPROTO_UDP;
673	ip->ip_dst.s_addr = htonl(targ->g->dst_ip.start);
674	ip->ip_src.s_addr = htonl(targ->g->src_ip.start);
675	ip->ip_sum = wrapsum(checksum(ip, sizeof(*ip), 0));
676
677
678	udp = &pkt->udp;
679        udp->uh_sport = htons(targ->g->src_ip.port0);
680        udp->uh_dport = htons(targ->g->dst_ip.port0);
681	udp->uh_ulen = htons(paylen);
682	/* Magic: taken from sbin/dhclient/packet.c */
683	udp->uh_sum = wrapsum(checksum(udp, sizeof(*udp),
684                    checksum(pkt->body,
685                        paylen - sizeof(*udp),
686                        checksum(&ip->ip_src, 2 * sizeof(ip->ip_src),
687                            IPPROTO_UDP + (u_int32_t)ntohs(udp->uh_ulen)
688                        )
689                    )
690                ));
691
692	eh = &pkt->eh;
693	bcopy(&targ->g->src_mac.start, eh->ether_shost, 6);
694	bcopy(&targ->g->dst_mac.start, eh->ether_dhost, 6);
695	eh->ether_type = htons(ETHERTYPE_IP);
696
697	bzero(&pkt->vh, sizeof(pkt->vh));
698#ifdef TRASH_VHOST_HDR
699	/* set bogus content */
700	pkt->vh.fields[0] = 0xff;
701	pkt->vh.fields[1] = 0xff;
702	pkt->vh.fields[2] = 0xff;
703	pkt->vh.fields[3] = 0xff;
704	pkt->vh.fields[4] = 0xff;
705	pkt->vh.fields[5] = 0xff;
706#endif /* TRASH_VHOST_HDR */
707	// dump_payload((void *)pkt, targ->g->pkt_size, NULL, 0);
708}
709
710static void
711set_vnet_hdr_len(struct targ *t)
712{
713	int err, l = t->g->virt_header;
714	struct nmreq req;
715
716	if (l == 0)
717		return;
718
719	memset(&req, 0, sizeof(req));
720	bcopy(t->nmd->req.nr_name, req.nr_name, sizeof(req.nr_name));
721	req.nr_version = NETMAP_API;
722	req.nr_cmd = NETMAP_BDG_VNET_HDR;
723	req.nr_arg1 = l;
724	err = ioctl(t->fd, NIOCREGIF, &req);
725	if (err) {
726		D("Unable to set vnet header length %d", l);
727	}
728}
729
730
731/*
732 * create and enqueue a batch of packets on a ring.
733 * On the last one set NS_REPORT to tell the driver to generate
734 * an interrupt when done.
735 */
736static int
737send_packets(struct netmap_ring *ring, struct pkt *pkt, void *frame,
738		int size, struct glob_arg *g, u_int count, int options,
739		u_int nfrags)
740{
741	u_int n, sent, cur = ring->cur;
742	u_int fcnt;
743
744	n = nm_ring_space(ring);
745	if (n < count)
746		count = n;
747	if (count < nfrags) {
748		D("truncating packet, no room for frags %d %d",
749				count, nfrags);
750	}
751#if 0
752	if (options & (OPT_COPY | OPT_PREFETCH) ) {
753		for (sent = 0; sent < count; sent++) {
754			struct netmap_slot *slot = &ring->slot[cur];
755			char *p = NETMAP_BUF(ring, slot->buf_idx);
756
757			__builtin_prefetch(p);
758			cur = nm_ring_next(ring, cur);
759		}
760		cur = ring->cur;
761	}
762#endif
763	for (fcnt = nfrags, sent = 0; sent < count; sent++) {
764		struct netmap_slot *slot = &ring->slot[cur];
765		char *p = NETMAP_BUF(ring, slot->buf_idx);
766
767		slot->flags = 0;
768		if (options & OPT_INDIRECT) {
769			slot->flags |= NS_INDIRECT;
770			slot->ptr = (uint64_t)frame;
771		} else if (options & OPT_COPY) {
772			nm_pkt_copy(frame, p, size);
773			if (fcnt == nfrags)
774				update_addresses(pkt, g);
775		} else if (options & OPT_MEMCPY) {
776			memcpy(p, frame, size);
777			if (fcnt == nfrags)
778				update_addresses(pkt, g);
779		} else if (options & OPT_PREFETCH) {
780			__builtin_prefetch(p);
781		}
782		if (options & OPT_DUMP)
783			dump_payload(p, size, ring, cur);
784		slot->len = size;
785		if (--fcnt > 0)
786			slot->flags |= NS_MOREFRAG;
787		else
788			fcnt = nfrags;
789		if (sent == count - 1) {
790			slot->flags &= ~NS_MOREFRAG;
791			slot->flags |= NS_REPORT;
792		}
793		cur = nm_ring_next(ring, cur);
794	}
795	ring->head = ring->cur = cur;
796
797	return (sent);
798}
799
800/*
801 * Send a packet, and wait for a response.
802 * The payload (after UDP header, ofs 42) has a 4-byte sequence
803 * followed by a struct timeval (or bintime?)
804 */
805#define	PAY_OFS	42	/* where in the pkt... */
806
807static void *
808pinger_body(void *data)
809{
810	struct targ *targ = (struct targ *) data;
811	struct pollfd pfd = { .fd = targ->fd, .events = POLLIN };
812	struct netmap_if *nifp = targ->nmd->nifp;
813	int i, rx = 0, n = targ->g->npackets;
814	void *frame;
815	int size;
816	uint32_t sent = 0;
817	struct timespec ts, now, last_print;
818	uint32_t count = 0, min = 1000000000, av = 0;
819
820	frame = &targ->pkt;
821	frame += sizeof(targ->pkt.vh) - targ->g->virt_header;
822	size = targ->g->pkt_size + targ->g->virt_header;
823
824	if (targ->g->nthreads > 1) {
825		D("can only ping with 1 thread");
826		return NULL;
827	}
828
829	clock_gettime(CLOCK_REALTIME_PRECISE, &last_print);
830	now = last_print;
831	while (n == 0 || (int)sent < n) {
832		struct netmap_ring *ring = NETMAP_TXRING(nifp, 0);
833		struct netmap_slot *slot;
834		char *p;
835	    for (i = 0; i < 1; i++) { /* XXX why the loop for 1 pkt ? */
836		slot = &ring->slot[ring->cur];
837		slot->len = size;
838		p = NETMAP_BUF(ring, slot->buf_idx);
839
840		if (nm_ring_empty(ring)) {
841			D("-- ouch, cannot send");
842		} else {
843			struct tstamp *tp;
844			nm_pkt_copy(frame, p, size);
845			clock_gettime(CLOCK_REALTIME_PRECISE, &ts);
846			bcopy(&sent, p+42, sizeof(sent));
847			tp = (struct tstamp *)(p+46);
848			tp->sec = (uint32_t)ts.tv_sec;
849			tp->nsec = (uint32_t)ts.tv_nsec;
850			sent++;
851			ring->head = ring->cur = nm_ring_next(ring, ring->cur);
852		}
853	    }
854		/* should use a parameter to decide how often to send */
855		if (poll(&pfd, 1, 3000) <= 0) {
856			D("poll error/timeout on queue %d: %s", targ->me,
857				strerror(errno));
858			continue;
859		}
860		/* see what we got back */
861		for (i = targ->nmd->first_tx_ring;
862			i <= targ->nmd->last_tx_ring; i++) {
863			ring = NETMAP_RXRING(nifp, i);
864			while (!nm_ring_empty(ring)) {
865				uint32_t seq;
866				struct tstamp *tp;
867				slot = &ring->slot[ring->cur];
868				p = NETMAP_BUF(ring, slot->buf_idx);
869
870				clock_gettime(CLOCK_REALTIME_PRECISE, &now);
871				bcopy(p+42, &seq, sizeof(seq));
872				tp = (struct tstamp *)(p+46);
873				ts.tv_sec = (time_t)tp->sec;
874				ts.tv_nsec = (long)tp->nsec;
875				ts.tv_sec = now.tv_sec - ts.tv_sec;
876				ts.tv_nsec = now.tv_nsec - ts.tv_nsec;
877				if (ts.tv_nsec < 0) {
878					ts.tv_nsec += 1000000000;
879					ts.tv_sec--;
880				}
881				if (1) D("seq %d/%d delta %d.%09d", seq, sent,
882					(int)ts.tv_sec, (int)ts.tv_nsec);
883				if (ts.tv_nsec < (int)min)
884					min = ts.tv_nsec;
885				count ++;
886				av += ts.tv_nsec;
887				ring->head = ring->cur = nm_ring_next(ring, ring->cur);
888				rx++;
889			}
890		}
891		//D("tx %d rx %d", sent, rx);
892		//usleep(100000);
893		ts.tv_sec = now.tv_sec - last_print.tv_sec;
894		ts.tv_nsec = now.tv_nsec - last_print.tv_nsec;
895		if (ts.tv_nsec < 0) {
896			ts.tv_nsec += 1000000000;
897			ts.tv_sec--;
898		}
899		if (ts.tv_sec >= 1) {
900			D("count %d min %d av %d",
901				count, min, av/count);
902			count = 0;
903			av = 0;
904			min = 100000000;
905			last_print = now;
906		}
907	}
908	return NULL;
909}
910
911
912/*
913 * reply to ping requests
914 */
915static void *
916ponger_body(void *data)
917{
918	struct targ *targ = (struct targ *) data;
919	struct pollfd pfd = { .fd = targ->fd, .events = POLLIN };
920	struct netmap_if *nifp = targ->nmd->nifp;
921	struct netmap_ring *txring, *rxring;
922	int i, rx = 0, sent = 0, n = targ->g->npackets;
923
924	if (targ->g->nthreads > 1) {
925		D("can only reply ping with 1 thread");
926		return NULL;
927	}
928	D("understood ponger %d but don't know how to do it", n);
929	while (n == 0 || sent < n) {
930		uint32_t txcur, txavail;
931//#define BUSYWAIT
932#ifdef BUSYWAIT
933		ioctl(pfd.fd, NIOCRXSYNC, NULL);
934#else
935		if (poll(&pfd, 1, 1000) <= 0) {
936			D("poll error/timeout on queue %d: %s", targ->me,
937				strerror(errno));
938			continue;
939		}
940#endif
941		txring = NETMAP_TXRING(nifp, 0);
942		txcur = txring->cur;
943		txavail = nm_ring_space(txring);
944		/* see what we got back */
945		for (i = targ->nmd->first_rx_ring; i <= targ->nmd->last_rx_ring; i++) {
946			rxring = NETMAP_RXRING(nifp, i);
947			while (!nm_ring_empty(rxring)) {
948				uint16_t *spkt, *dpkt;
949				uint32_t cur = rxring->cur;
950				struct netmap_slot *slot = &rxring->slot[cur];
951				char *src, *dst;
952				src = NETMAP_BUF(rxring, slot->buf_idx);
953				//D("got pkt %p of size %d", src, slot->len);
954				rxring->head = rxring->cur = nm_ring_next(rxring, cur);
955				rx++;
956				if (txavail == 0)
957					continue;
958				dst = NETMAP_BUF(txring,
959				    txring->slot[txcur].buf_idx);
960				/* copy... */
961				dpkt = (uint16_t *)dst;
962				spkt = (uint16_t *)src;
963				nm_pkt_copy(src, dst, slot->len);
964				dpkt[0] = spkt[3];
965				dpkt[1] = spkt[4];
966				dpkt[2] = spkt[5];
967				dpkt[3] = spkt[0];
968				dpkt[4] = spkt[1];
969				dpkt[5] = spkt[2];
970				txring->slot[txcur].len = slot->len;
971				/* XXX swap src dst mac */
972				txcur = nm_ring_next(txring, txcur);
973				txavail--;
974				sent++;
975			}
976		}
977		txring->head = txring->cur = txcur;
978		targ->count = sent;
979#ifdef BUSYWAIT
980		ioctl(pfd.fd, NIOCTXSYNC, NULL);
981#endif
982		//D("tx %d rx %d", sent, rx);
983	}
984	return NULL;
985}
986
987static __inline int
988timespec_ge(const struct timespec *a, const struct timespec *b)
989{
990
991	if (a->tv_sec > b->tv_sec)
992		return (1);
993	if (a->tv_sec < b->tv_sec)
994		return (0);
995	if (a->tv_nsec >= b->tv_nsec)
996		return (1);
997	return (0);
998}
999
1000static __inline struct timespec
1001timeval2spec(const struct timeval *a)
1002{
1003	struct timespec ts = {
1004		.tv_sec = a->tv_sec,
1005		.tv_nsec = a->tv_usec * 1000
1006	};
1007	return ts;
1008}
1009
1010static __inline struct timeval
1011timespec2val(const struct timespec *a)
1012{
1013	struct timeval tv = {
1014		.tv_sec = a->tv_sec,
1015		.tv_usec = a->tv_nsec / 1000
1016	};
1017	return tv;
1018}
1019
1020
1021static __inline struct timespec
1022timespec_add(struct timespec a, struct timespec b)
1023{
1024	struct timespec ret = { a.tv_sec + b.tv_sec, a.tv_nsec + b.tv_nsec };
1025	if (ret.tv_nsec >= 1000000000) {
1026		ret.tv_sec++;
1027		ret.tv_nsec -= 1000000000;
1028	}
1029	return ret;
1030}
1031
1032static __inline struct timespec
1033timespec_sub(struct timespec a, struct timespec b)
1034{
1035	struct timespec ret = { a.tv_sec - b.tv_sec, a.tv_nsec - b.tv_nsec };
1036	if (ret.tv_nsec < 0) {
1037		ret.tv_sec--;
1038		ret.tv_nsec += 1000000000;
1039	}
1040	return ret;
1041}
1042
1043
1044/*
1045 * wait until ts, either busy or sleeping if more than 1ms.
1046 * Return wakeup time.
1047 */
1048static struct timespec
1049wait_time(struct timespec ts)
1050{
1051	for (;;) {
1052		struct timespec w, cur;
1053		clock_gettime(CLOCK_REALTIME_PRECISE, &cur);
1054		w = timespec_sub(ts, cur);
1055		if (w.tv_sec < 0)
1056			return cur;
1057		else if (w.tv_sec > 0 || w.tv_nsec > 1000000)
1058			poll(NULL, 0, 1);
1059	}
1060}
1061
1062static void *
1063sender_body(void *data)
1064{
1065	struct targ *targ = (struct targ *) data;
1066	struct pollfd pfd = { .fd = targ->fd, .events = POLLOUT };
1067	struct netmap_if *nifp;
1068	struct netmap_ring *txring;
1069	int i, n = targ->g->npackets / targ->g->nthreads;
1070	int64_t sent = 0;
1071	int options = targ->g->options | OPT_COPY;
1072	struct timespec nexttime = { 0, 0}; // XXX silence compiler
1073	int rate_limit = targ->g->tx_rate;
1074	struct pkt *pkt = &targ->pkt;
1075	void *frame;
1076	int size;
1077
1078	if (targ->frame == NULL) {
1079		frame = pkt;
1080		frame += sizeof(pkt->vh) - targ->g->virt_header;
1081		size = targ->g->pkt_size + targ->g->virt_header;
1082	} else {
1083		frame = targ->frame;
1084		size = targ->g->pkt_size;
1085	}
1086
1087	D("start, fd %d main_fd %d", targ->fd, targ->g->main_fd);
1088	if (setaffinity(targ->thread, targ->affinity))
1089		goto quit;
1090
1091	/* main loop.*/
1092	clock_gettime(CLOCK_REALTIME_PRECISE, &targ->tic);
1093	if (rate_limit) {
1094		targ->tic = timespec_add(targ->tic, (struct timespec){2,0});
1095		targ->tic.tv_nsec = 0;
1096		wait_time(targ->tic);
1097		nexttime = targ->tic;
1098	}
1099        if (targ->g->dev_type == DEV_TAP) {
1100	    D("writing to file desc %d", targ->g->main_fd);
1101
1102	    for (i = 0; !targ->cancel && (n == 0 || sent < n); i++) {
1103		if (write(targ->g->main_fd, frame, size) != -1)
1104			sent++;
1105		update_addresses(pkt, targ->g);
1106		if (i > 10000) {
1107			targ->count = sent;
1108			i = 0;
1109		}
1110	    }
1111#ifndef NO_PCAP
1112    } else if (targ->g->dev_type == DEV_PCAP) {
1113	    pcap_t *p = targ->g->p;
1114
1115	    for (i = 0; !targ->cancel && (n == 0 || sent < n); i++) {
1116		if (pcap_inject(p, frame, size) != -1)
1117			sent++;
1118		update_addresses(pkt, targ->g);
1119		if (i > 10000) {
1120			targ->count = sent;
1121			i = 0;
1122		}
1123	    }
1124#endif /* NO_PCAP */
1125    } else {
1126	int tosend = 0;
1127	int frags = targ->g->frags;
1128
1129        nifp = targ->nmd->nifp;
1130	while (!targ->cancel && (n == 0 || sent < n)) {
1131
1132		if (rate_limit && tosend <= 0) {
1133			tosend = targ->g->burst;
1134			nexttime = timespec_add(nexttime, targ->g->tx_period);
1135			wait_time(nexttime);
1136		}
1137
1138		/*
1139		 * wait for available room in the send queue(s)
1140		 */
1141		if (poll(&pfd, 1, 2000) <= 0) {
1142			if (targ->cancel)
1143				break;
1144			D("poll error/timeout on queue %d: %s", targ->me,
1145				strerror(errno));
1146			// goto quit;
1147		}
1148		if (pfd.revents & POLLERR) {
1149			D("poll error");
1150			goto quit;
1151		}
1152		/*
1153		 * scan our queues and send on those with room
1154		 */
1155		if (options & OPT_COPY && sent > 100000 && !(targ->g->options & OPT_COPY) ) {
1156			D("drop copy");
1157			options &= ~OPT_COPY;
1158		}
1159		for (i = targ->nmd->first_tx_ring; i <= targ->nmd->last_tx_ring; i++) {
1160			int m, limit = rate_limit ?  tosend : targ->g->burst;
1161			if (n > 0 && n - sent < limit)
1162				limit = n - sent;
1163			txring = NETMAP_TXRING(nifp, i);
1164			if (nm_ring_empty(txring))
1165				continue;
1166			if (frags > 1)
1167				limit = ((limit + frags - 1) / frags) * frags;
1168
1169			m = send_packets(txring, pkt, frame, size, targ->g,
1170					 limit, options, frags);
1171			ND("limit %d tail %d frags %d m %d",
1172				limit, txring->tail, frags, m);
1173			sent += m;
1174			targ->count = sent;
1175			if (rate_limit) {
1176				tosend -= m;
1177				if (tosend <= 0)
1178					break;
1179			}
1180		}
1181	}
1182	/* flush any remaining packets */
1183	D("flush tail %d head %d on thread %p",
1184		txring->tail, txring->head,
1185		pthread_self());
1186	ioctl(pfd.fd, NIOCTXSYNC, NULL);
1187
1188	/* final part: wait all the TX queues to be empty. */
1189	for (i = targ->nmd->first_tx_ring; i <= targ->nmd->last_tx_ring; i++) {
1190		txring = NETMAP_TXRING(nifp, i);
1191		while (nm_tx_pending(txring)) {
1192			RD(5, "pending tx tail %d head %d on ring %d",
1193				txring->tail, txring->head, i);
1194			ioctl(pfd.fd, NIOCTXSYNC, NULL);
1195			usleep(1); /* wait 1 tick */
1196		}
1197	}
1198    } /* end DEV_NETMAP */
1199
1200	clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc);
1201	targ->completed = 1;
1202	targ->count = sent;
1203
1204quit:
1205	/* reset the ``used`` flag. */
1206	targ->used = 0;
1207
1208	return (NULL);
1209}
1210
1211
1212#ifndef NO_PCAP
1213static void
1214receive_pcap(u_char *user, const struct pcap_pkthdr * h,
1215	const u_char * bytes)
1216{
1217	int *count = (int *)user;
1218	(void)h;	/* UNUSED */
1219	(void)bytes;	/* UNUSED */
1220	(*count)++;
1221}
1222#endif /* !NO_PCAP */
1223
1224static int
1225receive_packets(struct netmap_ring *ring, u_int limit, int dump)
1226{
1227	u_int cur, rx, n;
1228
1229	cur = ring->cur;
1230	n = nm_ring_space(ring);
1231	if (n < limit)
1232		limit = n;
1233	for (rx = 0; rx < limit; rx++) {
1234		struct netmap_slot *slot = &ring->slot[cur];
1235		char *p = NETMAP_BUF(ring, slot->buf_idx);
1236
1237		if (dump)
1238			dump_payload(p, slot->len, ring, cur);
1239
1240		cur = nm_ring_next(ring, cur);
1241	}
1242	ring->head = ring->cur = cur;
1243
1244	return (rx);
1245}
1246
1247static void *
1248receiver_body(void *data)
1249{
1250	struct targ *targ = (struct targ *) data;
1251	struct pollfd pfd = { .fd = targ->fd, .events = POLLIN };
1252	struct netmap_if *nifp;
1253	struct netmap_ring *rxring;
1254	int i;
1255	uint64_t received = 0;
1256
1257	if (setaffinity(targ->thread, targ->affinity))
1258		goto quit;
1259
1260	D("reading from %s fd %d main_fd %d",
1261		targ->g->ifname, targ->fd, targ->g->main_fd);
1262	/* unbounded wait for the first packet. */
1263	for (;!targ->cancel;) {
1264		i = poll(&pfd, 1, 1000);
1265		if (i > 0 && !(pfd.revents & POLLERR))
1266			break;
1267		RD(1, "waiting for initial packets, poll returns %d %d",
1268			i, pfd.revents);
1269	}
1270	/* main loop, exit after 1s silence */
1271	clock_gettime(CLOCK_REALTIME_PRECISE, &targ->tic);
1272    if (targ->g->dev_type == DEV_TAP) {
1273	while (!targ->cancel) {
1274		char buf[MAX_BODYSIZE];
1275		/* XXX should we poll ? */
1276		if (read(targ->g->main_fd, buf, sizeof(buf)) > 0)
1277			targ->count++;
1278	}
1279#ifndef NO_PCAP
1280    } else if (targ->g->dev_type == DEV_PCAP) {
1281	while (!targ->cancel) {
1282		/* XXX should we poll ? */
1283		pcap_dispatch(targ->g->p, targ->g->burst, receive_pcap,
1284			(u_char *)&targ->count);
1285	}
1286#endif /* !NO_PCAP */
1287    } else {
1288	int dump = targ->g->options & OPT_DUMP;
1289
1290        nifp = targ->nmd->nifp;
1291	while (!targ->cancel) {
1292		/* Once we started to receive packets, wait at most 1 seconds
1293		   before quitting. */
1294		if (poll(&pfd, 1, 1 * 1000) <= 0 && !targ->g->forever) {
1295			clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc);
1296			targ->toc.tv_sec -= 1; /* Subtract timeout time. */
1297			goto out;
1298		}
1299
1300		if (pfd.revents & POLLERR) {
1301			D("poll err");
1302			goto quit;
1303		}
1304
1305		for (i = targ->nmd->first_rx_ring; i <= targ->nmd->last_rx_ring; i++) {
1306			int m;
1307
1308			rxring = NETMAP_RXRING(nifp, i);
1309			if (nm_ring_empty(rxring))
1310				continue;
1311
1312			m = receive_packets(rxring, targ->g->burst, dump);
1313			received += m;
1314		}
1315		targ->count = received;
1316	}
1317    }
1318
1319	clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc);
1320
1321out:
1322	targ->completed = 1;
1323	targ->count = received;
1324
1325quit:
1326	/* reset the ``used`` flag. */
1327	targ->used = 0;
1328
1329	return (NULL);
1330}
1331
1332/* very crude code to print a number in normalized form.
1333 * Caller has to make sure that the buffer is large enough.
1334 */
1335static const char *
1336norm(char *buf, double val)
1337{
1338	char *units[] = { "", "K", "M", "G", "T" };
1339	u_int i;
1340
1341	for (i = 0; val >=1000 && i < sizeof(units)/sizeof(char *) - 1; i++)
1342		val /= 1000;
1343	sprintf(buf, "%.2f %s", val, units[i]);
1344	return buf;
1345}
1346
1347static void
1348tx_output(uint64_t sent, int size, double delta)
1349{
1350	double bw, raw_bw, pps;
1351	char b1[40], b2[80], b3[80];
1352
1353	printf("Sent %llu packets, %d bytes each, in %.2f seconds.\n",
1354	       (unsigned long long)sent, size, delta);
1355	if (delta == 0)
1356		delta = 1e-6;
1357	if (size < 60)		/* correct for min packet size */
1358		size = 60;
1359	pps = sent / delta;
1360	bw = (8.0 * size * sent) / delta;
1361	/* raw packets have4 bytes crc + 20 bytes framing */
1362	raw_bw = (8.0 * (size + 24) * sent) / delta;
1363
1364	printf("Speed: %spps Bandwidth: %sbps (raw %sbps)\n",
1365		norm(b1, pps), norm(b2, bw), norm(b3, raw_bw) );
1366}
1367
1368
1369static void
1370rx_output(uint64_t received, double delta)
1371{
1372	double pps;
1373	char b1[40];
1374
1375	printf("Received %llu packets, in %.2f seconds.\n",
1376		(unsigned long long) received, delta);
1377
1378	if (delta == 0)
1379		delta = 1e-6;
1380	pps = received / delta;
1381	printf("Speed: %spps\n", norm(b1, pps));
1382}
1383
1384static void
1385usage(void)
1386{
1387	const char *cmd = "pkt-gen";
1388	fprintf(stderr,
1389		"Usage:\n"
1390		"%s arguments\n"
1391		"\t-i interface		interface name\n"
1392		"\t-f function		tx rx ping pong\n"
1393		"\t-n count		number of iterations (can be 0)\n"
1394		"\t-t pkts_to_send		also forces tx mode\n"
1395		"\t-r pkts_to_receive	also forces rx mode\n"
1396		"\t-l pkt_size		in bytes excluding CRC\n"
1397		"\t-d dst_ip[:port[-dst_ip:port]]   single or range\n"
1398		"\t-s src_ip[:port[-src_ip:port]]   single or range\n"
1399		"\t-D dst-mac\n"
1400		"\t-S src-mac\n"
1401		"\t-a cpu_id		use setaffinity\n"
1402		"\t-b burst size		testing, mostly\n"
1403		"\t-c cores		cores to use\n"
1404		"\t-p threads		processes/threads to use\n"
1405		"\t-T report_ms		milliseconds between reports\n"
1406		"\t-P			use libpcap instead of netmap\n"
1407		"\t-w wait_for_link_time	in seconds\n"
1408		"\t-R rate		in packets per second\n"
1409		"\t-X			dump payload\n"
1410		"\t-H len		add empty virtio-net-header with size 'len'\n"
1411	        "\t-P file		load packet from pcap file\n"
1412		"\t-z			use random IPv4 src address/port\n"
1413		"\t-Z			use random IPv4 dst address/port\n"
1414		"",
1415		cmd);
1416
1417	exit(0);
1418}
1419
1420static void
1421start_threads(struct glob_arg *g)
1422{
1423	int i;
1424
1425	targs = calloc(g->nthreads, sizeof(*targs));
1426	/*
1427	 * Now create the desired number of threads, each one
1428	 * using a single descriptor.
1429 	 */
1430	for (i = 0; i < g->nthreads; i++) {
1431		struct targ *t = &targs[i];
1432
1433		bzero(t, sizeof(*t));
1434		t->fd = -1; /* default, with pcap */
1435		t->g = g;
1436
1437	    if (g->dev_type == DEV_NETMAP) {
1438		struct nm_desc nmd = *g->nmd; /* copy, we overwrite ringid */
1439		uint64_t nmd_flags = 0;
1440		nmd.self = &nmd;
1441
1442		if (g->nthreads > 1) {
1443			if (nmd.req.nr_flags != NR_REG_ALL_NIC) {
1444				D("invalid nthreads mode %d", nmd.req.nr_flags);
1445				continue;
1446			}
1447			nmd.req.nr_flags = NR_REG_ONE_NIC;
1448			nmd.req.nr_ringid = i;
1449		}
1450		/* Only touch one of the rings (rx is already ok) */
1451		if (g->td_body == receiver_body)
1452			nmd_flags |= NETMAP_NO_TX_POLL;
1453
1454		/* register interface. Override ifname and ringid etc. */
1455		if (g->options & OPT_MONITOR_TX)
1456			nmd.req.nr_flags |= NR_MONITOR_TX;
1457		if (g->options & OPT_MONITOR_RX)
1458			nmd.req.nr_flags |= NR_MONITOR_RX;
1459
1460		t->nmd = nm_open(t->g->ifname, NULL, nmd_flags |
1461			NM_OPEN_IFNAME | NM_OPEN_NO_MMAP, &nmd);
1462		if (t->nmd == NULL) {
1463			D("Unable to open %s: %s",
1464				t->g->ifname, strerror(errno));
1465			continue;
1466		}
1467		t->fd = t->nmd->fd;
1468		set_vnet_hdr_len(t);
1469
1470	    } else {
1471		targs[i].fd = g->main_fd;
1472	    }
1473		t->used = 1;
1474		t->me = i;
1475		if (g->affinity >= 0) {
1476			if (g->affinity < g->cpus)
1477				t->affinity = g->affinity;
1478			else
1479				t->affinity = i % g->cpus;
1480		} else {
1481			t->affinity = -1;
1482		}
1483		/* default, init packets */
1484		initialize_packet(t);
1485
1486		if (pthread_create(&t->thread, NULL, g->td_body, t) == -1) {
1487			D("Unable to create thread %d: %s", i, strerror(errno));
1488			t->used = 0;
1489		}
1490	}
1491}
1492
1493static void
1494main_thread(struct glob_arg *g)
1495{
1496	int i;
1497
1498	uint64_t prev = 0;
1499	uint64_t count = 0;
1500	double delta_t;
1501	struct timeval tic, toc;
1502
1503	gettimeofday(&toc, NULL);
1504	for (;;) {
1505		struct timeval now, delta;
1506		uint64_t pps, usec, my_count, npkts;
1507		int done = 0;
1508
1509		delta.tv_sec = g->report_interval/1000;
1510		delta.tv_usec = (g->report_interval%1000)*1000;
1511		select(0, NULL, NULL, NULL, &delta);
1512		gettimeofday(&now, NULL);
1513		timersub(&now, &toc, &toc);
1514		my_count = 0;
1515		for (i = 0; i < g->nthreads; i++) {
1516			my_count += targs[i].count;
1517			if (targs[i].used == 0)
1518				done++;
1519		}
1520		usec = toc.tv_sec* 1000000 + toc.tv_usec;
1521		if (usec < 10000)
1522			continue;
1523		npkts = my_count - prev;
1524		pps = (npkts*1000000 + usec/2) / usec;
1525		D("%llu pps (%llu pkts in %llu usec)",
1526			(unsigned long long)pps,
1527			(unsigned long long)npkts,
1528			(unsigned long long)usec);
1529		prev = my_count;
1530		toc = now;
1531		if (done == g->nthreads)
1532			break;
1533	}
1534
1535	timerclear(&tic);
1536	timerclear(&toc);
1537	for (i = 0; i < g->nthreads; i++) {
1538		struct timespec t_tic, t_toc;
1539		/*
1540		 * Join active threads, unregister interfaces and close
1541		 * file descriptors.
1542		 */
1543		if (targs[i].used)
1544			pthread_join(targs[i].thread, NULL);
1545		close(targs[i].fd);
1546
1547		if (targs[i].completed == 0)
1548			D("ouch, thread %d exited with error", i);
1549
1550		/*
1551		 * Collect threads output and extract information about
1552		 * how long it took to send all the packets.
1553		 */
1554		count += targs[i].count;
1555		t_tic = timeval2spec(&tic);
1556		t_toc = timeval2spec(&toc);
1557		if (!timerisset(&tic) || timespec_ge(&targs[i].tic, &t_tic))
1558			tic = timespec2val(&targs[i].tic);
1559		if (!timerisset(&toc) || timespec_ge(&targs[i].toc, &t_toc))
1560			toc = timespec2val(&targs[i].toc);
1561	}
1562
1563	/* print output. */
1564	timersub(&toc, &tic, &toc);
1565	delta_t = toc.tv_sec + 1e-6* toc.tv_usec;
1566	if (g->td_body == sender_body)
1567		tx_output(count, g->pkt_size, delta_t);
1568	else
1569		rx_output(count, delta_t);
1570
1571	if (g->dev_type == DEV_NETMAP) {
1572		munmap(g->nmd->mem, g->nmd->req.nr_memsize);
1573		close(g->main_fd);
1574	}
1575}
1576
1577
1578struct sf {
1579	char *key;
1580	void *f;
1581};
1582
1583static struct sf func[] = {
1584	{ "tx",	sender_body },
1585	{ "rx",	receiver_body },
1586	{ "ping",	pinger_body },
1587	{ "pong",	ponger_body },
1588	{ NULL, NULL }
1589};
1590
1591static int
1592tap_alloc(char *dev)
1593{
1594	struct ifreq ifr;
1595	int fd, err;
1596	char *clonedev = TAP_CLONEDEV;
1597
1598	(void)err;
1599	(void)dev;
1600	/* Arguments taken by the function:
1601	 *
1602	 * char *dev: the name of an interface (or '\0'). MUST have enough
1603	 *   space to hold the interface name if '\0' is passed
1604	 * int flags: interface flags (eg, IFF_TUN etc.)
1605	 */
1606
1607#ifdef __FreeBSD__
1608	if (dev[3]) { /* tapSomething */
1609		static char buf[128];
1610		snprintf(buf, sizeof(buf), "/dev/%s", dev);
1611		clonedev = buf;
1612	}
1613#endif
1614	/* open the device */
1615	if( (fd = open(clonedev, O_RDWR)) < 0 ) {
1616		return fd;
1617	}
1618	D("%s open successful", clonedev);
1619
1620	/* preparation of the struct ifr, of type "struct ifreq" */
1621	memset(&ifr, 0, sizeof(ifr));
1622
1623#ifdef linux
1624	ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
1625
1626	if (*dev) {
1627		/* if a device name was specified, put it in the structure; otherwise,
1628		* the kernel will try to allocate the "next" device of the
1629		* specified type */
1630		strncpy(ifr.ifr_name, dev, IFNAMSIZ);
1631	}
1632
1633	/* try to create the device */
1634	if( (err = ioctl(fd, TUNSETIFF, (void *) &ifr)) < 0 ) {
1635		D("failed to to a TUNSETIFF: %s", strerror(errno));
1636		close(fd);
1637		return err;
1638	}
1639
1640	/* if the operation was successful, write back the name of the
1641	* interface to the variable "dev", so the caller can know
1642	* it. Note that the caller MUST reserve space in *dev (see calling
1643	* code below) */
1644	strcpy(dev, ifr.ifr_name);
1645	D("new name is %s", dev);
1646#endif /* linux */
1647
1648        /* this is the special file descriptor that the caller will use to talk
1649         * with the virtual interface */
1650        return fd;
1651}
1652
1653int
1654main(int arc, char **argv)
1655{
1656	int i;
1657
1658	struct glob_arg g;
1659
1660	int ch;
1661	int wait_link = 2;
1662	int devqueues = 1;	/* how many device queues */
1663
1664	bzero(&g, sizeof(g));
1665
1666	g.main_fd = -1;
1667	g.td_body = receiver_body;
1668	g.report_interval = 1000;	/* report interval */
1669	g.affinity = -1;
1670	/* ip addresses can also be a range x.x.x.x-x.x.x.y */
1671	g.src_ip.name = "10.0.0.1";
1672	g.dst_ip.name = "10.1.0.1";
1673	g.dst_mac.name = "ff:ff:ff:ff:ff:ff";
1674	g.src_mac.name = NULL;
1675	g.pkt_size = 60;
1676	g.burst = 512;		// default
1677	g.nthreads = 1;
1678	g.cpus = 1;
1679	g.forever = 1;
1680	g.tx_rate = 0;
1681	g.frags = 1;
1682	g.nmr_config = "";
1683	g.virt_header = 0;
1684
1685	while ( (ch = getopt(arc, argv,
1686			"a:f:F:n:i:Il:d:s:D:S:b:c:o:p:T:w:WvR:XC:H:e:m:P:zZ")) != -1) {
1687		struct sf *fn;
1688
1689		switch(ch) {
1690		default:
1691			D("bad option %c %s", ch, optarg);
1692			usage();
1693			break;
1694
1695		case 'n':
1696			g.npackets = atoi(optarg);
1697			break;
1698
1699		case 'F':
1700			i = atoi(optarg);
1701			if (i < 1 || i > 63) {
1702				D("invalid frags %d [1..63], ignore", i);
1703				break;
1704			}
1705			g.frags = i;
1706			break;
1707
1708		case 'f':
1709			for (fn = func; fn->key; fn++) {
1710				if (!strcmp(fn->key, optarg))
1711					break;
1712			}
1713			if (fn->key)
1714				g.td_body = fn->f;
1715			else
1716				D("unrecognised function %s", optarg);
1717			break;
1718
1719		case 'o':	/* data generation options */
1720			g.options = atoi(optarg);
1721			break;
1722
1723		case 'a':       /* force affinity */
1724			g.affinity = atoi(optarg);
1725			break;
1726
1727		case 'i':	/* interface */
1728			/* a prefix of tap: netmap: or pcap: forces the mode.
1729			 * otherwise we guess
1730			 */
1731			D("interface is %s", optarg);
1732			if (strlen(optarg) > MAX_IFNAMELEN - 8) {
1733				D("ifname too long %s", optarg);
1734				break;
1735			}
1736			strcpy(g.ifname, optarg);
1737			if (!strcmp(optarg, "null")) {
1738				g.dev_type = DEV_NETMAP;
1739				g.dummy_send = 1;
1740			} else if (!strncmp(optarg, "tap:", 4)) {
1741				g.dev_type = DEV_TAP;
1742				strcpy(g.ifname, optarg + 4);
1743			} else if (!strncmp(optarg, "pcap:", 5)) {
1744				g.dev_type = DEV_PCAP;
1745				strcpy(g.ifname, optarg + 5);
1746			} else if (!strncmp(optarg, "netmap:", 7) ||
1747				   !strncmp(optarg, "vale", 4)) {
1748				g.dev_type = DEV_NETMAP;
1749			} else if (!strncmp(optarg, "tap", 3)) {
1750				g.dev_type = DEV_TAP;
1751			} else { /* prepend netmap: */
1752				g.dev_type = DEV_NETMAP;
1753				sprintf(g.ifname, "netmap:%s", optarg);
1754			}
1755			break;
1756
1757		case 'I':
1758			g.options |= OPT_INDIRECT;	/* XXX use indirect buffer */
1759			break;
1760
1761		case 'l':	/* pkt_size */
1762			g.pkt_size = atoi(optarg);
1763			break;
1764
1765		case 'd':
1766			g.dst_ip.name = optarg;
1767			break;
1768
1769		case 's':
1770			g.src_ip.name = optarg;
1771			break;
1772
1773		case 'T':	/* report interval */
1774			g.report_interval = atoi(optarg);
1775			break;
1776
1777		case 'w':
1778			wait_link = atoi(optarg);
1779			break;
1780
1781		case 'W': /* XXX changed default */
1782			g.forever = 0; /* do not exit rx even with no traffic */
1783			break;
1784
1785		case 'b':	/* burst */
1786			g.burst = atoi(optarg);
1787			break;
1788		case 'c':
1789			g.cpus = atoi(optarg);
1790			break;
1791		case 'p':
1792			g.nthreads = atoi(optarg);
1793			break;
1794
1795		case 'D': /* destination mac */
1796			g.dst_mac.name = optarg;
1797			break;
1798
1799		case 'S': /* source mac */
1800			g.src_mac.name = optarg;
1801			break;
1802		case 'v':
1803			verbose++;
1804			break;
1805		case 'R':
1806			g.tx_rate = atoi(optarg);
1807			break;
1808		case 'X':
1809			g.options |= OPT_DUMP;
1810			break;
1811		case 'C':
1812			g.nmr_config = strdup(optarg);
1813			break;
1814		case 'H':
1815			g.virt_header = atoi(optarg);
1816			break;
1817		case 'e': /* extra bufs */
1818			g.extra_bufs = atoi(optarg);
1819			break;
1820		case 'm':
1821			if (strcmp(optarg, "tx") == 0) {
1822				g.options |= OPT_MONITOR_TX;
1823			} else if (strcmp(optarg, "rx") == 0) {
1824				g.options |= OPT_MONITOR_RX;
1825			} else {
1826				D("unrecognized monitor mode %s", optarg);
1827			}
1828			break;
1829		case 'P':
1830			g.packet_file = strdup(optarg);
1831			break;
1832		case 'z':
1833			g.options |= OPT_RANDOM_SRC;
1834			break;
1835		case 'Z':
1836			g.options |= OPT_RANDOM_DST;
1837			break;
1838		}
1839	}
1840
1841	if (strlen(g.ifname) <=0 ) {
1842		D("missing ifname");
1843		usage();
1844	}
1845
1846	i = system_ncpus();
1847	if (g.cpus < 0 || g.cpus > i) {
1848		D("%d cpus is too high, have only %d cpus", g.cpus, i);
1849		usage();
1850	}
1851	if (g.cpus == 0)
1852		g.cpus = i;
1853
1854	if (g.pkt_size < 16 || g.pkt_size > MAX_PKTSIZE) {
1855		D("bad pktsize %d [16..%d]\n", g.pkt_size, MAX_PKTSIZE);
1856		usage();
1857	}
1858
1859	if (g.src_mac.name == NULL) {
1860		static char mybuf[20] = "00:00:00:00:00:00";
1861		/* retrieve source mac address. */
1862		if (source_hwaddr(g.ifname, mybuf) == -1) {
1863			D("Unable to retrieve source mac");
1864			// continue, fail later
1865		}
1866		g.src_mac.name = mybuf;
1867	}
1868	/* extract address ranges */
1869	extract_ip_range(&g.src_ip);
1870	extract_ip_range(&g.dst_ip);
1871	extract_mac_range(&g.src_mac);
1872	extract_mac_range(&g.dst_mac);
1873
1874	if (g.src_ip.start != g.src_ip.end ||
1875	    g.src_ip.port0 != g.src_ip.port1 ||
1876	    g.dst_ip.start != g.dst_ip.end ||
1877	    g.dst_ip.port0 != g.dst_ip.port1)
1878		g.options |= OPT_COPY;
1879
1880	if (g.virt_header != 0 && g.virt_header != VIRT_HDR_1
1881			&& g.virt_header != VIRT_HDR_2) {
1882		D("bad virtio-net-header length");
1883		usage();
1884	}
1885
1886    if (g.dev_type == DEV_TAP) {
1887	D("want to use tap %s", g.ifname);
1888	g.main_fd = tap_alloc(g.ifname);
1889	if (g.main_fd < 0) {
1890		D("cannot open tap %s", g.ifname);
1891		usage();
1892	}
1893#ifndef NO_PCAP
1894    } else if (g.dev_type == DEV_PCAP) {
1895	char pcap_errbuf[PCAP_ERRBUF_SIZE];
1896
1897	pcap_errbuf[0] = '\0'; // init the buffer
1898	g.p = pcap_open_live(g.ifname, 256 /* XXX */, 1, 100, pcap_errbuf);
1899	if (g.p == NULL) {
1900		D("cannot open pcap on %s", g.ifname);
1901		usage();
1902	}
1903	g.main_fd = pcap_fileno(g.p);
1904	D("using pcap on %s fileno %d", g.ifname, g.main_fd);
1905#endif /* !NO_PCAP */
1906    } else if (g.dummy_send) { /* but DEV_NETMAP */
1907	D("using a dummy send routine");
1908    } else {
1909	struct nmreq base_nmd;
1910
1911	bzero(&base_nmd, sizeof(base_nmd));
1912
1913	parse_nmr_config(g.nmr_config, &base_nmd);
1914	if (g.extra_bufs) {
1915		base_nmd.nr_arg3 = g.extra_bufs;
1916	}
1917
1918	/*
1919	 * Open the netmap device using nm_open().
1920	 *
1921	 * protocol stack and may cause a reset of the card,
1922	 * which in turn may take some time for the PHY to
1923	 * reconfigure. We do the open here to have time to reset.
1924	 */
1925	g.nmd = nm_open(g.ifname, &base_nmd, 0, NULL);
1926	if (g.nmd == NULL) {
1927		D("Unable to open %s: %s", g.ifname, strerror(errno));
1928		goto out;
1929	}
1930	g.main_fd = g.nmd->fd;
1931	D("mapped %dKB at %p", g.nmd->req.nr_memsize>>10, g.nmd->mem);
1932
1933	/* get num of queues in tx or rx */
1934	if (g.td_body == sender_body)
1935		devqueues = g.nmd->req.nr_tx_rings;
1936	else
1937		devqueues = g.nmd->req.nr_rx_rings;
1938
1939	/* validate provided nthreads. */
1940	if (g.nthreads < 1 || g.nthreads > devqueues) {
1941		D("bad nthreads %d, have %d queues", g.nthreads, devqueues);
1942		// continue, fail later
1943	}
1944
1945	if (verbose) {
1946		struct netmap_if *nifp = g.nmd->nifp;
1947		struct nmreq *req = &g.nmd->req;
1948
1949		D("nifp at offset %d, %d tx %d rx region %d",
1950		    req->nr_offset, req->nr_tx_rings, req->nr_rx_rings,
1951		    req->nr_arg2);
1952		for (i = 0; i <= req->nr_tx_rings; i++) {
1953			struct netmap_ring *ring = NETMAP_TXRING(nifp, i);
1954			D("   TX%d at 0x%lx slots %d", i,
1955			    (char *)ring - (char *)nifp, ring->num_slots);
1956		}
1957		for (i = 0; i <= req->nr_rx_rings; i++) {
1958			struct netmap_ring *ring = NETMAP_RXRING(nifp, i);
1959			D("   RX%d at 0x%lx slots %d", i,
1960			    (char *)ring - (char *)nifp, ring->num_slots);
1961		}
1962	}
1963
1964	/* Print some debug information. */
1965	fprintf(stdout,
1966		"%s %s: %d queues, %d threads and %d cpus.\n",
1967		(g.td_body == sender_body) ? "Sending on" : "Receiving from",
1968		g.ifname,
1969		devqueues,
1970		g.nthreads,
1971		g.cpus);
1972	if (g.td_body == sender_body) {
1973		fprintf(stdout, "%s -> %s (%s -> %s)\n",
1974			g.src_ip.name, g.dst_ip.name,
1975			g.src_mac.name, g.dst_mac.name);
1976	}
1977
1978out:
1979	/* Exit if something went wrong. */
1980	if (g.main_fd < 0) {
1981		D("aborting");
1982		usage();
1983	}
1984    }
1985
1986
1987	if (g.options) {
1988		D("--- SPECIAL OPTIONS:%s%s%s%s%s\n",
1989			g.options & OPT_PREFETCH ? " prefetch" : "",
1990			g.options & OPT_ACCESS ? " access" : "",
1991			g.options & OPT_MEMCPY ? " memcpy" : "",
1992			g.options & OPT_INDIRECT ? " indirect" : "",
1993			g.options & OPT_COPY ? " copy" : "");
1994	}
1995
1996	g.tx_period.tv_sec = g.tx_period.tv_nsec = 0;
1997	if (g.tx_rate > 0) {
1998		/* try to have at least something every second,
1999		 * reducing the burst size to some 0.01s worth of data
2000		 * (but no less than one full set of fragments)
2001	 	 */
2002		uint64_t x;
2003		int lim = (g.tx_rate)/300;
2004		if (g.burst > lim)
2005			g.burst = lim;
2006		if (g.burst < g.frags)
2007			g.burst = g.frags;
2008		x = ((uint64_t)1000000000 * (uint64_t)g.burst) / (uint64_t) g.tx_rate;
2009		g.tx_period.tv_nsec = x;
2010		g.tx_period.tv_sec = g.tx_period.tv_nsec / 1000000000;
2011		g.tx_period.tv_nsec = g.tx_period.tv_nsec % 1000000000;
2012	}
2013	if (g.td_body == sender_body)
2014	    D("Sending %d packets every  %ld.%09ld s",
2015			g.burst, g.tx_period.tv_sec, g.tx_period.tv_nsec);
2016	/* Wait for PHY reset. */
2017	D("Wait %d secs for phy reset", wait_link);
2018	sleep(wait_link);
2019	D("Ready...");
2020
2021	/* Install ^C handler. */
2022	global_nthreads = g.nthreads;
2023	signal(SIGINT, sigint_h);
2024
2025	start_threads(&g);
2026	main_thread(&g);
2027	return 0;
2028}
2029
2030/* end of file */
2031