pkt-gen.c revision 260700
1/*
2 * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved.
3 * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *   1. Redistributions of source code must retain the above copyright
9 *      notice, this list of conditions and the following disclaimer.
10 *   2. Redistributions in binary form must reproduce the above copyright
11 *      notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27/*
28 * $FreeBSD: head/tools/tools/netmap/pkt-gen.c 260700 2014-01-16 00:20:42Z luigi $
29 * $Id: pkt-gen.c 12346 2013-06-12 17:36:25Z luigi $
30 *
31 * Example program to show how to build a multithreaded packet
32 * source/sink using the netmap device.
33 *
34 * In this example we create a programmable number of threads
35 * to take care of all the queues of the interface used to
36 * send or receive traffic.
37 *
38 */
39
40#define MY_PCAP
41#include "nm_util.h"
42// #include <net/netmap_user.h>
43
44#include <ctype.h>	// isprint()
45
46#ifndef NO_PCAP
47#include <pcap/pcap.h>
48#endif
49const char *default_payload="netmap pkt-gen DIRECT payload\n"
50	"http://info.iet.unipi.it/~luigi/netmap/ ";
51
52const char *indirect_payload="netmap pkt-gen indirect payload\n"
53	"http://info.iet.unipi.it/~luigi/netmap/ ";
54
55int time_second;	// support for RD() debugging macro
56
57int verbose = 0;
58
59#define SKIP_PAYLOAD 1 /* do not check payload. */
60
61
62#define VIRT_HDR_1	10	/* length of a base vnet-hdr */
63#define VIRT_HDR_2	12	/* length of the extenede vnet-hdr */
64#define VIRT_HDR_MAX	VIRT_HDR_2
65struct virt_header {
66	uint8_t fields[VIRT_HDR_MAX];
67};
68
69struct pkt {
70	struct virt_header vh;
71	struct ether_header eh;
72	struct ip ip;
73	struct udphdr udp;
74	uint8_t body[2048];	// XXX hardwired
75} __attribute__((__packed__));
76
77struct ip_range {
78	char *name;
79	uint32_t start, end; /* same as struct in_addr */
80	uint16_t port0, port1;
81};
82
83struct mac_range {
84	char *name;
85	struct ether_addr start, end;
86};
87
88/*
89 * global arguments for all threads
90 */
91
92struct glob_arg {
93	struct ip_range src_ip;
94	struct ip_range dst_ip;
95	struct mac_range dst_mac;
96	struct mac_range src_mac;
97	int pkt_size;
98	int burst;
99	int forever;
100	int npackets;	/* total packets to send */
101	int frags;	/* fragments per packet */
102	int nthreads;
103	int cpus;
104	int options;	/* testing */
105#define OPT_PREFETCH	1
106#define OPT_ACCESS	2
107#define OPT_COPY	4
108#define OPT_MEMCPY	8
109#define OPT_TS		16	/* add a timestamp */
110#define OPT_INDIRECT	32	/* use indirect buffers, tx only */
111#define OPT_DUMP	64	/* dump rx/tx traffic */
112	int dev_type;
113#ifndef NO_PCAP
114	pcap_t *p;
115#endif
116
117	int tx_rate;
118	struct timespec tx_period;
119
120	int affinity;
121	int main_fd;
122	int report_interval;		/* milliseconds between prints */
123	void *(*td_body)(void *);
124	void *mmap_addr;
125	int mmap_size;
126	char *ifname;
127	char *nmr_config;
128	int dummy_send;
129	int virt_header;	/* send also the virt_header */
130	int host_ring;
131};
132enum dev_type { DEV_NONE, DEV_NETMAP, DEV_PCAP, DEV_TAP };
133
134
135/*
136 * Arguments for a new thread. The same structure is used by
137 * the source and the sink
138 */
139struct targ {
140	struct glob_arg *g;
141	int used;
142	int completed;
143	int cancel;
144	int fd;
145	struct nmreq nmr;
146	struct netmap_if *nifp;
147	uint16_t	qfirst, qlast; /* range of queues to scan */
148	volatile uint64_t count;
149	struct timespec tic, toc;
150	int me;
151	pthread_t thread;
152	int affinity;
153
154	struct pkt pkt;
155};
156
157
158/*
159 * extract the extremes from a range of ipv4 addresses.
160 * addr_lo[-addr_hi][:port_lo[-port_hi]]
161 */
162static void
163extract_ip_range(struct ip_range *r)
164{
165	char *ap, *pp;
166	struct in_addr a;
167
168	if (verbose)
169		D("extract IP range from %s", r->name);
170	r->port0 = r->port1 = 0;
171	r->start = r->end = 0;
172
173	/* the first - splits start/end of range */
174	ap = index(r->name, '-');	/* do we have ports ? */
175	if (ap) {
176		*ap++ = '\0';
177	}
178	/* grab the initial values (mandatory) */
179	pp = index(r->name, ':');
180	if (pp) {
181		*pp++ = '\0';
182		r->port0 = r->port1 = strtol(pp, NULL, 0);
183	};
184	inet_aton(r->name, &a);
185	r->start = r->end = ntohl(a.s_addr);
186	if (ap) {
187		pp = index(ap, ':');
188		if (pp) {
189			*pp++ = '\0';
190			if (*pp)
191				r->port1 = strtol(pp, NULL, 0);
192		}
193		if (*ap) {
194			inet_aton(ap, &a);
195			r->end = ntohl(a.s_addr);
196		}
197	}
198	if (r->port0 > r->port1) {
199		uint16_t tmp = r->port0;
200		r->port0 = r->port1;
201		r->port1 = tmp;
202	}
203	if (r->start > r->end) {
204		uint32_t tmp = r->start;
205		r->start = r->end;
206		r->end = tmp;
207	}
208	{
209		struct in_addr a;
210		char buf1[16]; // one ip address
211
212		a.s_addr = htonl(r->end);
213		strncpy(buf1, inet_ntoa(a), sizeof(buf1));
214		a.s_addr = htonl(r->start);
215		if (1)
216		    D("range is %s:%d to %s:%d",
217			inet_ntoa(a), r->port0, buf1, r->port1);
218	}
219}
220
221static void
222extract_mac_range(struct mac_range *r)
223{
224	if (verbose)
225	    D("extract MAC range from %s", r->name);
226	bcopy(ether_aton(r->name), &r->start, 6);
227	bcopy(ether_aton(r->name), &r->end, 6);
228#if 0
229	bcopy(targ->src_mac, eh->ether_shost, 6);
230	p = index(targ->g->src_mac, '-');
231	if (p)
232		targ->src_mac_range = atoi(p+1);
233
234	bcopy(ether_aton(targ->g->dst_mac), targ->dst_mac, 6);
235	bcopy(targ->dst_mac, eh->ether_dhost, 6);
236	p = index(targ->g->dst_mac, '-');
237	if (p)
238		targ->dst_mac_range = atoi(p+1);
239#endif
240	if (verbose)
241		D("%s starts at %s", r->name, ether_ntoa(&r->start));
242}
243
244static struct targ *targs;
245static int global_nthreads;
246
247/* control-C handler */
248static void
249sigint_h(int sig)
250{
251	int i;
252
253	(void)sig;	/* UNUSED */
254	for (i = 0; i < global_nthreads; i++) {
255		targs[i].cancel = 1;
256	}
257	signal(SIGINT, SIG_DFL);
258}
259
260/* sysctl wrapper to return the number of active CPUs */
261static int
262system_ncpus(void)
263{
264#ifdef __FreeBSD__
265	int mib[2], ncpus;
266	size_t len;
267
268	mib[0] = CTL_HW;
269	mib[1] = HW_NCPU;
270	len = sizeof(mib);
271	sysctl(mib, 2, &ncpus, &len, NULL, 0);
272
273	return (ncpus);
274#else
275	return 1;
276#endif /* !__FreeBSD__ */
277}
278
279#ifdef __linux__
280#define sockaddr_dl    sockaddr_ll
281#define sdl_family     sll_family
282#define AF_LINK        AF_PACKET
283#define LLADDR(s)      s->sll_addr;
284#include <linux/if_tun.h>
285#define TAP_CLONEDEV	"/dev/net/tun"
286#endif /* __linux__ */
287
288#ifdef __FreeBSD__
289#include <net/if_tun.h>
290#define TAP_CLONEDEV	"/dev/tap"
291#endif /* __FreeBSD */
292
293#ifdef __APPLE__
294// #warning TAP not supported on apple ?
295#include <net/if_utun.h>
296#define TAP_CLONEDEV	"/dev/tap"
297#endif /* __APPLE__ */
298
299
300/*
301 * parse the vale configuration in conf and put it in nmr.
302 * The configuration may consist of 0 to 4 numbers separated
303 * by commas: #tx-slots,#rx-slots,#tx-rings,#rx-rings.
304 * Missing numbers or zeroes stand for default values.
305 * As an additional convenience, if exactly one number
306 * is specified, then this is assigned to both #tx-slots and #rx-slots.
307 * If there is no 4th number, then the 3rd is assigned to both #tx-rings
308 * and #rx-rings.
309 */
310void parse_nmr_config(const char* conf, struct nmreq *nmr)
311{
312	char *w, *tok;
313	int i, v;
314
315	nmr->nr_tx_rings = nmr->nr_rx_rings = 0;
316	nmr->nr_tx_slots = nmr->nr_rx_slots = 0;
317	if (conf == NULL || ! *conf)
318		return;
319	w = strdup(conf);
320	for (i = 0, tok = strtok(w, ","); tok; i++, tok = strtok(NULL, ",")) {
321		v = atoi(tok);
322		switch (i) {
323		case 0:
324			nmr->nr_tx_slots = nmr->nr_rx_slots = v;
325			break;
326		case 1:
327			nmr->nr_rx_slots = v;
328			break;
329		case 2:
330			nmr->nr_tx_rings = nmr->nr_rx_rings = v;
331			break;
332		case 3:
333			nmr->nr_rx_rings = v;
334			break;
335		default:
336			D("ignored config: %s", tok);
337			break;
338		}
339	}
340	D("txr %d txd %d rxr %d rxd %d",
341			nmr->nr_tx_rings, nmr->nr_tx_slots,
342			nmr->nr_rx_rings, nmr->nr_rx_slots);
343	free(w);
344}
345
346
347/*
348 * locate the src mac address for our interface, put it
349 * into the user-supplied buffer. return 0 if ok, -1 on error.
350 */
351static int
352source_hwaddr(const char *ifname, char *buf)
353{
354	struct ifaddrs *ifaphead, *ifap;
355	int l = sizeof(ifap->ifa_name);
356
357	if (getifaddrs(&ifaphead) != 0) {
358		D("getifaddrs %s failed", ifname);
359		return (-1);
360	}
361
362	for (ifap = ifaphead; ifap; ifap = ifap->ifa_next) {
363		struct sockaddr_dl *sdl =
364			(struct sockaddr_dl *)ifap->ifa_addr;
365		uint8_t *mac;
366
367		if (!sdl || sdl->sdl_family != AF_LINK)
368			continue;
369		if (strncmp(ifap->ifa_name, ifname, l) != 0)
370			continue;
371		mac = (uint8_t *)LLADDR(sdl);
372		sprintf(buf, "%02x:%02x:%02x:%02x:%02x:%02x",
373			mac[0], mac[1], mac[2],
374			mac[3], mac[4], mac[5]);
375		if (verbose)
376			D("source hwaddr %s", buf);
377		break;
378	}
379	freeifaddrs(ifaphead);
380	return ifap ? 0 : 1;
381}
382
383
384/* set the thread affinity. */
385static int
386setaffinity(pthread_t me, int i)
387{
388#if 1 // def __FreeBSD__
389	cpuset_t cpumask;
390
391	if (i == -1)
392		return 0;
393
394	/* Set thread affinity affinity.*/
395	CPU_ZERO(&cpumask);
396	CPU_SET(i, &cpumask);
397
398	if (pthread_setaffinity_np(me, sizeof(cpuset_t), &cpumask) != 0) {
399		D("Unable to set affinity: %s", strerror(errno));
400		return 1;
401	}
402#else
403	(void)me; /* suppress 'unused' warnings */
404	(void)i;
405#endif /* __FreeBSD__ */
406	return 0;
407}
408
409/* Compute the checksum of the given ip header. */
410static uint16_t
411checksum(const void *data, uint16_t len, uint32_t sum)
412{
413        const uint8_t *addr = data;
414	uint32_t i;
415
416        /* Checksum all the pairs of bytes first... */
417        for (i = 0; i < (len & ~1U); i += 2) {
418                sum += (u_int16_t)ntohs(*((u_int16_t *)(addr + i)));
419                if (sum > 0xFFFF)
420                        sum -= 0xFFFF;
421        }
422	/*
423	 * If there's a single byte left over, checksum it, too.
424	 * Network byte order is big-endian, so the remaining byte is
425	 * the high byte.
426	 */
427	if (i < len) {
428		sum += addr[i] << 8;
429		if (sum > 0xFFFF)
430			sum -= 0xFFFF;
431	}
432	return sum;
433}
434
435static u_int16_t
436wrapsum(u_int32_t sum)
437{
438	sum = ~sum & 0xFFFF;
439	return (htons(sum));
440}
441
442/* Check the payload of the packet for errors (use it for debug).
443 * Look for consecutive ascii representations of the size of the packet.
444 */
445static void
446dump_payload(char *p, int len, struct netmap_ring *ring, int cur)
447{
448	char buf[128];
449	int i, j, i0;
450
451	/* get the length in ASCII of the length of the packet. */
452
453	printf("ring %p cur %5d [buf %6d flags 0x%04x len %5d]\n",
454		ring, cur, ring->slot[cur].buf_idx,
455		ring->slot[cur].flags, len);
456	/* hexdump routine */
457	for (i = 0; i < len; ) {
458		memset(buf, sizeof(buf), ' ');
459		sprintf(buf, "%5d: ", i);
460		i0 = i;
461		for (j=0; j < 16 && i < len; i++, j++)
462			sprintf(buf+7+j*3, "%02x ", (uint8_t)(p[i]));
463		i = i0;
464		for (j=0; j < 16 && i < len; i++, j++)
465			sprintf(buf+7+j + 48, "%c",
466				isprint(p[i]) ? p[i] : '.');
467		printf("%s\n", buf);
468	}
469}
470
471/*
472 * Fill a packet with some payload.
473 * We create a UDP packet so the payload starts at
474 *	14+20+8 = 42 bytes.
475 */
476#ifdef __linux__
477#define uh_sport source
478#define uh_dport dest
479#define uh_ulen len
480#define uh_sum check
481#endif /* linux */
482
483/*
484 * increment the addressed in the packet,
485 * starting from the least significant field.
486 *	DST_IP DST_PORT SRC_IP SRC_PORT
487 */
488static void
489update_addresses(struct pkt *pkt, struct glob_arg *g)
490{
491	uint32_t a;
492	uint16_t p;
493	struct ip *ip = &pkt->ip;
494	struct udphdr *udp = &pkt->udp;
495
496    do {
497	p = ntohs(udp->uh_sport);
498	if (p < g->src_ip.port1) { /* just inc, no wrap */
499		udp->uh_sport = htons(p + 1);
500		break;
501	}
502	udp->uh_sport = htons(g->src_ip.port0);
503
504	a = ntohl(ip->ip_src.s_addr);
505	if (a < g->src_ip.end) { /* just inc, no wrap */
506		ip->ip_src.s_addr = htonl(a + 1);
507		break;
508	}
509	ip->ip_src.s_addr = htonl(g->src_ip.start);
510
511	udp->uh_sport = htons(g->src_ip.port0);
512	p = ntohs(udp->uh_dport);
513	if (p < g->dst_ip.port1) { /* just inc, no wrap */
514		udp->uh_dport = htons(p + 1);
515		break;
516	}
517	udp->uh_dport = htons(g->dst_ip.port0);
518
519	a = ntohl(ip->ip_dst.s_addr);
520	if (a < g->dst_ip.end) { /* just inc, no wrap */
521		ip->ip_dst.s_addr = htonl(a + 1);
522		break;
523	}
524	ip->ip_dst.s_addr = htonl(g->dst_ip.start);
525    } while (0);
526    // update checksum
527}
528
529/*
530 * initialize one packet and prepare for the next one.
531 * The copy could be done better instead of repeating it each time.
532 */
533static void
534initialize_packet(struct targ *targ)
535{
536	struct pkt *pkt = &targ->pkt;
537	struct ether_header *eh;
538	struct ip *ip;
539	struct udphdr *udp;
540	uint16_t paylen = targ->g->pkt_size - sizeof(*eh) - sizeof(struct ip);
541	const char *payload = targ->g->options & OPT_INDIRECT ?
542		indirect_payload : default_payload;
543	int i, l0 = strlen(payload);
544
545	/* create a nice NUL-terminated string */
546	for (i = 0; i < paylen; i += l0) {
547		if (l0 > paylen - i)
548			l0 = paylen - i; // last round
549		bcopy(payload, pkt->body + i, l0);
550	}
551	pkt->body[i-1] = '\0';
552	ip = &pkt->ip;
553
554	/* prepare the headers */
555        ip->ip_v = IPVERSION;
556        ip->ip_hl = 5;
557        ip->ip_id = 0;
558        ip->ip_tos = IPTOS_LOWDELAY;
559	ip->ip_len = ntohs(targ->g->pkt_size - sizeof(*eh));
560        ip->ip_id = 0;
561        ip->ip_off = htons(IP_DF); /* Don't fragment */
562        ip->ip_ttl = IPDEFTTL;
563	ip->ip_p = IPPROTO_UDP;
564	ip->ip_dst.s_addr = htonl(targ->g->dst_ip.start);
565	ip->ip_src.s_addr = htonl(targ->g->src_ip.start);
566	ip->ip_sum = wrapsum(checksum(ip, sizeof(*ip), 0));
567
568
569	udp = &pkt->udp;
570        udp->uh_sport = htons(targ->g->src_ip.port0);
571        udp->uh_dport = htons(targ->g->dst_ip.port0);
572	udp->uh_ulen = htons(paylen);
573	/* Magic: taken from sbin/dhclient/packet.c */
574	udp->uh_sum = wrapsum(checksum(udp, sizeof(*udp),
575                    checksum(pkt->body,
576                        paylen - sizeof(*udp),
577                        checksum(&ip->ip_src, 2 * sizeof(ip->ip_src),
578                            IPPROTO_UDP + (u_int32_t)ntohs(udp->uh_ulen)
579                        )
580                    )
581                ));
582
583	eh = &pkt->eh;
584	bcopy(&targ->g->src_mac.start, eh->ether_shost, 6);
585	bcopy(&targ->g->dst_mac.start, eh->ether_dhost, 6);
586	eh->ether_type = htons(ETHERTYPE_IP);
587
588	bzero(&pkt->vh, sizeof(pkt->vh));
589	// dump_payload((void *)pkt, targ->g->pkt_size, NULL, 0);
590}
591
592
593
594/*
595 * create and enqueue a batch of packets on a ring.
596 * On the last one set NS_REPORT to tell the driver to generate
597 * an interrupt when done.
598 */
599static int
600send_packets(struct netmap_ring *ring, struct pkt *pkt, void *frame,
601		int size, struct glob_arg *g, u_int count, int options,
602		u_int nfrags)
603{
604	u_int n, sent, cur = ring->cur;
605	u_int fcnt;
606
607	n = nm_ring_space(ring);
608	if (n < count)
609		count = n;
610	if (count < nfrags) {
611		D("truncating packet, no room for frags %d %d",
612				count, nfrags);
613	}
614#if 0
615	if (options & (OPT_COPY | OPT_PREFETCH) ) {
616		for (sent = 0; sent < count; sent++) {
617			struct netmap_slot *slot = &ring->slot[cur];
618			char *p = NETMAP_BUF(ring, slot->buf_idx);
619
620			__builtin_prefetch(p);
621			cur = nm_ring_next(ring, cur);
622		}
623		cur = ring->cur;
624	}
625#endif
626	for (fcnt = nfrags, sent = 0; sent < count; sent++) {
627		struct netmap_slot *slot = &ring->slot[cur];
628		char *p = NETMAP_BUF(ring, slot->buf_idx);
629
630		slot->flags = 0;
631		if (options & OPT_INDIRECT) {
632			slot->flags |= NS_INDIRECT;
633			slot->ptr = (uint64_t)frame;
634		} else if (options & OPT_COPY) {
635			pkt_copy(frame, p, size);
636			if (fcnt == nfrags)
637				update_addresses(pkt, g);
638		} else if (options & OPT_MEMCPY) {
639			memcpy(p, frame, size);
640			if (fcnt == nfrags)
641				update_addresses(pkt, g);
642		} else if (options & OPT_PREFETCH) {
643			__builtin_prefetch(p);
644		}
645		if (options & OPT_DUMP)
646			dump_payload(p, size, ring, cur);
647		slot->len = size;
648		if (--fcnt > 0)
649			slot->flags |= NS_MOREFRAG;
650		else
651			fcnt = nfrags;
652		if (sent == count - 1) {
653			slot->flags &= ~NS_MOREFRAG;
654			slot->flags |= NS_REPORT;
655		}
656		cur = nm_ring_next(ring, cur);
657	}
658	ring->head = ring->cur = cur;
659
660	return (sent);
661}
662
663/*
664 * Send a packet, and wait for a response.
665 * The payload (after UDP header, ofs 42) has a 4-byte sequence
666 * followed by a struct timeval (or bintime?)
667 */
668#define	PAY_OFS	42	/* where in the pkt... */
669
670static void *
671pinger_body(void *data)
672{
673	struct targ *targ = (struct targ *) data;
674	struct pollfd fds[1];
675	struct netmap_if *nifp = targ->nifp;
676	int i, rx = 0, n = targ->g->npackets;
677	void *frame;
678	int size;
679
680	frame = &targ->pkt;
681	frame += sizeof(targ->pkt.vh) - targ->g->virt_header;
682	size = targ->g->pkt_size + targ->g->virt_header;
683
684	fds[0].fd = targ->fd;
685	fds[0].events = (POLLIN);
686	static uint32_t sent;
687	struct timespec ts, now, last_print;
688	uint32_t count = 0, min = 1000000000, av = 0;
689
690	if (targ->g->nthreads > 1) {
691		D("can only ping with 1 thread");
692		return NULL;
693	}
694
695	clock_gettime(CLOCK_REALTIME_PRECISE, &last_print);
696	now = last_print;
697	while (n == 0 || (int)sent < n) {
698		struct netmap_ring *ring = NETMAP_TXRING(nifp, 0);
699		struct netmap_slot *slot;
700		char *p;
701	    for (i = 0; i < 1; i++) { /* XXX why the loop for 1 pkt ? */
702		slot = &ring->slot[ring->cur];
703		slot->len = size;
704		p = NETMAP_BUF(ring, slot->buf_idx);
705
706		if (nm_ring_empty(ring)) {
707			D("-- ouch, cannot send");
708		} else {
709			pkt_copy(frame, p, size);
710			clock_gettime(CLOCK_REALTIME_PRECISE, &ts);
711			bcopy(&sent, p+42, sizeof(sent));
712			bcopy(&ts, p+46, sizeof(ts));
713			sent++;
714			ring->head = ring->cur = nm_ring_next(ring, ring->cur);
715		}
716	    }
717		/* should use a parameter to decide how often to send */
718		if (poll(fds, 1, 3000) <= 0) {
719			D("poll error/timeout on queue %d: %s", targ->me,
720				strerror(errno));
721			continue;
722		}
723		/* see what we got back */
724		for (i = targ->qfirst; i < targ->qlast; i++) {
725			ring = NETMAP_RXRING(nifp, i);
726			while (!nm_ring_empty(ring)) {
727				uint32_t seq;
728				slot = &ring->slot[ring->cur];
729				p = NETMAP_BUF(ring, slot->buf_idx);
730
731				clock_gettime(CLOCK_REALTIME_PRECISE, &now);
732				bcopy(p+42, &seq, sizeof(seq));
733				bcopy(p+46, &ts, sizeof(ts));
734				ts.tv_sec = now.tv_sec - ts.tv_sec;
735				ts.tv_nsec = now.tv_nsec - ts.tv_nsec;
736				if (ts.tv_nsec < 0) {
737					ts.tv_nsec += 1000000000;
738					ts.tv_sec--;
739				}
740				if (1) D("seq %d/%d delta %d.%09d", seq, sent,
741					(int)ts.tv_sec, (int)ts.tv_nsec);
742				if (ts.tv_nsec < (int)min)
743					min = ts.tv_nsec;
744				count ++;
745				av += ts.tv_nsec;
746				ring->head = ring->cur = nm_ring_next(ring, ring->cur);
747				rx++;
748			}
749		}
750		//D("tx %d rx %d", sent, rx);
751		//usleep(100000);
752		ts.tv_sec = now.tv_sec - last_print.tv_sec;
753		ts.tv_nsec = now.tv_nsec - last_print.tv_nsec;
754		if (ts.tv_nsec < 0) {
755			ts.tv_nsec += 1000000000;
756			ts.tv_sec--;
757		}
758		if (ts.tv_sec >= 1) {
759			D("count %d min %d av %d",
760				count, min, av/count);
761			count = 0;
762			av = 0;
763			min = 100000000;
764			last_print = now;
765		}
766	}
767	return NULL;
768}
769
770
771/*
772 * reply to ping requests
773 */
774static void *
775ponger_body(void *data)
776{
777	struct targ *targ = (struct targ *) data;
778	struct pollfd fds[1];
779	struct netmap_if *nifp = targ->nifp;
780	struct netmap_ring *txring, *rxring;
781	int i, rx = 0, sent = 0, n = targ->g->npackets;
782	fds[0].fd = targ->fd;
783	fds[0].events = (POLLIN);
784
785	if (targ->g->nthreads > 1) {
786		D("can only reply ping with 1 thread");
787		return NULL;
788	}
789	D("understood ponger %d but don't know how to do it", n);
790	while (n == 0 || sent < n) {
791		uint32_t txcur, txavail;
792//#define BUSYWAIT
793#ifdef BUSYWAIT
794		ioctl(fds[0].fd, NIOCRXSYNC, NULL);
795#else
796		if (poll(fds, 1, 1000) <= 0) {
797			D("poll error/timeout on queue %d: %s", targ->me,
798				strerror(errno));
799			continue;
800		}
801#endif
802		txring = NETMAP_TXRING(nifp, 0);
803		txcur = txring->cur;
804		txavail = nm_ring_space(txring);
805		/* see what we got back */
806		for (i = targ->qfirst; i < targ->qlast; i++) {
807			rxring = NETMAP_RXRING(nifp, i);
808			while (!nm_ring_empty(rxring)) {
809				uint16_t *spkt, *dpkt;
810				uint32_t cur = rxring->cur;
811				struct netmap_slot *slot = &rxring->slot[cur];
812				char *src, *dst;
813				src = NETMAP_BUF(rxring, slot->buf_idx);
814				//D("got pkt %p of size %d", src, slot->len);
815				rxring->head = rxring->cur = nm_ring_next(rxring, cur);
816				rx++;
817				if (txavail == 0)
818					continue;
819				dst = NETMAP_BUF(txring,
820				    txring->slot[txcur].buf_idx);
821				/* copy... */
822				dpkt = (uint16_t *)dst;
823				spkt = (uint16_t *)src;
824				pkt_copy(src, dst, slot->len);
825				dpkt[0] = spkt[3];
826				dpkt[1] = spkt[4];
827				dpkt[2] = spkt[5];
828				dpkt[3] = spkt[0];
829				dpkt[4] = spkt[1];
830				dpkt[5] = spkt[2];
831				txring->slot[txcur].len = slot->len;
832				/* XXX swap src dst mac */
833				txcur = nm_ring_next(txring, txcur);
834				txavail--;
835				sent++;
836			}
837		}
838		txring->head = txring->cur = txcur;
839		targ->count = sent;
840#ifdef BUSYWAIT
841		ioctl(fds[0].fd, NIOCTXSYNC, NULL);
842#endif
843		//D("tx %d rx %d", sent, rx);
844	}
845	return NULL;
846}
847
848static __inline int
849timespec_ge(const struct timespec *a, const struct timespec *b)
850{
851
852	if (a->tv_sec > b->tv_sec)
853		return (1);
854	if (a->tv_sec < b->tv_sec)
855		return (0);
856	if (a->tv_nsec >= b->tv_nsec)
857		return (1);
858	return (0);
859}
860
861static __inline struct timespec
862timeval2spec(const struct timeval *a)
863{
864	struct timespec ts = {
865		.tv_sec = a->tv_sec,
866		.tv_nsec = a->tv_usec * 1000
867	};
868	return ts;
869}
870
871static __inline struct timeval
872timespec2val(const struct timespec *a)
873{
874	struct timeval tv = {
875		.tv_sec = a->tv_sec,
876		.tv_usec = a->tv_nsec / 1000
877	};
878	return tv;
879}
880
881
882static __inline struct timespec
883timespec_add(struct timespec a, struct timespec b)
884{
885	struct timespec ret = { a.tv_sec + b.tv_sec, a.tv_nsec + b.tv_nsec };
886	if (ret.tv_nsec >= 1000000000) {
887		ret.tv_sec++;
888		ret.tv_nsec -= 1000000000;
889	}
890	return ret;
891}
892
893static __inline struct timespec
894timespec_sub(struct timespec a, struct timespec b)
895{
896	struct timespec ret = { a.tv_sec - b.tv_sec, a.tv_nsec - b.tv_nsec };
897	if (ret.tv_nsec < 0) {
898		ret.tv_sec--;
899		ret.tv_nsec += 1000000000;
900	}
901	return ret;
902}
903
904
905/*
906 * wait until ts, either busy or sleeping if more than 1ms.
907 * Return wakeup time.
908 */
909static struct timespec
910wait_time(struct timespec ts)
911{
912	for (;;) {
913		struct timespec w, cur;
914		clock_gettime(CLOCK_REALTIME_PRECISE, &cur);
915		w = timespec_sub(ts, cur);
916		if (w.tv_sec < 0)
917			return cur;
918		else if (w.tv_sec > 0 || w.tv_nsec > 1000000)
919			poll(NULL, 0, 1);
920	}
921}
922
923static void *
924sender_body(void *data)
925{
926	struct targ *targ = (struct targ *) data;
927
928	struct pollfd fds[1];
929	struct netmap_if *nifp = targ->nifp;
930	struct netmap_ring *txring;
931	int i, n = targ->g->npackets / targ->g->nthreads, sent = 0;
932	int options = targ->g->options | OPT_COPY;
933	struct timespec nexttime = { 0, 0}; // XXX silence compiler
934	int rate_limit = targ->g->tx_rate;
935	struct pkt *pkt = &targ->pkt;
936	void *frame;
937	int size;
938
939	frame = pkt;
940	frame += sizeof(pkt->vh) - targ->g->virt_header;
941	size = targ->g->pkt_size + targ->g->virt_header;
942
943	D("start");
944	if (setaffinity(targ->thread, targ->affinity))
945		goto quit;
946	/* setup poll(2) mechanism. */
947	memset(fds, 0, sizeof(fds));
948	fds[0].fd = targ->fd;
949	fds[0].events = (POLLOUT);
950
951	/* main loop.*/
952	clock_gettime(CLOCK_REALTIME_PRECISE, &targ->tic);
953	if (rate_limit) {
954		targ->tic = timespec_add(targ->tic, (struct timespec){2,0});
955		targ->tic.tv_nsec = 0;
956		wait_time(targ->tic);
957		nexttime = targ->tic;
958	}
959    if (targ->g->dev_type == DEV_TAP) {
960	    D("writing to file desc %d", targ->g->main_fd);
961
962	    for (i = 0; !targ->cancel && (n == 0 || sent < n); i++) {
963		if (write(targ->g->main_fd, frame, size) != -1)
964			sent++;
965		update_addresses(pkt, targ->g);
966		if (i > 10000) {
967			targ->count = sent;
968			i = 0;
969		}
970	    }
971#ifndef NO_PCAP
972    } else if (targ->g->dev_type == DEV_PCAP) {
973	    pcap_t *p = targ->g->p;
974
975	    for (i = 0; !targ->cancel && (n == 0 || sent < n); i++) {
976		if (pcap_inject(p, frame, size) != -1)
977			sent++;
978		update_addresses(pkt, targ->g);
979		if (i > 10000) {
980			targ->count = sent;
981			i = 0;
982		}
983	    }
984#endif /* NO_PCAP */
985    } else {
986	int tosend = 0;
987	int frags = targ->g->frags;
988
989	while (!targ->cancel && (n == 0 || sent < n)) {
990
991		if (rate_limit && tosend <= 0) {
992			tosend = targ->g->burst;
993			nexttime = timespec_add(nexttime, targ->g->tx_period);
994			wait_time(nexttime);
995		}
996
997		/*
998		 * wait for available room in the send queue(s)
999		 */
1000		if (poll(fds, 1, 2000) <= 0) {
1001			if (targ->cancel)
1002				break;
1003			D("poll error/timeout on queue %d: %s", targ->me,
1004				strerror(errno));
1005			goto quit;
1006		}
1007		if (fds[0].revents & POLLERR) {
1008			D("poll error");
1009			goto quit;
1010		}
1011		/*
1012		 * scan our queues and send on those with room
1013		 */
1014		if (options & OPT_COPY && sent > 100000 && !(targ->g->options & OPT_COPY) ) {
1015			D("drop copy");
1016			options &= ~OPT_COPY;
1017		}
1018		for (i = targ->qfirst; i < targ->qlast; i++) {
1019			int m, limit = rate_limit ?  tosend : targ->g->burst;
1020			if (n > 0 && n - sent < limit)
1021				limit = n - sent;
1022			txring = NETMAP_TXRING(nifp, i);
1023			if (nm_ring_empty(txring))
1024				continue;
1025			if (frags > 1)
1026				limit = ((limit + frags - 1) / frags) * frags;
1027
1028			m = send_packets(txring, pkt, frame, size, targ->g,
1029					 limit, options, frags);
1030			ND("limit %d tail %d frags %d m %d",
1031				limit, txring->tail, frags, m);
1032			sent += m;
1033			targ->count = sent;
1034			if (rate_limit) {
1035				tosend -= m;
1036				if (tosend <= 0)
1037					break;
1038			}
1039		}
1040	}
1041	/* flush any remaining packets */
1042	ioctl(fds[0].fd, NIOCTXSYNC, NULL);
1043
1044	/* final part: wait all the TX queues to be empty. */
1045	for (i = targ->qfirst; i < targ->qlast; i++) {
1046		txring = NETMAP_TXRING(nifp, i);
1047		while (nm_tx_pending(txring)) {
1048			ioctl(fds[0].fd, NIOCTXSYNC, NULL);
1049			usleep(1); /* wait 1 tick */
1050		}
1051	}
1052    } /* end DEV_NETMAP */
1053
1054	clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc);
1055	targ->completed = 1;
1056	targ->count = sent;
1057
1058quit:
1059	/* reset the ``used`` flag. */
1060	targ->used = 0;
1061
1062	return (NULL);
1063}
1064
1065
1066#ifndef NO_PCAP
1067static void
1068receive_pcap(u_char *user, const struct pcap_pkthdr * h,
1069	const u_char * bytes)
1070{
1071	int *count = (int *)user;
1072	(void)h;	/* UNUSED */
1073	(void)bytes;	/* UNUSED */
1074	(*count)++;
1075}
1076#endif /* !NO_PCAP */
1077
1078static int
1079receive_packets(struct netmap_ring *ring, u_int limit, int dump)
1080{
1081	u_int cur, rx, n;
1082
1083	cur = ring->cur;
1084	n = nm_ring_space(ring);
1085	if (n < limit)
1086		limit = n;
1087	for (rx = 0; rx < limit; rx++) {
1088		struct netmap_slot *slot = &ring->slot[cur];
1089		char *p = NETMAP_BUF(ring, slot->buf_idx);
1090
1091		if (dump)
1092			dump_payload(p, slot->len, ring, cur);
1093
1094		cur = nm_ring_next(ring, cur);
1095	}
1096	ring->head = ring->cur = cur;
1097
1098	return (rx);
1099}
1100
1101static void *
1102receiver_body(void *data)
1103{
1104	struct targ *targ = (struct targ *) data;
1105	struct pollfd fds[1];
1106	struct netmap_if *nifp = targ->nifp;
1107	struct netmap_ring *rxring;
1108	int i;
1109	uint64_t received = 0;
1110
1111	if (setaffinity(targ->thread, targ->affinity))
1112		goto quit;
1113
1114	/* setup poll(2) mechanism. */
1115	memset(fds, 0, sizeof(fds));
1116	fds[0].fd = targ->fd;
1117	fds[0].events = (POLLIN);
1118
1119	/* unbounded wait for the first packet. */
1120	for (;;) {
1121		i = poll(fds, 1, 1000);
1122		if (i > 0 && !(fds[0].revents & POLLERR))
1123			break;
1124		RD(1, "waiting for initial packets, poll returns %d %d", i, fds[0].revents);
1125	}
1126
1127	/* main loop, exit after 1s silence */
1128	clock_gettime(CLOCK_REALTIME_PRECISE, &targ->tic);
1129    if (targ->g->dev_type == DEV_TAP) {
1130	D("reading from %s fd %d", targ->g->ifname, targ->g->main_fd);
1131	while (!targ->cancel) {
1132		char buf[2048];
1133		/* XXX should we poll ? */
1134		if (read(targ->g->main_fd, buf, sizeof(buf)) > 0)
1135			targ->count++;
1136	}
1137#ifndef NO_PCAP
1138    } else if (targ->g->dev_type == DEV_PCAP) {
1139	while (!targ->cancel) {
1140		/* XXX should we poll ? */
1141		pcap_dispatch(targ->g->p, targ->g->burst, receive_pcap, NULL);
1142	}
1143#endif /* !NO_PCAP */
1144    } else {
1145	int dump = targ->g->options & OPT_DUMP;
1146	while (!targ->cancel) {
1147		/* Once we started to receive packets, wait at most 1 seconds
1148		   before quitting. */
1149		if (poll(fds, 1, 1 * 1000) <= 0 && !targ->g->forever) {
1150			clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc);
1151			targ->toc.tv_sec -= 1; /* Subtract timeout time. */
1152			break;
1153		}
1154
1155		if (fds[0].revents & POLLERR) {
1156			D("poll err");
1157			goto quit;
1158		}
1159
1160		for (i = targ->qfirst; i < targ->qlast; i++) {
1161			int m;
1162
1163			rxring = NETMAP_RXRING(nifp, i);
1164			if (nm_ring_empty(rxring))
1165				continue;
1166
1167			m = receive_packets(rxring, targ->g->burst, dump);
1168			received += m;
1169		}
1170		targ->count = received;
1171
1172		// tell the card we have read the data
1173		//ioctl(fds[0].fd, NIOCRXSYNC, NULL);
1174	}
1175    }
1176
1177	targ->completed = 1;
1178	targ->count = received;
1179
1180quit:
1181	/* reset the ``used`` flag. */
1182	targ->used = 0;
1183
1184	return (NULL);
1185}
1186
1187/* very crude code to print a number in normalized form.
1188 * Caller has to make sure that the buffer is large enough.
1189 */
1190static const char *
1191norm(char *buf, double val)
1192{
1193	char *units[] = { "", "K", "M", "G" };
1194	u_int i;
1195
1196	for (i = 0; val >=1000 && i < sizeof(units)/sizeof(char *); i++)
1197		val /= 1000;
1198	sprintf(buf, "%.2f %s", val, units[i]);
1199	return buf;
1200}
1201
1202static void
1203tx_output(uint64_t sent, int size, double delta)
1204{
1205	double bw, raw_bw, pps;
1206	char b1[40], b2[80], b3[80];
1207
1208	printf("Sent %" PRIu64 " packets, %d bytes each, in %.2f seconds.\n",
1209	       sent, size, delta);
1210	if (delta == 0)
1211		delta = 1e-6;
1212	if (size < 60)		/* correct for min packet size */
1213		size = 60;
1214	pps = sent / delta;
1215	bw = (8.0 * size * sent) / delta;
1216	/* raw packets have4 bytes crc + 20 bytes framing */
1217	raw_bw = (8.0 * (size + 24) * sent) / delta;
1218
1219	printf("Speed: %spps Bandwidth: %sbps (raw %sbps)\n",
1220		norm(b1, pps), norm(b2, bw), norm(b3, raw_bw) );
1221}
1222
1223
1224static void
1225rx_output(uint64_t received, double delta)
1226{
1227	double pps;
1228	char b1[40];
1229
1230	printf("Received %" PRIu64 " packets, in %.2f seconds.\n", received, delta);
1231
1232	if (delta == 0)
1233		delta = 1e-6;
1234	pps = received / delta;
1235	printf("Speed: %spps\n", norm(b1, pps));
1236}
1237
1238static void
1239usage(void)
1240{
1241	const char *cmd = "pkt-gen";
1242	fprintf(stderr,
1243		"Usage:\n"
1244		"%s arguments\n"
1245		"\t-i interface		interface name\n"
1246		"\t-f function		tx rx ping pong\n"
1247		"\t-n count		number of iterations (can be 0)\n"
1248		"\t-t pkts_to_send		also forces tx mode\n"
1249		"\t-r pkts_to_receive	also forces rx mode\n"
1250		"\t-l pkt_size		in bytes excluding CRC\n"
1251		"\t-d dst_ip[:port[-dst_ip:port]]   single or range\n"
1252		"\t-s src_ip[:port[-src_ip:port]]   single or range\n"
1253		"\t-D dst-mac\n"
1254		"\t-S src-mac\n"
1255		"\t-a cpu_id		use setaffinity\n"
1256		"\t-b burst size		testing, mostly\n"
1257		"\t-c cores		cores to use\n"
1258		"\t-p threads		processes/threads to use\n"
1259		"\t-T report_ms		milliseconds between reports\n"
1260		"\t-P			use libpcap instead of netmap\n"
1261		"\t-w wait_for_link_time	in seconds\n"
1262		"\t-R rate		in packets per second\n"
1263		"\t-X			dump payload\n"
1264		"\t-H len		add empty virtio-net-header with size 'len'\n"
1265		"\t-h			use host ring\n"
1266		"",
1267		cmd);
1268
1269	exit(0);
1270}
1271
1272static void
1273start_threads(struct glob_arg *g)
1274{
1275	int i;
1276
1277	targs = calloc(g->nthreads, sizeof(*targs));
1278	/*
1279	 * Now create the desired number of threads, each one
1280	 * using a single descriptor.
1281 	 */
1282	for (i = 0; i < g->nthreads; i++) {
1283		bzero(&targs[i], sizeof(targs[i]));
1284		targs[i].fd = -1; /* default, with pcap */
1285		targs[i].g = g;
1286
1287	    if (g->dev_type == DEV_NETMAP) {
1288		struct nmreq tifreq;
1289		int tfd;
1290
1291		/* register interface. */
1292		tfd = open("/dev/netmap", O_RDWR);
1293		if (tfd == -1) {
1294			D("Unable to open /dev/netmap: %s", strerror(errno));
1295			continue;
1296		}
1297		targs[i].fd = tfd;
1298
1299		bzero(&tifreq, sizeof(tifreq));
1300		strncpy(tifreq.nr_name, g->ifname, sizeof(tifreq.nr_name));
1301		tifreq.nr_version = NETMAP_API;
1302		if (g->host_ring) {
1303			tifreq.nr_ringid = NETMAP_SW_RING;
1304		} else {
1305			tifreq.nr_ringid = (g->nthreads > 1) ? (i | NETMAP_HW_RING) : 0;
1306		}
1307		parse_nmr_config(g->nmr_config, &tifreq);
1308
1309		/*
1310		 * if we are acting as a receiver only, do not touch the transmit ring.
1311		 * This is not the default because many apps may use the interface
1312		 * in both directions, but a pure receiver does not.
1313		 */
1314		if (g->td_body == receiver_body) {
1315			tifreq.nr_ringid |= NETMAP_NO_TX_POLL;
1316		}
1317
1318		if ((ioctl(tfd, NIOCREGIF, &tifreq)) == -1) {
1319			D("Unable to register %s: %s", g->ifname, strerror(errno));
1320			continue;
1321		}
1322		D("memsize is %d MB", tifreq.nr_memsize >> 20);
1323		targs[i].nmr = tifreq;
1324		targs[i].nifp = NETMAP_IF(g->mmap_addr, tifreq.nr_offset);
1325		D("nifp flags 0x%x", targs[i].nifp->ni_flags);
1326		/* start threads. */
1327		if (g->host_ring) {
1328			targs[i].qfirst = (g->td_body == receiver_body ? tifreq.nr_rx_rings : tifreq.nr_tx_rings);
1329			targs[i].qlast = targs[i].qfirst + 1;
1330		} else {
1331			targs[i].qfirst = (g->nthreads > 1) ? i : 0;
1332			targs[i].qlast = (g->nthreads > 1) ? i+1 :
1333				(g->td_body == receiver_body ? tifreq.nr_rx_rings : tifreq.nr_tx_rings);
1334		}
1335	    } else {
1336		targs[i].fd = g->main_fd;
1337	    }
1338		targs[i].used = 1;
1339		targs[i].me = i;
1340		if (g->affinity >= 0) {
1341			if (g->affinity < g->cpus)
1342				targs[i].affinity = g->affinity;
1343			else
1344				targs[i].affinity = i % g->cpus;
1345		} else
1346			targs[i].affinity = -1;
1347		/* default, init packets */
1348		initialize_packet(&targs[i]);
1349
1350		if (pthread_create(&targs[i].thread, NULL, g->td_body,
1351				   &targs[i]) == -1) {
1352			D("Unable to create thread %d: %s", i, strerror(errno));
1353			targs[i].used = 0;
1354		}
1355	}
1356}
1357
1358static void
1359main_thread(struct glob_arg *g)
1360{
1361	int i;
1362
1363	uint64_t prev = 0;
1364	uint64_t count = 0;
1365	double delta_t;
1366	struct timeval tic, toc;
1367
1368	gettimeofday(&toc, NULL);
1369	for (;;) {
1370		struct timeval now, delta;
1371		uint64_t pps, usec, my_count, npkts;
1372		int done = 0;
1373
1374		delta.tv_sec = g->report_interval/1000;
1375		delta.tv_usec = (g->report_interval%1000)*1000;
1376		select(0, NULL, NULL, NULL, &delta);
1377		gettimeofday(&now, NULL);
1378		time_second = now.tv_sec;
1379		timersub(&now, &toc, &toc);
1380		my_count = 0;
1381		for (i = 0; i < g->nthreads; i++) {
1382			my_count += targs[i].count;
1383			if (targs[i].used == 0)
1384				done++;
1385		}
1386		usec = toc.tv_sec* 1000000 + toc.tv_usec;
1387		if (usec < 10000)
1388			continue;
1389		npkts = my_count - prev;
1390		pps = (npkts*1000000 + usec/2) / usec;
1391		D("%" PRIu64 " pps (%" PRIu64 " pkts in %" PRIu64 " usec)",
1392			pps, npkts, usec);
1393		prev = my_count;
1394		toc = now;
1395		if (done == g->nthreads)
1396			break;
1397	}
1398
1399	timerclear(&tic);
1400	timerclear(&toc);
1401	for (i = 0; i < g->nthreads; i++) {
1402		struct timespec t_tic, t_toc;
1403		/*
1404		 * Join active threads, unregister interfaces and close
1405		 * file descriptors.
1406		 */
1407		if (targs[i].used)
1408			pthread_join(targs[i].thread, NULL);
1409		close(targs[i].fd);
1410
1411		if (targs[i].completed == 0)
1412			D("ouch, thread %d exited with error", i);
1413
1414		/*
1415		 * Collect threads output and extract information about
1416		 * how long it took to send all the packets.
1417		 */
1418		count += targs[i].count;
1419		t_tic = timeval2spec(&tic);
1420		t_toc = timeval2spec(&toc);
1421		if (!timerisset(&tic) || timespec_ge(&targs[i].tic, &t_tic))
1422			tic = timespec2val(&targs[i].tic);
1423		if (!timerisset(&toc) || timespec_ge(&targs[i].toc, &t_toc))
1424			toc = timespec2val(&targs[i].toc);
1425	}
1426
1427	/* print output. */
1428	timersub(&toc, &tic, &toc);
1429	delta_t = toc.tv_sec + 1e-6* toc.tv_usec;
1430	if (g->td_body == sender_body)
1431		tx_output(count, g->pkt_size, delta_t);
1432	else
1433		rx_output(count, delta_t);
1434
1435	if (g->dev_type == DEV_NETMAP) {
1436		munmap(g->mmap_addr, g->mmap_size);
1437		close(g->main_fd);
1438	}
1439}
1440
1441
1442struct sf {
1443	char *key;
1444	void *f;
1445};
1446
1447static struct sf func[] = {
1448	{ "tx",	sender_body },
1449	{ "rx",	receiver_body },
1450	{ "ping",	pinger_body },
1451	{ "pong",	ponger_body },
1452	{ NULL, NULL }
1453};
1454
1455static int
1456tap_alloc(char *dev)
1457{
1458	struct ifreq ifr;
1459	int fd, err;
1460	char *clonedev = TAP_CLONEDEV;
1461
1462	(void)err;
1463	(void)dev;
1464	/* Arguments taken by the function:
1465	 *
1466	 * char *dev: the name of an interface (or '\0'). MUST have enough
1467	 *   space to hold the interface name if '\0' is passed
1468	 * int flags: interface flags (eg, IFF_TUN etc.)
1469	 */
1470
1471#ifdef __FreeBSD__
1472	if (dev[3]) { /* tapSomething */
1473		static char buf[128];
1474		snprintf(buf, sizeof(buf), "/dev/%s", dev);
1475		clonedev = buf;
1476	}
1477#endif
1478	/* open the device */
1479	if( (fd = open(clonedev, O_RDWR)) < 0 ) {
1480		return fd;
1481	}
1482	D("%s open successful", clonedev);
1483
1484	/* preparation of the struct ifr, of type "struct ifreq" */
1485	memset(&ifr, 0, sizeof(ifr));
1486
1487#ifdef linux
1488	ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
1489
1490	if (*dev) {
1491		/* if a device name was specified, put it in the structure; otherwise,
1492		* the kernel will try to allocate the "next" device of the
1493		* specified type */
1494		strncpy(ifr.ifr_name, dev, IFNAMSIZ);
1495	}
1496
1497	/* try to create the device */
1498	if( (err = ioctl(fd, TUNSETIFF, (void *) &ifr)) < 0 ) {
1499		D("failed to to a TUNSETIFF: %s", strerror(errno));
1500		close(fd);
1501		return err;
1502	}
1503
1504	/* if the operation was successful, write back the name of the
1505	* interface to the variable "dev", so the caller can know
1506	* it. Note that the caller MUST reserve space in *dev (see calling
1507	* code below) */
1508	strcpy(dev, ifr.ifr_name);
1509	D("new name is %s", dev);
1510#endif /* linux */
1511
1512        /* this is the special file descriptor that the caller will use to talk
1513         * with the virtual interface */
1514        return fd;
1515}
1516
1517int
1518main(int arc, char **argv)
1519{
1520	int i;
1521
1522	struct glob_arg g;
1523
1524	struct nmreq nmr;
1525	int ch;
1526	int wait_link = 2;
1527	int devqueues = 1;	/* how many device queues */
1528
1529	bzero(&g, sizeof(g));
1530
1531	g.main_fd = -1;
1532	g.td_body = receiver_body;
1533	g.report_interval = 1000;	/* report interval */
1534	g.affinity = -1;
1535	/* ip addresses can also be a range x.x.x.x-x.x.x.y */
1536	g.src_ip.name = "10.0.0.1";
1537	g.dst_ip.name = "10.1.0.1";
1538	g.dst_mac.name = "ff:ff:ff:ff:ff:ff";
1539	g.src_mac.name = NULL;
1540	g.pkt_size = 60;
1541	g.burst = 512;		// default
1542	g.nthreads = 1;
1543	g.cpus = 1;
1544	g.forever = 1;
1545	g.tx_rate = 0;
1546	g.frags = 1;
1547	g.nmr_config = "";
1548	g.virt_header = 0;
1549
1550	while ( (ch = getopt(arc, argv,
1551			"a:f:F:n:i:It:r:l:d:s:D:S:b:c:o:p:T:w:WvR:XC:H:h")) != -1) {
1552		struct sf *fn;
1553
1554		switch(ch) {
1555		default:
1556			D("bad option %c %s", ch, optarg);
1557			usage();
1558			break;
1559
1560		case 'n':
1561			g.npackets = atoi(optarg);
1562			break;
1563
1564		case 'F':
1565			i = atoi(optarg);
1566			if (i < 1 || i > 63) {
1567				D("invalid frags %d [1..63], ignore", i);
1568				break;
1569			}
1570			g.frags = i;
1571			break;
1572
1573		case 'f':
1574			for (fn = func; fn->key; fn++) {
1575				if (!strcmp(fn->key, optarg))
1576					break;
1577			}
1578			if (fn->key)
1579				g.td_body = fn->f;
1580			else
1581				D("unrecognised function %s", optarg);
1582			break;
1583
1584		case 'o':	/* data generation options */
1585			g.options = atoi(optarg);
1586			break;
1587
1588		case 'a':       /* force affinity */
1589			g.affinity = atoi(optarg);
1590			break;
1591
1592		case 'i':	/* interface */
1593			/* a prefix of tap: netmap: or pcap: forces the mode.
1594			 * otherwise we guess
1595			 */
1596			D("interface is %s", optarg);
1597			g.ifname = optarg;
1598			if (!strcmp(optarg, "null")) {
1599				g.dev_type = DEV_NETMAP;
1600				g.dummy_send = 1;
1601			} else if (!strncmp(optarg, "tap:", 4)) {
1602				g.dev_type = DEV_TAP;
1603				g.ifname = optarg + 4;
1604			} else if (!strncmp(optarg, "pcap:", 5)) {
1605				g.dev_type = DEV_PCAP;
1606				g.ifname = optarg + 5;
1607			} else if (!strncmp(optarg, "netmap:", 7)) {
1608				g.dev_type = DEV_NETMAP;
1609				g.ifname = optarg + 7;
1610			} else if (!strncmp(optarg, "tap", 3)) {
1611				g.dev_type = DEV_TAP;
1612			} else {
1613				g.dev_type = DEV_NETMAP;
1614			}
1615			break;
1616
1617		case 'I':
1618			g.options |= OPT_INDIRECT;	/* XXX use indirect buffer */
1619			break;
1620
1621		case 't':	/* send, deprecated */
1622			D("-t deprecated, please use -f tx -n %s", optarg);
1623			g.td_body = sender_body;
1624			g.npackets = atoi(optarg);
1625			break;
1626
1627		case 'r':	/* receive */
1628			D("-r deprecated, please use -f rx -n %s", optarg);
1629			g.td_body = receiver_body;
1630			g.npackets = atoi(optarg);
1631			break;
1632
1633		case 'l':	/* pkt_size */
1634			g.pkt_size = atoi(optarg);
1635			break;
1636
1637		case 'd':
1638			g.dst_ip.name = optarg;
1639			break;
1640
1641		case 's':
1642			g.src_ip.name = optarg;
1643			break;
1644
1645		case 'T':	/* report interval */
1646			g.report_interval = atoi(optarg);
1647			break;
1648
1649		case 'w':
1650			wait_link = atoi(optarg);
1651			break;
1652
1653		case 'W': /* XXX changed default */
1654			g.forever = 0; /* do not exit rx even with no traffic */
1655			break;
1656
1657		case 'b':	/* burst */
1658			g.burst = atoi(optarg);
1659			break;
1660		case 'c':
1661			g.cpus = atoi(optarg);
1662			break;
1663		case 'p':
1664			g.nthreads = atoi(optarg);
1665			break;
1666
1667		case 'D': /* destination mac */
1668			g.dst_mac.name = optarg;
1669			break;
1670
1671		case 'S': /* source mac */
1672			g.src_mac.name = optarg;
1673			break;
1674		case 'v':
1675			verbose++;
1676			break;
1677		case 'R':
1678			g.tx_rate = atoi(optarg);
1679			break;
1680		case 'X':
1681			g.options |= OPT_DUMP;
1682			break;
1683		case 'C':
1684			g.nmr_config = strdup(optarg);
1685			break;
1686		case 'H':
1687			g.virt_header = atoi(optarg);
1688			break;
1689		case 'h':
1690			g.host_ring = 1;
1691			break;
1692		}
1693	}
1694
1695	if (g.ifname == NULL) {
1696		D("missing ifname");
1697		usage();
1698	}
1699
1700	i = system_ncpus();
1701	if (g.cpus < 0 || g.cpus > i) {
1702		D("%d cpus is too high, have only %d cpus", g.cpus, i);
1703		usage();
1704	}
1705	if (g.cpus == 0)
1706		g.cpus = i;
1707
1708	if (g.pkt_size < 16 || g.pkt_size > 1536) {
1709		D("bad pktsize %d\n", g.pkt_size);
1710		usage();
1711	}
1712
1713	if (g.src_mac.name == NULL) {
1714		static char mybuf[20] = "00:00:00:00:00:00";
1715		/* retrieve source mac address. */
1716		if (source_hwaddr(g.ifname, mybuf) == -1) {
1717			D("Unable to retrieve source mac");
1718			// continue, fail later
1719		}
1720		g.src_mac.name = mybuf;
1721	}
1722	/* extract address ranges */
1723	extract_ip_range(&g.src_ip);
1724	extract_ip_range(&g.dst_ip);
1725	extract_mac_range(&g.src_mac);
1726	extract_mac_range(&g.dst_mac);
1727
1728	if (g.src_ip.start != g.src_ip.end ||
1729	    g.src_ip.port0 != g.src_ip.port1 ||
1730	    g.dst_ip.start != g.dst_ip.end ||
1731	    g.dst_ip.port0 != g.dst_ip.port1)
1732		g.options |= OPT_COPY;
1733
1734	if (g.virt_header != 0 && g.virt_header != VIRT_HDR_1
1735			&& g.virt_header != VIRT_HDR_2) {
1736		D("bad virtio-net-header length");
1737		usage();
1738	}
1739
1740    if (g.dev_type == DEV_TAP) {
1741	D("want to use tap %s", g.ifname);
1742	g.main_fd = tap_alloc(g.ifname);
1743	if (g.main_fd < 0) {
1744		D("cannot open tap %s", g.ifname);
1745		usage();
1746	}
1747#ifndef NO_PCAP
1748    } else if (g.dev_type == DEV_PCAP) {
1749	char pcap_errbuf[PCAP_ERRBUF_SIZE];
1750
1751	D("using pcap on %s", g.ifname);
1752	pcap_errbuf[0] = '\0'; // init the buffer
1753	g.p = pcap_open_live(g.ifname, 0, 1, 100, pcap_errbuf);
1754	if (g.p == NULL) {
1755		D("cannot open pcap on %s", g.ifname);
1756		usage();
1757	}
1758#endif /* !NO_PCAP */
1759    } else if (g.dummy_send) { /* but DEV_NETMAP */
1760	D("using a dummy send routine");
1761    } else {
1762	bzero(&nmr, sizeof(nmr));
1763	nmr.nr_version = NETMAP_API;
1764	/*
1765	 * Open the netmap device to fetch the number of queues of our
1766	 * interface.
1767	 *
1768	 * The first NIOCREGIF also detaches the card from the
1769	 * protocol stack and may cause a reset of the card,
1770	 * which in turn may take some time for the PHY to
1771	 * reconfigure.
1772	 */
1773	g.main_fd = open("/dev/netmap", O_RDWR);
1774	if (g.main_fd == -1) {
1775		D("Unable to open /dev/netmap: %s", strerror(errno));
1776		// fail later
1777	}
1778	/*
1779	 * Register the interface on the netmap device: from now on,
1780	 * we can operate on the network interface without any
1781	 * interference from the legacy network stack.
1782	 *
1783	 * We decide to put the first interface registration here to
1784	 * give time to cards that take a long time to reset the PHY.
1785	 */
1786	bzero(&nmr, sizeof(nmr));
1787	nmr.nr_version = NETMAP_API;
1788	strncpy(nmr.nr_name, g.ifname, sizeof(nmr.nr_name));
1789	parse_nmr_config(g.nmr_config, &nmr);
1790	if (ioctl(g.main_fd, NIOCREGIF, &nmr) == -1) {
1791		D("Unable to register interface %s: %s", g.ifname, strerror(errno));
1792		//continue, fail later
1793	}
1794	ND("%s: txr %d txd %d rxr %d rxd %d", g.ifname,
1795			nmr.nr_tx_rings, nmr.nr_tx_slots,
1796			nmr.nr_rx_rings, nmr.nr_rx_slots);
1797	devqueues = nmr.nr_rx_rings;
1798
1799	/* validate provided nthreads. */
1800	if (g.nthreads < 1 || g.nthreads > devqueues) {
1801		D("bad nthreads %d, have %d queues", g.nthreads, devqueues);
1802		// continue, fail later
1803	}
1804
1805	/*
1806	 * Map the netmap shared memory: instead of issuing mmap()
1807	 * inside the body of the threads, we prefer to keep this
1808	 * operation here to simplify the thread logic.
1809	 */
1810	D("mapping %d Kbytes", nmr.nr_memsize>>10);
1811	g.mmap_size = nmr.nr_memsize;
1812	g.mmap_addr = (struct netmap_d *) mmap(0, nmr.nr_memsize,
1813					    PROT_WRITE | PROT_READ,
1814					    MAP_SHARED, g.main_fd, 0);
1815	if (g.mmap_addr == MAP_FAILED) {
1816		D("Unable to mmap %d KB: %s", nmr.nr_memsize >> 10, strerror(errno));
1817		// continue, fail later
1818	}
1819
1820	if (verbose) {
1821		struct netmap_if *nifp = NETMAP_IF(g.mmap_addr, nmr.nr_offset);
1822
1823		D("nifp at offset %d, %d tx %d rx rings %s",
1824		    nmr.nr_offset, nmr.nr_tx_rings, nmr.nr_rx_rings,
1825		    nmr.nr_ringid & NETMAP_PRIV_MEM ? "PRIVATE" : "common" );
1826		for (i = 0; i <= nmr.nr_tx_rings; i++) {
1827			D("   TX%d at 0x%lx", i,
1828			    (char *)NETMAP_TXRING(nifp, i) - (char *)nifp);
1829		}
1830		for (i = 0; i <= nmr.nr_rx_rings; i++) {
1831			D("   RX%d at 0x%lx", i,
1832			    (char *)NETMAP_RXRING(nifp, i) - (char *)nifp);
1833		}
1834	}
1835
1836	/* Print some debug information. */
1837	fprintf(stdout,
1838		"%s %s: %d queues, %d threads and %d cpus.\n",
1839		(g.td_body == sender_body) ? "Sending on" : "Receiving from",
1840		g.ifname,
1841		devqueues,
1842		g.nthreads,
1843		g.cpus);
1844	if (g.td_body == sender_body) {
1845		fprintf(stdout, "%s -> %s (%s -> %s)\n",
1846			g.src_ip.name, g.dst_ip.name,
1847			g.src_mac.name, g.dst_mac.name);
1848	}
1849
1850	/* Exit if something went wrong. */
1851	if (g.main_fd < 0) {
1852		D("aborting");
1853		usage();
1854	}
1855    }
1856
1857
1858	if (g.options) {
1859		D("--- SPECIAL OPTIONS:%s%s%s%s%s\n",
1860			g.options & OPT_PREFETCH ? " prefetch" : "",
1861			g.options & OPT_ACCESS ? " access" : "",
1862			g.options & OPT_MEMCPY ? " memcpy" : "",
1863			g.options & OPT_INDIRECT ? " indirect" : "",
1864			g.options & OPT_COPY ? " copy" : "");
1865	}
1866
1867	g.tx_period.tv_sec = g.tx_period.tv_nsec = 0;
1868	if (g.tx_rate > 0) {
1869		/* try to have at least something every second,
1870		 * reducing the burst size to some 0.01s worth of data
1871		 * (but no less than one full set of fragments)
1872	 	 */
1873		uint64_t x;
1874		int lim = (g.tx_rate)/300;
1875		if (g.burst > lim)
1876			g.burst = lim;
1877		if (g.burst < g.frags)
1878			g.burst = g.frags;
1879		x = ((uint64_t)1000000000 * (uint64_t)g.burst) / (uint64_t) g.tx_rate;
1880		g.tx_period.tv_nsec = x;
1881		g.tx_period.tv_sec = g.tx_period.tv_nsec / 1000000000;
1882		g.tx_period.tv_nsec = g.tx_period.tv_nsec % 1000000000;
1883	}
1884	if (g.td_body == sender_body)
1885	    D("Sending %d packets every  %ld.%09ld s",
1886			g.burst, g.tx_period.tv_sec, g.tx_period.tv_nsec);
1887	/* Wait for PHY reset. */
1888	D("Wait %d secs for phy reset", wait_link);
1889	sleep(wait_link);
1890	D("Ready...");
1891
1892	/* Install ^C handler. */
1893	global_nthreads = g.nthreads;
1894	signal(SIGINT, sigint_h);
1895
1896	start_threads(&g);
1897	main_thread(&g);
1898	return 0;
1899}
1900
1901/* end of file */
1902