1/*
2 * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved.
3 * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *   1. Redistributions of source code must retain the above copyright
9 *      notice, this list of conditions and the following disclaimer.
10 *   2. Redistributions in binary form must reproduce the above copyright
11 *      notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27/*
28 * $FreeBSD$
29 * $Id: pkt-gen.c 12346 2013-06-12 17:36:25Z luigi $
30 *
31 * Example program to show how to build a multithreaded packet
32 * source/sink using the netmap device.
33 *
34 * In this example we create a programmable number of threads
35 * to take care of all the queues of the interface used to
36 * send or receive traffic.
37 *
38 */
39
40#define _GNU_SOURCE	/* for CPU_SET() */
41#include <stdio.h>
42#define NETMAP_WITH_LIBS
43#include <net/netmap_user.h>
44
45
46#include <ctype.h>	// isprint()
47#include <unistd.h>	// sysconf()
48#include <sys/poll.h>
49#include <arpa/inet.h>	/* ntohs */
50#include <sys/sysctl.h>	/* sysctl */
51#include <ifaddrs.h>	/* getifaddrs */
52#include <net/ethernet.h>
53#include <netinet/in.h>
54#include <netinet/ip.h>
55#include <netinet/udp.h>
56
57#include <pthread.h>
58
59#ifndef NO_PCAP
60#include <pcap/pcap.h>
61#endif
62
63#ifdef linux
64
65#define cpuset_t        cpu_set_t
66
67#define ifr_flagshigh  ifr_flags        /* only the low 16 bits here */
68#define IFF_PPROMISC   IFF_PROMISC      /* IFF_PPROMISC does not exist */
69#include <linux/ethtool.h>
70#include <linux/sockios.h>
71
72#define CLOCK_REALTIME_PRECISE CLOCK_REALTIME
73#include <netinet/ether.h>      /* ether_aton */
74#include <linux/if_packet.h>    /* sockaddr_ll */
75#endif  /* linux */
76
77#ifdef __FreeBSD__
78#include <sys/endian.h> /* le64toh */
79#include <machine/param.h>
80
81#include <pthread_np.h> /* pthread w/ affinity */
82#include <sys/cpuset.h> /* cpu_set */
83#include <net/if_dl.h>  /* LLADDR */
84#endif  /* __FreeBSD__ */
85
86#ifdef __APPLE__
87
88#define cpuset_t        uint64_t        // XXX
89static inline void CPU_ZERO(cpuset_t *p)
90{
91        *p = 0;
92}
93
94static inline void CPU_SET(uint32_t i, cpuset_t *p)
95{
96        *p |= 1<< (i & 0x3f);
97}
98
99#define pthread_setaffinity_np(a, b, c) ((void)a, 0)
100
101#define ifr_flagshigh  ifr_flags        // XXX
102#define IFF_PPROMISC   IFF_PROMISC
103#include <net/if_dl.h>  /* LLADDR */
104#define clock_gettime(a,b)      \
105        do {struct timespec t0 = {0,0}; *(b) = t0; } while (0)
106#endif  /* __APPLE__ */
107
108const char *default_payload="netmap pkt-gen DIRECT payload\n"
109	"http://info.iet.unipi.it/~luigi/netmap/ ";
110
111const char *indirect_payload="netmap pkt-gen indirect payload\n"
112	"http://info.iet.unipi.it/~luigi/netmap/ ";
113
114int verbose = 0;
115
116#define SKIP_PAYLOAD 1 /* do not check payload. XXX unused */
117
118
119#define VIRT_HDR_1	10	/* length of a base vnet-hdr */
120#define VIRT_HDR_2	12	/* length of the extenede vnet-hdr */
121#define VIRT_HDR_MAX	VIRT_HDR_2
122struct virt_header {
123	uint8_t fields[VIRT_HDR_MAX];
124};
125
126struct pkt {
127	struct virt_header vh;
128	struct ether_header eh;
129	struct ip ip;
130	struct udphdr udp;
131	uint8_t body[2048];	// XXX hardwired
132} __attribute__((__packed__));
133
134struct ip_range {
135	char *name;
136	uint32_t start, end; /* same as struct in_addr */
137	uint16_t port0, port1;
138};
139
140struct mac_range {
141	char *name;
142	struct ether_addr start, end;
143};
144
145/* ifname can be netmap:foo-xxxx */
146#define MAX_IFNAMELEN	64	/* our buffer for ifname */
147/*
148 * global arguments for all threads
149 */
150
151struct glob_arg {
152	struct ip_range src_ip;
153	struct ip_range dst_ip;
154	struct mac_range dst_mac;
155	struct mac_range src_mac;
156	int pkt_size;
157	int burst;
158	int forever;
159	int npackets;	/* total packets to send */
160	int frags;	/* fragments per packet */
161	int nthreads;
162	int cpus;
163	int options;	/* testing */
164#define OPT_PREFETCH	1
165#define OPT_ACCESS	2
166#define OPT_COPY	4
167#define OPT_MEMCPY	8
168#define OPT_TS		16	/* add a timestamp */
169#define OPT_INDIRECT	32	/* use indirect buffers, tx only */
170#define OPT_DUMP	64	/* dump rx/tx traffic */
171	int dev_type;
172#ifndef NO_PCAP
173	pcap_t *p;
174#endif
175
176	int tx_rate;
177	struct timespec tx_period;
178
179	int affinity;
180	int main_fd;
181	struct nm_desc *nmd;
182	uint64_t nmd_flags;
183	int report_interval;		/* milliseconds between prints */
184	void *(*td_body)(void *);
185	void *mmap_addr;
186	char ifname[MAX_IFNAMELEN];
187	char *nmr_config;
188	int dummy_send;
189	int virt_header;	/* send also the virt_header */
190	int extra_bufs;		/* goes in nr_arg3 */
191};
192enum dev_type { DEV_NONE, DEV_NETMAP, DEV_PCAP, DEV_TAP };
193
194
195/*
196 * Arguments for a new thread. The same structure is used by
197 * the source and the sink
198 */
199struct targ {
200	struct glob_arg *g;
201	int used;
202	int completed;
203	int cancel;
204	int fd;
205	struct nm_desc *nmd;
206	volatile uint64_t count;
207	struct timespec tic, toc;
208	int me;
209	pthread_t thread;
210	int affinity;
211
212	struct pkt pkt;
213};
214
215
216/*
217 * extract the extremes from a range of ipv4 addresses.
218 * addr_lo[-addr_hi][:port_lo[-port_hi]]
219 */
220static void
221extract_ip_range(struct ip_range *r)
222{
223	char *ap, *pp;
224	struct in_addr a;
225
226	if (verbose)
227		D("extract IP range from %s", r->name);
228	r->port0 = r->port1 = 0;
229	r->start = r->end = 0;
230
231	/* the first - splits start/end of range */
232	ap = index(r->name, '-');	/* do we have ports ? */
233	if (ap) {
234		*ap++ = '\0';
235	}
236	/* grab the initial values (mandatory) */
237	pp = index(r->name, ':');
238	if (pp) {
239		*pp++ = '\0';
240		r->port0 = r->port1 = strtol(pp, NULL, 0);
241	};
242	inet_aton(r->name, &a);
243	r->start = r->end = ntohl(a.s_addr);
244	if (ap) {
245		pp = index(ap, ':');
246		if (pp) {
247			*pp++ = '\0';
248			if (*pp)
249				r->port1 = strtol(pp, NULL, 0);
250		}
251		if (*ap) {
252			inet_aton(ap, &a);
253			r->end = ntohl(a.s_addr);
254		}
255	}
256	if (r->port0 > r->port1) {
257		uint16_t tmp = r->port0;
258		r->port0 = r->port1;
259		r->port1 = tmp;
260	}
261	if (r->start > r->end) {
262		uint32_t tmp = r->start;
263		r->start = r->end;
264		r->end = tmp;
265	}
266	{
267		struct in_addr a;
268		char buf1[16]; // one ip address
269
270		a.s_addr = htonl(r->end);
271		strncpy(buf1, inet_ntoa(a), sizeof(buf1));
272		a.s_addr = htonl(r->start);
273		if (1)
274		    D("range is %s:%d to %s:%d",
275			inet_ntoa(a), r->port0, buf1, r->port1);
276	}
277}
278
279static void
280extract_mac_range(struct mac_range *r)
281{
282	if (verbose)
283	    D("extract MAC range from %s", r->name);
284	bcopy(ether_aton(r->name), &r->start, 6);
285	bcopy(ether_aton(r->name), &r->end, 6);
286#if 0
287	bcopy(targ->src_mac, eh->ether_shost, 6);
288	p = index(targ->g->src_mac, '-');
289	if (p)
290		targ->src_mac_range = atoi(p+1);
291
292	bcopy(ether_aton(targ->g->dst_mac), targ->dst_mac, 6);
293	bcopy(targ->dst_mac, eh->ether_dhost, 6);
294	p = index(targ->g->dst_mac, '-');
295	if (p)
296		targ->dst_mac_range = atoi(p+1);
297#endif
298	if (verbose)
299		D("%s starts at %s", r->name, ether_ntoa(&r->start));
300}
301
302static struct targ *targs;
303static int global_nthreads;
304
305/* control-C handler */
306static void
307sigint_h(int sig)
308{
309	int i;
310
311	(void)sig;	/* UNUSED */
312	for (i = 0; i < global_nthreads; i++) {
313		targs[i].cancel = 1;
314	}
315	signal(SIGINT, SIG_DFL);
316}
317
318/* sysctl wrapper to return the number of active CPUs */
319static int
320system_ncpus(void)
321{
322	int ncpus;
323#if defined (__FreeBSD__)
324	int mib[2] = { CTL_HW, HW_NCPU };
325	size_t len = sizeof(mib);
326	sysctl(mib, 2, &ncpus, &len, NULL, 0);
327#elif defined(linux)
328	ncpus = sysconf(_SC_NPROCESSORS_ONLN);
329#else /* others */
330	ncpus = 1;
331#endif /* others */
332	return (ncpus);
333}
334
335#ifdef __linux__
336#define sockaddr_dl    sockaddr_ll
337#define sdl_family     sll_family
338#define AF_LINK        AF_PACKET
339#define LLADDR(s)      s->sll_addr;
340#include <linux/if_tun.h>
341#define TAP_CLONEDEV	"/dev/net/tun"
342#endif /* __linux__ */
343
344#ifdef __FreeBSD__
345#include <net/if_tun.h>
346#define TAP_CLONEDEV	"/dev/tap"
347#endif /* __FreeBSD */
348
349#ifdef __APPLE__
350// #warning TAP not supported on apple ?
351#include <net/if_utun.h>
352#define TAP_CLONEDEV	"/dev/tap"
353#endif /* __APPLE__ */
354
355
356/*
357 * parse the vale configuration in conf and put it in nmr.
358 * Return the flag set if necessary.
359 * The configuration may consist of 0 to 4 numbers separated
360 * by commas: #tx-slots,#rx-slots,#tx-rings,#rx-rings.
361 * Missing numbers or zeroes stand for default values.
362 * As an additional convenience, if exactly one number
363 * is specified, then this is assigned to both #tx-slots and #rx-slots.
364 * If there is no 4th number, then the 3rd is assigned to both #tx-rings
365 * and #rx-rings.
366 */
367int
368parse_nmr_config(const char* conf, struct nmreq *nmr)
369{
370	char *w, *tok;
371	int i, v;
372
373	nmr->nr_tx_rings = nmr->nr_rx_rings = 0;
374	nmr->nr_tx_slots = nmr->nr_rx_slots = 0;
375	if (conf == NULL || ! *conf)
376		return 0;
377	w = strdup(conf);
378	for (i = 0, tok = strtok(w, ","); tok; i++, tok = strtok(NULL, ",")) {
379		v = atoi(tok);
380		switch (i) {
381		case 0:
382			nmr->nr_tx_slots = nmr->nr_rx_slots = v;
383			break;
384		case 1:
385			nmr->nr_rx_slots = v;
386			break;
387		case 2:
388			nmr->nr_tx_rings = nmr->nr_rx_rings = v;
389			break;
390		case 3:
391			nmr->nr_rx_rings = v;
392			break;
393		default:
394			D("ignored config: %s", tok);
395			break;
396		}
397	}
398	D("txr %d txd %d rxr %d rxd %d",
399			nmr->nr_tx_rings, nmr->nr_tx_slots,
400			nmr->nr_rx_rings, nmr->nr_rx_slots);
401	free(w);
402	return (nmr->nr_tx_rings || nmr->nr_tx_slots ||
403                        nmr->nr_rx_rings || nmr->nr_rx_slots) ?
404		NM_OPEN_RING_CFG : 0;
405}
406
407
408/*
409 * locate the src mac address for our interface, put it
410 * into the user-supplied buffer. return 0 if ok, -1 on error.
411 */
412static int
413source_hwaddr(const char *ifname, char *buf)
414{
415	struct ifaddrs *ifaphead, *ifap;
416	int l = sizeof(ifap->ifa_name);
417
418	if (getifaddrs(&ifaphead) != 0) {
419		D("getifaddrs %s failed", ifname);
420		return (-1);
421	}
422
423	for (ifap = ifaphead; ifap; ifap = ifap->ifa_next) {
424		struct sockaddr_dl *sdl =
425			(struct sockaddr_dl *)ifap->ifa_addr;
426		uint8_t *mac;
427
428		if (!sdl || sdl->sdl_family != AF_LINK)
429			continue;
430		if (strncmp(ifap->ifa_name, ifname, l) != 0)
431			continue;
432		mac = (uint8_t *)LLADDR(sdl);
433		sprintf(buf, "%02x:%02x:%02x:%02x:%02x:%02x",
434			mac[0], mac[1], mac[2],
435			mac[3], mac[4], mac[5]);
436		if (verbose)
437			D("source hwaddr %s", buf);
438		break;
439	}
440	freeifaddrs(ifaphead);
441	return ifap ? 0 : 1;
442}
443
444
445/* set the thread affinity. */
446static int
447setaffinity(pthread_t me, int i)
448{
449	cpuset_t cpumask;
450
451	if (i == -1)
452		return 0;
453
454	/* Set thread affinity affinity.*/
455	CPU_ZERO(&cpumask);
456	CPU_SET(i, &cpumask);
457
458	if (pthread_setaffinity_np(me, sizeof(cpuset_t), &cpumask) != 0) {
459		D("Unable to set affinity: %s", strerror(errno));
460		return 1;
461	}
462	return 0;
463}
464
465/* Compute the checksum of the given ip header. */
466static uint16_t
467checksum(const void *data, uint16_t len, uint32_t sum)
468{
469        const uint8_t *addr = data;
470	uint32_t i;
471
472        /* Checksum all the pairs of bytes first... */
473        for (i = 0; i < (len & ~1U); i += 2) {
474                sum += (u_int16_t)ntohs(*((u_int16_t *)(addr + i)));
475                if (sum > 0xFFFF)
476                        sum -= 0xFFFF;
477        }
478	/*
479	 * If there's a single byte left over, checksum it, too.
480	 * Network byte order is big-endian, so the remaining byte is
481	 * the high byte.
482	 */
483	if (i < len) {
484		sum += addr[i] << 8;
485		if (sum > 0xFFFF)
486			sum -= 0xFFFF;
487	}
488	return sum;
489}
490
491static u_int16_t
492wrapsum(u_int32_t sum)
493{
494	sum = ~sum & 0xFFFF;
495	return (htons(sum));
496}
497
498/* Check the payload of the packet for errors (use it for debug).
499 * Look for consecutive ascii representations of the size of the packet.
500 */
501static void
502dump_payload(char *p, int len, struct netmap_ring *ring, int cur)
503{
504	char buf[128];
505	int i, j, i0;
506
507	/* get the length in ASCII of the length of the packet. */
508
509	printf("ring %p cur %5d [buf %6d flags 0x%04x len %5d]\n",
510		ring, cur, ring->slot[cur].buf_idx,
511		ring->slot[cur].flags, len);
512	/* hexdump routine */
513	for (i = 0; i < len; ) {
514		memset(buf, sizeof(buf), ' ');
515		sprintf(buf, "%5d: ", i);
516		i0 = i;
517		for (j=0; j < 16 && i < len; i++, j++)
518			sprintf(buf+7+j*3, "%02x ", (uint8_t)(p[i]));
519		i = i0;
520		for (j=0; j < 16 && i < len; i++, j++)
521			sprintf(buf+7+j + 48, "%c",
522				isprint(p[i]) ? p[i] : '.');
523		printf("%s\n", buf);
524	}
525}
526
527/*
528 * Fill a packet with some payload.
529 * We create a UDP packet so the payload starts at
530 *	14+20+8 = 42 bytes.
531 */
532#ifdef __linux__
533#define uh_sport source
534#define uh_dport dest
535#define uh_ulen len
536#define uh_sum check
537#endif /* linux */
538
539/*
540 * increment the addressed in the packet,
541 * starting from the least significant field.
542 *	DST_IP DST_PORT SRC_IP SRC_PORT
543 */
544static void
545update_addresses(struct pkt *pkt, struct glob_arg *g)
546{
547	uint32_t a;
548	uint16_t p;
549	struct ip *ip = &pkt->ip;
550	struct udphdr *udp = &pkt->udp;
551
552    do {
553	p = ntohs(udp->uh_sport);
554	if (p < g->src_ip.port1) { /* just inc, no wrap */
555		udp->uh_sport = htons(p + 1);
556		break;
557	}
558	udp->uh_sport = htons(g->src_ip.port0);
559
560	a = ntohl(ip->ip_src.s_addr);
561	if (a < g->src_ip.end) { /* just inc, no wrap */
562		ip->ip_src.s_addr = htonl(a + 1);
563		break;
564	}
565	ip->ip_src.s_addr = htonl(g->src_ip.start);
566
567	udp->uh_sport = htons(g->src_ip.port0);
568	p = ntohs(udp->uh_dport);
569	if (p < g->dst_ip.port1) { /* just inc, no wrap */
570		udp->uh_dport = htons(p + 1);
571		break;
572	}
573	udp->uh_dport = htons(g->dst_ip.port0);
574
575	a = ntohl(ip->ip_dst.s_addr);
576	if (a < g->dst_ip.end) { /* just inc, no wrap */
577		ip->ip_dst.s_addr = htonl(a + 1);
578		break;
579	}
580	ip->ip_dst.s_addr = htonl(g->dst_ip.start);
581    } while (0);
582    // update checksum
583}
584
585/*
586 * initialize one packet and prepare for the next one.
587 * The copy could be done better instead of repeating it each time.
588 */
589static void
590initialize_packet(struct targ *targ)
591{
592	struct pkt *pkt = &targ->pkt;
593	struct ether_header *eh;
594	struct ip *ip;
595	struct udphdr *udp;
596	uint16_t paylen = targ->g->pkt_size - sizeof(*eh) - sizeof(struct ip);
597	const char *payload = targ->g->options & OPT_INDIRECT ?
598		indirect_payload : default_payload;
599	int i, l0 = strlen(payload);
600
601	/* create a nice NUL-terminated string */
602	for (i = 0; i < paylen; i += l0) {
603		if (l0 > paylen - i)
604			l0 = paylen - i; // last round
605		bcopy(payload, pkt->body + i, l0);
606	}
607	pkt->body[i-1] = '\0';
608	ip = &pkt->ip;
609
610	/* prepare the headers */
611        ip->ip_v = IPVERSION;
612        ip->ip_hl = 5;
613        ip->ip_id = 0;
614        ip->ip_tos = IPTOS_LOWDELAY;
615	ip->ip_len = ntohs(targ->g->pkt_size - sizeof(*eh));
616        ip->ip_id = 0;
617        ip->ip_off = htons(IP_DF); /* Don't fragment */
618        ip->ip_ttl = IPDEFTTL;
619	ip->ip_p = IPPROTO_UDP;
620	ip->ip_dst.s_addr = htonl(targ->g->dst_ip.start);
621	ip->ip_src.s_addr = htonl(targ->g->src_ip.start);
622	ip->ip_sum = wrapsum(checksum(ip, sizeof(*ip), 0));
623
624
625	udp = &pkt->udp;
626        udp->uh_sport = htons(targ->g->src_ip.port0);
627        udp->uh_dport = htons(targ->g->dst_ip.port0);
628	udp->uh_ulen = htons(paylen);
629	/* Magic: taken from sbin/dhclient/packet.c */
630	udp->uh_sum = wrapsum(checksum(udp, sizeof(*udp),
631                    checksum(pkt->body,
632                        paylen - sizeof(*udp),
633                        checksum(&ip->ip_src, 2 * sizeof(ip->ip_src),
634                            IPPROTO_UDP + (u_int32_t)ntohs(udp->uh_ulen)
635                        )
636                    )
637                ));
638
639	eh = &pkt->eh;
640	bcopy(&targ->g->src_mac.start, eh->ether_shost, 6);
641	bcopy(&targ->g->dst_mac.start, eh->ether_dhost, 6);
642	eh->ether_type = htons(ETHERTYPE_IP);
643
644	bzero(&pkt->vh, sizeof(pkt->vh));
645	// dump_payload((void *)pkt, targ->g->pkt_size, NULL, 0);
646}
647
648
649
650/*
651 * create and enqueue a batch of packets on a ring.
652 * On the last one set NS_REPORT to tell the driver to generate
653 * an interrupt when done.
654 */
655static int
656send_packets(struct netmap_ring *ring, struct pkt *pkt, void *frame,
657		int size, struct glob_arg *g, u_int count, int options,
658		u_int nfrags)
659{
660	u_int n, sent, cur = ring->cur;
661	u_int fcnt;
662
663	n = nm_ring_space(ring);
664	if (n < count)
665		count = n;
666	if (count < nfrags) {
667		D("truncating packet, no room for frags %d %d",
668				count, nfrags);
669	}
670#if 0
671	if (options & (OPT_COPY | OPT_PREFETCH) ) {
672		for (sent = 0; sent < count; sent++) {
673			struct netmap_slot *slot = &ring->slot[cur];
674			char *p = NETMAP_BUF(ring, slot->buf_idx);
675
676			__builtin_prefetch(p);
677			cur = nm_ring_next(ring, cur);
678		}
679		cur = ring->cur;
680	}
681#endif
682	for (fcnt = nfrags, sent = 0; sent < count; sent++) {
683		struct netmap_slot *slot = &ring->slot[cur];
684		char *p = NETMAP_BUF(ring, slot->buf_idx);
685
686		slot->flags = 0;
687		if (options & OPT_INDIRECT) {
688			slot->flags |= NS_INDIRECT;
689			slot->ptr = (uint64_t)frame;
690		} else if (options & OPT_COPY) {
691			nm_pkt_copy(frame, p, size);
692			if (fcnt == nfrags)
693				update_addresses(pkt, g);
694		} else if (options & OPT_MEMCPY) {
695			memcpy(p, frame, size);
696			if (fcnt == nfrags)
697				update_addresses(pkt, g);
698		} else if (options & OPT_PREFETCH) {
699			__builtin_prefetch(p);
700		}
701		if (options & OPT_DUMP)
702			dump_payload(p, size, ring, cur);
703		slot->len = size;
704		if (--fcnt > 0)
705			slot->flags |= NS_MOREFRAG;
706		else
707			fcnt = nfrags;
708		if (sent == count - 1) {
709			slot->flags &= ~NS_MOREFRAG;
710			slot->flags |= NS_REPORT;
711		}
712		cur = nm_ring_next(ring, cur);
713	}
714	ring->head = ring->cur = cur;
715
716	return (sent);
717}
718
719/*
720 * Send a packet, and wait for a response.
721 * The payload (after UDP header, ofs 42) has a 4-byte sequence
722 * followed by a struct timeval (or bintime?)
723 */
724#define	PAY_OFS	42	/* where in the pkt... */
725
726static void *
727pinger_body(void *data)
728{
729	struct targ *targ = (struct targ *) data;
730	struct pollfd pfd = { .fd = targ->fd, .events = POLLIN };
731	struct netmap_if *nifp = targ->nmd->nifp;
732	int i, rx = 0, n = targ->g->npackets;
733	void *frame;
734	int size;
735	uint32_t sent = 0;
736	struct timespec ts, now, last_print;
737	uint32_t count = 0, min = 1000000000, av = 0;
738
739	frame = &targ->pkt;
740	frame += sizeof(targ->pkt.vh) - targ->g->virt_header;
741	size = targ->g->pkt_size + targ->g->virt_header;
742
743
744	if (targ->g->nthreads > 1) {
745		D("can only ping with 1 thread");
746		return NULL;
747	}
748
749	clock_gettime(CLOCK_REALTIME_PRECISE, &last_print);
750	now = last_print;
751	while (n == 0 || (int)sent < n) {
752		struct netmap_ring *ring = NETMAP_TXRING(nifp, 0);
753		struct netmap_slot *slot;
754		char *p;
755	    for (i = 0; i < 1; i++) { /* XXX why the loop for 1 pkt ? */
756		slot = &ring->slot[ring->cur];
757		slot->len = size;
758		p = NETMAP_BUF(ring, slot->buf_idx);
759
760		if (nm_ring_empty(ring)) {
761			D("-- ouch, cannot send");
762		} else {
763			nm_pkt_copy(frame, p, size);
764			clock_gettime(CLOCK_REALTIME_PRECISE, &ts);
765			bcopy(&sent, p+42, sizeof(sent));
766			bcopy(&ts, p+46, sizeof(ts));
767			sent++;
768			ring->head = ring->cur = nm_ring_next(ring, ring->cur);
769		}
770	    }
771		/* should use a parameter to decide how often to send */
772		if (poll(&pfd, 1, 3000) <= 0) {
773			D("poll error/timeout on queue %d: %s", targ->me,
774				strerror(errno));
775			continue;
776		}
777		/* see what we got back */
778		for (i = targ->nmd->first_tx_ring;
779			i <= targ->nmd->last_tx_ring; i++) {
780			ring = NETMAP_RXRING(nifp, i);
781			while (!nm_ring_empty(ring)) {
782				uint32_t seq;
783				slot = &ring->slot[ring->cur];
784				p = NETMAP_BUF(ring, slot->buf_idx);
785
786				clock_gettime(CLOCK_REALTIME_PRECISE, &now);
787				bcopy(p+42, &seq, sizeof(seq));
788				bcopy(p+46, &ts, sizeof(ts));
789				ts.tv_sec = now.tv_sec - ts.tv_sec;
790				ts.tv_nsec = now.tv_nsec - ts.tv_nsec;
791				if (ts.tv_nsec < 0) {
792					ts.tv_nsec += 1000000000;
793					ts.tv_sec--;
794				}
795				if (1) D("seq %d/%d delta %d.%09d", seq, sent,
796					(int)ts.tv_sec, (int)ts.tv_nsec);
797				if (ts.tv_nsec < (int)min)
798					min = ts.tv_nsec;
799				count ++;
800				av += ts.tv_nsec;
801				ring->head = ring->cur = nm_ring_next(ring, ring->cur);
802				rx++;
803			}
804		}
805		//D("tx %d rx %d", sent, rx);
806		//usleep(100000);
807		ts.tv_sec = now.tv_sec - last_print.tv_sec;
808		ts.tv_nsec = now.tv_nsec - last_print.tv_nsec;
809		if (ts.tv_nsec < 0) {
810			ts.tv_nsec += 1000000000;
811			ts.tv_sec--;
812		}
813		if (ts.tv_sec >= 1) {
814			D("count %d min %d av %d",
815				count, min, av/count);
816			count = 0;
817			av = 0;
818			min = 100000000;
819			last_print = now;
820		}
821	}
822	return NULL;
823}
824
825
826/*
827 * reply to ping requests
828 */
829static void *
830ponger_body(void *data)
831{
832	struct targ *targ = (struct targ *) data;
833	struct pollfd pfd = { .fd = targ->fd, .events = POLLIN };
834	struct netmap_if *nifp = targ->nmd->nifp;
835	struct netmap_ring *txring, *rxring;
836	int i, rx = 0, sent = 0, n = targ->g->npackets;
837
838	if (targ->g->nthreads > 1) {
839		D("can only reply ping with 1 thread");
840		return NULL;
841	}
842	D("understood ponger %d but don't know how to do it", n);
843	while (n == 0 || sent < n) {
844		uint32_t txcur, txavail;
845//#define BUSYWAIT
846#ifdef BUSYWAIT
847		ioctl(pfd.fd, NIOCRXSYNC, NULL);
848#else
849		if (poll(&pfd, 1, 1000) <= 0) {
850			D("poll error/timeout on queue %d: %s", targ->me,
851				strerror(errno));
852			continue;
853		}
854#endif
855		txring = NETMAP_TXRING(nifp, 0);
856		txcur = txring->cur;
857		txavail = nm_ring_space(txring);
858		/* see what we got back */
859		for (i = targ->nmd->first_rx_ring; i <= targ->nmd->last_rx_ring; i++) {
860			rxring = NETMAP_RXRING(nifp, i);
861			while (!nm_ring_empty(rxring)) {
862				uint16_t *spkt, *dpkt;
863				uint32_t cur = rxring->cur;
864				struct netmap_slot *slot = &rxring->slot[cur];
865				char *src, *dst;
866				src = NETMAP_BUF(rxring, slot->buf_idx);
867				//D("got pkt %p of size %d", src, slot->len);
868				rxring->head = rxring->cur = nm_ring_next(rxring, cur);
869				rx++;
870				if (txavail == 0)
871					continue;
872				dst = NETMAP_BUF(txring,
873				    txring->slot[txcur].buf_idx);
874				/* copy... */
875				dpkt = (uint16_t *)dst;
876				spkt = (uint16_t *)src;
877				nm_pkt_copy(src, dst, slot->len);
878				dpkt[0] = spkt[3];
879				dpkt[1] = spkt[4];
880				dpkt[2] = spkt[5];
881				dpkt[3] = spkt[0];
882				dpkt[4] = spkt[1];
883				dpkt[5] = spkt[2];
884				txring->slot[txcur].len = slot->len;
885				/* XXX swap src dst mac */
886				txcur = nm_ring_next(txring, txcur);
887				txavail--;
888				sent++;
889			}
890		}
891		txring->head = txring->cur = txcur;
892		targ->count = sent;
893#ifdef BUSYWAIT
894		ioctl(pfd.fd, NIOCTXSYNC, NULL);
895#endif
896		//D("tx %d rx %d", sent, rx);
897	}
898	return NULL;
899}
900
901static __inline int
902timespec_ge(const struct timespec *a, const struct timespec *b)
903{
904
905	if (a->tv_sec > b->tv_sec)
906		return (1);
907	if (a->tv_sec < b->tv_sec)
908		return (0);
909	if (a->tv_nsec >= b->tv_nsec)
910		return (1);
911	return (0);
912}
913
914static __inline struct timespec
915timeval2spec(const struct timeval *a)
916{
917	struct timespec ts = {
918		.tv_sec = a->tv_sec,
919		.tv_nsec = a->tv_usec * 1000
920	};
921	return ts;
922}
923
924static __inline struct timeval
925timespec2val(const struct timespec *a)
926{
927	struct timeval tv = {
928		.tv_sec = a->tv_sec,
929		.tv_usec = a->tv_nsec / 1000
930	};
931	return tv;
932}
933
934
935static __inline struct timespec
936timespec_add(struct timespec a, struct timespec b)
937{
938	struct timespec ret = { a.tv_sec + b.tv_sec, a.tv_nsec + b.tv_nsec };
939	if (ret.tv_nsec >= 1000000000) {
940		ret.tv_sec++;
941		ret.tv_nsec -= 1000000000;
942	}
943	return ret;
944}
945
946static __inline struct timespec
947timespec_sub(struct timespec a, struct timespec b)
948{
949	struct timespec ret = { a.tv_sec - b.tv_sec, a.tv_nsec - b.tv_nsec };
950	if (ret.tv_nsec < 0) {
951		ret.tv_sec--;
952		ret.tv_nsec += 1000000000;
953	}
954	return ret;
955}
956
957
958/*
959 * wait until ts, either busy or sleeping if more than 1ms.
960 * Return wakeup time.
961 */
962static struct timespec
963wait_time(struct timespec ts)
964{
965	for (;;) {
966		struct timespec w, cur;
967		clock_gettime(CLOCK_REALTIME_PRECISE, &cur);
968		w = timespec_sub(ts, cur);
969		if (w.tv_sec < 0)
970			return cur;
971		else if (w.tv_sec > 0 || w.tv_nsec > 1000000)
972			poll(NULL, 0, 1);
973	}
974}
975
976static void *
977sender_body(void *data)
978{
979	struct targ *targ = (struct targ *) data;
980	struct pollfd pfd = { .fd = targ->fd, .events = POLLOUT };
981	struct netmap_if *nifp = targ->nmd->nifp;
982	struct netmap_ring *txring;
983	int i, n = targ->g->npackets / targ->g->nthreads;
984	int64_t sent = 0;
985	int options = targ->g->options | OPT_COPY;
986	struct timespec nexttime = { 0, 0}; // XXX silence compiler
987	int rate_limit = targ->g->tx_rate;
988	struct pkt *pkt = &targ->pkt;
989	void *frame;
990	int size;
991
992	frame = pkt;
993	frame += sizeof(pkt->vh) - targ->g->virt_header;
994	size = targ->g->pkt_size + targ->g->virt_header;
995
996	D("start");
997	if (setaffinity(targ->thread, targ->affinity))
998		goto quit;
999
1000	/* main loop.*/
1001	clock_gettime(CLOCK_REALTIME_PRECISE, &targ->tic);
1002	if (rate_limit) {
1003		targ->tic = timespec_add(targ->tic, (struct timespec){2,0});
1004		targ->tic.tv_nsec = 0;
1005		wait_time(targ->tic);
1006		nexttime = targ->tic;
1007	}
1008        if (targ->g->dev_type == DEV_TAP) {
1009	    D("writing to file desc %d", targ->g->main_fd);
1010
1011	    for (i = 0; !targ->cancel && (n == 0 || sent < n); i++) {
1012		if (write(targ->g->main_fd, frame, size) != -1)
1013			sent++;
1014		update_addresses(pkt, targ->g);
1015		if (i > 10000) {
1016			targ->count = sent;
1017			i = 0;
1018		}
1019	    }
1020#ifndef NO_PCAP
1021    } else if (targ->g->dev_type == DEV_PCAP) {
1022	    pcap_t *p = targ->g->p;
1023
1024	    for (i = 0; !targ->cancel && (n == 0 || sent < n); i++) {
1025		if (pcap_inject(p, frame, size) != -1)
1026			sent++;
1027		update_addresses(pkt, targ->g);
1028		if (i > 10000) {
1029			targ->count = sent;
1030			i = 0;
1031		}
1032	    }
1033#endif /* NO_PCAP */
1034    } else {
1035	int tosend = 0;
1036	int frags = targ->g->frags;
1037
1038	while (!targ->cancel && (n == 0 || sent < n)) {
1039
1040		if (rate_limit && tosend <= 0) {
1041			tosend = targ->g->burst;
1042			nexttime = timespec_add(nexttime, targ->g->tx_period);
1043			wait_time(nexttime);
1044		}
1045
1046		/*
1047		 * wait for available room in the send queue(s)
1048		 */
1049		if (poll(&pfd, 1, 2000) <= 0) {
1050			if (targ->cancel)
1051				break;
1052			D("poll error/timeout on queue %d: %s", targ->me,
1053				strerror(errno));
1054			// goto quit;
1055		}
1056		if (pfd.revents & POLLERR) {
1057			D("poll error");
1058			goto quit;
1059		}
1060		/*
1061		 * scan our queues and send on those with room
1062		 */
1063		if (options & OPT_COPY && sent > 100000 && !(targ->g->options & OPT_COPY) ) {
1064			D("drop copy");
1065			options &= ~OPT_COPY;
1066		}
1067		for (i = targ->nmd->first_tx_ring; i <= targ->nmd->last_tx_ring; i++) {
1068			int m, limit = rate_limit ?  tosend : targ->g->burst;
1069			if (n > 0 && n - sent < limit)
1070				limit = n - sent;
1071			txring = NETMAP_TXRING(nifp, i);
1072			if (nm_ring_empty(txring))
1073				continue;
1074			if (frags > 1)
1075				limit = ((limit + frags - 1) / frags) * frags;
1076
1077			m = send_packets(txring, pkt, frame, size, targ->g,
1078					 limit, options, frags);
1079			ND("limit %d tail %d frags %d m %d",
1080				limit, txring->tail, frags, m);
1081			sent += m;
1082			targ->count = sent;
1083			if (rate_limit) {
1084				tosend -= m;
1085				if (tosend <= 0)
1086					break;
1087			}
1088		}
1089	}
1090	/* flush any remaining packets */
1091	ioctl(pfd.fd, NIOCTXSYNC, NULL);
1092
1093	/* final part: wait all the TX queues to be empty. */
1094	for (i = targ->nmd->first_tx_ring; i <= targ->nmd->last_tx_ring; i++) {
1095		txring = NETMAP_TXRING(nifp, i);
1096		while (nm_tx_pending(txring)) {
1097			ioctl(pfd.fd, NIOCTXSYNC, NULL);
1098			usleep(1); /* wait 1 tick */
1099		}
1100	}
1101    } /* end DEV_NETMAP */
1102
1103	clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc);
1104	targ->completed = 1;
1105	targ->count = sent;
1106
1107quit:
1108	/* reset the ``used`` flag. */
1109	targ->used = 0;
1110
1111	return (NULL);
1112}
1113
1114
1115#ifndef NO_PCAP
1116static void
1117receive_pcap(u_char *user, const struct pcap_pkthdr * h,
1118	const u_char * bytes)
1119{
1120	int *count = (int *)user;
1121	(void)h;	/* UNUSED */
1122	(void)bytes;	/* UNUSED */
1123	(*count)++;
1124}
1125#endif /* !NO_PCAP */
1126
1127static int
1128receive_packets(struct netmap_ring *ring, u_int limit, int dump)
1129{
1130	u_int cur, rx, n;
1131
1132	cur = ring->cur;
1133	n = nm_ring_space(ring);
1134	if (n < limit)
1135		limit = n;
1136	for (rx = 0; rx < limit; rx++) {
1137		struct netmap_slot *slot = &ring->slot[cur];
1138		char *p = NETMAP_BUF(ring, slot->buf_idx);
1139
1140		if (dump)
1141			dump_payload(p, slot->len, ring, cur);
1142
1143		cur = nm_ring_next(ring, cur);
1144	}
1145	ring->head = ring->cur = cur;
1146
1147	return (rx);
1148}
1149
1150static void *
1151receiver_body(void *data)
1152{
1153	struct targ *targ = (struct targ *) data;
1154	struct pollfd pfd = { .fd = targ->fd, .events = POLLIN };
1155	struct netmap_if *nifp = targ->nmd->nifp;
1156	struct netmap_ring *rxring;
1157	int i;
1158	uint64_t received = 0;
1159
1160	if (setaffinity(targ->thread, targ->affinity))
1161		goto quit;
1162
1163	/* unbounded wait for the first packet. */
1164	for (;;) {
1165		i = poll(&pfd, 1, 1000);
1166		if (i > 0 && !(pfd.revents & POLLERR))
1167			break;
1168		RD(1, "waiting for initial packets, poll returns %d %d",
1169			i, pfd.revents);
1170	}
1171
1172	/* main loop, exit after 1s silence */
1173	clock_gettime(CLOCK_REALTIME_PRECISE, &targ->tic);
1174    if (targ->g->dev_type == DEV_TAP) {
1175	D("reading from %s fd %d", targ->g->ifname, targ->g->main_fd);
1176	while (!targ->cancel) {
1177		char buf[2048];
1178		/* XXX should we poll ? */
1179		if (read(targ->g->main_fd, buf, sizeof(buf)) > 0)
1180			targ->count++;
1181	}
1182#ifndef NO_PCAP
1183    } else if (targ->g->dev_type == DEV_PCAP) {
1184	while (!targ->cancel) {
1185		/* XXX should we poll ? */
1186		pcap_dispatch(targ->g->p, targ->g->burst, receive_pcap, NULL);
1187	}
1188#endif /* !NO_PCAP */
1189    } else {
1190	int dump = targ->g->options & OPT_DUMP;
1191	while (!targ->cancel) {
1192		/* Once we started to receive packets, wait at most 1 seconds
1193		   before quitting. */
1194		if (poll(&pfd, 1, 1 * 1000) <= 0 && !targ->g->forever) {
1195			clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc);
1196			targ->toc.tv_sec -= 1; /* Subtract timeout time. */
1197			goto out;
1198		}
1199
1200		if (pfd.revents & POLLERR) {
1201			D("poll err");
1202			goto quit;
1203		}
1204
1205		for (i = targ->nmd->first_rx_ring; i <= targ->nmd->last_rx_ring; i++) {
1206			int m;
1207
1208			rxring = NETMAP_RXRING(nifp, i);
1209			if (nm_ring_empty(rxring))
1210				continue;
1211
1212			m = receive_packets(rxring, targ->g->burst, dump);
1213			received += m;
1214		}
1215		targ->count = received;
1216	}
1217    }
1218
1219	clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc);
1220
1221out:
1222	targ->completed = 1;
1223	targ->count = received;
1224
1225quit:
1226	/* reset the ``used`` flag. */
1227	targ->used = 0;
1228
1229	return (NULL);
1230}
1231
1232/* very crude code to print a number in normalized form.
1233 * Caller has to make sure that the buffer is large enough.
1234 */
1235static const char *
1236norm(char *buf, double val)
1237{
1238	char *units[] = { "", "K", "M", "G", "T" };
1239	u_int i;
1240
1241	for (i = 0; val >=1000 && i < sizeof(units)/sizeof(char *) - 1; i++)
1242		val /= 1000;
1243	sprintf(buf, "%.2f %s", val, units[i]);
1244	return buf;
1245}
1246
1247static void
1248tx_output(uint64_t sent, int size, double delta)
1249{
1250	double bw, raw_bw, pps;
1251	char b1[40], b2[80], b3[80];
1252
1253	printf("Sent %llu packets, %d bytes each, in %.2f seconds.\n",
1254	       (unsigned long long)sent, size, delta);
1255	if (delta == 0)
1256		delta = 1e-6;
1257	if (size < 60)		/* correct for min packet size */
1258		size = 60;
1259	pps = sent / delta;
1260	bw = (8.0 * size * sent) / delta;
1261	/* raw packets have4 bytes crc + 20 bytes framing */
1262	raw_bw = (8.0 * (size + 24) * sent) / delta;
1263
1264	printf("Speed: %spps Bandwidth: %sbps (raw %sbps)\n",
1265		norm(b1, pps), norm(b2, bw), norm(b3, raw_bw) );
1266}
1267
1268
1269static void
1270rx_output(uint64_t received, double delta)
1271{
1272	double pps;
1273	char b1[40];
1274
1275	printf("Received %llu packets, in %.2f seconds.\n",
1276		(unsigned long long) received, delta);
1277
1278	if (delta == 0)
1279		delta = 1e-6;
1280	pps = received / delta;
1281	printf("Speed: %spps\n", norm(b1, pps));
1282}
1283
1284static void
1285usage(void)
1286{
1287	const char *cmd = "pkt-gen";
1288	fprintf(stderr,
1289		"Usage:\n"
1290		"%s arguments\n"
1291		"\t-i interface		interface name\n"
1292		"\t-f function		tx rx ping pong\n"
1293		"\t-n count		number of iterations (can be 0)\n"
1294		"\t-t pkts_to_send		also forces tx mode\n"
1295		"\t-r pkts_to_receive	also forces rx mode\n"
1296		"\t-l pkt_size		in bytes excluding CRC\n"
1297		"\t-d dst_ip[:port[-dst_ip:port]]   single or range\n"
1298		"\t-s src_ip[:port[-src_ip:port]]   single or range\n"
1299		"\t-D dst-mac\n"
1300		"\t-S src-mac\n"
1301		"\t-a cpu_id		use setaffinity\n"
1302		"\t-b burst size		testing, mostly\n"
1303		"\t-c cores		cores to use\n"
1304		"\t-p threads		processes/threads to use\n"
1305		"\t-T report_ms		milliseconds between reports\n"
1306		"\t-P			use libpcap instead of netmap\n"
1307		"\t-w wait_for_link_time	in seconds\n"
1308		"\t-R rate		in packets per second\n"
1309		"\t-X			dump payload\n"
1310		"\t-H len		add empty virtio-net-header with size 'len'\n"
1311		"",
1312		cmd);
1313
1314	exit(0);
1315}
1316
1317static void
1318start_threads(struct glob_arg *g)
1319{
1320	int i;
1321
1322	targs = calloc(g->nthreads, sizeof(*targs));
1323	/*
1324	 * Now create the desired number of threads, each one
1325	 * using a single descriptor.
1326 	 */
1327	for (i = 0; i < g->nthreads; i++) {
1328		struct targ *t = &targs[i];
1329
1330		bzero(t, sizeof(*t));
1331		t->fd = -1; /* default, with pcap */
1332		t->g = g;
1333
1334	    if (g->dev_type == DEV_NETMAP) {
1335		struct nm_desc nmd = *g->nmd; /* copy, we overwrite ringid */
1336
1337		if (g->nthreads > 1) {
1338			if (nmd.req.nr_flags != NR_REG_ALL_NIC) {
1339				D("invalid nthreads mode %d", nmd.req.nr_flags);
1340				continue;
1341			}
1342			nmd.req.nr_flags = NR_REG_ONE_NIC;
1343			nmd.req.nr_ringid = i;
1344		}
1345		/* Only touch one of the rings (rx is already ok) */
1346		if (g->td_body == receiver_body)
1347			nmd.req.nr_ringid |= NETMAP_NO_TX_POLL;
1348
1349		/* register interface. Override ifname and ringid etc. */
1350
1351		t->nmd = nm_open(t->g->ifname, NULL, g->nmd_flags |
1352			NM_OPEN_IFNAME | NM_OPEN_NO_MMAP, g->nmd);
1353		if (t->nmd == NULL) {
1354			D("Unable to open %s: %s",
1355				t->g->ifname, strerror(errno));
1356			continue;
1357		}
1358		t->fd = t->nmd->fd;
1359
1360	    } else {
1361		targs[i].fd = g->main_fd;
1362	    }
1363		t->used = 1;
1364		t->me = i;
1365		if (g->affinity >= 0) {
1366			if (g->affinity < g->cpus)
1367				t->affinity = g->affinity;
1368			else
1369				t->affinity = i % g->cpus;
1370		} else {
1371			t->affinity = -1;
1372		}
1373		/* default, init packets */
1374		initialize_packet(t);
1375
1376		if (pthread_create(&t->thread, NULL, g->td_body, t) == -1) {
1377			D("Unable to create thread %d: %s", i, strerror(errno));
1378			t->used = 0;
1379		}
1380	}
1381}
1382
1383static void
1384main_thread(struct glob_arg *g)
1385{
1386	int i;
1387
1388	uint64_t prev = 0;
1389	uint64_t count = 0;
1390	double delta_t;
1391	struct timeval tic, toc;
1392
1393	gettimeofday(&toc, NULL);
1394	for (;;) {
1395		struct timeval now, delta;
1396		uint64_t pps, usec, my_count, npkts;
1397		int done = 0;
1398
1399		delta.tv_sec = g->report_interval/1000;
1400		delta.tv_usec = (g->report_interval%1000)*1000;
1401		select(0, NULL, NULL, NULL, &delta);
1402		gettimeofday(&now, NULL);
1403		timersub(&now, &toc, &toc);
1404		my_count = 0;
1405		for (i = 0; i < g->nthreads; i++) {
1406			my_count += targs[i].count;
1407			if (targs[i].used == 0)
1408				done++;
1409		}
1410		usec = toc.tv_sec* 1000000 + toc.tv_usec;
1411		if (usec < 10000)
1412			continue;
1413		npkts = my_count - prev;
1414		pps = (npkts*1000000 + usec/2) / usec;
1415		D("%llu pps (%llu pkts in %llu usec)",
1416			(unsigned long long)pps,
1417			(unsigned long long)npkts,
1418			(unsigned long long)usec);
1419		prev = my_count;
1420		toc = now;
1421		if (done == g->nthreads)
1422			break;
1423	}
1424
1425	timerclear(&tic);
1426	timerclear(&toc);
1427	for (i = 0; i < g->nthreads; i++) {
1428		struct timespec t_tic, t_toc;
1429		/*
1430		 * Join active threads, unregister interfaces and close
1431		 * file descriptors.
1432		 */
1433		if (targs[i].used)
1434			pthread_join(targs[i].thread, NULL);
1435		close(targs[i].fd);
1436
1437		if (targs[i].completed == 0)
1438			D("ouch, thread %d exited with error", i);
1439
1440		/*
1441		 * Collect threads output and extract information about
1442		 * how long it took to send all the packets.
1443		 */
1444		count += targs[i].count;
1445		t_tic = timeval2spec(&tic);
1446		t_toc = timeval2spec(&toc);
1447		if (!timerisset(&tic) || timespec_ge(&targs[i].tic, &t_tic))
1448			tic = timespec2val(&targs[i].tic);
1449		if (!timerisset(&toc) || timespec_ge(&targs[i].toc, &t_toc))
1450			toc = timespec2val(&targs[i].toc);
1451	}
1452
1453	/* print output. */
1454	timersub(&toc, &tic, &toc);
1455	delta_t = toc.tv_sec + 1e-6* toc.tv_usec;
1456	if (g->td_body == sender_body)
1457		tx_output(count, g->pkt_size, delta_t);
1458	else
1459		rx_output(count, delta_t);
1460
1461	if (g->dev_type == DEV_NETMAP) {
1462		munmap(g->nmd->mem, g->nmd->req.nr_memsize);
1463		close(g->main_fd);
1464	}
1465}
1466
1467
1468struct sf {
1469	char *key;
1470	void *f;
1471};
1472
1473static struct sf func[] = {
1474	{ "tx",	sender_body },
1475	{ "rx",	receiver_body },
1476	{ "ping",	pinger_body },
1477	{ "pong",	ponger_body },
1478	{ NULL, NULL }
1479};
1480
1481static int
1482tap_alloc(char *dev)
1483{
1484	struct ifreq ifr;
1485	int fd, err;
1486	char *clonedev = TAP_CLONEDEV;
1487
1488	(void)err;
1489	(void)dev;
1490	/* Arguments taken by the function:
1491	 *
1492	 * char *dev: the name of an interface (or '\0'). MUST have enough
1493	 *   space to hold the interface name if '\0' is passed
1494	 * int flags: interface flags (eg, IFF_TUN etc.)
1495	 */
1496
1497#ifdef __FreeBSD__
1498	if (dev[3]) { /* tapSomething */
1499		static char buf[128];
1500		snprintf(buf, sizeof(buf), "/dev/%s", dev);
1501		clonedev = buf;
1502	}
1503#endif
1504	/* open the device */
1505	if( (fd = open(clonedev, O_RDWR)) < 0 ) {
1506		return fd;
1507	}
1508	D("%s open successful", clonedev);
1509
1510	/* preparation of the struct ifr, of type "struct ifreq" */
1511	memset(&ifr, 0, sizeof(ifr));
1512
1513#ifdef linux
1514	ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
1515
1516	if (*dev) {
1517		/* if a device name was specified, put it in the structure; otherwise,
1518		* the kernel will try to allocate the "next" device of the
1519		* specified type */
1520		strncpy(ifr.ifr_name, dev, IFNAMSIZ);
1521	}
1522
1523	/* try to create the device */
1524	if( (err = ioctl(fd, TUNSETIFF, (void *) &ifr)) < 0 ) {
1525		D("failed to to a TUNSETIFF: %s", strerror(errno));
1526		close(fd);
1527		return err;
1528	}
1529
1530	/* if the operation was successful, write back the name of the
1531	* interface to the variable "dev", so the caller can know
1532	* it. Note that the caller MUST reserve space in *dev (see calling
1533	* code below) */
1534	strcpy(dev, ifr.ifr_name);
1535	D("new name is %s", dev);
1536#endif /* linux */
1537
1538        /* this is the special file descriptor that the caller will use to talk
1539         * with the virtual interface */
1540        return fd;
1541}
1542
1543int
1544main(int arc, char **argv)
1545{
1546	int i;
1547
1548	struct glob_arg g;
1549
1550	int ch;
1551	int wait_link = 2;
1552	int devqueues = 1;	/* how many device queues */
1553
1554	bzero(&g, sizeof(g));
1555
1556	g.main_fd = -1;
1557	g.td_body = receiver_body;
1558	g.report_interval = 1000;	/* report interval */
1559	g.affinity = -1;
1560	/* ip addresses can also be a range x.x.x.x-x.x.x.y */
1561	g.src_ip.name = "10.0.0.1";
1562	g.dst_ip.name = "10.1.0.1";
1563	g.dst_mac.name = "ff:ff:ff:ff:ff:ff";
1564	g.src_mac.name = NULL;
1565	g.pkt_size = 60;
1566	g.burst = 512;		// default
1567	g.nthreads = 1;
1568	g.cpus = 1;
1569	g.forever = 1;
1570	g.tx_rate = 0;
1571	g.frags = 1;
1572	g.nmr_config = "";
1573	g.virt_header = 0;
1574
1575	while ( (ch = getopt(arc, argv,
1576			"a:f:F:n:i:Il:d:s:D:S:b:c:o:p:T:w:WvR:XC:H:e:")) != -1) {
1577		struct sf *fn;
1578
1579		switch(ch) {
1580		default:
1581			D("bad option %c %s", ch, optarg);
1582			usage();
1583			break;
1584
1585		case 'n':
1586			g.npackets = atoi(optarg);
1587			break;
1588
1589		case 'F':
1590			i = atoi(optarg);
1591			if (i < 1 || i > 63) {
1592				D("invalid frags %d [1..63], ignore", i);
1593				break;
1594			}
1595			g.frags = i;
1596			break;
1597
1598		case 'f':
1599			for (fn = func; fn->key; fn++) {
1600				if (!strcmp(fn->key, optarg))
1601					break;
1602			}
1603			if (fn->key)
1604				g.td_body = fn->f;
1605			else
1606				D("unrecognised function %s", optarg);
1607			break;
1608
1609		case 'o':	/* data generation options */
1610			g.options = atoi(optarg);
1611			break;
1612
1613		case 'a':       /* force affinity */
1614			g.affinity = atoi(optarg);
1615			break;
1616
1617		case 'i':	/* interface */
1618			/* a prefix of tap: netmap: or pcap: forces the mode.
1619			 * otherwise we guess
1620			 */
1621			D("interface is %s", optarg);
1622			if (strlen(optarg) > MAX_IFNAMELEN - 8) {
1623				D("ifname too long %s", optarg);
1624				break;
1625			}
1626			strcpy(g.ifname, optarg);
1627			if (!strcmp(optarg, "null")) {
1628				g.dev_type = DEV_NETMAP;
1629				g.dummy_send = 1;
1630			} else if (!strncmp(optarg, "tap:", 4)) {
1631				g.dev_type = DEV_TAP;
1632				strcpy(g.ifname, optarg + 4);
1633			} else if (!strncmp(optarg, "pcap:", 5)) {
1634				g.dev_type = DEV_PCAP;
1635				strcpy(g.ifname, optarg + 5);
1636			} else if (!strncmp(optarg, "netmap:", 7) ||
1637				   !strncmp(optarg, "vale", 4)) {
1638				g.dev_type = DEV_NETMAP;
1639			} else if (!strncmp(optarg, "tap", 3)) {
1640				g.dev_type = DEV_TAP;
1641			} else { /* prepend netmap: */
1642				g.dev_type = DEV_NETMAP;
1643				sprintf(g.ifname, "netmap:%s", optarg);
1644			}
1645			break;
1646
1647		case 'I':
1648			g.options |= OPT_INDIRECT;	/* XXX use indirect buffer */
1649			break;
1650
1651		case 'l':	/* pkt_size */
1652			g.pkt_size = atoi(optarg);
1653			break;
1654
1655		case 'd':
1656			g.dst_ip.name = optarg;
1657			break;
1658
1659		case 's':
1660			g.src_ip.name = optarg;
1661			break;
1662
1663		case 'T':	/* report interval */
1664			g.report_interval = atoi(optarg);
1665			break;
1666
1667		case 'w':
1668			wait_link = atoi(optarg);
1669			break;
1670
1671		case 'W': /* XXX changed default */
1672			g.forever = 0; /* do not exit rx even with no traffic */
1673			break;
1674
1675		case 'b':	/* burst */
1676			g.burst = atoi(optarg);
1677			break;
1678		case 'c':
1679			g.cpus = atoi(optarg);
1680			break;
1681		case 'p':
1682			g.nthreads = atoi(optarg);
1683			break;
1684
1685		case 'D': /* destination mac */
1686			g.dst_mac.name = optarg;
1687			break;
1688
1689		case 'S': /* source mac */
1690			g.src_mac.name = optarg;
1691			break;
1692		case 'v':
1693			verbose++;
1694			break;
1695		case 'R':
1696			g.tx_rate = atoi(optarg);
1697			break;
1698		case 'X':
1699			g.options |= OPT_DUMP;
1700			break;
1701		case 'C':
1702			g.nmr_config = strdup(optarg);
1703			break;
1704		case 'H':
1705			g.virt_header = atoi(optarg);
1706			break;
1707		case 'e': /* extra bufs */
1708			g.extra_bufs = atoi(optarg);
1709			break;
1710		}
1711	}
1712
1713	if (g.ifname == NULL) {
1714		D("missing ifname");
1715		usage();
1716	}
1717
1718	i = system_ncpus();
1719	if (g.cpus < 0 || g.cpus > i) {
1720		D("%d cpus is too high, have only %d cpus", g.cpus, i);
1721		usage();
1722	}
1723	if (g.cpus == 0)
1724		g.cpus = i;
1725
1726	if (g.pkt_size < 16 || g.pkt_size > 1536) {
1727		D("bad pktsize %d\n", g.pkt_size);
1728		usage();
1729	}
1730
1731	if (g.src_mac.name == NULL) {
1732		static char mybuf[20] = "00:00:00:00:00:00";
1733		/* retrieve source mac address. */
1734		if (source_hwaddr(g.ifname, mybuf) == -1) {
1735			D("Unable to retrieve source mac");
1736			// continue, fail later
1737		}
1738		g.src_mac.name = mybuf;
1739	}
1740	/* extract address ranges */
1741	extract_ip_range(&g.src_ip);
1742	extract_ip_range(&g.dst_ip);
1743	extract_mac_range(&g.src_mac);
1744	extract_mac_range(&g.dst_mac);
1745
1746	if (g.src_ip.start != g.src_ip.end ||
1747	    g.src_ip.port0 != g.src_ip.port1 ||
1748	    g.dst_ip.start != g.dst_ip.end ||
1749	    g.dst_ip.port0 != g.dst_ip.port1)
1750		g.options |= OPT_COPY;
1751
1752	if (g.virt_header != 0 && g.virt_header != VIRT_HDR_1
1753			&& g.virt_header != VIRT_HDR_2) {
1754		D("bad virtio-net-header length");
1755		usage();
1756	}
1757
1758    if (g.dev_type == DEV_TAP) {
1759	D("want to use tap %s", g.ifname);
1760	g.main_fd = tap_alloc(g.ifname);
1761	if (g.main_fd < 0) {
1762		D("cannot open tap %s", g.ifname);
1763		usage();
1764	}
1765#ifndef NO_PCAP
1766    } else if (g.dev_type == DEV_PCAP) {
1767	char pcap_errbuf[PCAP_ERRBUF_SIZE];
1768
1769	D("using pcap on %s", g.ifname);
1770	pcap_errbuf[0] = '\0'; // init the buffer
1771	g.p = pcap_open_live(g.ifname, 0, 1, 100, pcap_errbuf);
1772	if (g.p == NULL) {
1773		D("cannot open pcap on %s", g.ifname);
1774		usage();
1775	}
1776#endif /* !NO_PCAP */
1777    } else if (g.dummy_send) { /* but DEV_NETMAP */
1778	D("using a dummy send routine");
1779    } else {
1780	struct nm_desc base_nmd;
1781
1782	bzero(&base_nmd, sizeof(base_nmd));
1783
1784	g.nmd_flags = 0;
1785	g.nmd_flags |= parse_nmr_config(g.nmr_config, &base_nmd.req);
1786	if (g.extra_bufs) {
1787		base_nmd.req.nr_arg3 = g.extra_bufs;
1788		g.nmd_flags |= NM_OPEN_ARG3;
1789	}
1790
1791	/*
1792	 * Open the netmap device using nm_open().
1793	 *
1794	 * protocol stack and may cause a reset of the card,
1795	 * which in turn may take some time for the PHY to
1796	 * reconfigure. We do the open here to have time to reset.
1797	 */
1798	g.nmd = nm_open(g.ifname, NULL, g.nmd_flags, &base_nmd);
1799	if (g.nmd == NULL) {
1800		D("Unable to open %s: %s", g.ifname, strerror(errno));
1801		goto out;
1802	}
1803	g.main_fd = g.nmd->fd;
1804	D("mapped %dKB at %p", g.nmd->req.nr_memsize>>10, g.nmd->mem);
1805
1806	devqueues = g.nmd->req.nr_rx_rings;
1807
1808	/* validate provided nthreads. */
1809	if (g.nthreads < 1 || g.nthreads > devqueues) {
1810		D("bad nthreads %d, have %d queues", g.nthreads, devqueues);
1811		// continue, fail later
1812	}
1813
1814	if (verbose) {
1815		struct netmap_if *nifp = g.nmd->nifp;
1816		struct nmreq *req = &g.nmd->req;
1817
1818		D("nifp at offset %d, %d tx %d rx region %d",
1819		    req->nr_offset, req->nr_tx_rings, req->nr_rx_rings,
1820		    req->nr_arg2);
1821		for (i = 0; i <= req->nr_tx_rings; i++) {
1822			D("   TX%d at 0x%lx", i,
1823			    (char *)NETMAP_TXRING(nifp, i) - (char *)nifp);
1824		}
1825		for (i = 0; i <= req->nr_rx_rings; i++) {
1826			D("   RX%d at 0x%lx", i,
1827			    (char *)NETMAP_RXRING(nifp, i) - (char *)nifp);
1828		}
1829	}
1830
1831	/* Print some debug information. */
1832	fprintf(stdout,
1833		"%s %s: %d queues, %d threads and %d cpus.\n",
1834		(g.td_body == sender_body) ? "Sending on" : "Receiving from",
1835		g.ifname,
1836		devqueues,
1837		g.nthreads,
1838		g.cpus);
1839	if (g.td_body == sender_body) {
1840		fprintf(stdout, "%s -> %s (%s -> %s)\n",
1841			g.src_ip.name, g.dst_ip.name,
1842			g.src_mac.name, g.dst_mac.name);
1843	}
1844
1845out:
1846	/* Exit if something went wrong. */
1847	if (g.main_fd < 0) {
1848		D("aborting");
1849		usage();
1850	}
1851    }
1852
1853
1854	if (g.options) {
1855		D("--- SPECIAL OPTIONS:%s%s%s%s%s\n",
1856			g.options & OPT_PREFETCH ? " prefetch" : "",
1857			g.options & OPT_ACCESS ? " access" : "",
1858			g.options & OPT_MEMCPY ? " memcpy" : "",
1859			g.options & OPT_INDIRECT ? " indirect" : "",
1860			g.options & OPT_COPY ? " copy" : "");
1861	}
1862
1863	g.tx_period.tv_sec = g.tx_period.tv_nsec = 0;
1864	if (g.tx_rate > 0) {
1865		/* try to have at least something every second,
1866		 * reducing the burst size to some 0.01s worth of data
1867		 * (but no less than one full set of fragments)
1868	 	 */
1869		uint64_t x;
1870		int lim = (g.tx_rate)/300;
1871		if (g.burst > lim)
1872			g.burst = lim;
1873		if (g.burst < g.frags)
1874			g.burst = g.frags;
1875		x = ((uint64_t)1000000000 * (uint64_t)g.burst) / (uint64_t) g.tx_rate;
1876		g.tx_period.tv_nsec = x;
1877		g.tx_period.tv_sec = g.tx_period.tv_nsec / 1000000000;
1878		g.tx_period.tv_nsec = g.tx_period.tv_nsec % 1000000000;
1879	}
1880	if (g.td_body == sender_body)
1881	    D("Sending %d packets every  %ld.%09ld s",
1882			g.burst, g.tx_period.tv_sec, g.tx_period.tv_nsec);
1883	/* Wait for PHY reset. */
1884	D("Wait %d secs for phy reset", wait_link);
1885	sleep(wait_link);
1886	D("Ready...");
1887
1888	/* Install ^C handler. */
1889	global_nthreads = g.nthreads;
1890	signal(SIGINT, sigint_h);
1891
1892	start_threads(&g);
1893	main_thread(&g);
1894	return 0;
1895}
1896
1897/* end of file */
1898