pkt-gen.c revision 272962
1/*
2 * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved.
3 * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *   1. Redistributions of source code must retain the above copyright
9 *      notice, this list of conditions and the following disclaimer.
10 *   2. Redistributions in binary form must reproduce the above copyright
11 *      notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27/*
28 * $FreeBSD: head/tools/tools/netmap/pkt-gen.c 272962 2014-10-11 21:43:05Z gnn $
29 * $Id: pkt-gen.c 12346 2013-06-12 17:36:25Z luigi $
30 *
31 * Example program to show how to build a multithreaded packet
32 * source/sink using the netmap device.
33 *
34 * In this example we create a programmable number of threads
35 * to take care of all the queues of the interface used to
36 * send or receive traffic.
37 *
38 */
39
40// #define TRASH_VHOST_HDR
41
42#define _GNU_SOURCE	/* for CPU_SET() */
43#include <stdio.h>
44#define NETMAP_WITH_LIBS
45#include <net/netmap_user.h>
46
47
48#include <ctype.h>	// isprint()
49#include <unistd.h>	// sysconf()
50#include <sys/poll.h>
51#include <arpa/inet.h>	/* ntohs */
52#include <sys/sysctl.h>	/* sysctl */
53#include <ifaddrs.h>	/* getifaddrs */
54#include <net/ethernet.h>
55#include <netinet/in.h>
56#include <netinet/ip.h>
57#include <netinet/udp.h>
58
59#include <pthread.h>
60
61#ifndef NO_PCAP
62#include <pcap/pcap.h>
63#endif
64
65#ifdef linux
66
67#define cpuset_t        cpu_set_t
68
69#define ifr_flagshigh  ifr_flags        /* only the low 16 bits here */
70#define IFF_PPROMISC   IFF_PROMISC      /* IFF_PPROMISC does not exist */
71#include <linux/ethtool.h>
72#include <linux/sockios.h>
73
74#define CLOCK_REALTIME_PRECISE CLOCK_REALTIME
75#include <netinet/ether.h>      /* ether_aton */
76#include <linux/if_packet.h>    /* sockaddr_ll */
77#endif  /* linux */
78
79#ifdef __FreeBSD__
80#include <sys/endian.h> /* le64toh */
81#include <machine/param.h>
82
83#include <pthread_np.h> /* pthread w/ affinity */
84#include <sys/cpuset.h> /* cpu_set */
85#include <net/if_dl.h>  /* LLADDR */
86#endif  /* __FreeBSD__ */
87
88#ifdef __APPLE__
89
90#define cpuset_t        uint64_t        // XXX
91static inline void CPU_ZERO(cpuset_t *p)
92{
93        *p = 0;
94}
95
96static inline void CPU_SET(uint32_t i, cpuset_t *p)
97{
98        *p |= 1<< (i & 0x3f);
99}
100
101#define pthread_setaffinity_np(a, b, c) ((void)a, 0)
102
103#define ifr_flagshigh  ifr_flags        // XXX
104#define IFF_PPROMISC   IFF_PROMISC
105#include <net/if_dl.h>  /* LLADDR */
106#define clock_gettime(a,b)      \
107        do {struct timespec t0 = {0,0}; *(b) = t0; } while (0)
108#endif  /* __APPLE__ */
109
110const char *default_payload="netmap pkt-gen DIRECT payload\n"
111	"http://info.iet.unipi.it/~luigi/netmap/ ";
112
113const char *indirect_payload="netmap pkt-gen indirect payload\n"
114	"http://info.iet.unipi.it/~luigi/netmap/ ";
115
116int verbose = 0;
117
118#define SKIP_PAYLOAD 1 /* do not check payload. XXX unused */
119
120
121#define VIRT_HDR_1	10	/* length of a base vnet-hdr */
122#define VIRT_HDR_2	12	/* length of the extenede vnet-hdr */
123#define VIRT_HDR_MAX	VIRT_HDR_2
124struct virt_header {
125	uint8_t fields[VIRT_HDR_MAX];
126};
127
128#define MAX_BODYSIZE	16384
129
130struct pkt {
131	struct virt_header vh;
132	struct ether_header eh;
133	struct ip ip;
134	struct udphdr udp;
135	uint8_t body[MAX_BODYSIZE];	// XXX hardwired
136} __attribute__((__packed__));
137
138struct ip_range {
139	char *name;
140	uint32_t start, end; /* same as struct in_addr */
141	uint16_t port0, port1;
142};
143
144struct mac_range {
145	char *name;
146	struct ether_addr start, end;
147};
148
149/* ifname can be netmap:foo-xxxx */
150#define MAX_IFNAMELEN	64	/* our buffer for ifname */
151//#define MAX_PKTSIZE	1536
152#define MAX_PKTSIZE	MAX_BODYSIZE	/* XXX: + IP_HDR + ETH_HDR */
153
154/* compact timestamp to fit into 60 byte packet. (enough to obtain RTT) */
155struct tstamp {
156	uint32_t sec;
157	uint32_t nsec;
158};
159
160/*
161 * global arguments for all threads
162 */
163
164struct glob_arg {
165	struct ip_range src_ip;
166	struct ip_range dst_ip;
167	struct mac_range dst_mac;
168	struct mac_range src_mac;
169	int pkt_size;
170	int burst;
171	int forever;
172	int npackets;	/* total packets to send */
173	int frags;	/* fragments per packet */
174	int nthreads;
175	int cpus;
176	int options;	/* testing */
177#define OPT_PREFETCH	1
178#define OPT_ACCESS	2
179#define OPT_COPY	4
180#define OPT_MEMCPY	8
181#define OPT_TS		16	/* add a timestamp */
182#define OPT_INDIRECT	32	/* use indirect buffers, tx only */
183#define OPT_DUMP	64	/* dump rx/tx traffic */
184#define OPT_MONITOR_TX  128
185#define OPT_MONITOR_RX  256
186	int dev_type;
187#ifndef NO_PCAP
188	pcap_t *p;
189#endif
190
191	int tx_rate;
192	struct timespec tx_period;
193
194	int affinity;
195	int main_fd;
196	struct nm_desc *nmd;
197	int report_interval;		/* milliseconds between prints */
198	void *(*td_body)(void *);
199	void *mmap_addr;
200	char ifname[MAX_IFNAMELEN];
201	char *nmr_config;
202	int dummy_send;
203	int virt_header;	/* send also the virt_header */
204	int extra_bufs;		/* goes in nr_arg3 */
205	char *packet_file;	/* -P option */
206};
207enum dev_type { DEV_NONE, DEV_NETMAP, DEV_PCAP, DEV_TAP };
208
209
210/*
211 * Arguments for a new thread. The same structure is used by
212 * the source and the sink
213 */
214struct targ {
215	struct glob_arg *g;
216	int used;
217	int completed;
218	int cancel;
219	int fd;
220	struct nm_desc *nmd;
221	volatile uint64_t count;
222	struct timespec tic, toc;
223	int me;
224	pthread_t thread;
225	int affinity;
226
227	struct pkt pkt;
228	void *frame;
229};
230
231
232/*
233 * extract the extremes from a range of ipv4 addresses.
234 * addr_lo[-addr_hi][:port_lo[-port_hi]]
235 */
236static void
237extract_ip_range(struct ip_range *r)
238{
239	char *ap, *pp;
240	struct in_addr a;
241
242	if (verbose)
243		D("extract IP range from %s", r->name);
244	r->port0 = r->port1 = 0;
245	r->start = r->end = 0;
246
247	/* the first - splits start/end of range */
248	ap = index(r->name, '-');	/* do we have ports ? */
249	if (ap) {
250		*ap++ = '\0';
251	}
252	/* grab the initial values (mandatory) */
253	pp = index(r->name, ':');
254	if (pp) {
255		*pp++ = '\0';
256		r->port0 = r->port1 = strtol(pp, NULL, 0);
257	};
258	inet_aton(r->name, &a);
259	r->start = r->end = ntohl(a.s_addr);
260	if (ap) {
261		pp = index(ap, ':');
262		if (pp) {
263			*pp++ = '\0';
264			if (*pp)
265				r->port1 = strtol(pp, NULL, 0);
266		}
267		if (*ap) {
268			inet_aton(ap, &a);
269			r->end = ntohl(a.s_addr);
270		}
271	}
272	if (r->port0 > r->port1) {
273		uint16_t tmp = r->port0;
274		r->port0 = r->port1;
275		r->port1 = tmp;
276	}
277	if (r->start > r->end) {
278		uint32_t tmp = r->start;
279		r->start = r->end;
280		r->end = tmp;
281	}
282	{
283		struct in_addr a;
284		char buf1[16]; // one ip address
285
286		a.s_addr = htonl(r->end);
287		strncpy(buf1, inet_ntoa(a), sizeof(buf1));
288		a.s_addr = htonl(r->start);
289		if (1)
290		    D("range is %s:%d to %s:%d",
291			inet_ntoa(a), r->port0, buf1, r->port1);
292	}
293}
294
295static void
296extract_mac_range(struct mac_range *r)
297{
298	if (verbose)
299	    D("extract MAC range from %s", r->name);
300	bcopy(ether_aton(r->name), &r->start, 6);
301	bcopy(ether_aton(r->name), &r->end, 6);
302#if 0
303	bcopy(targ->src_mac, eh->ether_shost, 6);
304	p = index(targ->g->src_mac, '-');
305	if (p)
306		targ->src_mac_range = atoi(p+1);
307
308	bcopy(ether_aton(targ->g->dst_mac), targ->dst_mac, 6);
309	bcopy(targ->dst_mac, eh->ether_dhost, 6);
310	p = index(targ->g->dst_mac, '-');
311	if (p)
312		targ->dst_mac_range = atoi(p+1);
313#endif
314	if (verbose)
315		D("%s starts at %s", r->name, ether_ntoa(&r->start));
316}
317
318static struct targ *targs;
319static int global_nthreads;
320
321/* control-C handler */
322static void
323sigint_h(int sig)
324{
325	int i;
326
327	(void)sig;	/* UNUSED */
328	D("received control-C on thread %p", pthread_self());
329	for (i = 0; i < global_nthreads; i++) {
330		targs[i].cancel = 1;
331	}
332	signal(SIGINT, SIG_DFL);
333}
334
335/* sysctl wrapper to return the number of active CPUs */
336static int
337system_ncpus(void)
338{
339	int ncpus;
340#if defined (__FreeBSD__)
341	int mib[2] = { CTL_HW, HW_NCPU };
342	size_t len = sizeof(mib);
343	sysctl(mib, 2, &ncpus, &len, NULL, 0);
344#elif defined(linux)
345	ncpus = sysconf(_SC_NPROCESSORS_ONLN);
346#else /* others */
347	ncpus = 1;
348#endif /* others */
349	return (ncpus);
350}
351
352#ifdef __linux__
353#define sockaddr_dl    sockaddr_ll
354#define sdl_family     sll_family
355#define AF_LINK        AF_PACKET
356#define LLADDR(s)      s->sll_addr;
357#include <linux/if_tun.h>
358#define TAP_CLONEDEV	"/dev/net/tun"
359#endif /* __linux__ */
360
361#ifdef __FreeBSD__
362#include <net/if_tun.h>
363#define TAP_CLONEDEV	"/dev/tap"
364#endif /* __FreeBSD */
365
366#ifdef __APPLE__
367// #warning TAP not supported on apple ?
368#include <net/if_utun.h>
369#define TAP_CLONEDEV	"/dev/tap"
370#endif /* __APPLE__ */
371
372
373/*
374 * parse the vale configuration in conf and put it in nmr.
375 * Return the flag set if necessary.
376 * The configuration may consist of 0 to 4 numbers separated
377 * by commas: #tx-slots,#rx-slots,#tx-rings,#rx-rings.
378 * Missing numbers or zeroes stand for default values.
379 * As an additional convenience, if exactly one number
380 * is specified, then this is assigned to both #tx-slots and #rx-slots.
381 * If there is no 4th number, then the 3rd is assigned to both #tx-rings
382 * and #rx-rings.
383 */
384int
385parse_nmr_config(const char* conf, struct nmreq *nmr)
386{
387	char *w, *tok;
388	int i, v;
389
390	nmr->nr_tx_rings = nmr->nr_rx_rings = 0;
391	nmr->nr_tx_slots = nmr->nr_rx_slots = 0;
392	if (conf == NULL || ! *conf)
393		return 0;
394	w = strdup(conf);
395	for (i = 0, tok = strtok(w, ","); tok; i++, tok = strtok(NULL, ",")) {
396		v = atoi(tok);
397		switch (i) {
398		case 0:
399			nmr->nr_tx_slots = nmr->nr_rx_slots = v;
400			break;
401		case 1:
402			nmr->nr_rx_slots = v;
403			break;
404		case 2:
405			nmr->nr_tx_rings = nmr->nr_rx_rings = v;
406			break;
407		case 3:
408			nmr->nr_rx_rings = v;
409			break;
410		default:
411			D("ignored config: %s", tok);
412			break;
413		}
414	}
415	D("txr %d txd %d rxr %d rxd %d",
416			nmr->nr_tx_rings, nmr->nr_tx_slots,
417			nmr->nr_rx_rings, nmr->nr_rx_slots);
418	free(w);
419	return (nmr->nr_tx_rings || nmr->nr_tx_slots ||
420                        nmr->nr_rx_rings || nmr->nr_rx_slots) ?
421		NM_OPEN_RING_CFG : 0;
422}
423
424
425/*
426 * locate the src mac address for our interface, put it
427 * into the user-supplied buffer. return 0 if ok, -1 on error.
428 */
429static int
430source_hwaddr(const char *ifname, char *buf)
431{
432	struct ifaddrs *ifaphead, *ifap;
433	int l = sizeof(ifap->ifa_name);
434
435	if (getifaddrs(&ifaphead) != 0) {
436		D("getifaddrs %s failed", ifname);
437		return (-1);
438	}
439
440	for (ifap = ifaphead; ifap; ifap = ifap->ifa_next) {
441		struct sockaddr_dl *sdl =
442			(struct sockaddr_dl *)ifap->ifa_addr;
443		uint8_t *mac;
444
445		if (!sdl || sdl->sdl_family != AF_LINK)
446			continue;
447		if (strncmp(ifap->ifa_name, ifname, l) != 0)
448			continue;
449		mac = (uint8_t *)LLADDR(sdl);
450		sprintf(buf, "%02x:%02x:%02x:%02x:%02x:%02x",
451			mac[0], mac[1], mac[2],
452			mac[3], mac[4], mac[5]);
453		if (verbose)
454			D("source hwaddr %s", buf);
455		break;
456	}
457	freeifaddrs(ifaphead);
458	return ifap ? 0 : 1;
459}
460
461
462/* set the thread affinity. */
463static int
464setaffinity(pthread_t me, int i)
465{
466	cpuset_t cpumask;
467
468	if (i == -1)
469		return 0;
470
471	/* Set thread affinity affinity.*/
472	CPU_ZERO(&cpumask);
473	CPU_SET(i, &cpumask);
474
475	if (pthread_setaffinity_np(me, sizeof(cpuset_t), &cpumask) != 0) {
476		D("Unable to set affinity: %s", strerror(errno));
477		return 1;
478	}
479	return 0;
480}
481
482/* Compute the checksum of the given ip header. */
483static uint16_t
484checksum(const void *data, uint16_t len, uint32_t sum)
485{
486        const uint8_t *addr = data;
487	uint32_t i;
488
489        /* Checksum all the pairs of bytes first... */
490        for (i = 0; i < (len & ~1U); i += 2) {
491                sum += (u_int16_t)ntohs(*((u_int16_t *)(addr + i)));
492                if (sum > 0xFFFF)
493                        sum -= 0xFFFF;
494        }
495	/*
496	 * If there's a single byte left over, checksum it, too.
497	 * Network byte order is big-endian, so the remaining byte is
498	 * the high byte.
499	 */
500	if (i < len) {
501		sum += addr[i] << 8;
502		if (sum > 0xFFFF)
503			sum -= 0xFFFF;
504	}
505	return sum;
506}
507
508static u_int16_t
509wrapsum(u_int32_t sum)
510{
511	sum = ~sum & 0xFFFF;
512	return (htons(sum));
513}
514
515/* Check the payload of the packet for errors (use it for debug).
516 * Look for consecutive ascii representations of the size of the packet.
517 */
518static void
519dump_payload(char *p, int len, struct netmap_ring *ring, int cur)
520{
521	char buf[128];
522	int i, j, i0;
523
524	/* get the length in ASCII of the length of the packet. */
525
526	printf("ring %p cur %5d [buf %6d flags 0x%04x len %5d]\n",
527		ring, cur, ring->slot[cur].buf_idx,
528		ring->slot[cur].flags, len);
529	/* hexdump routine */
530	for (i = 0; i < len; ) {
531		memset(buf, sizeof(buf), ' ');
532		sprintf(buf, "%5d: ", i);
533		i0 = i;
534		for (j=0; j < 16 && i < len; i++, j++)
535			sprintf(buf+7+j*3, "%02x ", (uint8_t)(p[i]));
536		i = i0;
537		for (j=0; j < 16 && i < len; i++, j++)
538			sprintf(buf+7+j + 48, "%c",
539				isprint(p[i]) ? p[i] : '.');
540		printf("%s\n", buf);
541	}
542}
543
544/*
545 * Fill a packet with some payload.
546 * We create a UDP packet so the payload starts at
547 *	14+20+8 = 42 bytes.
548 */
549#ifdef __linux__
550#define uh_sport source
551#define uh_dport dest
552#define uh_ulen len
553#define uh_sum check
554#endif /* linux */
555
556/*
557 * increment the addressed in the packet,
558 * starting from the least significant field.
559 *	DST_IP DST_PORT SRC_IP SRC_PORT
560 */
561static void
562update_addresses(struct pkt *pkt, struct glob_arg *g)
563{
564	uint32_t a;
565	uint16_t p;
566	struct ip *ip = &pkt->ip;
567	struct udphdr *udp = &pkt->udp;
568
569    do {
570	p = ntohs(udp->uh_sport);
571	if (p < g->src_ip.port1) { /* just inc, no wrap */
572		udp->uh_sport = htons(p + 1);
573		break;
574	}
575	udp->uh_sport = htons(g->src_ip.port0);
576
577	a = ntohl(ip->ip_src.s_addr);
578	if (a < g->src_ip.end) { /* just inc, no wrap */
579		ip->ip_src.s_addr = htonl(a + 1);
580		break;
581	}
582	ip->ip_src.s_addr = htonl(g->src_ip.start);
583
584	udp->uh_sport = htons(g->src_ip.port0);
585	p = ntohs(udp->uh_dport);
586	if (p < g->dst_ip.port1) { /* just inc, no wrap */
587		udp->uh_dport = htons(p + 1);
588		break;
589	}
590	udp->uh_dport = htons(g->dst_ip.port0);
591
592	a = ntohl(ip->ip_dst.s_addr);
593	if (a < g->dst_ip.end) { /* just inc, no wrap */
594		ip->ip_dst.s_addr = htonl(a + 1);
595		break;
596	}
597	ip->ip_dst.s_addr = htonl(g->dst_ip.start);
598    } while (0);
599    // update checksum
600}
601
602/*
603 * initialize one packet and prepare for the next one.
604 * The copy could be done better instead of repeating it each time.
605 */
606static void
607initialize_packet(struct targ *targ)
608{
609	struct pkt *pkt = &targ->pkt;
610	struct ether_header *eh;
611	struct ip *ip;
612	struct udphdr *udp;
613	uint16_t paylen = targ->g->pkt_size - sizeof(*eh) - sizeof(struct ip);
614	const char *payload = targ->g->options & OPT_INDIRECT ?
615		indirect_payload : default_payload;
616	int i, l0 = strlen(payload);
617
618	char errbuf[PCAP_ERRBUF_SIZE];
619	pcap_t *file;
620	struct pcap_pkthdr *header;
621	const unsigned char *packet;
622
623	/* Read a packet from a PCAP file if asked. */
624	if (targ->g->packet_file != NULL) {
625		if ((file = pcap_open_offline(targ->g->packet_file,
626			    errbuf)) == NULL)
627			D("failed to open pcap file %s",
628			    targ->g->packet_file);
629		if (pcap_next_ex(file, &header, &packet) < 0)
630			D("failed to read packet from %s",
631			    targ->g->packet_file);
632		if ((targ->frame = malloc(header->caplen)) == NULL)
633			D("out of memory");
634		bcopy(packet, (unsigned char *)targ->frame, header->caplen);
635		targ->g->pkt_size = header->caplen;
636		pcap_close(file);
637		return;
638	}
639
640	/* create a nice NUL-terminated string */
641	for (i = 0; i < paylen; i += l0) {
642		if (l0 > paylen - i)
643			l0 = paylen - i; // last round
644		bcopy(payload, pkt->body + i, l0);
645	}
646	pkt->body[i-1] = '\0';
647	ip = &pkt->ip;
648
649	/* prepare the headers */
650        ip->ip_v = IPVERSION;
651        ip->ip_hl = 5;
652        ip->ip_id = 0;
653        ip->ip_tos = IPTOS_LOWDELAY;
654	ip->ip_len = ntohs(targ->g->pkt_size - sizeof(*eh));
655        ip->ip_id = 0;
656        ip->ip_off = htons(IP_DF); /* Don't fragment */
657        ip->ip_ttl = IPDEFTTL;
658	ip->ip_p = IPPROTO_UDP;
659	ip->ip_dst.s_addr = htonl(targ->g->dst_ip.start);
660	ip->ip_src.s_addr = htonl(targ->g->src_ip.start);
661	ip->ip_sum = wrapsum(checksum(ip, sizeof(*ip), 0));
662
663
664	udp = &pkt->udp;
665        udp->uh_sport = htons(targ->g->src_ip.port0);
666        udp->uh_dport = htons(targ->g->dst_ip.port0);
667	udp->uh_ulen = htons(paylen);
668	/* Magic: taken from sbin/dhclient/packet.c */
669	udp->uh_sum = wrapsum(checksum(udp, sizeof(*udp),
670                    checksum(pkt->body,
671                        paylen - sizeof(*udp),
672                        checksum(&ip->ip_src, 2 * sizeof(ip->ip_src),
673                            IPPROTO_UDP + (u_int32_t)ntohs(udp->uh_ulen)
674                        )
675                    )
676                ));
677
678	eh = &pkt->eh;
679	bcopy(&targ->g->src_mac.start, eh->ether_shost, 6);
680	bcopy(&targ->g->dst_mac.start, eh->ether_dhost, 6);
681	eh->ether_type = htons(ETHERTYPE_IP);
682
683	bzero(&pkt->vh, sizeof(pkt->vh));
684#ifdef TRASH_VHOST_HDR
685	/* set bogus content */
686	pkt->vh.fields[0] = 0xff;
687	pkt->vh.fields[1] = 0xff;
688	pkt->vh.fields[2] = 0xff;
689	pkt->vh.fields[3] = 0xff;
690	pkt->vh.fields[4] = 0xff;
691	pkt->vh.fields[5] = 0xff;
692#endif /* TRASH_VHOST_HDR */
693	// dump_payload((void *)pkt, targ->g->pkt_size, NULL, 0);
694}
695
696static void
697set_vnet_hdr_len(struct targ *t)
698{
699	int err, l = t->g->virt_header;
700	struct nmreq req;
701
702	if (l == 0)
703		return;
704
705	memset(&req, 0, sizeof(req));
706	bcopy(t->nmd->req.nr_name, req.nr_name, sizeof(req.nr_name));
707	req.nr_version = NETMAP_API;
708	req.nr_cmd = NETMAP_BDG_VNET_HDR;
709	req.nr_arg1 = l;
710	err = ioctl(t->fd, NIOCREGIF, &req);
711	if (err) {
712		D("Unable to set vnet header length %d", l);
713	}
714}
715
716
717/*
718 * create and enqueue a batch of packets on a ring.
719 * On the last one set NS_REPORT to tell the driver to generate
720 * an interrupt when done.
721 */
722static int
723send_packets(struct netmap_ring *ring, struct pkt *pkt, void *frame,
724		int size, struct glob_arg *g, u_int count, int options,
725		u_int nfrags)
726{
727	u_int n, sent, cur = ring->cur;
728	u_int fcnt;
729
730	n = nm_ring_space(ring);
731	if (n < count)
732		count = n;
733	if (count < nfrags) {
734		D("truncating packet, no room for frags %d %d",
735				count, nfrags);
736	}
737#if 0
738	if (options & (OPT_COPY | OPT_PREFETCH) ) {
739		for (sent = 0; sent < count; sent++) {
740			struct netmap_slot *slot = &ring->slot[cur];
741			char *p = NETMAP_BUF(ring, slot->buf_idx);
742
743			__builtin_prefetch(p);
744			cur = nm_ring_next(ring, cur);
745		}
746		cur = ring->cur;
747	}
748#endif
749	for (fcnt = nfrags, sent = 0; sent < count; sent++) {
750		struct netmap_slot *slot = &ring->slot[cur];
751		char *p = NETMAP_BUF(ring, slot->buf_idx);
752
753		slot->flags = 0;
754		if (options & OPT_INDIRECT) {
755			slot->flags |= NS_INDIRECT;
756			slot->ptr = (uint64_t)frame;
757		} else if (options & OPT_COPY) {
758			nm_pkt_copy(frame, p, size);
759			if (fcnt == nfrags)
760				update_addresses(pkt, g);
761		} else if (options & OPT_MEMCPY) {
762			memcpy(p, frame, size);
763			if (fcnt == nfrags)
764				update_addresses(pkt, g);
765		} else if (options & OPT_PREFETCH) {
766			__builtin_prefetch(p);
767		}
768		if (options & OPT_DUMP)
769			dump_payload(p, size, ring, cur);
770		slot->len = size;
771		if (--fcnt > 0)
772			slot->flags |= NS_MOREFRAG;
773		else
774			fcnt = nfrags;
775		if (sent == count - 1) {
776			slot->flags &= ~NS_MOREFRAG;
777			slot->flags |= NS_REPORT;
778		}
779		cur = nm_ring_next(ring, cur);
780	}
781	ring->head = ring->cur = cur;
782
783	return (sent);
784}
785
786/*
787 * Send a packet, and wait for a response.
788 * The payload (after UDP header, ofs 42) has a 4-byte sequence
789 * followed by a struct timeval (or bintime?)
790 */
791#define	PAY_OFS	42	/* where in the pkt... */
792
793static void *
794pinger_body(void *data)
795{
796	struct targ *targ = (struct targ *) data;
797	struct pollfd pfd = { .fd = targ->fd, .events = POLLIN };
798	struct netmap_if *nifp = targ->nmd->nifp;
799	int i, rx = 0, n = targ->g->npackets;
800	void *frame;
801	int size;
802	uint32_t sent = 0;
803	struct timespec ts, now, last_print;
804	uint32_t count = 0, min = 1000000000, av = 0;
805
806	frame = &targ->pkt;
807	frame += sizeof(targ->pkt.vh) - targ->g->virt_header;
808	size = targ->g->pkt_size + targ->g->virt_header;
809
810	if (targ->g->nthreads > 1) {
811		D("can only ping with 1 thread");
812		return NULL;
813	}
814
815	clock_gettime(CLOCK_REALTIME_PRECISE, &last_print);
816	now = last_print;
817	while (n == 0 || (int)sent < n) {
818		struct netmap_ring *ring = NETMAP_TXRING(nifp, 0);
819		struct netmap_slot *slot;
820		char *p;
821	    for (i = 0; i < 1; i++) { /* XXX why the loop for 1 pkt ? */
822		slot = &ring->slot[ring->cur];
823		slot->len = size;
824		p = NETMAP_BUF(ring, slot->buf_idx);
825
826		if (nm_ring_empty(ring)) {
827			D("-- ouch, cannot send");
828		} else {
829			struct tstamp *tp;
830			nm_pkt_copy(frame, p, size);
831			clock_gettime(CLOCK_REALTIME_PRECISE, &ts);
832			bcopy(&sent, p+42, sizeof(sent));
833			tp = (struct tstamp *)(p+46);
834			tp->sec = (uint32_t)ts.tv_sec;
835			tp->nsec = (uint32_t)ts.tv_nsec;
836			sent++;
837			ring->head = ring->cur = nm_ring_next(ring, ring->cur);
838		}
839	    }
840		/* should use a parameter to decide how often to send */
841		if (poll(&pfd, 1, 3000) <= 0) {
842			D("poll error/timeout on queue %d: %s", targ->me,
843				strerror(errno));
844			continue;
845		}
846		/* see what we got back */
847		for (i = targ->nmd->first_tx_ring;
848			i <= targ->nmd->last_tx_ring; i++) {
849			ring = NETMAP_RXRING(nifp, i);
850			while (!nm_ring_empty(ring)) {
851				uint32_t seq;
852				struct tstamp *tp;
853				slot = &ring->slot[ring->cur];
854				p = NETMAP_BUF(ring, slot->buf_idx);
855
856				clock_gettime(CLOCK_REALTIME_PRECISE, &now);
857				bcopy(p+42, &seq, sizeof(seq));
858				tp = (struct tstamp *)(p+46);
859				ts.tv_sec = (time_t)tp->sec;
860				ts.tv_nsec = (long)tp->nsec;
861				ts.tv_sec = now.tv_sec - ts.tv_sec;
862				ts.tv_nsec = now.tv_nsec - ts.tv_nsec;
863				if (ts.tv_nsec < 0) {
864					ts.tv_nsec += 1000000000;
865					ts.tv_sec--;
866				}
867				if (1) D("seq %d/%d delta %d.%09d", seq, sent,
868					(int)ts.tv_sec, (int)ts.tv_nsec);
869				if (ts.tv_nsec < (int)min)
870					min = ts.tv_nsec;
871				count ++;
872				av += ts.tv_nsec;
873				ring->head = ring->cur = nm_ring_next(ring, ring->cur);
874				rx++;
875			}
876		}
877		//D("tx %d rx %d", sent, rx);
878		//usleep(100000);
879		ts.tv_sec = now.tv_sec - last_print.tv_sec;
880		ts.tv_nsec = now.tv_nsec - last_print.tv_nsec;
881		if (ts.tv_nsec < 0) {
882			ts.tv_nsec += 1000000000;
883			ts.tv_sec--;
884		}
885		if (ts.tv_sec >= 1) {
886			D("count %d min %d av %d",
887				count, min, av/count);
888			count = 0;
889			av = 0;
890			min = 100000000;
891			last_print = now;
892		}
893	}
894	return NULL;
895}
896
897
898/*
899 * reply to ping requests
900 */
901static void *
902ponger_body(void *data)
903{
904	struct targ *targ = (struct targ *) data;
905	struct pollfd pfd = { .fd = targ->fd, .events = POLLIN };
906	struct netmap_if *nifp = targ->nmd->nifp;
907	struct netmap_ring *txring, *rxring;
908	int i, rx = 0, sent = 0, n = targ->g->npackets;
909
910	if (targ->g->nthreads > 1) {
911		D("can only reply ping with 1 thread");
912		return NULL;
913	}
914	D("understood ponger %d but don't know how to do it", n);
915	while (n == 0 || sent < n) {
916		uint32_t txcur, txavail;
917//#define BUSYWAIT
918#ifdef BUSYWAIT
919		ioctl(pfd.fd, NIOCRXSYNC, NULL);
920#else
921		if (poll(&pfd, 1, 1000) <= 0) {
922			D("poll error/timeout on queue %d: %s", targ->me,
923				strerror(errno));
924			continue;
925		}
926#endif
927		txring = NETMAP_TXRING(nifp, 0);
928		txcur = txring->cur;
929		txavail = nm_ring_space(txring);
930		/* see what we got back */
931		for (i = targ->nmd->first_rx_ring; i <= targ->nmd->last_rx_ring; i++) {
932			rxring = NETMAP_RXRING(nifp, i);
933			while (!nm_ring_empty(rxring)) {
934				uint16_t *spkt, *dpkt;
935				uint32_t cur = rxring->cur;
936				struct netmap_slot *slot = &rxring->slot[cur];
937				char *src, *dst;
938				src = NETMAP_BUF(rxring, slot->buf_idx);
939				//D("got pkt %p of size %d", src, slot->len);
940				rxring->head = rxring->cur = nm_ring_next(rxring, cur);
941				rx++;
942				if (txavail == 0)
943					continue;
944				dst = NETMAP_BUF(txring,
945				    txring->slot[txcur].buf_idx);
946				/* copy... */
947				dpkt = (uint16_t *)dst;
948				spkt = (uint16_t *)src;
949				nm_pkt_copy(src, dst, slot->len);
950				dpkt[0] = spkt[3];
951				dpkt[1] = spkt[4];
952				dpkt[2] = spkt[5];
953				dpkt[3] = spkt[0];
954				dpkt[4] = spkt[1];
955				dpkt[5] = spkt[2];
956				txring->slot[txcur].len = slot->len;
957				/* XXX swap src dst mac */
958				txcur = nm_ring_next(txring, txcur);
959				txavail--;
960				sent++;
961			}
962		}
963		txring->head = txring->cur = txcur;
964		targ->count = sent;
965#ifdef BUSYWAIT
966		ioctl(pfd.fd, NIOCTXSYNC, NULL);
967#endif
968		//D("tx %d rx %d", sent, rx);
969	}
970	return NULL;
971}
972
973static __inline int
974timespec_ge(const struct timespec *a, const struct timespec *b)
975{
976
977	if (a->tv_sec > b->tv_sec)
978		return (1);
979	if (a->tv_sec < b->tv_sec)
980		return (0);
981	if (a->tv_nsec >= b->tv_nsec)
982		return (1);
983	return (0);
984}
985
986static __inline struct timespec
987timeval2spec(const struct timeval *a)
988{
989	struct timespec ts = {
990		.tv_sec = a->tv_sec,
991		.tv_nsec = a->tv_usec * 1000
992	};
993	return ts;
994}
995
996static __inline struct timeval
997timespec2val(const struct timespec *a)
998{
999	struct timeval tv = {
1000		.tv_sec = a->tv_sec,
1001		.tv_usec = a->tv_nsec / 1000
1002	};
1003	return tv;
1004}
1005
1006
1007static __inline struct timespec
1008timespec_add(struct timespec a, struct timespec b)
1009{
1010	struct timespec ret = { a.tv_sec + b.tv_sec, a.tv_nsec + b.tv_nsec };
1011	if (ret.tv_nsec >= 1000000000) {
1012		ret.tv_sec++;
1013		ret.tv_nsec -= 1000000000;
1014	}
1015	return ret;
1016}
1017
1018static __inline struct timespec
1019timespec_sub(struct timespec a, struct timespec b)
1020{
1021	struct timespec ret = { a.tv_sec - b.tv_sec, a.tv_nsec - b.tv_nsec };
1022	if (ret.tv_nsec < 0) {
1023		ret.tv_sec--;
1024		ret.tv_nsec += 1000000000;
1025	}
1026	return ret;
1027}
1028
1029
1030/*
1031 * wait until ts, either busy or sleeping if more than 1ms.
1032 * Return wakeup time.
1033 */
1034static struct timespec
1035wait_time(struct timespec ts)
1036{
1037	for (;;) {
1038		struct timespec w, cur;
1039		clock_gettime(CLOCK_REALTIME_PRECISE, &cur);
1040		w = timespec_sub(ts, cur);
1041		if (w.tv_sec < 0)
1042			return cur;
1043		else if (w.tv_sec > 0 || w.tv_nsec > 1000000)
1044			poll(NULL, 0, 1);
1045	}
1046}
1047
1048static void *
1049sender_body(void *data)
1050{
1051	struct targ *targ = (struct targ *) data;
1052	struct pollfd pfd = { .fd = targ->fd, .events = POLLOUT };
1053	struct netmap_if *nifp;
1054	struct netmap_ring *txring;
1055	int i, n = targ->g->npackets / targ->g->nthreads;
1056	int64_t sent = 0;
1057	int options = targ->g->options | OPT_COPY;
1058	struct timespec nexttime = { 0, 0}; // XXX silence compiler
1059	int rate_limit = targ->g->tx_rate;
1060	struct pkt *pkt = &targ->pkt;
1061	void *frame;
1062	int size;
1063
1064	if (targ->frame == NULL) {
1065		frame = pkt;
1066		frame += sizeof(pkt->vh) - targ->g->virt_header;
1067		size = targ->g->pkt_size + targ->g->virt_header;
1068	} else {
1069		frame = targ->frame;
1070		size = targ->g->pkt_size;
1071	}
1072
1073	D("start, fd %d main_fd %d", targ->fd, targ->g->main_fd);
1074	if (setaffinity(targ->thread, targ->affinity))
1075		goto quit;
1076
1077	/* main loop.*/
1078	clock_gettime(CLOCK_REALTIME_PRECISE, &targ->tic);
1079	if (rate_limit) {
1080		targ->tic = timespec_add(targ->tic, (struct timespec){2,0});
1081		targ->tic.tv_nsec = 0;
1082		wait_time(targ->tic);
1083		nexttime = targ->tic;
1084	}
1085        if (targ->g->dev_type == DEV_TAP) {
1086	    D("writing to file desc %d", targ->g->main_fd);
1087
1088	    for (i = 0; !targ->cancel && (n == 0 || sent < n); i++) {
1089		if (write(targ->g->main_fd, frame, size) != -1)
1090			sent++;
1091		update_addresses(pkt, targ->g);
1092		if (i > 10000) {
1093			targ->count = sent;
1094			i = 0;
1095		}
1096	    }
1097#ifndef NO_PCAP
1098    } else if (targ->g->dev_type == DEV_PCAP) {
1099	    pcap_t *p = targ->g->p;
1100
1101	    for (i = 0; !targ->cancel && (n == 0 || sent < n); i++) {
1102		if (pcap_inject(p, frame, size) != -1)
1103			sent++;
1104		update_addresses(pkt, targ->g);
1105		if (i > 10000) {
1106			targ->count = sent;
1107			i = 0;
1108		}
1109	    }
1110#endif /* NO_PCAP */
1111    } else {
1112	int tosend = 0;
1113	int frags = targ->g->frags;
1114
1115        nifp = targ->nmd->nifp;
1116	while (!targ->cancel && (n == 0 || sent < n)) {
1117
1118		if (rate_limit && tosend <= 0) {
1119			tosend = targ->g->burst;
1120			nexttime = timespec_add(nexttime, targ->g->tx_period);
1121			wait_time(nexttime);
1122		}
1123
1124		/*
1125		 * wait for available room in the send queue(s)
1126		 */
1127		if (poll(&pfd, 1, 2000) <= 0) {
1128			if (targ->cancel)
1129				break;
1130			D("poll error/timeout on queue %d: %s", targ->me,
1131				strerror(errno));
1132			// goto quit;
1133		}
1134		if (pfd.revents & POLLERR) {
1135			D("poll error");
1136			goto quit;
1137		}
1138		/*
1139		 * scan our queues and send on those with room
1140		 */
1141		if (options & OPT_COPY && sent > 100000 && !(targ->g->options & OPT_COPY) ) {
1142			D("drop copy");
1143			options &= ~OPT_COPY;
1144		}
1145		for (i = targ->nmd->first_tx_ring; i <= targ->nmd->last_tx_ring; i++) {
1146			int m, limit = rate_limit ?  tosend : targ->g->burst;
1147			if (n > 0 && n - sent < limit)
1148				limit = n - sent;
1149			txring = NETMAP_TXRING(nifp, i);
1150			if (nm_ring_empty(txring))
1151				continue;
1152			if (frags > 1)
1153				limit = ((limit + frags - 1) / frags) * frags;
1154
1155			m = send_packets(txring, pkt, frame, size, targ->g,
1156					 limit, options, frags);
1157			ND("limit %d tail %d frags %d m %d",
1158				limit, txring->tail, frags, m);
1159			sent += m;
1160			targ->count = sent;
1161			if (rate_limit) {
1162				tosend -= m;
1163				if (tosend <= 0)
1164					break;
1165			}
1166		}
1167	}
1168	/* flush any remaining packets */
1169	D("flush tail %d head %d on thread %p",
1170		txring->tail, txring->head,
1171		pthread_self());
1172	ioctl(pfd.fd, NIOCTXSYNC, NULL);
1173
1174	/* final part: wait all the TX queues to be empty. */
1175	for (i = targ->nmd->first_tx_ring; i <= targ->nmd->last_tx_ring; i++) {
1176		txring = NETMAP_TXRING(nifp, i);
1177		while (nm_tx_pending(txring)) {
1178			RD(5, "pending tx tail %d head %d on ring %d",
1179				txring->tail, txring->head, i);
1180			ioctl(pfd.fd, NIOCTXSYNC, NULL);
1181			usleep(1); /* wait 1 tick */
1182		}
1183	}
1184    } /* end DEV_NETMAP */
1185
1186	clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc);
1187	targ->completed = 1;
1188	targ->count = sent;
1189
1190quit:
1191	/* reset the ``used`` flag. */
1192	targ->used = 0;
1193
1194	return (NULL);
1195}
1196
1197
1198#ifndef NO_PCAP
1199static void
1200receive_pcap(u_char *user, const struct pcap_pkthdr * h,
1201	const u_char * bytes)
1202{
1203	int *count = (int *)user;
1204	(void)h;	/* UNUSED */
1205	(void)bytes;	/* UNUSED */
1206	(*count)++;
1207}
1208#endif /* !NO_PCAP */
1209
1210static int
1211receive_packets(struct netmap_ring *ring, u_int limit, int dump)
1212{
1213	u_int cur, rx, n;
1214
1215	cur = ring->cur;
1216	n = nm_ring_space(ring);
1217	if (n < limit)
1218		limit = n;
1219	for (rx = 0; rx < limit; rx++) {
1220		struct netmap_slot *slot = &ring->slot[cur];
1221		char *p = NETMAP_BUF(ring, slot->buf_idx);
1222
1223		if (dump)
1224			dump_payload(p, slot->len, ring, cur);
1225
1226		cur = nm_ring_next(ring, cur);
1227	}
1228	ring->head = ring->cur = cur;
1229
1230	return (rx);
1231}
1232
1233static void *
1234receiver_body(void *data)
1235{
1236	struct targ *targ = (struct targ *) data;
1237	struct pollfd pfd = { .fd = targ->fd, .events = POLLIN };
1238	struct netmap_if *nifp;
1239	struct netmap_ring *rxring;
1240	int i;
1241	uint64_t received = 0;
1242
1243	if (setaffinity(targ->thread, targ->affinity))
1244		goto quit;
1245
1246	D("reading from %s fd %d main_fd %d",
1247		targ->g->ifname, targ->fd, targ->g->main_fd);
1248	/* unbounded wait for the first packet. */
1249	for (;!targ->cancel;) {
1250		i = poll(&pfd, 1, 1000);
1251		if (i > 0 && !(pfd.revents & POLLERR))
1252			break;
1253		RD(1, "waiting for initial packets, poll returns %d %d",
1254			i, pfd.revents);
1255	}
1256	/* main loop, exit after 1s silence */
1257	clock_gettime(CLOCK_REALTIME_PRECISE, &targ->tic);
1258    if (targ->g->dev_type == DEV_TAP) {
1259	while (!targ->cancel) {
1260		char buf[MAX_BODYSIZE];
1261		/* XXX should we poll ? */
1262		if (read(targ->g->main_fd, buf, sizeof(buf)) > 0)
1263			targ->count++;
1264	}
1265#ifndef NO_PCAP
1266    } else if (targ->g->dev_type == DEV_PCAP) {
1267	while (!targ->cancel) {
1268		/* XXX should we poll ? */
1269		pcap_dispatch(targ->g->p, targ->g->burst, receive_pcap,
1270			(u_char *)&targ->count);
1271	}
1272#endif /* !NO_PCAP */
1273    } else {
1274	int dump = targ->g->options & OPT_DUMP;
1275
1276        nifp = targ->nmd->nifp;
1277	while (!targ->cancel) {
1278		/* Once we started to receive packets, wait at most 1 seconds
1279		   before quitting. */
1280		if (poll(&pfd, 1, 1 * 1000) <= 0 && !targ->g->forever) {
1281			clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc);
1282			targ->toc.tv_sec -= 1; /* Subtract timeout time. */
1283			goto out;
1284		}
1285
1286		if (pfd.revents & POLLERR) {
1287			D("poll err");
1288			goto quit;
1289		}
1290
1291		for (i = targ->nmd->first_rx_ring; i <= targ->nmd->last_rx_ring; i++) {
1292			int m;
1293
1294			rxring = NETMAP_RXRING(nifp, i);
1295			if (nm_ring_empty(rxring))
1296				continue;
1297
1298			m = receive_packets(rxring, targ->g->burst, dump);
1299			received += m;
1300		}
1301		targ->count = received;
1302	}
1303    }
1304
1305	clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc);
1306
1307out:
1308	targ->completed = 1;
1309	targ->count = received;
1310
1311quit:
1312	/* reset the ``used`` flag. */
1313	targ->used = 0;
1314
1315	return (NULL);
1316}
1317
1318/* very crude code to print a number in normalized form.
1319 * Caller has to make sure that the buffer is large enough.
1320 */
1321static const char *
1322norm(char *buf, double val)
1323{
1324	char *units[] = { "", "K", "M", "G", "T" };
1325	u_int i;
1326
1327	for (i = 0; val >=1000 && i < sizeof(units)/sizeof(char *) - 1; i++)
1328		val /= 1000;
1329	sprintf(buf, "%.2f %s", val, units[i]);
1330	return buf;
1331}
1332
1333static void
1334tx_output(uint64_t sent, int size, double delta)
1335{
1336	double bw, raw_bw, pps;
1337	char b1[40], b2[80], b3[80];
1338
1339	printf("Sent %llu packets, %d bytes each, in %.2f seconds.\n",
1340	       (unsigned long long)sent, size, delta);
1341	if (delta == 0)
1342		delta = 1e-6;
1343	if (size < 60)		/* correct for min packet size */
1344		size = 60;
1345	pps = sent / delta;
1346	bw = (8.0 * size * sent) / delta;
1347	/* raw packets have4 bytes crc + 20 bytes framing */
1348	raw_bw = (8.0 * (size + 24) * sent) / delta;
1349
1350	printf("Speed: %spps Bandwidth: %sbps (raw %sbps)\n",
1351		norm(b1, pps), norm(b2, bw), norm(b3, raw_bw) );
1352}
1353
1354
1355static void
1356rx_output(uint64_t received, double delta)
1357{
1358	double pps;
1359	char b1[40];
1360
1361	printf("Received %llu packets, in %.2f seconds.\n",
1362		(unsigned long long) received, delta);
1363
1364	if (delta == 0)
1365		delta = 1e-6;
1366	pps = received / delta;
1367	printf("Speed: %spps\n", norm(b1, pps));
1368}
1369
1370static void
1371usage(void)
1372{
1373	const char *cmd = "pkt-gen";
1374	fprintf(stderr,
1375		"Usage:\n"
1376		"%s arguments\n"
1377		"\t-i interface		interface name\n"
1378		"\t-f function		tx rx ping pong\n"
1379		"\t-n count		number of iterations (can be 0)\n"
1380		"\t-t pkts_to_send		also forces tx mode\n"
1381		"\t-r pkts_to_receive	also forces rx mode\n"
1382		"\t-l pkt_size		in bytes excluding CRC\n"
1383		"\t-d dst_ip[:port[-dst_ip:port]]   single or range\n"
1384		"\t-s src_ip[:port[-src_ip:port]]   single or range\n"
1385		"\t-D dst-mac\n"
1386		"\t-S src-mac\n"
1387		"\t-a cpu_id		use setaffinity\n"
1388		"\t-b burst size		testing, mostly\n"
1389		"\t-c cores		cores to use\n"
1390		"\t-p threads		processes/threads to use\n"
1391		"\t-T report_ms		milliseconds between reports\n"
1392		"\t-P			use libpcap instead of netmap\n"
1393		"\t-w wait_for_link_time	in seconds\n"
1394		"\t-R rate		in packets per second\n"
1395		"\t-X			dump payload\n"
1396		"\t-H len		add empty virtio-net-header with size 'len'\n"
1397	        "\t-P file		load packet from pcap file"
1398		"",
1399		cmd);
1400
1401	exit(0);
1402}
1403
1404static void
1405start_threads(struct glob_arg *g)
1406{
1407	int i;
1408
1409	targs = calloc(g->nthreads, sizeof(*targs));
1410	/*
1411	 * Now create the desired number of threads, each one
1412	 * using a single descriptor.
1413 	 */
1414	for (i = 0; i < g->nthreads; i++) {
1415		struct targ *t = &targs[i];
1416
1417		bzero(t, sizeof(*t));
1418		t->fd = -1; /* default, with pcap */
1419		t->g = g;
1420
1421	    if (g->dev_type == DEV_NETMAP) {
1422		struct nm_desc nmd = *g->nmd; /* copy, we overwrite ringid */
1423		uint64_t nmd_flags = 0;
1424		nmd.self = &nmd;
1425
1426		if (g->nthreads > 1) {
1427			if (nmd.req.nr_flags != NR_REG_ALL_NIC) {
1428				D("invalid nthreads mode %d", nmd.req.nr_flags);
1429				continue;
1430			}
1431			nmd.req.nr_flags = NR_REG_ONE_NIC;
1432			nmd.req.nr_ringid = i;
1433		}
1434		/* Only touch one of the rings (rx is already ok) */
1435		if (g->td_body == receiver_body)
1436			nmd_flags |= NETMAP_NO_TX_POLL;
1437
1438		/* register interface. Override ifname and ringid etc. */
1439		if (g->options & OPT_MONITOR_TX)
1440			nmd.req.nr_flags |= NR_MONITOR_TX;
1441		if (g->options & OPT_MONITOR_RX)
1442			nmd.req.nr_flags |= NR_MONITOR_RX;
1443
1444		t->nmd = nm_open(t->g->ifname, NULL, nmd_flags |
1445			NM_OPEN_IFNAME | NM_OPEN_NO_MMAP, &nmd);
1446		if (t->nmd == NULL) {
1447			D("Unable to open %s: %s",
1448				t->g->ifname, strerror(errno));
1449			continue;
1450		}
1451		t->fd = t->nmd->fd;
1452		set_vnet_hdr_len(t);
1453
1454	    } else {
1455		targs[i].fd = g->main_fd;
1456	    }
1457		t->used = 1;
1458		t->me = i;
1459		if (g->affinity >= 0) {
1460			if (g->affinity < g->cpus)
1461				t->affinity = g->affinity;
1462			else
1463				t->affinity = i % g->cpus;
1464		} else {
1465			t->affinity = -1;
1466		}
1467		/* default, init packets */
1468		initialize_packet(t);
1469
1470		if (pthread_create(&t->thread, NULL, g->td_body, t) == -1) {
1471			D("Unable to create thread %d: %s", i, strerror(errno));
1472			t->used = 0;
1473		}
1474	}
1475}
1476
1477static void
1478main_thread(struct glob_arg *g)
1479{
1480	int i;
1481
1482	uint64_t prev = 0;
1483	uint64_t count = 0;
1484	double delta_t;
1485	struct timeval tic, toc;
1486
1487	gettimeofday(&toc, NULL);
1488	for (;;) {
1489		struct timeval now, delta;
1490		uint64_t pps, usec, my_count, npkts;
1491		int done = 0;
1492
1493		delta.tv_sec = g->report_interval/1000;
1494		delta.tv_usec = (g->report_interval%1000)*1000;
1495		select(0, NULL, NULL, NULL, &delta);
1496		gettimeofday(&now, NULL);
1497		timersub(&now, &toc, &toc);
1498		my_count = 0;
1499		for (i = 0; i < g->nthreads; i++) {
1500			my_count += targs[i].count;
1501			if (targs[i].used == 0)
1502				done++;
1503		}
1504		usec = toc.tv_sec* 1000000 + toc.tv_usec;
1505		if (usec < 10000)
1506			continue;
1507		npkts = my_count - prev;
1508		pps = (npkts*1000000 + usec/2) / usec;
1509		D("%llu pps (%llu pkts in %llu usec)",
1510			(unsigned long long)pps,
1511			(unsigned long long)npkts,
1512			(unsigned long long)usec);
1513		prev = my_count;
1514		toc = now;
1515		if (done == g->nthreads)
1516			break;
1517	}
1518
1519	timerclear(&tic);
1520	timerclear(&toc);
1521	for (i = 0; i < g->nthreads; i++) {
1522		struct timespec t_tic, t_toc;
1523		/*
1524		 * Join active threads, unregister interfaces and close
1525		 * file descriptors.
1526		 */
1527		if (targs[i].used)
1528			pthread_join(targs[i].thread, NULL);
1529		close(targs[i].fd);
1530
1531		if (targs[i].completed == 0)
1532			D("ouch, thread %d exited with error", i);
1533
1534		/*
1535		 * Collect threads output and extract information about
1536		 * how long it took to send all the packets.
1537		 */
1538		count += targs[i].count;
1539		t_tic = timeval2spec(&tic);
1540		t_toc = timeval2spec(&toc);
1541		if (!timerisset(&tic) || timespec_ge(&targs[i].tic, &t_tic))
1542			tic = timespec2val(&targs[i].tic);
1543		if (!timerisset(&toc) || timespec_ge(&targs[i].toc, &t_toc))
1544			toc = timespec2val(&targs[i].toc);
1545	}
1546
1547	/* print output. */
1548	timersub(&toc, &tic, &toc);
1549	delta_t = toc.tv_sec + 1e-6* toc.tv_usec;
1550	if (g->td_body == sender_body)
1551		tx_output(count, g->pkt_size, delta_t);
1552	else
1553		rx_output(count, delta_t);
1554
1555	if (g->dev_type == DEV_NETMAP) {
1556		munmap(g->nmd->mem, g->nmd->req.nr_memsize);
1557		close(g->main_fd);
1558	}
1559}
1560
1561
1562struct sf {
1563	char *key;
1564	void *f;
1565};
1566
1567static struct sf func[] = {
1568	{ "tx",	sender_body },
1569	{ "rx",	receiver_body },
1570	{ "ping",	pinger_body },
1571	{ "pong",	ponger_body },
1572	{ NULL, NULL }
1573};
1574
1575static int
1576tap_alloc(char *dev)
1577{
1578	struct ifreq ifr;
1579	int fd, err;
1580	char *clonedev = TAP_CLONEDEV;
1581
1582	(void)err;
1583	(void)dev;
1584	/* Arguments taken by the function:
1585	 *
1586	 * char *dev: the name of an interface (or '\0'). MUST have enough
1587	 *   space to hold the interface name if '\0' is passed
1588	 * int flags: interface flags (eg, IFF_TUN etc.)
1589	 */
1590
1591#ifdef __FreeBSD__
1592	if (dev[3]) { /* tapSomething */
1593		static char buf[128];
1594		snprintf(buf, sizeof(buf), "/dev/%s", dev);
1595		clonedev = buf;
1596	}
1597#endif
1598	/* open the device */
1599	if( (fd = open(clonedev, O_RDWR)) < 0 ) {
1600		return fd;
1601	}
1602	D("%s open successful", clonedev);
1603
1604	/* preparation of the struct ifr, of type "struct ifreq" */
1605	memset(&ifr, 0, sizeof(ifr));
1606
1607#ifdef linux
1608	ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
1609
1610	if (*dev) {
1611		/* if a device name was specified, put it in the structure; otherwise,
1612		* the kernel will try to allocate the "next" device of the
1613		* specified type */
1614		strncpy(ifr.ifr_name, dev, IFNAMSIZ);
1615	}
1616
1617	/* try to create the device */
1618	if( (err = ioctl(fd, TUNSETIFF, (void *) &ifr)) < 0 ) {
1619		D("failed to to a TUNSETIFF: %s", strerror(errno));
1620		close(fd);
1621		return err;
1622	}
1623
1624	/* if the operation was successful, write back the name of the
1625	* interface to the variable "dev", so the caller can know
1626	* it. Note that the caller MUST reserve space in *dev (see calling
1627	* code below) */
1628	strcpy(dev, ifr.ifr_name);
1629	D("new name is %s", dev);
1630#endif /* linux */
1631
1632        /* this is the special file descriptor that the caller will use to talk
1633         * with the virtual interface */
1634        return fd;
1635}
1636
1637int
1638main(int arc, char **argv)
1639{
1640	int i;
1641
1642	struct glob_arg g;
1643
1644	int ch;
1645	int wait_link = 2;
1646	int devqueues = 1;	/* how many device queues */
1647
1648	bzero(&g, sizeof(g));
1649
1650	g.main_fd = -1;
1651	g.td_body = receiver_body;
1652	g.report_interval = 1000;	/* report interval */
1653	g.affinity = -1;
1654	/* ip addresses can also be a range x.x.x.x-x.x.x.y */
1655	g.src_ip.name = "10.0.0.1";
1656	g.dst_ip.name = "10.1.0.1";
1657	g.dst_mac.name = "ff:ff:ff:ff:ff:ff";
1658	g.src_mac.name = NULL;
1659	g.pkt_size = 60;
1660	g.burst = 512;		// default
1661	g.nthreads = 1;
1662	g.cpus = 1;
1663	g.forever = 1;
1664	g.tx_rate = 0;
1665	g.frags = 1;
1666	g.nmr_config = "";
1667	g.virt_header = 0;
1668
1669	while ( (ch = getopt(arc, argv,
1670			"a:f:F:n:i:Il:d:s:D:S:b:c:o:p:T:w:WvR:XC:H:e:m:P:")) != -1) {
1671		struct sf *fn;
1672
1673		switch(ch) {
1674		default:
1675			D("bad option %c %s", ch, optarg);
1676			usage();
1677			break;
1678
1679		case 'n':
1680			g.npackets = atoi(optarg);
1681			break;
1682
1683		case 'F':
1684			i = atoi(optarg);
1685			if (i < 1 || i > 63) {
1686				D("invalid frags %d [1..63], ignore", i);
1687				break;
1688			}
1689			g.frags = i;
1690			break;
1691
1692		case 'f':
1693			for (fn = func; fn->key; fn++) {
1694				if (!strcmp(fn->key, optarg))
1695					break;
1696			}
1697			if (fn->key)
1698				g.td_body = fn->f;
1699			else
1700				D("unrecognised function %s", optarg);
1701			break;
1702
1703		case 'o':	/* data generation options */
1704			g.options = atoi(optarg);
1705			break;
1706
1707		case 'a':       /* force affinity */
1708			g.affinity = atoi(optarg);
1709			break;
1710
1711		case 'i':	/* interface */
1712			/* a prefix of tap: netmap: or pcap: forces the mode.
1713			 * otherwise we guess
1714			 */
1715			D("interface is %s", optarg);
1716			if (strlen(optarg) > MAX_IFNAMELEN - 8) {
1717				D("ifname too long %s", optarg);
1718				break;
1719			}
1720			strcpy(g.ifname, optarg);
1721			if (!strcmp(optarg, "null")) {
1722				g.dev_type = DEV_NETMAP;
1723				g.dummy_send = 1;
1724			} else if (!strncmp(optarg, "tap:", 4)) {
1725				g.dev_type = DEV_TAP;
1726				strcpy(g.ifname, optarg + 4);
1727			} else if (!strncmp(optarg, "pcap:", 5)) {
1728				g.dev_type = DEV_PCAP;
1729				strcpy(g.ifname, optarg + 5);
1730			} else if (!strncmp(optarg, "netmap:", 7) ||
1731				   !strncmp(optarg, "vale", 4)) {
1732				g.dev_type = DEV_NETMAP;
1733			} else if (!strncmp(optarg, "tap", 3)) {
1734				g.dev_type = DEV_TAP;
1735			} else { /* prepend netmap: */
1736				g.dev_type = DEV_NETMAP;
1737				sprintf(g.ifname, "netmap:%s", optarg);
1738			}
1739			break;
1740
1741		case 'I':
1742			g.options |= OPT_INDIRECT;	/* XXX use indirect buffer */
1743			break;
1744
1745		case 'l':	/* pkt_size */
1746			g.pkt_size = atoi(optarg);
1747			break;
1748
1749		case 'd':
1750			g.dst_ip.name = optarg;
1751			break;
1752
1753		case 's':
1754			g.src_ip.name = optarg;
1755			break;
1756
1757		case 'T':	/* report interval */
1758			g.report_interval = atoi(optarg);
1759			break;
1760
1761		case 'w':
1762			wait_link = atoi(optarg);
1763			break;
1764
1765		case 'W': /* XXX changed default */
1766			g.forever = 0; /* do not exit rx even with no traffic */
1767			break;
1768
1769		case 'b':	/* burst */
1770			g.burst = atoi(optarg);
1771			break;
1772		case 'c':
1773			g.cpus = atoi(optarg);
1774			break;
1775		case 'p':
1776			g.nthreads = atoi(optarg);
1777			break;
1778
1779		case 'D': /* destination mac */
1780			g.dst_mac.name = optarg;
1781			break;
1782
1783		case 'S': /* source mac */
1784			g.src_mac.name = optarg;
1785			break;
1786		case 'v':
1787			verbose++;
1788			break;
1789		case 'R':
1790			g.tx_rate = atoi(optarg);
1791			break;
1792		case 'X':
1793			g.options |= OPT_DUMP;
1794			break;
1795		case 'C':
1796			g.nmr_config = strdup(optarg);
1797			break;
1798		case 'H':
1799			g.virt_header = atoi(optarg);
1800			break;
1801		case 'e': /* extra bufs */
1802			g.extra_bufs = atoi(optarg);
1803			break;
1804		case 'm':
1805			if (strcmp(optarg, "tx") == 0) {
1806				g.options |= OPT_MONITOR_TX;
1807			} else if (strcmp(optarg, "rx") == 0) {
1808				g.options |= OPT_MONITOR_RX;
1809			} else {
1810				D("unrecognized monitor mode %s", optarg);
1811			}
1812			break;
1813		case 'P':
1814			g.packet_file = strdup(optarg);
1815			break;
1816		}
1817
1818	}
1819
1820	if (g.ifname == NULL) {
1821		D("missing ifname");
1822		usage();
1823	}
1824
1825	i = system_ncpus();
1826	if (g.cpus < 0 || g.cpus > i) {
1827		D("%d cpus is too high, have only %d cpus", g.cpus, i);
1828		usage();
1829	}
1830	if (g.cpus == 0)
1831		g.cpus = i;
1832
1833	if (g.pkt_size < 16 || g.pkt_size > MAX_PKTSIZE) {
1834		D("bad pktsize %d [16..%d]\n", g.pkt_size, MAX_PKTSIZE);
1835		usage();
1836	}
1837
1838	if (g.src_mac.name == NULL) {
1839		static char mybuf[20] = "00:00:00:00:00:00";
1840		/* retrieve source mac address. */
1841		if (source_hwaddr(g.ifname, mybuf) == -1) {
1842			D("Unable to retrieve source mac");
1843			// continue, fail later
1844		}
1845		g.src_mac.name = mybuf;
1846	}
1847	/* extract address ranges */
1848	extract_ip_range(&g.src_ip);
1849	extract_ip_range(&g.dst_ip);
1850	extract_mac_range(&g.src_mac);
1851	extract_mac_range(&g.dst_mac);
1852
1853	if (g.src_ip.start != g.src_ip.end ||
1854	    g.src_ip.port0 != g.src_ip.port1 ||
1855	    g.dst_ip.start != g.dst_ip.end ||
1856	    g.dst_ip.port0 != g.dst_ip.port1)
1857		g.options |= OPT_COPY;
1858
1859	if (g.virt_header != 0 && g.virt_header != VIRT_HDR_1
1860			&& g.virt_header != VIRT_HDR_2) {
1861		D("bad virtio-net-header length");
1862		usage();
1863	}
1864
1865    if (g.dev_type == DEV_TAP) {
1866	D("want to use tap %s", g.ifname);
1867	g.main_fd = tap_alloc(g.ifname);
1868	if (g.main_fd < 0) {
1869		D("cannot open tap %s", g.ifname);
1870		usage();
1871	}
1872#ifndef NO_PCAP
1873    } else if (g.dev_type == DEV_PCAP) {
1874	char pcap_errbuf[PCAP_ERRBUF_SIZE];
1875
1876	pcap_errbuf[0] = '\0'; // init the buffer
1877	g.p = pcap_open_live(g.ifname, 256 /* XXX */, 1, 100, pcap_errbuf);
1878	if (g.p == NULL) {
1879		D("cannot open pcap on %s", g.ifname);
1880		usage();
1881	}
1882	g.main_fd = pcap_fileno(g.p);
1883	D("using pcap on %s fileno %d", g.ifname, g.main_fd);
1884#endif /* !NO_PCAP */
1885    } else if (g.dummy_send) { /* but DEV_NETMAP */
1886	D("using a dummy send routine");
1887    } else {
1888	struct nmreq base_nmd;
1889
1890	bzero(&base_nmd, sizeof(base_nmd));
1891
1892	parse_nmr_config(g.nmr_config, &base_nmd);
1893	if (g.extra_bufs) {
1894		base_nmd.nr_arg3 = g.extra_bufs;
1895	}
1896
1897	/*
1898	 * Open the netmap device using nm_open().
1899	 *
1900	 * protocol stack and may cause a reset of the card,
1901	 * which in turn may take some time for the PHY to
1902	 * reconfigure. We do the open here to have time to reset.
1903	 */
1904	g.nmd = nm_open(g.ifname, &base_nmd, 0, NULL);
1905	if (g.nmd == NULL) {
1906		D("Unable to open %s: %s", g.ifname, strerror(errno));
1907		goto out;
1908	}
1909	g.main_fd = g.nmd->fd;
1910	D("mapped %dKB at %p", g.nmd->req.nr_memsize>>10, g.nmd->mem);
1911
1912	/* get num of queues in tx or rx */
1913	if (g.td_body == sender_body)
1914		devqueues = g.nmd->req.nr_tx_rings;
1915	else
1916		devqueues = g.nmd->req.nr_rx_rings;
1917
1918	/* validate provided nthreads. */
1919	if (g.nthreads < 1 || g.nthreads > devqueues) {
1920		D("bad nthreads %d, have %d queues", g.nthreads, devqueues);
1921		// continue, fail later
1922	}
1923
1924	if (verbose) {
1925		struct netmap_if *nifp = g.nmd->nifp;
1926		struct nmreq *req = &g.nmd->req;
1927
1928		D("nifp at offset %d, %d tx %d rx region %d",
1929		    req->nr_offset, req->nr_tx_rings, req->nr_rx_rings,
1930		    req->nr_arg2);
1931		for (i = 0; i <= req->nr_tx_rings; i++) {
1932			struct netmap_ring *ring = NETMAP_TXRING(nifp, i);
1933			D("   TX%d at 0x%lx slots %d", i,
1934			    (char *)ring - (char *)nifp, ring->num_slots);
1935		}
1936		for (i = 0; i <= req->nr_rx_rings; i++) {
1937			struct netmap_ring *ring = NETMAP_RXRING(nifp, i);
1938			D("   RX%d at 0x%lx slots %d", i,
1939			    (char *)ring - (char *)nifp, ring->num_slots);
1940		}
1941	}
1942
1943	/* Print some debug information. */
1944	fprintf(stdout,
1945		"%s %s: %d queues, %d threads and %d cpus.\n",
1946		(g.td_body == sender_body) ? "Sending on" : "Receiving from",
1947		g.ifname,
1948		devqueues,
1949		g.nthreads,
1950		g.cpus);
1951	if (g.td_body == sender_body) {
1952		fprintf(stdout, "%s -> %s (%s -> %s)\n",
1953			g.src_ip.name, g.dst_ip.name,
1954			g.src_mac.name, g.dst_mac.name);
1955	}
1956
1957out:
1958	/* Exit if something went wrong. */
1959	if (g.main_fd < 0) {
1960		D("aborting");
1961		usage();
1962	}
1963    }
1964
1965
1966	if (g.options) {
1967		D("--- SPECIAL OPTIONS:%s%s%s%s%s\n",
1968			g.options & OPT_PREFETCH ? " prefetch" : "",
1969			g.options & OPT_ACCESS ? " access" : "",
1970			g.options & OPT_MEMCPY ? " memcpy" : "",
1971			g.options & OPT_INDIRECT ? " indirect" : "",
1972			g.options & OPT_COPY ? " copy" : "");
1973	}
1974
1975	g.tx_period.tv_sec = g.tx_period.tv_nsec = 0;
1976	if (g.tx_rate > 0) {
1977		/* try to have at least something every second,
1978		 * reducing the burst size to some 0.01s worth of data
1979		 * (but no less than one full set of fragments)
1980	 	 */
1981		uint64_t x;
1982		int lim = (g.tx_rate)/300;
1983		if (g.burst > lim)
1984			g.burst = lim;
1985		if (g.burst < g.frags)
1986			g.burst = g.frags;
1987		x = ((uint64_t)1000000000 * (uint64_t)g.burst) / (uint64_t) g.tx_rate;
1988		g.tx_period.tv_nsec = x;
1989		g.tx_period.tv_sec = g.tx_period.tv_nsec / 1000000000;
1990		g.tx_period.tv_nsec = g.tx_period.tv_nsec % 1000000000;
1991	}
1992	if (g.td_body == sender_body)
1993	    D("Sending %d packets every  %ld.%09ld s",
1994			g.burst, g.tx_period.tv_sec, g.tx_period.tv_nsec);
1995	/* Wait for PHY reset. */
1996	D("Wait %d secs for phy reset", wait_link);
1997	sleep(wait_link);
1998	D("Ready...");
1999
2000	/* Install ^C handler. */
2001	global_nthreads = g.nthreads;
2002	signal(SIGINT, sigint_h);
2003
2004	start_threads(&g);
2005	main_thread(&g);
2006	return 0;
2007}
2008
2009/* end of file */
2010