pkt-gen.c revision 270252
1/*
2 * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved.
3 * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *   1. Redistributions of source code must retain the above copyright
9 *      notice, this list of conditions and the following disclaimer.
10 *   2. Redistributions in binary form must reproduce the above copyright
11 *      notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27/*
28 * $FreeBSD: stable/10/tools/tools/netmap/pkt-gen.c 270252 2014-08-20 23:34:36Z luigi $
29 * $Id: pkt-gen.c 12346 2013-06-12 17:36:25Z luigi $
30 *
31 * Example program to show how to build a multithreaded packet
32 * source/sink using the netmap device.
33 *
34 * In this example we create a programmable number of threads
35 * to take care of all the queues of the interface used to
36 * send or receive traffic.
37 *
38 */
39
40// #define TRASH_VHOST_HDR
41
42#define _GNU_SOURCE	/* for CPU_SET() */
43#include <stdio.h>
44#define NETMAP_WITH_LIBS
45#include <net/netmap_user.h>
46
47
48#include <ctype.h>	// isprint()
49#include <unistd.h>	// sysconf()
50#include <sys/poll.h>
51#include <arpa/inet.h>	/* ntohs */
52#include <sys/sysctl.h>	/* sysctl */
53#include <ifaddrs.h>	/* getifaddrs */
54#include <net/ethernet.h>
55#include <netinet/in.h>
56#include <netinet/ip.h>
57#include <netinet/udp.h>
58
59#include <pthread.h>
60
61#ifndef NO_PCAP
62#include <pcap/pcap.h>
63#endif
64
65#ifdef linux
66
67#define cpuset_t        cpu_set_t
68
69#define ifr_flagshigh  ifr_flags        /* only the low 16 bits here */
70#define IFF_PPROMISC   IFF_PROMISC      /* IFF_PPROMISC does not exist */
71#include <linux/ethtool.h>
72#include <linux/sockios.h>
73
74#define CLOCK_REALTIME_PRECISE CLOCK_REALTIME
75#include <netinet/ether.h>      /* ether_aton */
76#include <linux/if_packet.h>    /* sockaddr_ll */
77#endif  /* linux */
78
79#ifdef __FreeBSD__
80#include <sys/endian.h> /* le64toh */
81#include <machine/param.h>
82
83#include <pthread_np.h> /* pthread w/ affinity */
84#include <sys/cpuset.h> /* cpu_set */
85#include <net/if_dl.h>  /* LLADDR */
86#endif  /* __FreeBSD__ */
87
88#ifdef __APPLE__
89
90#define cpuset_t        uint64_t        // XXX
91static inline void CPU_ZERO(cpuset_t *p)
92{
93        *p = 0;
94}
95
96static inline void CPU_SET(uint32_t i, cpuset_t *p)
97{
98        *p |= 1<< (i & 0x3f);
99}
100
101#define pthread_setaffinity_np(a, b, c) ((void)a, 0)
102
103#define ifr_flagshigh  ifr_flags        // XXX
104#define IFF_PPROMISC   IFF_PROMISC
105#include <net/if_dl.h>  /* LLADDR */
106#define clock_gettime(a,b)      \
107        do {struct timespec t0 = {0,0}; *(b) = t0; } while (0)
108#endif  /* __APPLE__ */
109
110const char *default_payload="netmap pkt-gen DIRECT payload\n"
111	"http://info.iet.unipi.it/~luigi/netmap/ ";
112
113const char *indirect_payload="netmap pkt-gen indirect payload\n"
114	"http://info.iet.unipi.it/~luigi/netmap/ ";
115
116int verbose = 0;
117
118#define SKIP_PAYLOAD 1 /* do not check payload. XXX unused */
119
120
121#define VIRT_HDR_1	10	/* length of a base vnet-hdr */
122#define VIRT_HDR_2	12	/* length of the extenede vnet-hdr */
123#define VIRT_HDR_MAX	VIRT_HDR_2
124struct virt_header {
125	uint8_t fields[VIRT_HDR_MAX];
126};
127
128#define MAX_BODYSIZE	16384
129
130struct pkt {
131	struct virt_header vh;
132	struct ether_header eh;
133	struct ip ip;
134	struct udphdr udp;
135	uint8_t body[MAX_BODYSIZE];	// XXX hardwired
136} __attribute__((__packed__));
137
138struct ip_range {
139	char *name;
140	uint32_t start, end; /* same as struct in_addr */
141	uint16_t port0, port1;
142};
143
144struct mac_range {
145	char *name;
146	struct ether_addr start, end;
147};
148
149/* ifname can be netmap:foo-xxxx */
150#define MAX_IFNAMELEN	64	/* our buffer for ifname */
151//#define MAX_PKTSIZE	1536
152#define MAX_PKTSIZE	MAX_BODYSIZE	/* XXX: + IP_HDR + ETH_HDR */
153
154/* compact timestamp to fit into 60 byte packet. (enough to obtain RTT) */
155struct tstamp {
156	uint32_t sec;
157	uint32_t nsec;
158};
159
160/*
161 * global arguments for all threads
162 */
163
164struct glob_arg {
165	struct ip_range src_ip;
166	struct ip_range dst_ip;
167	struct mac_range dst_mac;
168	struct mac_range src_mac;
169	int pkt_size;
170	int burst;
171	int forever;
172	int npackets;	/* total packets to send */
173	int frags;	/* fragments per packet */
174	int nthreads;
175	int cpus;
176	int options;	/* testing */
177#define OPT_PREFETCH	1
178#define OPT_ACCESS	2
179#define OPT_COPY	4
180#define OPT_MEMCPY	8
181#define OPT_TS		16	/* add a timestamp */
182#define OPT_INDIRECT	32	/* use indirect buffers, tx only */
183#define OPT_DUMP	64	/* dump rx/tx traffic */
184#define OPT_MONITOR_TX  128
185#define OPT_MONITOR_RX  256
186	int dev_type;
187#ifndef NO_PCAP
188	pcap_t *p;
189#endif
190
191	int tx_rate;
192	struct timespec tx_period;
193
194	int affinity;
195	int main_fd;
196	struct nm_desc *nmd;
197	int report_interval;		/* milliseconds between prints */
198	void *(*td_body)(void *);
199	void *mmap_addr;
200	char ifname[MAX_IFNAMELEN];
201	char *nmr_config;
202	int dummy_send;
203	int virt_header;	/* send also the virt_header */
204	int extra_bufs;		/* goes in nr_arg3 */
205};
206enum dev_type { DEV_NONE, DEV_NETMAP, DEV_PCAP, DEV_TAP };
207
208
209/*
210 * Arguments for a new thread. The same structure is used by
211 * the source and the sink
212 */
213struct targ {
214	struct glob_arg *g;
215	int used;
216	int completed;
217	int cancel;
218	int fd;
219	struct nm_desc *nmd;
220	volatile uint64_t count;
221	struct timespec tic, toc;
222	int me;
223	pthread_t thread;
224	int affinity;
225
226	struct pkt pkt;
227};
228
229
230/*
231 * extract the extremes from a range of ipv4 addresses.
232 * addr_lo[-addr_hi][:port_lo[-port_hi]]
233 */
234static void
235extract_ip_range(struct ip_range *r)
236{
237	char *ap, *pp;
238	struct in_addr a;
239
240	if (verbose)
241		D("extract IP range from %s", r->name);
242	r->port0 = r->port1 = 0;
243	r->start = r->end = 0;
244
245	/* the first - splits start/end of range */
246	ap = index(r->name, '-');	/* do we have ports ? */
247	if (ap) {
248		*ap++ = '\0';
249	}
250	/* grab the initial values (mandatory) */
251	pp = index(r->name, ':');
252	if (pp) {
253		*pp++ = '\0';
254		r->port0 = r->port1 = strtol(pp, NULL, 0);
255	};
256	inet_aton(r->name, &a);
257	r->start = r->end = ntohl(a.s_addr);
258	if (ap) {
259		pp = index(ap, ':');
260		if (pp) {
261			*pp++ = '\0';
262			if (*pp)
263				r->port1 = strtol(pp, NULL, 0);
264		}
265		if (*ap) {
266			inet_aton(ap, &a);
267			r->end = ntohl(a.s_addr);
268		}
269	}
270	if (r->port0 > r->port1) {
271		uint16_t tmp = r->port0;
272		r->port0 = r->port1;
273		r->port1 = tmp;
274	}
275	if (r->start > r->end) {
276		uint32_t tmp = r->start;
277		r->start = r->end;
278		r->end = tmp;
279	}
280	{
281		struct in_addr a;
282		char buf1[16]; // one ip address
283
284		a.s_addr = htonl(r->end);
285		strncpy(buf1, inet_ntoa(a), sizeof(buf1));
286		a.s_addr = htonl(r->start);
287		if (1)
288		    D("range is %s:%d to %s:%d",
289			inet_ntoa(a), r->port0, buf1, r->port1);
290	}
291}
292
293static void
294extract_mac_range(struct mac_range *r)
295{
296	if (verbose)
297	    D("extract MAC range from %s", r->name);
298	bcopy(ether_aton(r->name), &r->start, 6);
299	bcopy(ether_aton(r->name), &r->end, 6);
300#if 0
301	bcopy(targ->src_mac, eh->ether_shost, 6);
302	p = index(targ->g->src_mac, '-');
303	if (p)
304		targ->src_mac_range = atoi(p+1);
305
306	bcopy(ether_aton(targ->g->dst_mac), targ->dst_mac, 6);
307	bcopy(targ->dst_mac, eh->ether_dhost, 6);
308	p = index(targ->g->dst_mac, '-');
309	if (p)
310		targ->dst_mac_range = atoi(p+1);
311#endif
312	if (verbose)
313		D("%s starts at %s", r->name, ether_ntoa(&r->start));
314}
315
316static struct targ *targs;
317static int global_nthreads;
318
319/* control-C handler */
320static void
321sigint_h(int sig)
322{
323	int i;
324
325	(void)sig;	/* UNUSED */
326	D("received control-C on thread %p", pthread_self());
327	for (i = 0; i < global_nthreads; i++) {
328		targs[i].cancel = 1;
329	}
330	signal(SIGINT, SIG_DFL);
331}
332
333/* sysctl wrapper to return the number of active CPUs */
334static int
335system_ncpus(void)
336{
337	int ncpus;
338#if defined (__FreeBSD__)
339	int mib[2] = { CTL_HW, HW_NCPU };
340	size_t len = sizeof(mib);
341	sysctl(mib, 2, &ncpus, &len, NULL, 0);
342#elif defined(linux)
343	ncpus = sysconf(_SC_NPROCESSORS_ONLN);
344#else /* others */
345	ncpus = 1;
346#endif /* others */
347	return (ncpus);
348}
349
350#ifdef __linux__
351#define sockaddr_dl    sockaddr_ll
352#define sdl_family     sll_family
353#define AF_LINK        AF_PACKET
354#define LLADDR(s)      s->sll_addr;
355#include <linux/if_tun.h>
356#define TAP_CLONEDEV	"/dev/net/tun"
357#endif /* __linux__ */
358
359#ifdef __FreeBSD__
360#include <net/if_tun.h>
361#define TAP_CLONEDEV	"/dev/tap"
362#endif /* __FreeBSD */
363
364#ifdef __APPLE__
365// #warning TAP not supported on apple ?
366#include <net/if_utun.h>
367#define TAP_CLONEDEV	"/dev/tap"
368#endif /* __APPLE__ */
369
370
371/*
372 * parse the vale configuration in conf and put it in nmr.
373 * Return the flag set if necessary.
374 * The configuration may consist of 0 to 4 numbers separated
375 * by commas: #tx-slots,#rx-slots,#tx-rings,#rx-rings.
376 * Missing numbers or zeroes stand for default values.
377 * As an additional convenience, if exactly one number
378 * is specified, then this is assigned to both #tx-slots and #rx-slots.
379 * If there is no 4th number, then the 3rd is assigned to both #tx-rings
380 * and #rx-rings.
381 */
382int
383parse_nmr_config(const char* conf, struct nmreq *nmr)
384{
385	char *w, *tok;
386	int i, v;
387
388	nmr->nr_tx_rings = nmr->nr_rx_rings = 0;
389	nmr->nr_tx_slots = nmr->nr_rx_slots = 0;
390	if (conf == NULL || ! *conf)
391		return 0;
392	w = strdup(conf);
393	for (i = 0, tok = strtok(w, ","); tok; i++, tok = strtok(NULL, ",")) {
394		v = atoi(tok);
395		switch (i) {
396		case 0:
397			nmr->nr_tx_slots = nmr->nr_rx_slots = v;
398			break;
399		case 1:
400			nmr->nr_rx_slots = v;
401			break;
402		case 2:
403			nmr->nr_tx_rings = nmr->nr_rx_rings = v;
404			break;
405		case 3:
406			nmr->nr_rx_rings = v;
407			break;
408		default:
409			D("ignored config: %s", tok);
410			break;
411		}
412	}
413	D("txr %d txd %d rxr %d rxd %d",
414			nmr->nr_tx_rings, nmr->nr_tx_slots,
415			nmr->nr_rx_rings, nmr->nr_rx_slots);
416	free(w);
417	return (nmr->nr_tx_rings || nmr->nr_tx_slots ||
418                        nmr->nr_rx_rings || nmr->nr_rx_slots) ?
419		NM_OPEN_RING_CFG : 0;
420}
421
422
423/*
424 * locate the src mac address for our interface, put it
425 * into the user-supplied buffer. return 0 if ok, -1 on error.
426 */
427static int
428source_hwaddr(const char *ifname, char *buf)
429{
430	struct ifaddrs *ifaphead, *ifap;
431	int l = sizeof(ifap->ifa_name);
432
433	if (getifaddrs(&ifaphead) != 0) {
434		D("getifaddrs %s failed", ifname);
435		return (-1);
436	}
437
438	for (ifap = ifaphead; ifap; ifap = ifap->ifa_next) {
439		struct sockaddr_dl *sdl =
440			(struct sockaddr_dl *)ifap->ifa_addr;
441		uint8_t *mac;
442
443		if (!sdl || sdl->sdl_family != AF_LINK)
444			continue;
445		if (strncmp(ifap->ifa_name, ifname, l) != 0)
446			continue;
447		mac = (uint8_t *)LLADDR(sdl);
448		sprintf(buf, "%02x:%02x:%02x:%02x:%02x:%02x",
449			mac[0], mac[1], mac[2],
450			mac[3], mac[4], mac[5]);
451		if (verbose)
452			D("source hwaddr %s", buf);
453		break;
454	}
455	freeifaddrs(ifaphead);
456	return ifap ? 0 : 1;
457}
458
459
460/* set the thread affinity. */
461static int
462setaffinity(pthread_t me, int i)
463{
464	cpuset_t cpumask;
465
466	if (i == -1)
467		return 0;
468
469	/* Set thread affinity affinity.*/
470	CPU_ZERO(&cpumask);
471	CPU_SET(i, &cpumask);
472
473	if (pthread_setaffinity_np(me, sizeof(cpuset_t), &cpumask) != 0) {
474		D("Unable to set affinity: %s", strerror(errno));
475		return 1;
476	}
477	return 0;
478}
479
480/* Compute the checksum of the given ip header. */
481static uint16_t
482checksum(const void *data, uint16_t len, uint32_t sum)
483{
484        const uint8_t *addr = data;
485	uint32_t i;
486
487        /* Checksum all the pairs of bytes first... */
488        for (i = 0; i < (len & ~1U); i += 2) {
489                sum += (u_int16_t)ntohs(*((u_int16_t *)(addr + i)));
490                if (sum > 0xFFFF)
491                        sum -= 0xFFFF;
492        }
493	/*
494	 * If there's a single byte left over, checksum it, too.
495	 * Network byte order is big-endian, so the remaining byte is
496	 * the high byte.
497	 */
498	if (i < len) {
499		sum += addr[i] << 8;
500		if (sum > 0xFFFF)
501			sum -= 0xFFFF;
502	}
503	return sum;
504}
505
506static u_int16_t
507wrapsum(u_int32_t sum)
508{
509	sum = ~sum & 0xFFFF;
510	return (htons(sum));
511}
512
513/* Check the payload of the packet for errors (use it for debug).
514 * Look for consecutive ascii representations of the size of the packet.
515 */
516static void
517dump_payload(char *p, int len, struct netmap_ring *ring, int cur)
518{
519	char buf[128];
520	int i, j, i0;
521
522	/* get the length in ASCII of the length of the packet. */
523
524	printf("ring %p cur %5d [buf %6d flags 0x%04x len %5d]\n",
525		ring, cur, ring->slot[cur].buf_idx,
526		ring->slot[cur].flags, len);
527	/* hexdump routine */
528	for (i = 0; i < len; ) {
529		memset(buf, sizeof(buf), ' ');
530		sprintf(buf, "%5d: ", i);
531		i0 = i;
532		for (j=0; j < 16 && i < len; i++, j++)
533			sprintf(buf+7+j*3, "%02x ", (uint8_t)(p[i]));
534		i = i0;
535		for (j=0; j < 16 && i < len; i++, j++)
536			sprintf(buf+7+j + 48, "%c",
537				isprint(p[i]) ? p[i] : '.');
538		printf("%s\n", buf);
539	}
540}
541
542/*
543 * Fill a packet with some payload.
544 * We create a UDP packet so the payload starts at
545 *	14+20+8 = 42 bytes.
546 */
547#ifdef __linux__
548#define uh_sport source
549#define uh_dport dest
550#define uh_ulen len
551#define uh_sum check
552#endif /* linux */
553
554/*
555 * increment the addressed in the packet,
556 * starting from the least significant field.
557 *	DST_IP DST_PORT SRC_IP SRC_PORT
558 */
559static void
560update_addresses(struct pkt *pkt, struct glob_arg *g)
561{
562	uint32_t a;
563	uint16_t p;
564	struct ip *ip = &pkt->ip;
565	struct udphdr *udp = &pkt->udp;
566
567    do {
568	p = ntohs(udp->uh_sport);
569	if (p < g->src_ip.port1) { /* just inc, no wrap */
570		udp->uh_sport = htons(p + 1);
571		break;
572	}
573	udp->uh_sport = htons(g->src_ip.port0);
574
575	a = ntohl(ip->ip_src.s_addr);
576	if (a < g->src_ip.end) { /* just inc, no wrap */
577		ip->ip_src.s_addr = htonl(a + 1);
578		break;
579	}
580	ip->ip_src.s_addr = htonl(g->src_ip.start);
581
582	udp->uh_sport = htons(g->src_ip.port0);
583	p = ntohs(udp->uh_dport);
584	if (p < g->dst_ip.port1) { /* just inc, no wrap */
585		udp->uh_dport = htons(p + 1);
586		break;
587	}
588	udp->uh_dport = htons(g->dst_ip.port0);
589
590	a = ntohl(ip->ip_dst.s_addr);
591	if (a < g->dst_ip.end) { /* just inc, no wrap */
592		ip->ip_dst.s_addr = htonl(a + 1);
593		break;
594	}
595	ip->ip_dst.s_addr = htonl(g->dst_ip.start);
596    } while (0);
597    // update checksum
598}
599
600/*
601 * initialize one packet and prepare for the next one.
602 * The copy could be done better instead of repeating it each time.
603 */
604static void
605initialize_packet(struct targ *targ)
606{
607	struct pkt *pkt = &targ->pkt;
608	struct ether_header *eh;
609	struct ip *ip;
610	struct udphdr *udp;
611	uint16_t paylen = targ->g->pkt_size - sizeof(*eh) - sizeof(struct ip);
612	const char *payload = targ->g->options & OPT_INDIRECT ?
613		indirect_payload : default_payload;
614	int i, l0 = strlen(payload);
615
616	/* create a nice NUL-terminated string */
617	for (i = 0; i < paylen; i += l0) {
618		if (l0 > paylen - i)
619			l0 = paylen - i; // last round
620		bcopy(payload, pkt->body + i, l0);
621	}
622	pkt->body[i-1] = '\0';
623	ip = &pkt->ip;
624
625	/* prepare the headers */
626        ip->ip_v = IPVERSION;
627        ip->ip_hl = 5;
628        ip->ip_id = 0;
629        ip->ip_tos = IPTOS_LOWDELAY;
630	ip->ip_len = ntohs(targ->g->pkt_size - sizeof(*eh));
631        ip->ip_id = 0;
632        ip->ip_off = htons(IP_DF); /* Don't fragment */
633        ip->ip_ttl = IPDEFTTL;
634	ip->ip_p = IPPROTO_UDP;
635	ip->ip_dst.s_addr = htonl(targ->g->dst_ip.start);
636	ip->ip_src.s_addr = htonl(targ->g->src_ip.start);
637	ip->ip_sum = wrapsum(checksum(ip, sizeof(*ip), 0));
638
639
640	udp = &pkt->udp;
641        udp->uh_sport = htons(targ->g->src_ip.port0);
642        udp->uh_dport = htons(targ->g->dst_ip.port0);
643	udp->uh_ulen = htons(paylen);
644	/* Magic: taken from sbin/dhclient/packet.c */
645	udp->uh_sum = wrapsum(checksum(udp, sizeof(*udp),
646                    checksum(pkt->body,
647                        paylen - sizeof(*udp),
648                        checksum(&ip->ip_src, 2 * sizeof(ip->ip_src),
649                            IPPROTO_UDP + (u_int32_t)ntohs(udp->uh_ulen)
650                        )
651                    )
652                ));
653
654	eh = &pkt->eh;
655	bcopy(&targ->g->src_mac.start, eh->ether_shost, 6);
656	bcopy(&targ->g->dst_mac.start, eh->ether_dhost, 6);
657	eh->ether_type = htons(ETHERTYPE_IP);
658
659	bzero(&pkt->vh, sizeof(pkt->vh));
660#ifdef TRASH_VHOST_HDR
661	/* set bogus content */
662	pkt->vh.fields[0] = 0xff;
663	pkt->vh.fields[1] = 0xff;
664	pkt->vh.fields[2] = 0xff;
665	pkt->vh.fields[3] = 0xff;
666	pkt->vh.fields[4] = 0xff;
667	pkt->vh.fields[5] = 0xff;
668#endif /* TRASH_VHOST_HDR */
669	// dump_payload((void *)pkt, targ->g->pkt_size, NULL, 0);
670}
671
672static void
673set_vnet_hdr_len(struct targ *t)
674{
675	int err, l = t->g->virt_header;
676	struct nmreq req;
677
678	if (l == 0)
679		return;
680
681	memset(&req, 0, sizeof(req));
682	bcopy(t->nmd->req.nr_name, req.nr_name, sizeof(req.nr_name));
683	req.nr_version = NETMAP_API;
684	req.nr_cmd = NETMAP_BDG_VNET_HDR;
685	req.nr_arg1 = l;
686	err = ioctl(t->fd, NIOCREGIF, &req);
687	if (err) {
688		D("Unable to set vnet header length %d", l);
689	}
690}
691
692
693/*
694 * create and enqueue a batch of packets on a ring.
695 * On the last one set NS_REPORT to tell the driver to generate
696 * an interrupt when done.
697 */
698static int
699send_packets(struct netmap_ring *ring, struct pkt *pkt, void *frame,
700		int size, struct glob_arg *g, u_int count, int options,
701		u_int nfrags)
702{
703	u_int n, sent, cur = ring->cur;
704	u_int fcnt;
705
706	n = nm_ring_space(ring);
707	if (n < count)
708		count = n;
709	if (count < nfrags) {
710		D("truncating packet, no room for frags %d %d",
711				count, nfrags);
712	}
713#if 0
714	if (options & (OPT_COPY | OPT_PREFETCH) ) {
715		for (sent = 0; sent < count; sent++) {
716			struct netmap_slot *slot = &ring->slot[cur];
717			char *p = NETMAP_BUF(ring, slot->buf_idx);
718
719			__builtin_prefetch(p);
720			cur = nm_ring_next(ring, cur);
721		}
722		cur = ring->cur;
723	}
724#endif
725	for (fcnt = nfrags, sent = 0; sent < count; sent++) {
726		struct netmap_slot *slot = &ring->slot[cur];
727		char *p = NETMAP_BUF(ring, slot->buf_idx);
728
729		slot->flags = 0;
730		if (options & OPT_INDIRECT) {
731			slot->flags |= NS_INDIRECT;
732			slot->ptr = (uint64_t)frame;
733		} else if (options & OPT_COPY) {
734			nm_pkt_copy(frame, p, size);
735			if (fcnt == nfrags)
736				update_addresses(pkt, g);
737		} else if (options & OPT_MEMCPY) {
738			memcpy(p, frame, size);
739			if (fcnt == nfrags)
740				update_addresses(pkt, g);
741		} else if (options & OPT_PREFETCH) {
742			__builtin_prefetch(p);
743		}
744		if (options & OPT_DUMP)
745			dump_payload(p, size, ring, cur);
746		slot->len = size;
747		if (--fcnt > 0)
748			slot->flags |= NS_MOREFRAG;
749		else
750			fcnt = nfrags;
751		if (sent == count - 1) {
752			slot->flags &= ~NS_MOREFRAG;
753			slot->flags |= NS_REPORT;
754		}
755		cur = nm_ring_next(ring, cur);
756	}
757	ring->head = ring->cur = cur;
758
759	return (sent);
760}
761
762/*
763 * Send a packet, and wait for a response.
764 * The payload (after UDP header, ofs 42) has a 4-byte sequence
765 * followed by a struct timeval (or bintime?)
766 */
767#define	PAY_OFS	42	/* where in the pkt... */
768
769static void *
770pinger_body(void *data)
771{
772	struct targ *targ = (struct targ *) data;
773	struct pollfd pfd = { .fd = targ->fd, .events = POLLIN };
774	struct netmap_if *nifp = targ->nmd->nifp;
775	int i, rx = 0, n = targ->g->npackets;
776	void *frame;
777	int size;
778	uint32_t sent = 0;
779	struct timespec ts, now, last_print;
780	uint32_t count = 0, min = 1000000000, av = 0;
781
782	frame = &targ->pkt;
783	frame += sizeof(targ->pkt.vh) - targ->g->virt_header;
784	size = targ->g->pkt_size + targ->g->virt_header;
785
786
787	if (targ->g->nthreads > 1) {
788		D("can only ping with 1 thread");
789		return NULL;
790	}
791
792	clock_gettime(CLOCK_REALTIME_PRECISE, &last_print);
793	now = last_print;
794	while (n == 0 || (int)sent < n) {
795		struct netmap_ring *ring = NETMAP_TXRING(nifp, 0);
796		struct netmap_slot *slot;
797		char *p;
798	    for (i = 0; i < 1; i++) { /* XXX why the loop for 1 pkt ? */
799		slot = &ring->slot[ring->cur];
800		slot->len = size;
801		p = NETMAP_BUF(ring, slot->buf_idx);
802
803		if (nm_ring_empty(ring)) {
804			D("-- ouch, cannot send");
805		} else {
806			struct tstamp *tp;
807			nm_pkt_copy(frame, p, size);
808			clock_gettime(CLOCK_REALTIME_PRECISE, &ts);
809			bcopy(&sent, p+42, sizeof(sent));
810			tp = (struct tstamp *)(p+46);
811			tp->sec = (uint32_t)ts.tv_sec;
812			tp->nsec = (uint32_t)ts.tv_nsec;
813			sent++;
814			ring->head = ring->cur = nm_ring_next(ring, ring->cur);
815		}
816	    }
817		/* should use a parameter to decide how often to send */
818		if (poll(&pfd, 1, 3000) <= 0) {
819			D("poll error/timeout on queue %d: %s", targ->me,
820				strerror(errno));
821			continue;
822		}
823		/* see what we got back */
824		for (i = targ->nmd->first_tx_ring;
825			i <= targ->nmd->last_tx_ring; i++) {
826			ring = NETMAP_RXRING(nifp, i);
827			while (!nm_ring_empty(ring)) {
828				uint32_t seq;
829				struct tstamp *tp;
830				slot = &ring->slot[ring->cur];
831				p = NETMAP_BUF(ring, slot->buf_idx);
832
833				clock_gettime(CLOCK_REALTIME_PRECISE, &now);
834				bcopy(p+42, &seq, sizeof(seq));
835				tp = (struct tstamp *)(p+46);
836				ts.tv_sec = (time_t)tp->sec;
837				ts.tv_nsec = (long)tp->nsec;
838				ts.tv_sec = now.tv_sec - ts.tv_sec;
839				ts.tv_nsec = now.tv_nsec - ts.tv_nsec;
840				if (ts.tv_nsec < 0) {
841					ts.tv_nsec += 1000000000;
842					ts.tv_sec--;
843				}
844				if (1) D("seq %d/%d delta %d.%09d", seq, sent,
845					(int)ts.tv_sec, (int)ts.tv_nsec);
846				if (ts.tv_nsec < (int)min)
847					min = ts.tv_nsec;
848				count ++;
849				av += ts.tv_nsec;
850				ring->head = ring->cur = nm_ring_next(ring, ring->cur);
851				rx++;
852			}
853		}
854		//D("tx %d rx %d", sent, rx);
855		//usleep(100000);
856		ts.tv_sec = now.tv_sec - last_print.tv_sec;
857		ts.tv_nsec = now.tv_nsec - last_print.tv_nsec;
858		if (ts.tv_nsec < 0) {
859			ts.tv_nsec += 1000000000;
860			ts.tv_sec--;
861		}
862		if (ts.tv_sec >= 1) {
863			D("count %d min %d av %d",
864				count, min, av/count);
865			count = 0;
866			av = 0;
867			min = 100000000;
868			last_print = now;
869		}
870	}
871	return NULL;
872}
873
874
875/*
876 * reply to ping requests
877 */
878static void *
879ponger_body(void *data)
880{
881	struct targ *targ = (struct targ *) data;
882	struct pollfd pfd = { .fd = targ->fd, .events = POLLIN };
883	struct netmap_if *nifp = targ->nmd->nifp;
884	struct netmap_ring *txring, *rxring;
885	int i, rx = 0, sent = 0, n = targ->g->npackets;
886
887	if (targ->g->nthreads > 1) {
888		D("can only reply ping with 1 thread");
889		return NULL;
890	}
891	D("understood ponger %d but don't know how to do it", n);
892	while (n == 0 || sent < n) {
893		uint32_t txcur, txavail;
894//#define BUSYWAIT
895#ifdef BUSYWAIT
896		ioctl(pfd.fd, NIOCRXSYNC, NULL);
897#else
898		if (poll(&pfd, 1, 1000) <= 0) {
899			D("poll error/timeout on queue %d: %s", targ->me,
900				strerror(errno));
901			continue;
902		}
903#endif
904		txring = NETMAP_TXRING(nifp, 0);
905		txcur = txring->cur;
906		txavail = nm_ring_space(txring);
907		/* see what we got back */
908		for (i = targ->nmd->first_rx_ring; i <= targ->nmd->last_rx_ring; i++) {
909			rxring = NETMAP_RXRING(nifp, i);
910			while (!nm_ring_empty(rxring)) {
911				uint16_t *spkt, *dpkt;
912				uint32_t cur = rxring->cur;
913				struct netmap_slot *slot = &rxring->slot[cur];
914				char *src, *dst;
915				src = NETMAP_BUF(rxring, slot->buf_idx);
916				//D("got pkt %p of size %d", src, slot->len);
917				rxring->head = rxring->cur = nm_ring_next(rxring, cur);
918				rx++;
919				if (txavail == 0)
920					continue;
921				dst = NETMAP_BUF(txring,
922				    txring->slot[txcur].buf_idx);
923				/* copy... */
924				dpkt = (uint16_t *)dst;
925				spkt = (uint16_t *)src;
926				nm_pkt_copy(src, dst, slot->len);
927				dpkt[0] = spkt[3];
928				dpkt[1] = spkt[4];
929				dpkt[2] = spkt[5];
930				dpkt[3] = spkt[0];
931				dpkt[4] = spkt[1];
932				dpkt[5] = spkt[2];
933				txring->slot[txcur].len = slot->len;
934				/* XXX swap src dst mac */
935				txcur = nm_ring_next(txring, txcur);
936				txavail--;
937				sent++;
938			}
939		}
940		txring->head = txring->cur = txcur;
941		targ->count = sent;
942#ifdef BUSYWAIT
943		ioctl(pfd.fd, NIOCTXSYNC, NULL);
944#endif
945		//D("tx %d rx %d", sent, rx);
946	}
947	return NULL;
948}
949
950static __inline int
951timespec_ge(const struct timespec *a, const struct timespec *b)
952{
953
954	if (a->tv_sec > b->tv_sec)
955		return (1);
956	if (a->tv_sec < b->tv_sec)
957		return (0);
958	if (a->tv_nsec >= b->tv_nsec)
959		return (1);
960	return (0);
961}
962
963static __inline struct timespec
964timeval2spec(const struct timeval *a)
965{
966	struct timespec ts = {
967		.tv_sec = a->tv_sec,
968		.tv_nsec = a->tv_usec * 1000
969	};
970	return ts;
971}
972
973static __inline struct timeval
974timespec2val(const struct timespec *a)
975{
976	struct timeval tv = {
977		.tv_sec = a->tv_sec,
978		.tv_usec = a->tv_nsec / 1000
979	};
980	return tv;
981}
982
983
984static __inline struct timespec
985timespec_add(struct timespec a, struct timespec b)
986{
987	struct timespec ret = { a.tv_sec + b.tv_sec, a.tv_nsec + b.tv_nsec };
988	if (ret.tv_nsec >= 1000000000) {
989		ret.tv_sec++;
990		ret.tv_nsec -= 1000000000;
991	}
992	return ret;
993}
994
995static __inline struct timespec
996timespec_sub(struct timespec a, struct timespec b)
997{
998	struct timespec ret = { a.tv_sec - b.tv_sec, a.tv_nsec - b.tv_nsec };
999	if (ret.tv_nsec < 0) {
1000		ret.tv_sec--;
1001		ret.tv_nsec += 1000000000;
1002	}
1003	return ret;
1004}
1005
1006
1007/*
1008 * wait until ts, either busy or sleeping if more than 1ms.
1009 * Return wakeup time.
1010 */
1011static struct timespec
1012wait_time(struct timespec ts)
1013{
1014	for (;;) {
1015		struct timespec w, cur;
1016		clock_gettime(CLOCK_REALTIME_PRECISE, &cur);
1017		w = timespec_sub(ts, cur);
1018		if (w.tv_sec < 0)
1019			return cur;
1020		else if (w.tv_sec > 0 || w.tv_nsec > 1000000)
1021			poll(NULL, 0, 1);
1022	}
1023}
1024
1025static void *
1026sender_body(void *data)
1027{
1028	struct targ *targ = (struct targ *) data;
1029	struct pollfd pfd = { .fd = targ->fd, .events = POLLOUT };
1030	struct netmap_if *nifp;
1031	struct netmap_ring *txring;
1032	int i, n = targ->g->npackets / targ->g->nthreads;
1033	int64_t sent = 0;
1034	int options = targ->g->options | OPT_COPY;
1035	struct timespec nexttime = { 0, 0}; // XXX silence compiler
1036	int rate_limit = targ->g->tx_rate;
1037	struct pkt *pkt = &targ->pkt;
1038	void *frame;
1039	int size;
1040
1041	frame = pkt;
1042	frame += sizeof(pkt->vh) - targ->g->virt_header;
1043	size = targ->g->pkt_size + targ->g->virt_header;
1044
1045	D("start, fd %d main_fd %d", targ->fd, targ->g->main_fd);
1046	if (setaffinity(targ->thread, targ->affinity))
1047		goto quit;
1048
1049	/* main loop.*/
1050	clock_gettime(CLOCK_REALTIME_PRECISE, &targ->tic);
1051	if (rate_limit) {
1052		targ->tic = timespec_add(targ->tic, (struct timespec){2,0});
1053		targ->tic.tv_nsec = 0;
1054		wait_time(targ->tic);
1055		nexttime = targ->tic;
1056	}
1057        if (targ->g->dev_type == DEV_TAP) {
1058	    D("writing to file desc %d", targ->g->main_fd);
1059
1060	    for (i = 0; !targ->cancel && (n == 0 || sent < n); i++) {
1061		if (write(targ->g->main_fd, frame, size) != -1)
1062			sent++;
1063		update_addresses(pkt, targ->g);
1064		if (i > 10000) {
1065			targ->count = sent;
1066			i = 0;
1067		}
1068	    }
1069#ifndef NO_PCAP
1070    } else if (targ->g->dev_type == DEV_PCAP) {
1071	    pcap_t *p = targ->g->p;
1072
1073	    for (i = 0; !targ->cancel && (n == 0 || sent < n); i++) {
1074		if (pcap_inject(p, frame, size) != -1)
1075			sent++;
1076		update_addresses(pkt, targ->g);
1077		if (i > 10000) {
1078			targ->count = sent;
1079			i = 0;
1080		}
1081	    }
1082#endif /* NO_PCAP */
1083    } else {
1084	int tosend = 0;
1085	int frags = targ->g->frags;
1086
1087        nifp = targ->nmd->nifp;
1088	while (!targ->cancel && (n == 0 || sent < n)) {
1089
1090		if (rate_limit && tosend <= 0) {
1091			tosend = targ->g->burst;
1092			nexttime = timespec_add(nexttime, targ->g->tx_period);
1093			wait_time(nexttime);
1094		}
1095
1096		/*
1097		 * wait for available room in the send queue(s)
1098		 */
1099		if (poll(&pfd, 1, 2000) <= 0) {
1100			if (targ->cancel)
1101				break;
1102			D("poll error/timeout on queue %d: %s", targ->me,
1103				strerror(errno));
1104			// goto quit;
1105		}
1106		if (pfd.revents & POLLERR) {
1107			D("poll error");
1108			goto quit;
1109		}
1110		/*
1111		 * scan our queues and send on those with room
1112		 */
1113		if (options & OPT_COPY && sent > 100000 && !(targ->g->options & OPT_COPY) ) {
1114			D("drop copy");
1115			options &= ~OPT_COPY;
1116		}
1117		for (i = targ->nmd->first_tx_ring; i <= targ->nmd->last_tx_ring; i++) {
1118			int m, limit = rate_limit ?  tosend : targ->g->burst;
1119			if (n > 0 && n - sent < limit)
1120				limit = n - sent;
1121			txring = NETMAP_TXRING(nifp, i);
1122			if (nm_ring_empty(txring))
1123				continue;
1124			if (frags > 1)
1125				limit = ((limit + frags - 1) / frags) * frags;
1126
1127			m = send_packets(txring, pkt, frame, size, targ->g,
1128					 limit, options, frags);
1129			ND("limit %d tail %d frags %d m %d",
1130				limit, txring->tail, frags, m);
1131			sent += m;
1132			targ->count = sent;
1133			if (rate_limit) {
1134				tosend -= m;
1135				if (tosend <= 0)
1136					break;
1137			}
1138		}
1139	}
1140	/* flush any remaining packets */
1141	D("flush tail %d head %d on thread %p",
1142		txring->tail, txring->head,
1143		pthread_self());
1144	ioctl(pfd.fd, NIOCTXSYNC, NULL);
1145
1146	/* final part: wait all the TX queues to be empty. */
1147	for (i = targ->nmd->first_tx_ring; i <= targ->nmd->last_tx_ring; i++) {
1148		txring = NETMAP_TXRING(nifp, i);
1149		while (nm_tx_pending(txring)) {
1150			RD(5, "pending tx tail %d head %d on ring %d",
1151				txring->tail, txring->head, i);
1152			ioctl(pfd.fd, NIOCTXSYNC, NULL);
1153			usleep(1); /* wait 1 tick */
1154		}
1155	}
1156    } /* end DEV_NETMAP */
1157
1158	clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc);
1159	targ->completed = 1;
1160	targ->count = sent;
1161
1162quit:
1163	/* reset the ``used`` flag. */
1164	targ->used = 0;
1165
1166	return (NULL);
1167}
1168
1169
1170#ifndef NO_PCAP
1171static void
1172receive_pcap(u_char *user, const struct pcap_pkthdr * h,
1173	const u_char * bytes)
1174{
1175	int *count = (int *)user;
1176	(void)h;	/* UNUSED */
1177	(void)bytes;	/* UNUSED */
1178	(*count)++;
1179}
1180#endif /* !NO_PCAP */
1181
1182static int
1183receive_packets(struct netmap_ring *ring, u_int limit, int dump)
1184{
1185	u_int cur, rx, n;
1186
1187	cur = ring->cur;
1188	n = nm_ring_space(ring);
1189	if (n < limit)
1190		limit = n;
1191	for (rx = 0; rx < limit; rx++) {
1192		struct netmap_slot *slot = &ring->slot[cur];
1193		char *p = NETMAP_BUF(ring, slot->buf_idx);
1194
1195		if (dump)
1196			dump_payload(p, slot->len, ring, cur);
1197
1198		cur = nm_ring_next(ring, cur);
1199	}
1200	ring->head = ring->cur = cur;
1201
1202	return (rx);
1203}
1204
1205static void *
1206receiver_body(void *data)
1207{
1208	struct targ *targ = (struct targ *) data;
1209	struct pollfd pfd = { .fd = targ->fd, .events = POLLIN };
1210	struct netmap_if *nifp;
1211	struct netmap_ring *rxring;
1212	int i;
1213	uint64_t received = 0;
1214
1215	if (setaffinity(targ->thread, targ->affinity))
1216		goto quit;
1217
1218	D("reading from %s fd %d main_fd %d",
1219		targ->g->ifname, targ->fd, targ->g->main_fd);
1220	/* unbounded wait for the first packet. */
1221	for (;!targ->cancel;) {
1222		i = poll(&pfd, 1, 1000);
1223		if (i > 0 && !(pfd.revents & POLLERR))
1224			break;
1225		RD(1, "waiting for initial packets, poll returns %d %d",
1226			i, pfd.revents);
1227	}
1228	/* main loop, exit after 1s silence */
1229	clock_gettime(CLOCK_REALTIME_PRECISE, &targ->tic);
1230    if (targ->g->dev_type == DEV_TAP) {
1231	while (!targ->cancel) {
1232		char buf[MAX_BODYSIZE];
1233		/* XXX should we poll ? */
1234		if (read(targ->g->main_fd, buf, sizeof(buf)) > 0)
1235			targ->count++;
1236	}
1237#ifndef NO_PCAP
1238    } else if (targ->g->dev_type == DEV_PCAP) {
1239	while (!targ->cancel) {
1240		/* XXX should we poll ? */
1241		pcap_dispatch(targ->g->p, targ->g->burst, receive_pcap,
1242			(u_char *)&targ->count);
1243	}
1244#endif /* !NO_PCAP */
1245    } else {
1246	int dump = targ->g->options & OPT_DUMP;
1247
1248        nifp = targ->nmd->nifp;
1249	while (!targ->cancel) {
1250		/* Once we started to receive packets, wait at most 1 seconds
1251		   before quitting. */
1252		if (poll(&pfd, 1, 1 * 1000) <= 0 && !targ->g->forever) {
1253			clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc);
1254			targ->toc.tv_sec -= 1; /* Subtract timeout time. */
1255			goto out;
1256		}
1257
1258		if (pfd.revents & POLLERR) {
1259			D("poll err");
1260			goto quit;
1261		}
1262
1263		for (i = targ->nmd->first_rx_ring; i <= targ->nmd->last_rx_ring; i++) {
1264			int m;
1265
1266			rxring = NETMAP_RXRING(nifp, i);
1267			if (nm_ring_empty(rxring))
1268				continue;
1269
1270			m = receive_packets(rxring, targ->g->burst, dump);
1271			received += m;
1272		}
1273		targ->count = received;
1274	}
1275    }
1276
1277	clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc);
1278
1279out:
1280	targ->completed = 1;
1281	targ->count = received;
1282
1283quit:
1284	/* reset the ``used`` flag. */
1285	targ->used = 0;
1286
1287	return (NULL);
1288}
1289
1290/* very crude code to print a number in normalized form.
1291 * Caller has to make sure that the buffer is large enough.
1292 */
1293static const char *
1294norm(char *buf, double val)
1295{
1296	char *units[] = { "", "K", "M", "G", "T" };
1297	u_int i;
1298
1299	for (i = 0; val >=1000 && i < sizeof(units)/sizeof(char *) - 1; i++)
1300		val /= 1000;
1301	sprintf(buf, "%.2f %s", val, units[i]);
1302	return buf;
1303}
1304
1305static void
1306tx_output(uint64_t sent, int size, double delta)
1307{
1308	double bw, raw_bw, pps;
1309	char b1[40], b2[80], b3[80];
1310
1311	printf("Sent %llu packets, %d bytes each, in %.2f seconds.\n",
1312	       (unsigned long long)sent, size, delta);
1313	if (delta == 0)
1314		delta = 1e-6;
1315	if (size < 60)		/* correct for min packet size */
1316		size = 60;
1317	pps = sent / delta;
1318	bw = (8.0 * size * sent) / delta;
1319	/* raw packets have4 bytes crc + 20 bytes framing */
1320	raw_bw = (8.0 * (size + 24) * sent) / delta;
1321
1322	printf("Speed: %spps Bandwidth: %sbps (raw %sbps)\n",
1323		norm(b1, pps), norm(b2, bw), norm(b3, raw_bw) );
1324}
1325
1326
1327static void
1328rx_output(uint64_t received, double delta)
1329{
1330	double pps;
1331	char b1[40];
1332
1333	printf("Received %llu packets, in %.2f seconds.\n",
1334		(unsigned long long) received, delta);
1335
1336	if (delta == 0)
1337		delta = 1e-6;
1338	pps = received / delta;
1339	printf("Speed: %spps\n", norm(b1, pps));
1340}
1341
1342static void
1343usage(void)
1344{
1345	const char *cmd = "pkt-gen";
1346	fprintf(stderr,
1347		"Usage:\n"
1348		"%s arguments\n"
1349		"\t-i interface		interface name\n"
1350		"\t-f function		tx rx ping pong\n"
1351		"\t-n count		number of iterations (can be 0)\n"
1352		"\t-t pkts_to_send		also forces tx mode\n"
1353		"\t-r pkts_to_receive	also forces rx mode\n"
1354		"\t-l pkt_size		in bytes excluding CRC\n"
1355		"\t-d dst_ip[:port[-dst_ip:port]]   single or range\n"
1356		"\t-s src_ip[:port[-src_ip:port]]   single or range\n"
1357		"\t-D dst-mac\n"
1358		"\t-S src-mac\n"
1359		"\t-a cpu_id		use setaffinity\n"
1360		"\t-b burst size		testing, mostly\n"
1361		"\t-c cores		cores to use\n"
1362		"\t-p threads		processes/threads to use\n"
1363		"\t-T report_ms		milliseconds between reports\n"
1364		"\t-P			use libpcap instead of netmap\n"
1365		"\t-w wait_for_link_time	in seconds\n"
1366		"\t-R rate		in packets per second\n"
1367		"\t-X			dump payload\n"
1368		"\t-H len		add empty virtio-net-header with size 'len'\n"
1369		"",
1370		cmd);
1371
1372	exit(0);
1373}
1374
1375static void
1376start_threads(struct glob_arg *g)
1377{
1378	int i;
1379
1380	targs = calloc(g->nthreads, sizeof(*targs));
1381	/*
1382	 * Now create the desired number of threads, each one
1383	 * using a single descriptor.
1384 	 */
1385	for (i = 0; i < g->nthreads; i++) {
1386		struct targ *t = &targs[i];
1387
1388		bzero(t, sizeof(*t));
1389		t->fd = -1; /* default, with pcap */
1390		t->g = g;
1391
1392	    if (g->dev_type == DEV_NETMAP) {
1393		struct nm_desc nmd = *g->nmd; /* copy, we overwrite ringid */
1394		uint64_t nmd_flags = 0;
1395		nmd.self = &nmd;
1396
1397		if (g->nthreads > 1) {
1398			if (nmd.req.nr_flags != NR_REG_ALL_NIC) {
1399				D("invalid nthreads mode %d", nmd.req.nr_flags);
1400				continue;
1401			}
1402			nmd.req.nr_flags = NR_REG_ONE_NIC;
1403			nmd.req.nr_ringid = i;
1404		}
1405		/* Only touch one of the rings (rx is already ok) */
1406		if (g->td_body == receiver_body)
1407			nmd_flags |= NETMAP_NO_TX_POLL;
1408
1409		/* register interface. Override ifname and ringid etc. */
1410		if (g->options & OPT_MONITOR_TX)
1411			nmd.req.nr_flags |= NR_MONITOR_TX;
1412		if (g->options & OPT_MONITOR_RX)
1413			nmd.req.nr_flags |= NR_MONITOR_RX;
1414
1415		t->nmd = nm_open(t->g->ifname, NULL, nmd_flags |
1416			NM_OPEN_IFNAME | NM_OPEN_NO_MMAP, &nmd);
1417		if (t->nmd == NULL) {
1418			D("Unable to open %s: %s",
1419				t->g->ifname, strerror(errno));
1420			continue;
1421		}
1422		t->fd = t->nmd->fd;
1423		set_vnet_hdr_len(t);
1424
1425	    } else {
1426		targs[i].fd = g->main_fd;
1427	    }
1428		t->used = 1;
1429		t->me = i;
1430		if (g->affinity >= 0) {
1431			if (g->affinity < g->cpus)
1432				t->affinity = g->affinity;
1433			else
1434				t->affinity = i % g->cpus;
1435		} else {
1436			t->affinity = -1;
1437		}
1438		/* default, init packets */
1439		initialize_packet(t);
1440
1441		if (pthread_create(&t->thread, NULL, g->td_body, t) == -1) {
1442			D("Unable to create thread %d: %s", i, strerror(errno));
1443			t->used = 0;
1444		}
1445	}
1446}
1447
1448static void
1449main_thread(struct glob_arg *g)
1450{
1451	int i;
1452
1453	uint64_t prev = 0;
1454	uint64_t count = 0;
1455	double delta_t;
1456	struct timeval tic, toc;
1457
1458	gettimeofday(&toc, NULL);
1459	for (;;) {
1460		struct timeval now, delta;
1461		uint64_t pps, usec, my_count, npkts;
1462		int done = 0;
1463
1464		delta.tv_sec = g->report_interval/1000;
1465		delta.tv_usec = (g->report_interval%1000)*1000;
1466		select(0, NULL, NULL, NULL, &delta);
1467		gettimeofday(&now, NULL);
1468		timersub(&now, &toc, &toc);
1469		my_count = 0;
1470		for (i = 0; i < g->nthreads; i++) {
1471			my_count += targs[i].count;
1472			if (targs[i].used == 0)
1473				done++;
1474		}
1475		usec = toc.tv_sec* 1000000 + toc.tv_usec;
1476		if (usec < 10000)
1477			continue;
1478		npkts = my_count - prev;
1479		pps = (npkts*1000000 + usec/2) / usec;
1480		D("%llu pps (%llu pkts in %llu usec)",
1481			(unsigned long long)pps,
1482			(unsigned long long)npkts,
1483			(unsigned long long)usec);
1484		prev = my_count;
1485		toc = now;
1486		if (done == g->nthreads)
1487			break;
1488	}
1489
1490	timerclear(&tic);
1491	timerclear(&toc);
1492	for (i = 0; i < g->nthreads; i++) {
1493		struct timespec t_tic, t_toc;
1494		/*
1495		 * Join active threads, unregister interfaces and close
1496		 * file descriptors.
1497		 */
1498		if (targs[i].used)
1499			pthread_join(targs[i].thread, NULL);
1500		close(targs[i].fd);
1501
1502		if (targs[i].completed == 0)
1503			D("ouch, thread %d exited with error", i);
1504
1505		/*
1506		 * Collect threads output and extract information about
1507		 * how long it took to send all the packets.
1508		 */
1509		count += targs[i].count;
1510		t_tic = timeval2spec(&tic);
1511		t_toc = timeval2spec(&toc);
1512		if (!timerisset(&tic) || timespec_ge(&targs[i].tic, &t_tic))
1513			tic = timespec2val(&targs[i].tic);
1514		if (!timerisset(&toc) || timespec_ge(&targs[i].toc, &t_toc))
1515			toc = timespec2val(&targs[i].toc);
1516	}
1517
1518	/* print output. */
1519	timersub(&toc, &tic, &toc);
1520	delta_t = toc.tv_sec + 1e-6* toc.tv_usec;
1521	if (g->td_body == sender_body)
1522		tx_output(count, g->pkt_size, delta_t);
1523	else
1524		rx_output(count, delta_t);
1525
1526	if (g->dev_type == DEV_NETMAP) {
1527		munmap(g->nmd->mem, g->nmd->req.nr_memsize);
1528		close(g->main_fd);
1529	}
1530}
1531
1532
1533struct sf {
1534	char *key;
1535	void *f;
1536};
1537
1538static struct sf func[] = {
1539	{ "tx",	sender_body },
1540	{ "rx",	receiver_body },
1541	{ "ping",	pinger_body },
1542	{ "pong",	ponger_body },
1543	{ NULL, NULL }
1544};
1545
1546static int
1547tap_alloc(char *dev)
1548{
1549	struct ifreq ifr;
1550	int fd, err;
1551	char *clonedev = TAP_CLONEDEV;
1552
1553	(void)err;
1554	(void)dev;
1555	/* Arguments taken by the function:
1556	 *
1557	 * char *dev: the name of an interface (or '\0'). MUST have enough
1558	 *   space to hold the interface name if '\0' is passed
1559	 * int flags: interface flags (eg, IFF_TUN etc.)
1560	 */
1561
1562#ifdef __FreeBSD__
1563	if (dev[3]) { /* tapSomething */
1564		static char buf[128];
1565		snprintf(buf, sizeof(buf), "/dev/%s", dev);
1566		clonedev = buf;
1567	}
1568#endif
1569	/* open the device */
1570	if( (fd = open(clonedev, O_RDWR)) < 0 ) {
1571		return fd;
1572	}
1573	D("%s open successful", clonedev);
1574
1575	/* preparation of the struct ifr, of type "struct ifreq" */
1576	memset(&ifr, 0, sizeof(ifr));
1577
1578#ifdef linux
1579	ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
1580
1581	if (*dev) {
1582		/* if a device name was specified, put it in the structure; otherwise,
1583		* the kernel will try to allocate the "next" device of the
1584		* specified type */
1585		strncpy(ifr.ifr_name, dev, IFNAMSIZ);
1586	}
1587
1588	/* try to create the device */
1589	if( (err = ioctl(fd, TUNSETIFF, (void *) &ifr)) < 0 ) {
1590		D("failed to to a TUNSETIFF: %s", strerror(errno));
1591		close(fd);
1592		return err;
1593	}
1594
1595	/* if the operation was successful, write back the name of the
1596	* interface to the variable "dev", so the caller can know
1597	* it. Note that the caller MUST reserve space in *dev (see calling
1598	* code below) */
1599	strcpy(dev, ifr.ifr_name);
1600	D("new name is %s", dev);
1601#endif /* linux */
1602
1603        /* this is the special file descriptor that the caller will use to talk
1604         * with the virtual interface */
1605        return fd;
1606}
1607
1608int
1609main(int arc, char **argv)
1610{
1611	int i;
1612
1613	struct glob_arg g;
1614
1615	int ch;
1616	int wait_link = 2;
1617	int devqueues = 1;	/* how many device queues */
1618
1619	bzero(&g, sizeof(g));
1620
1621	g.main_fd = -1;
1622	g.td_body = receiver_body;
1623	g.report_interval = 1000;	/* report interval */
1624	g.affinity = -1;
1625	/* ip addresses can also be a range x.x.x.x-x.x.x.y */
1626	g.src_ip.name = "10.0.0.1";
1627	g.dst_ip.name = "10.1.0.1";
1628	g.dst_mac.name = "ff:ff:ff:ff:ff:ff";
1629	g.src_mac.name = NULL;
1630	g.pkt_size = 60;
1631	g.burst = 512;		// default
1632	g.nthreads = 1;
1633	g.cpus = 1;
1634	g.forever = 1;
1635	g.tx_rate = 0;
1636	g.frags = 1;
1637	g.nmr_config = "";
1638	g.virt_header = 0;
1639
1640	while ( (ch = getopt(arc, argv,
1641			"a:f:F:n:i:Il:d:s:D:S:b:c:o:p:T:w:WvR:XC:H:e:m:")) != -1) {
1642		struct sf *fn;
1643
1644		switch(ch) {
1645		default:
1646			D("bad option %c %s", ch, optarg);
1647			usage();
1648			break;
1649
1650		case 'n':
1651			g.npackets = atoi(optarg);
1652			break;
1653
1654		case 'F':
1655			i = atoi(optarg);
1656			if (i < 1 || i > 63) {
1657				D("invalid frags %d [1..63], ignore", i);
1658				break;
1659			}
1660			g.frags = i;
1661			break;
1662
1663		case 'f':
1664			for (fn = func; fn->key; fn++) {
1665				if (!strcmp(fn->key, optarg))
1666					break;
1667			}
1668			if (fn->key)
1669				g.td_body = fn->f;
1670			else
1671				D("unrecognised function %s", optarg);
1672			break;
1673
1674		case 'o':	/* data generation options */
1675			g.options = atoi(optarg);
1676			break;
1677
1678		case 'a':       /* force affinity */
1679			g.affinity = atoi(optarg);
1680			break;
1681
1682		case 'i':	/* interface */
1683			/* a prefix of tap: netmap: or pcap: forces the mode.
1684			 * otherwise we guess
1685			 */
1686			D("interface is %s", optarg);
1687			if (strlen(optarg) > MAX_IFNAMELEN - 8) {
1688				D("ifname too long %s", optarg);
1689				break;
1690			}
1691			strcpy(g.ifname, optarg);
1692			if (!strcmp(optarg, "null")) {
1693				g.dev_type = DEV_NETMAP;
1694				g.dummy_send = 1;
1695			} else if (!strncmp(optarg, "tap:", 4)) {
1696				g.dev_type = DEV_TAP;
1697				strcpy(g.ifname, optarg + 4);
1698			} else if (!strncmp(optarg, "pcap:", 5)) {
1699				g.dev_type = DEV_PCAP;
1700				strcpy(g.ifname, optarg + 5);
1701			} else if (!strncmp(optarg, "netmap:", 7) ||
1702				   !strncmp(optarg, "vale", 4)) {
1703				g.dev_type = DEV_NETMAP;
1704			} else if (!strncmp(optarg, "tap", 3)) {
1705				g.dev_type = DEV_TAP;
1706			} else { /* prepend netmap: */
1707				g.dev_type = DEV_NETMAP;
1708				sprintf(g.ifname, "netmap:%s", optarg);
1709			}
1710			break;
1711
1712		case 'I':
1713			g.options |= OPT_INDIRECT;	/* XXX use indirect buffer */
1714			break;
1715
1716		case 'l':	/* pkt_size */
1717			g.pkt_size = atoi(optarg);
1718			break;
1719
1720		case 'd':
1721			g.dst_ip.name = optarg;
1722			break;
1723
1724		case 's':
1725			g.src_ip.name = optarg;
1726			break;
1727
1728		case 'T':	/* report interval */
1729			g.report_interval = atoi(optarg);
1730			break;
1731
1732		case 'w':
1733			wait_link = atoi(optarg);
1734			break;
1735
1736		case 'W': /* XXX changed default */
1737			g.forever = 0; /* do not exit rx even with no traffic */
1738			break;
1739
1740		case 'b':	/* burst */
1741			g.burst = atoi(optarg);
1742			break;
1743		case 'c':
1744			g.cpus = atoi(optarg);
1745			break;
1746		case 'p':
1747			g.nthreads = atoi(optarg);
1748			break;
1749
1750		case 'D': /* destination mac */
1751			g.dst_mac.name = optarg;
1752			break;
1753
1754		case 'S': /* source mac */
1755			g.src_mac.name = optarg;
1756			break;
1757		case 'v':
1758			verbose++;
1759			break;
1760		case 'R':
1761			g.tx_rate = atoi(optarg);
1762			break;
1763		case 'X':
1764			g.options |= OPT_DUMP;
1765			break;
1766		case 'C':
1767			g.nmr_config = strdup(optarg);
1768			break;
1769		case 'H':
1770			g.virt_header = atoi(optarg);
1771			break;
1772		case 'e': /* extra bufs */
1773			g.extra_bufs = atoi(optarg);
1774			break;
1775		case 'm':
1776			if (strcmp(optarg, "tx") == 0) {
1777				g.options |= OPT_MONITOR_TX;
1778			} else if (strcmp(optarg, "rx") == 0) {
1779				g.options |= OPT_MONITOR_RX;
1780			} else {
1781				D("unrecognized monitor mode %s", optarg);
1782			}
1783			break;
1784		}
1785	}
1786
1787	if (g.ifname == NULL) {
1788		D("missing ifname");
1789		usage();
1790	}
1791
1792	i = system_ncpus();
1793	if (g.cpus < 0 || g.cpus > i) {
1794		D("%d cpus is too high, have only %d cpus", g.cpus, i);
1795		usage();
1796	}
1797	if (g.cpus == 0)
1798		g.cpus = i;
1799
1800	if (g.pkt_size < 16 || g.pkt_size > MAX_PKTSIZE) {
1801		D("bad pktsize %d [16..%d]\n", g.pkt_size, MAX_PKTSIZE);
1802		usage();
1803	}
1804
1805	if (g.src_mac.name == NULL) {
1806		static char mybuf[20] = "00:00:00:00:00:00";
1807		/* retrieve source mac address. */
1808		if (source_hwaddr(g.ifname, mybuf) == -1) {
1809			D("Unable to retrieve source mac");
1810			// continue, fail later
1811		}
1812		g.src_mac.name = mybuf;
1813	}
1814	/* extract address ranges */
1815	extract_ip_range(&g.src_ip);
1816	extract_ip_range(&g.dst_ip);
1817	extract_mac_range(&g.src_mac);
1818	extract_mac_range(&g.dst_mac);
1819
1820	if (g.src_ip.start != g.src_ip.end ||
1821	    g.src_ip.port0 != g.src_ip.port1 ||
1822	    g.dst_ip.start != g.dst_ip.end ||
1823	    g.dst_ip.port0 != g.dst_ip.port1)
1824		g.options |= OPT_COPY;
1825
1826	if (g.virt_header != 0 && g.virt_header != VIRT_HDR_1
1827			&& g.virt_header != VIRT_HDR_2) {
1828		D("bad virtio-net-header length");
1829		usage();
1830	}
1831
1832    if (g.dev_type == DEV_TAP) {
1833	D("want to use tap %s", g.ifname);
1834	g.main_fd = tap_alloc(g.ifname);
1835	if (g.main_fd < 0) {
1836		D("cannot open tap %s", g.ifname);
1837		usage();
1838	}
1839#ifndef NO_PCAP
1840    } else if (g.dev_type == DEV_PCAP) {
1841	char pcap_errbuf[PCAP_ERRBUF_SIZE];
1842
1843	pcap_errbuf[0] = '\0'; // init the buffer
1844	g.p = pcap_open_live(g.ifname, 256 /* XXX */, 1, 100, pcap_errbuf);
1845	if (g.p == NULL) {
1846		D("cannot open pcap on %s", g.ifname);
1847		usage();
1848	}
1849	g.main_fd = pcap_fileno(g.p);
1850	D("using pcap on %s fileno %d", g.ifname, g.main_fd);
1851#endif /* !NO_PCAP */
1852    } else if (g.dummy_send) { /* but DEV_NETMAP */
1853	D("using a dummy send routine");
1854    } else {
1855	struct nmreq base_nmd;
1856
1857	bzero(&base_nmd, sizeof(base_nmd));
1858
1859	parse_nmr_config(g.nmr_config, &base_nmd);
1860	if (g.extra_bufs) {
1861		base_nmd.nr_arg3 = g.extra_bufs;
1862	}
1863
1864	/*
1865	 * Open the netmap device using nm_open().
1866	 *
1867	 * protocol stack and may cause a reset of the card,
1868	 * which in turn may take some time for the PHY to
1869	 * reconfigure. We do the open here to have time to reset.
1870	 */
1871	g.nmd = nm_open(g.ifname, &base_nmd, 0, NULL);
1872	if (g.nmd == NULL) {
1873		D("Unable to open %s: %s", g.ifname, strerror(errno));
1874		goto out;
1875	}
1876	g.main_fd = g.nmd->fd;
1877	D("mapped %dKB at %p", g.nmd->req.nr_memsize>>10, g.nmd->mem);
1878
1879	/* get num of queues in tx or rx */
1880	if (g.td_body == sender_body)
1881		devqueues = g.nmd->req.nr_tx_rings;
1882	else
1883		devqueues = g.nmd->req.nr_rx_rings;
1884
1885	/* validate provided nthreads. */
1886	if (g.nthreads < 1 || g.nthreads > devqueues) {
1887		D("bad nthreads %d, have %d queues", g.nthreads, devqueues);
1888		// continue, fail later
1889	}
1890
1891	if (verbose) {
1892		struct netmap_if *nifp = g.nmd->nifp;
1893		struct nmreq *req = &g.nmd->req;
1894
1895		D("nifp at offset %d, %d tx %d rx region %d",
1896		    req->nr_offset, req->nr_tx_rings, req->nr_rx_rings,
1897		    req->nr_arg2);
1898		for (i = 0; i <= req->nr_tx_rings; i++) {
1899			struct netmap_ring *ring = NETMAP_TXRING(nifp, i);
1900			D("   TX%d at 0x%lx slots %d", i,
1901			    (char *)ring - (char *)nifp, ring->num_slots);
1902		}
1903		for (i = 0; i <= req->nr_rx_rings; i++) {
1904			struct netmap_ring *ring = NETMAP_RXRING(nifp, i);
1905			D("   RX%d at 0x%lx slots %d", i,
1906			    (char *)ring - (char *)nifp, ring->num_slots);
1907		}
1908	}
1909
1910	/* Print some debug information. */
1911	fprintf(stdout,
1912		"%s %s: %d queues, %d threads and %d cpus.\n",
1913		(g.td_body == sender_body) ? "Sending on" : "Receiving from",
1914		g.ifname,
1915		devqueues,
1916		g.nthreads,
1917		g.cpus);
1918	if (g.td_body == sender_body) {
1919		fprintf(stdout, "%s -> %s (%s -> %s)\n",
1920			g.src_ip.name, g.dst_ip.name,
1921			g.src_mac.name, g.dst_mac.name);
1922	}
1923
1924out:
1925	/* Exit if something went wrong. */
1926	if (g.main_fd < 0) {
1927		D("aborting");
1928		usage();
1929	}
1930    }
1931
1932
1933	if (g.options) {
1934		D("--- SPECIAL OPTIONS:%s%s%s%s%s\n",
1935			g.options & OPT_PREFETCH ? " prefetch" : "",
1936			g.options & OPT_ACCESS ? " access" : "",
1937			g.options & OPT_MEMCPY ? " memcpy" : "",
1938			g.options & OPT_INDIRECT ? " indirect" : "",
1939			g.options & OPT_COPY ? " copy" : "");
1940	}
1941
1942	g.tx_period.tv_sec = g.tx_period.tv_nsec = 0;
1943	if (g.tx_rate > 0) {
1944		/* try to have at least something every second,
1945		 * reducing the burst size to some 0.01s worth of data
1946		 * (but no less than one full set of fragments)
1947	 	 */
1948		uint64_t x;
1949		int lim = (g.tx_rate)/300;
1950		if (g.burst > lim)
1951			g.burst = lim;
1952		if (g.burst < g.frags)
1953			g.burst = g.frags;
1954		x = ((uint64_t)1000000000 * (uint64_t)g.burst) / (uint64_t) g.tx_rate;
1955		g.tx_period.tv_nsec = x;
1956		g.tx_period.tv_sec = g.tx_period.tv_nsec / 1000000000;
1957		g.tx_period.tv_nsec = g.tx_period.tv_nsec % 1000000000;
1958	}
1959	if (g.td_body == sender_body)
1960	    D("Sending %d packets every  %ld.%09ld s",
1961			g.burst, g.tx_period.tv_sec, g.tx_period.tv_nsec);
1962	/* Wait for PHY reset. */
1963	D("Wait %d secs for phy reset", wait_link);
1964	sleep(wait_link);
1965	D("Ready...");
1966
1967	/* Install ^C handler. */
1968	global_nthreads = g.nthreads;
1969	signal(SIGINT, sigint_h);
1970
1971	start_threads(&g);
1972	main_thread(&g);
1973	return 0;
1974}
1975
1976/* end of file */
1977