netmap_user.h revision 341477
1/*
2 * Copyright (C) 2011-2016 Universita` di Pisa
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 *   1. Redistributions of source code must retain the above copyright
10 *      notice, this list of conditions and the following disclaimer.
11 *   2. Redistributions in binary form must reproduce the above copyright
12 *      notice, this list of conditions and the following disclaimer in the
13 *      documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 */
27
28/*
29 * $FreeBSD: stable/11/sys/net/netmap_user.h 341477 2018-12-04 17:40:56Z vmaffione $
30 *
31 * Functions and macros to manipulate netmap structures and packets
32 * in userspace. See netmap(4) for more information.
33 *
34 * The address of the struct netmap_if, say nifp, is computed from the
35 * value returned from ioctl(.., NIOCREG, ...) and the mmap region:
36 *	ioctl(fd, NIOCREG, &req);
37 *	mem = mmap(0, ... );
38 *	nifp = NETMAP_IF(mem, req.nr_nifp);
39 *		(so simple, we could just do it manually)
40 *
41 * From there:
42 *	struct netmap_ring *NETMAP_TXRING(nifp, index)
43 *	struct netmap_ring *NETMAP_RXRING(nifp, index)
44 *		we can access ring->cur, ring->head, ring->tail, etc.
45 *
46 *	ring->slot[i] gives us the i-th slot (we can access
47 *		directly len, flags, buf_idx)
48 *
49 *	char *buf = NETMAP_BUF(ring, x) returns a pointer to
50 *		the buffer numbered x
51 *
52 * All ring indexes (head, cur, tail) should always move forward.
53 * To compute the next index in a circular ring you can use
54 *	i = nm_ring_next(ring, i);
55 *
56 * To ease porting apps from pcap to netmap we supply a few fuctions
57 * that can be called to open, close, read and write on netmap in a way
58 * similar to libpcap. Note that the read/write function depend on
59 * an ioctl()/select()/poll() being issued to refill rings or push
60 * packets out.
61 *
62 * In order to use these, include #define NETMAP_WITH_LIBS
63 * in the source file that invokes these functions.
64 */
65
66#ifndef _NET_NETMAP_USER_H_
67#define _NET_NETMAP_USER_H_
68
69#define NETMAP_DEVICE_NAME "/dev/netmap"
70
71#ifdef __CYGWIN__
72/*
73 * we can compile userspace apps with either cygwin or msvc,
74 * and we use _WIN32 to identify windows specific code
75 */
76#ifndef _WIN32
77#define _WIN32
78#endif	/* _WIN32 */
79
80#endif	/* __CYGWIN__ */
81
82#ifdef _WIN32
83#undef NETMAP_DEVICE_NAME
84#define NETMAP_DEVICE_NAME "/proc/sys/DosDevices/Global/netmap"
85#include <windows.h>
86#include <WinDef.h>
87#include <sys/cygwin.h>
88#endif /* _WIN32 */
89
90#include <stdint.h>
91#include <sys/socket.h>		/* apple needs sockaddr */
92#include <net/if.h>		/* IFNAMSIZ */
93#include <ctype.h>
94
95#ifndef likely
96#define likely(x)	__builtin_expect(!!(x), 1)
97#define unlikely(x)	__builtin_expect(!!(x), 0)
98#endif /* likely and unlikely */
99
100#include <net/netmap.h>
101
102/* helper macro */
103#define _NETMAP_OFFSET(type, ptr, offset) \
104	((type)(void *)((char *)(ptr) + (offset)))
105
106#define NETMAP_IF(_base, _ofs)	_NETMAP_OFFSET(struct netmap_if *, _base, _ofs)
107
108#define NETMAP_TXRING(nifp, index) _NETMAP_OFFSET(struct netmap_ring *, \
109	nifp, (nifp)->ring_ofs[index] )
110
111#define NETMAP_RXRING(nifp, index) _NETMAP_OFFSET(struct netmap_ring *,	\
112	nifp, (nifp)->ring_ofs[index + (nifp)->ni_tx_rings + 1] )
113
114#define NETMAP_BUF(ring, index)				\
115	((char *)(ring) + (ring)->buf_ofs + ((index)*(ring)->nr_buf_size))
116
117#define NETMAP_BUF_IDX(ring, buf)			\
118	( ((char *)(buf) - ((char *)(ring) + (ring)->buf_ofs) ) / \
119		(ring)->nr_buf_size )
120
121
122static inline uint32_t
123nm_ring_next(struct netmap_ring *r, uint32_t i)
124{
125	return ( unlikely(i + 1 == r->num_slots) ? 0 : i + 1);
126}
127
128
129/*
130 * Return 1 if we have pending transmissions in the tx ring.
131 * When everything is complete ring->head = ring->tail + 1 (modulo ring size)
132 */
133static inline int
134nm_tx_pending(struct netmap_ring *r)
135{
136	return nm_ring_next(r, r->tail) != r->head;
137}
138
139
140static inline uint32_t
141nm_ring_space(struct netmap_ring *ring)
142{
143        int ret = ring->tail - ring->cur;
144        if (ret < 0)
145                ret += ring->num_slots;
146        return ret;
147}
148
149
150#ifdef NETMAP_WITH_LIBS
151/*
152 * Support for simple I/O libraries.
153 * Include other system headers required for compiling this.
154 */
155
156#ifndef HAVE_NETMAP_WITH_LIBS
157#define HAVE_NETMAP_WITH_LIBS
158
159#include <stdio.h>
160#include <sys/time.h>
161#include <sys/mman.h>
162#include <string.h>	/* memset */
163#include <sys/ioctl.h>
164#include <sys/errno.h>	/* EINVAL */
165#include <fcntl.h>	/* O_RDWR */
166#include <unistd.h>	/* close() */
167#include <signal.h>
168#include <stdlib.h>
169
170#ifndef ND /* debug macros */
171/* debug support */
172#define ND(_fmt, ...) do {} while(0)
173#define D(_fmt, ...)						\
174	do {							\
175		struct timeval _t0;				\
176		gettimeofday(&_t0, NULL);			\
177		fprintf(stderr, "%03d.%06d %s [%d] " _fmt "\n",	\
178		    (int)(_t0.tv_sec % 1000), (int)_t0.tv_usec,	\
179		    __FUNCTION__, __LINE__, ##__VA_ARGS__);	\
180        } while (0)
181
182/* Rate limited version of "D", lps indicates how many per second */
183#define RD(lps, format, ...)                                    \
184    do {                                                        \
185        static int __t0, __cnt;                                 \
186        struct timeval __xxts;                                  \
187        gettimeofday(&__xxts, NULL);                            \
188        if (__t0 != __xxts.tv_sec) {                            \
189            __t0 = __xxts.tv_sec;                               \
190            __cnt = 0;                                          \
191        }                                                       \
192        if (__cnt++ < lps) {                                    \
193            D(format, ##__VA_ARGS__);                           \
194        }                                                       \
195    } while (0)
196#endif
197
198struct nm_pkthdr {	/* first part is the same as pcap_pkthdr */
199	struct timeval	ts;
200	uint32_t	caplen;
201	uint32_t	len;
202
203	uint64_t flags;	/* NM_MORE_PKTS etc */
204#define NM_MORE_PKTS	1
205	struct nm_desc *d;
206	struct netmap_slot *slot;
207	uint8_t *buf;
208};
209
210struct nm_stat {	/* same as pcap_stat	*/
211	u_int	ps_recv;
212	u_int	ps_drop;
213	u_int	ps_ifdrop;
214#ifdef WIN32 /* XXX or _WIN32 ? */
215	u_int	bs_capt;
216#endif /* WIN32 */
217};
218
219#define NM_ERRBUF_SIZE	512
220
221struct nm_desc {
222	struct nm_desc *self; /* point to self if netmap. */
223	int fd;
224	void *mem;
225	uint32_t memsize;
226	int done_mmap;	/* set if mem is the result of mmap */
227	struct netmap_if * const nifp;
228	uint16_t first_tx_ring, last_tx_ring, cur_tx_ring;
229	uint16_t first_rx_ring, last_rx_ring, cur_rx_ring;
230	struct nmreq req;	/* also contains the nr_name = ifname */
231	struct nm_pkthdr hdr;
232
233	/*
234	 * The memory contains netmap_if, rings and then buffers.
235	 * Given a pointer (e.g. to nm_inject) we can compare with
236	 * mem/buf_start/buf_end to tell if it is a buffer or
237	 * some other descriptor in our region.
238	 * We also store a pointer to some ring as it helps in the
239	 * translation from buffer indexes to addresses.
240	 */
241	struct netmap_ring * const some_ring;
242	void * const buf_start;
243	void * const buf_end;
244	/* parameters from pcap_open_live */
245	int snaplen;
246	int promisc;
247	int to_ms;
248	char *errbuf;
249
250	/* save flags so we can restore them on close */
251	uint32_t if_flags;
252        uint32_t if_reqcap;
253        uint32_t if_curcap;
254
255	struct nm_stat st;
256	char msg[NM_ERRBUF_SIZE];
257};
258
259/*
260 * when the descriptor is open correctly, d->self == d
261 * Eventually we should also use some magic number.
262 */
263#define P2NMD(p)		((struct nm_desc *)(p))
264#define IS_NETMAP_DESC(d)	((d) && P2NMD(d)->self == P2NMD(d))
265#define NETMAP_FD(d)		(P2NMD(d)->fd)
266
267
268/*
269 * this is a slightly optimized copy routine which rounds
270 * to multiple of 64 bytes and is often faster than dealing
271 * with other odd sizes. We assume there is enough room
272 * in the source and destination buffers.
273 */
274static inline void
275nm_pkt_copy(const void *_src, void *_dst, int l)
276{
277	const uint64_t *src = (const uint64_t *)_src;
278	uint64_t *dst = (uint64_t *)_dst;
279
280	if (unlikely(l >= 1024 || l % 64)) {
281		memcpy(dst, src, l);
282		return;
283	}
284	for (; likely(l > 0); l-=64) {
285		*dst++ = *src++;
286		*dst++ = *src++;
287		*dst++ = *src++;
288		*dst++ = *src++;
289		*dst++ = *src++;
290		*dst++ = *src++;
291		*dst++ = *src++;
292		*dst++ = *src++;
293	}
294}
295
296
297/*
298 * The callback, invoked on each received packet. Same as libpcap
299 */
300typedef void (*nm_cb_t)(u_char *, const struct nm_pkthdr *, const u_char *d);
301
302/*
303 *--- the pcap-like API ---
304 *
305 * nm_open() opens a file descriptor, binds to a port and maps memory.
306 *
307 * ifname	(netmap:foo or vale:foo) is the port name
308 *		a suffix can indicate the follwing:
309 *		^		bind the host (sw) ring pair
310 *		*		bind host and NIC ring pairs
311 *		-NN		bind individual NIC ring pair
312 *		{NN		bind master side of pipe NN
313 *		}NN		bind slave side of pipe NN
314 *		a suffix starting with / and the following flags,
315 *		in any order:
316 *		x		exclusive access
317 *		z		zero copy monitor (both tx and rx)
318 *		t		monitor tx side (copy monitor)
319 *		r		monitor rx side (copy monitor)
320 *		R		bind only RX ring(s)
321 *		T		bind only TX ring(s)
322 *
323 * req		provides the initial values of nmreq before parsing ifname.
324 *		Remember that the ifname parsing will override the ring
325 *		number in nm_ringid, and part of nm_flags;
326 * flags	special functions, normally 0
327 *		indicates which fields of *arg are significant
328 * arg		special functions, normally NULL
329 *		if passed a netmap_desc with mem != NULL,
330 *		use that memory instead of mmap.
331 */
332
333static struct nm_desc *nm_open(const char *ifname, const struct nmreq *req,
334	uint64_t flags, const struct nm_desc *arg);
335
336/*
337 * nm_open can import some fields from the parent descriptor.
338 * These flags control which ones.
339 * Also in flags you can specify NETMAP_NO_TX_POLL and NETMAP_DO_RX_POLL,
340 * which set the initial value for these flags.
341 * Note that the 16 low bits of the flags are reserved for data
342 * that may go into the nmreq.
343 */
344enum {
345	NM_OPEN_NO_MMAP =	0x040000, /* reuse mmap from parent */
346	NM_OPEN_IFNAME =	0x080000, /* nr_name, nr_ringid, nr_flags */
347	NM_OPEN_ARG1 =		0x100000,
348	NM_OPEN_ARG2 =		0x200000,
349	NM_OPEN_ARG3 =		0x400000,
350	NM_OPEN_RING_CFG =	0x800000, /* tx|rx rings|slots */
351};
352
353
354/*
355 * nm_close()	closes and restores the port to its previous state
356 */
357
358static int nm_close(struct nm_desc *);
359
360/*
361 * nm_mmap()    do mmap or inherit from parent if the nr_arg2
362 *              (memory block) matches.
363 */
364
365static int nm_mmap(struct nm_desc *, const struct nm_desc *);
366
367/*
368 * nm_inject() is the same as pcap_inject()
369 * nm_dispatch() is the same as pcap_dispatch()
370 * nm_nextpkt() is the same as pcap_next()
371 */
372
373static int nm_inject(struct nm_desc *, const void *, size_t);
374static int nm_dispatch(struct nm_desc *, int, nm_cb_t, u_char *);
375static u_char *nm_nextpkt(struct nm_desc *, struct nm_pkthdr *);
376
377#ifdef _WIN32
378
379intptr_t _get_osfhandle(int); /* defined in io.h in windows */
380
381/*
382 * In windows we do not have yet native poll support, so we keep track
383 * of file descriptors associated to netmap ports to emulate poll on
384 * them and fall back on regular poll on other file descriptors.
385 */
386struct win_netmap_fd_list {
387	struct win_netmap_fd_list *next;
388	int win_netmap_fd;
389	HANDLE win_netmap_handle;
390};
391
392/*
393 * list head containing all the netmap opened fd and their
394 * windows HANDLE counterparts
395 */
396static struct win_netmap_fd_list *win_netmap_fd_list_head;
397
398static void
399win_insert_fd_record(int fd)
400{
401	struct win_netmap_fd_list *curr;
402
403	for (curr = win_netmap_fd_list_head; curr; curr = curr->next) {
404		if (fd == curr->win_netmap_fd) {
405			return;
406		}
407	}
408	curr = calloc(1, sizeof(*curr));
409	curr->next = win_netmap_fd_list_head;
410	curr->win_netmap_fd = fd;
411	curr->win_netmap_handle = IntToPtr(_get_osfhandle(fd));
412	win_netmap_fd_list_head = curr;
413}
414
415void
416win_remove_fd_record(int fd)
417{
418	struct win_netmap_fd_list *curr = win_netmap_fd_list_head;
419	struct win_netmap_fd_list *prev = NULL;
420	for (; curr ; prev = curr, curr = curr->next) {
421		if (fd != curr->win_netmap_fd)
422			continue;
423		/* found the entry */
424		if (prev == NULL) { /* we are freeing the first entry */
425			win_netmap_fd_list_head = curr->next;
426		} else {
427			prev->next = curr->next;
428		}
429		free(curr);
430		break;
431	}
432}
433
434
435HANDLE
436win_get_netmap_handle(int fd)
437{
438	struct win_netmap_fd_list *curr;
439
440	for (curr = win_netmap_fd_list_head; curr; curr = curr->next) {
441		if (fd == curr->win_netmap_fd) {
442			return curr->win_netmap_handle;
443		}
444	}
445	return NULL;
446}
447
448/*
449 * we need to wrap ioctl and mmap, at least for the netmap file descriptors
450 */
451
452/*
453 * use this function only from netmap_user.h internal functions
454 * same as ioctl, returns 0 on success and -1 on error
455 */
456static int
457win_nm_ioctl_internal(HANDLE h, int32_t ctlCode, void *arg)
458{
459	DWORD bReturn = 0, szIn, szOut;
460	BOOL ioctlReturnStatus;
461	void *inParam = arg, *outParam = arg;
462
463	switch (ctlCode) {
464	case NETMAP_POLL:
465		szIn = sizeof(POLL_REQUEST_DATA);
466		szOut = sizeof(POLL_REQUEST_DATA);
467		break;
468	case NETMAP_MMAP:
469		szIn = 0;
470		szOut = sizeof(void*);
471		inParam = NULL; /* nothing on input */
472		break;
473	case NIOCTXSYNC:
474	case NIOCRXSYNC:
475		szIn = 0;
476		szOut = 0;
477		break;
478	case NIOCREGIF:
479		szIn = sizeof(struct nmreq);
480		szOut = sizeof(struct nmreq);
481		break;
482	case NIOCCONFIG:
483		D("unsupported NIOCCONFIG!");
484		return -1;
485
486	default: /* a regular ioctl */
487		D("invalid ioctl %x on netmap fd", ctlCode);
488		return -1;
489	}
490
491	ioctlReturnStatus = DeviceIoControl(h,
492				ctlCode, inParam, szIn,
493				outParam, szOut,
494				&bReturn, NULL);
495	// XXX note windows returns 0 on error or async call, 1 on success
496	// we could call GetLastError() to figure out what happened
497	return ioctlReturnStatus ? 0 : -1;
498}
499
500/*
501 * this function is what must be called from user-space programs
502 * same as ioctl, returns 0 on success and -1 on error
503 */
504static int
505win_nm_ioctl(int fd, int32_t ctlCode, void *arg)
506{
507	HANDLE h = win_get_netmap_handle(fd);
508
509	if (h == NULL) {
510		return ioctl(fd, ctlCode, arg);
511	} else {
512		return win_nm_ioctl_internal(h, ctlCode, arg);
513	}
514}
515
516#define ioctl win_nm_ioctl /* from now on, within this file ... */
517
518/*
519 * We cannot use the native mmap on windows
520 * The only parameter used is "fd", the other ones are just declared to
521 * make this signature comparable to the FreeBSD/Linux one
522 */
523static void *
524win32_mmap_emulated(void *addr, size_t length, int prot, int flags, int fd, int32_t offset)
525{
526	HANDLE h = win_get_netmap_handle(fd);
527
528	if (h == NULL) {
529		return mmap(addr, length, prot, flags, fd, offset);
530	} else {
531		MEMORY_ENTRY ret;
532
533		return win_nm_ioctl_internal(h, NETMAP_MMAP, &ret) ?
534			NULL : ret.pUsermodeVirtualAddress;
535	}
536}
537
538#define mmap win32_mmap_emulated
539
540#include <sys/poll.h> /* XXX needed to use the structure pollfd */
541
542static int
543win_nm_poll(struct pollfd *fds, int nfds, int timeout)
544{
545	HANDLE h;
546
547	if (nfds != 1 || fds == NULL || (h = win_get_netmap_handle(fds->fd)) == NULL) {;
548		return poll(fds, nfds, timeout);
549	} else {
550		POLL_REQUEST_DATA prd;
551
552		prd.timeout = timeout;
553		prd.events = fds->events;
554
555		win_nm_ioctl_internal(h, NETMAP_POLL, &prd);
556		if ((prd.revents == POLLERR) || (prd.revents == STATUS_TIMEOUT)) {
557			return -1;
558		}
559		return 1;
560	}
561}
562
563#define poll win_nm_poll
564
565static int
566win_nm_open(char* pathname, int flags)
567{
568
569	if (strcmp(pathname, NETMAP_DEVICE_NAME) == 0) {
570		int fd = open(NETMAP_DEVICE_NAME, O_RDWR);
571		if (fd < 0) {
572			return -1;
573		}
574
575		win_insert_fd_record(fd);
576		return fd;
577	} else {
578		return open(pathname, flags);
579	}
580}
581
582#define open win_nm_open
583
584static int
585win_nm_close(int fd)
586{
587	if (fd != -1) {
588		close(fd);
589		if (win_get_netmap_handle(fd) != NULL) {
590			win_remove_fd_record(fd);
591		}
592	}
593	return 0;
594}
595
596#define close win_nm_close
597
598#endif /* _WIN32 */
599
600static int
601nm_is_identifier(const char *s, const char *e)
602{
603	for (; s != e; s++) {
604		if (!isalnum(*s) && *s != '_') {
605			return 0;
606		}
607	}
608
609	return 1;
610}
611
612#define MAXERRMSG 80
613static int
614nm_parse(const char *ifname, struct nm_desc *d, char *err)
615{
616	int is_vale;
617	const char *port = NULL;
618	const char *vpname = NULL;
619	u_int namelen;
620	uint32_t nr_ringid = 0, nr_flags;
621	char errmsg[MAXERRMSG] = "";
622	long num;
623	uint16_t nr_arg2 = 0;
624	enum { P_START, P_RNGSFXOK, P_GETNUM, P_FLAGS, P_FLAGSOK, P_MEMID } p_state;
625
626	errno = 0;
627
628	is_vale = (ifname[0] == 'v');
629	if (is_vale) {
630		port = index(ifname, ':');
631		if (port == NULL) {
632			snprintf(errmsg, MAXERRMSG,
633				 "missing ':' in vale name");
634			goto fail;
635		}
636
637		if (!nm_is_identifier(ifname + 4, port)) {
638			snprintf(errmsg, MAXERRMSG, "invalid bridge name");
639			goto fail;
640		}
641
642		vpname = ++port;
643	} else {
644		ifname += 7;
645		port = ifname;
646	}
647
648	/* scan for a separator */
649	for (; *port && !index("-*^{}/@", *port); port++)
650		;
651
652	if (is_vale && !nm_is_identifier(vpname, port)) {
653		snprintf(errmsg, MAXERRMSG, "invalid bridge port name");
654		goto fail;
655	}
656
657	namelen = port - ifname;
658	if (namelen >= sizeof(d->req.nr_name)) {
659		snprintf(errmsg, MAXERRMSG, "name too long");
660		goto fail;
661	}
662	memcpy(d->req.nr_name, ifname, namelen);
663	d->req.nr_name[namelen] = '\0';
664
665	p_state = P_START;
666	nr_flags = NR_REG_ALL_NIC; /* default for no suffix */
667	while (*port) {
668		switch (p_state) {
669		case P_START:
670			switch (*port) {
671			case '^': /* only SW ring */
672				nr_flags = NR_REG_SW;
673				p_state = P_RNGSFXOK;
674				break;
675			case '*': /* NIC and SW */
676				nr_flags = NR_REG_NIC_SW;
677				p_state = P_RNGSFXOK;
678				break;
679			case '-': /* one NIC ring pair */
680				nr_flags = NR_REG_ONE_NIC;
681				p_state = P_GETNUM;
682				break;
683			case '{': /* pipe (master endpoint) */
684				nr_flags = NR_REG_PIPE_MASTER;
685				p_state = P_GETNUM;
686				break;
687			case '}': /* pipe (slave endoint) */
688				nr_flags = NR_REG_PIPE_SLAVE;
689				p_state = P_GETNUM;
690				break;
691			case '/': /* start of flags */
692				p_state = P_FLAGS;
693				break;
694			case '@': /* start of memid */
695				p_state = P_MEMID;
696				break;
697			default:
698				snprintf(errmsg, MAXERRMSG, "unknown modifier: '%c'", *port);
699				goto fail;
700			}
701			port++;
702			break;
703		case P_RNGSFXOK:
704			switch (*port) {
705			case '/':
706				p_state = P_FLAGS;
707				break;
708			case '@':
709				p_state = P_MEMID;
710				break;
711			default:
712				snprintf(errmsg, MAXERRMSG, "unexpected character: '%c'", *port);
713				goto fail;
714			}
715			port++;
716			break;
717		case P_GETNUM:
718			num = strtol(port, (char **)&port, 10);
719			if (num < 0 || num >= NETMAP_RING_MASK) {
720				snprintf(errmsg, MAXERRMSG, "'%ld' out of range [0, %d)",
721						num, NETMAP_RING_MASK);
722				goto fail;
723			}
724			nr_ringid = num & NETMAP_RING_MASK;
725			p_state = P_RNGSFXOK;
726			break;
727		case P_FLAGS:
728		case P_FLAGSOK:
729			if (*port == '@') {
730				port++;
731				p_state = P_MEMID;
732				break;
733			}
734			switch (*port) {
735			case 'x':
736				nr_flags |= NR_EXCLUSIVE;
737				break;
738			case 'z':
739				nr_flags |= NR_ZCOPY_MON;
740				break;
741			case 't':
742				nr_flags |= NR_MONITOR_TX;
743				break;
744			case 'r':
745				nr_flags |= NR_MONITOR_RX;
746				break;
747			case 'R':
748				nr_flags |= NR_RX_RINGS_ONLY;
749				break;
750			case 'T':
751				nr_flags |= NR_TX_RINGS_ONLY;
752				break;
753			default:
754				snprintf(errmsg, MAXERRMSG, "unrecognized flag: '%c'", *port);
755				goto fail;
756			}
757			port++;
758			p_state = P_FLAGSOK;
759			break;
760		case P_MEMID:
761			if (nr_arg2 != 0) {
762				snprintf(errmsg, MAXERRMSG, "double setting of memid");
763				goto fail;
764			}
765			num = strtol(port, (char **)&port, 10);
766			if (num <= 0) {
767				snprintf(errmsg, MAXERRMSG, "invalid memid %ld, must be >0", num);
768				goto fail;
769			}
770			nr_arg2 = num;
771			p_state = P_RNGSFXOK;
772			break;
773		}
774	}
775	if (p_state != P_START && p_state != P_RNGSFXOK && p_state != P_FLAGSOK) {
776		snprintf(errmsg, MAXERRMSG, "unexpected end of port name");
777		goto fail;
778	}
779	ND("flags: %s %s %s %s",
780			(nr_flags & NR_EXCLUSIVE) ? "EXCLUSIVE" : "",
781			(nr_flags & NR_ZCOPY_MON) ? "ZCOPY_MON" : "",
782			(nr_flags & NR_MONITOR_TX) ? "MONITOR_TX" : "",
783			(nr_flags & NR_MONITOR_RX) ? "MONITOR_RX" : "");
784
785	d->req.nr_flags |= nr_flags;
786	d->req.nr_ringid |= nr_ringid;
787	d->req.nr_arg2 = nr_arg2;
788
789	d->self = d;
790
791	return 0;
792fail:
793	if (!errno)
794		errno = EINVAL;
795	if (err)
796		strncpy(err, errmsg, MAXERRMSG);
797	return -1;
798}
799
800/*
801 * Try to open, return descriptor if successful, NULL otherwise.
802 * An invalid netmap name will return errno = 0;
803 * You can pass a pointer to a pre-filled nm_desc to add special
804 * parameters. Flags is used as follows
805 * NM_OPEN_NO_MMAP	use the memory from arg, only XXX avoid mmap
806 *			if the nr_arg2 (memory block) matches.
807 * NM_OPEN_ARG1		use req.nr_arg1 from arg
808 * NM_OPEN_ARG2		use req.nr_arg2 from arg
809 * NM_OPEN_RING_CFG	user ring config from arg
810 */
811static struct nm_desc *
812nm_open(const char *ifname, const struct nmreq *req,
813	uint64_t new_flags, const struct nm_desc *arg)
814{
815	struct nm_desc *d = NULL;
816	const struct nm_desc *parent = arg;
817	char errmsg[MAXERRMSG] = "";
818	uint32_t nr_reg;
819
820	if (strncmp(ifname, "netmap:", 7) &&
821			strncmp(ifname, NM_BDG_NAME, strlen(NM_BDG_NAME))) {
822		errno = 0; /* name not recognised, not an error */
823		return NULL;
824	}
825
826	d = (struct nm_desc *)calloc(1, sizeof(*d));
827	if (d == NULL) {
828		snprintf(errmsg, MAXERRMSG, "nm_desc alloc failure");
829		errno = ENOMEM;
830		return NULL;
831	}
832	d->self = d;	/* set this early so nm_close() works */
833	d->fd = open(NETMAP_DEVICE_NAME, O_RDWR);
834	if (d->fd < 0) {
835		snprintf(errmsg, MAXERRMSG, "cannot open /dev/netmap: %s", strerror(errno));
836		goto fail;
837	}
838
839	if (req)
840		d->req = *req;
841
842	if (!(new_flags & NM_OPEN_IFNAME)) {
843		if (nm_parse(ifname, d, errmsg) < 0)
844			goto fail;
845	}
846
847	d->req.nr_version = NETMAP_API;
848	d->req.nr_ringid &= NETMAP_RING_MASK;
849
850	/* optionally import info from parent */
851	if (IS_NETMAP_DESC(parent) && new_flags) {
852		if (new_flags & NM_OPEN_ARG1)
853			D("overriding ARG1 %d", parent->req.nr_arg1);
854		d->req.nr_arg1 = new_flags & NM_OPEN_ARG1 ?
855			parent->req.nr_arg1 : 4;
856		if (new_flags & NM_OPEN_ARG2) {
857			D("overriding ARG2 %d", parent->req.nr_arg2);
858			d->req.nr_arg2 =  parent->req.nr_arg2;
859		}
860		if (new_flags & NM_OPEN_ARG3)
861			D("overriding ARG3 %d", parent->req.nr_arg3);
862		d->req.nr_arg3 = new_flags & NM_OPEN_ARG3 ?
863			parent->req.nr_arg3 : 0;
864		if (new_flags & NM_OPEN_RING_CFG) {
865			D("overriding RING_CFG");
866			d->req.nr_tx_slots = parent->req.nr_tx_slots;
867			d->req.nr_rx_slots = parent->req.nr_rx_slots;
868			d->req.nr_tx_rings = parent->req.nr_tx_rings;
869			d->req.nr_rx_rings = parent->req.nr_rx_rings;
870		}
871		if (new_flags & NM_OPEN_IFNAME) {
872			D("overriding ifname %s ringid 0x%x flags 0x%x",
873				parent->req.nr_name, parent->req.nr_ringid,
874				parent->req.nr_flags);
875			memcpy(d->req.nr_name, parent->req.nr_name,
876				sizeof(d->req.nr_name));
877			d->req.nr_ringid = parent->req.nr_ringid;
878			d->req.nr_flags = parent->req.nr_flags;
879		}
880	}
881	/* add the *XPOLL flags */
882	d->req.nr_ringid |= new_flags & (NETMAP_NO_TX_POLL | NETMAP_DO_RX_POLL);
883
884	if (ioctl(d->fd, NIOCREGIF, &d->req)) {
885		snprintf(errmsg, MAXERRMSG, "NIOCREGIF failed: %s", strerror(errno));
886		goto fail;
887	}
888
889	nr_reg = d->req.nr_flags & NR_REG_MASK;
890
891	if (nr_reg == NR_REG_SW) { /* host stack */
892		d->first_tx_ring = d->last_tx_ring = d->req.nr_tx_rings;
893		d->first_rx_ring = d->last_rx_ring = d->req.nr_rx_rings;
894	} else if (nr_reg ==  NR_REG_ALL_NIC) { /* only nic */
895		d->first_tx_ring = 0;
896		d->first_rx_ring = 0;
897		d->last_tx_ring = d->req.nr_tx_rings - 1;
898		d->last_rx_ring = d->req.nr_rx_rings - 1;
899	} else if (nr_reg ==  NR_REG_NIC_SW) {
900		d->first_tx_ring = 0;
901		d->first_rx_ring = 0;
902		d->last_tx_ring = d->req.nr_tx_rings;
903		d->last_rx_ring = d->req.nr_rx_rings;
904	} else if (nr_reg == NR_REG_ONE_NIC) {
905		/* XXX check validity */
906		d->first_tx_ring = d->last_tx_ring =
907		d->first_rx_ring = d->last_rx_ring = d->req.nr_ringid & NETMAP_RING_MASK;
908	} else { /* pipes */
909		d->first_tx_ring = d->last_tx_ring = 0;
910		d->first_rx_ring = d->last_rx_ring = 0;
911	}
912
913        /* if parent is defined, do nm_mmap() even if NM_OPEN_NO_MMAP is set */
914	if ((!(new_flags & NM_OPEN_NO_MMAP) || parent) && nm_mmap(d, parent)) {
915	        snprintf(errmsg, MAXERRMSG, "mmap failed: %s", strerror(errno));
916		goto fail;
917	}
918
919
920#ifdef DEBUG_NETMAP_USER
921    { /* debugging code */
922	int i;
923
924	D("%s tx %d .. %d %d rx %d .. %d %d", ifname,
925		d->first_tx_ring, d->last_tx_ring, d->req.nr_tx_rings,
926                d->first_rx_ring, d->last_rx_ring, d->req.nr_rx_rings);
927	for (i = 0; i <= d->req.nr_tx_rings; i++) {
928		struct netmap_ring *r = NETMAP_TXRING(d->nifp, i);
929		D("TX%d %p h %d c %d t %d", i, r, r->head, r->cur, r->tail);
930	}
931	for (i = 0; i <= d->req.nr_rx_rings; i++) {
932		struct netmap_ring *r = NETMAP_RXRING(d->nifp, i);
933		D("RX%d %p h %d c %d t %d", i, r, r->head, r->cur, r->tail);
934	}
935    }
936#endif /* debugging */
937
938	d->cur_tx_ring = d->first_tx_ring;
939	d->cur_rx_ring = d->first_rx_ring;
940	return d;
941
942fail:
943	nm_close(d);
944	if (errmsg[0])
945		D("%s %s", errmsg, ifname);
946	if (errno == 0)
947		errno = EINVAL;
948	return NULL;
949}
950
951
952static int
953nm_close(struct nm_desc *d)
954{
955	/*
956	 * ugly trick to avoid unused warnings
957	 */
958	static void *__xxzt[] __attribute__ ((unused))  =
959		{ (void *)nm_open, (void *)nm_inject,
960		  (void *)nm_dispatch, (void *)nm_nextpkt } ;
961
962	if (d == NULL || d->self != d)
963		return EINVAL;
964	if (d->done_mmap && d->mem)
965		munmap(d->mem, d->memsize);
966	if (d->fd != -1) {
967		close(d->fd);
968	}
969
970	bzero(d, sizeof(*d));
971	free(d);
972	return 0;
973}
974
975
976static int
977nm_mmap(struct nm_desc *d, const struct nm_desc *parent)
978{
979	//XXX TODO: check if mmap is already done
980
981	if (IS_NETMAP_DESC(parent) && parent->mem &&
982	    parent->req.nr_arg2 == d->req.nr_arg2) {
983		/* do not mmap, inherit from parent */
984		D("do not mmap, inherit from parent");
985		d->memsize = parent->memsize;
986		d->mem = parent->mem;
987	} else {
988		/* XXX TODO: check if memsize is too large (or there is overflow) */
989		d->memsize = d->req.nr_memsize;
990		d->mem = mmap(0, d->memsize, PROT_WRITE | PROT_READ, MAP_SHARED,
991				d->fd, 0);
992		if (d->mem == MAP_FAILED) {
993			goto fail;
994		}
995		d->done_mmap = 1;
996	}
997	{
998		struct netmap_if *nifp = NETMAP_IF(d->mem, d->req.nr_offset);
999		struct netmap_ring *r = NETMAP_RXRING(nifp, d->first_rx_ring);
1000		if ((void *)r == (void *)nifp) {
1001			/* the descriptor is open for TX only */
1002			r = NETMAP_TXRING(nifp, d->first_tx_ring);
1003		}
1004
1005		*(struct netmap_if **)(uintptr_t)&(d->nifp) = nifp;
1006		*(struct netmap_ring **)(uintptr_t)&d->some_ring = r;
1007		*(void **)(uintptr_t)&d->buf_start = NETMAP_BUF(r, 0);
1008		*(void **)(uintptr_t)&d->buf_end =
1009			(char *)d->mem + d->memsize;
1010	}
1011
1012	return 0;
1013
1014fail:
1015	return EINVAL;
1016}
1017
1018/*
1019 * Same prototype as pcap_inject(), only need to cast.
1020 */
1021static int
1022nm_inject(struct nm_desc *d, const void *buf, size_t size)
1023{
1024	u_int c, n = d->last_tx_ring - d->first_tx_ring + 1,
1025		ri = d->cur_tx_ring;
1026
1027	for (c = 0; c < n ; c++, ri++) {
1028		/* compute current ring to use */
1029		struct netmap_ring *ring;
1030		uint32_t i, j, idx;
1031		size_t rem;
1032
1033		if (ri > d->last_tx_ring)
1034			ri = d->first_tx_ring;
1035		ring = NETMAP_TXRING(d->nifp, ri);
1036		rem = size;
1037		j = ring->cur;
1038		while (rem > ring->nr_buf_size && j != ring->tail) {
1039			rem -= ring->nr_buf_size;
1040			j = nm_ring_next(ring, j);
1041		}
1042		if (j == ring->tail && rem > 0)
1043			continue;
1044		i = ring->cur;
1045		while (i != j) {
1046			idx = ring->slot[i].buf_idx;
1047			ring->slot[i].len = ring->nr_buf_size;
1048			ring->slot[i].flags = NS_MOREFRAG;
1049			nm_pkt_copy(buf, NETMAP_BUF(ring, idx), ring->nr_buf_size);
1050			i = nm_ring_next(ring, i);
1051			buf = (char *)buf + ring->nr_buf_size;
1052		}
1053		idx = ring->slot[i].buf_idx;
1054		ring->slot[i].len = rem;
1055		ring->slot[i].flags = 0;
1056		nm_pkt_copy(buf, NETMAP_BUF(ring, idx), rem);
1057		ring->head = ring->cur = nm_ring_next(ring, i);
1058		d->cur_tx_ring = ri;
1059		return size;
1060	}
1061	return 0; /* fail */
1062}
1063
1064
1065/*
1066 * Same prototype as pcap_dispatch(), only need to cast.
1067 */
1068static int
1069nm_dispatch(struct nm_desc *d, int cnt, nm_cb_t cb, u_char *arg)
1070{
1071	int n = d->last_rx_ring - d->first_rx_ring + 1;
1072	int c, got = 0, ri = d->cur_rx_ring;
1073	d->hdr.buf = NULL;
1074	d->hdr.flags = NM_MORE_PKTS;
1075	d->hdr.d = d;
1076
1077	if (cnt == 0)
1078		cnt = -1;
1079	/* cnt == -1 means infinite, but rings have a finite amount
1080	 * of buffers and the int is large enough that we never wrap,
1081	 * so we can omit checking for -1
1082	 */
1083	for (c=0; c < n && cnt != got; c++, ri++) {
1084		/* compute current ring to use */
1085		struct netmap_ring *ring;
1086
1087		if (ri > d->last_rx_ring)
1088			ri = d->first_rx_ring;
1089		ring = NETMAP_RXRING(d->nifp, ri);
1090		for ( ; !nm_ring_empty(ring) && cnt != got; got++) {
1091			u_int idx, i;
1092			if (d->hdr.buf) { /* from previous round */
1093				cb(arg, &d->hdr, d->hdr.buf);
1094			}
1095			i = ring->cur;
1096			idx = ring->slot[i].buf_idx;
1097			/* d->cur_rx_ring doesn't change inside this loop, but
1098			 * set it here, so it reflects d->hdr.buf's ring */
1099			d->cur_rx_ring = ri;
1100			d->hdr.slot = &ring->slot[i];
1101			d->hdr.buf = (u_char *)NETMAP_BUF(ring, idx);
1102			// __builtin_prefetch(buf);
1103			d->hdr.len = d->hdr.caplen = ring->slot[i].len;
1104			d->hdr.ts = ring->ts;
1105			ring->head = ring->cur = nm_ring_next(ring, i);
1106		}
1107	}
1108	if (d->hdr.buf) { /* from previous round */
1109		d->hdr.flags = 0;
1110		cb(arg, &d->hdr, d->hdr.buf);
1111	}
1112	return got;
1113}
1114
1115static u_char *
1116nm_nextpkt(struct nm_desc *d, struct nm_pkthdr *hdr)
1117{
1118	int ri = d->cur_rx_ring;
1119
1120	do {
1121		/* compute current ring to use */
1122		struct netmap_ring *ring = NETMAP_RXRING(d->nifp, ri);
1123		if (!nm_ring_empty(ring)) {
1124			u_int i = ring->cur;
1125			u_int idx = ring->slot[i].buf_idx;
1126			u_char *buf = (u_char *)NETMAP_BUF(ring, idx);
1127
1128			// __builtin_prefetch(buf);
1129			hdr->ts = ring->ts;
1130			hdr->len = hdr->caplen = ring->slot[i].len;
1131			ring->cur = nm_ring_next(ring, i);
1132			/* we could postpone advancing head if we want
1133			 * to hold the buffer. This can be supported in
1134			 * the future.
1135			 */
1136			ring->head = ring->cur;
1137			d->cur_rx_ring = ri;
1138			return buf;
1139		}
1140		ri++;
1141		if (ri > d->last_rx_ring)
1142			ri = d->first_rx_ring;
1143	} while (ri != d->cur_rx_ring);
1144	return NULL; /* nothing found */
1145}
1146
1147#endif /* !HAVE_NETMAP_WITH_LIBS */
1148
1149#endif /* NETMAP_WITH_LIBS */
1150
1151#endif /* _NET_NETMAP_USER_H_ */
1152