netmap.h revision 257529
1/*
2 * Copyright (C) 2011-2013 Matteo Landi, Luigi Rizzo. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
6 * met:
7 *
8 *   1. Redistributions of source code must retain the above copyright
9 *      notice, this list of conditions and the following disclaimer.
10 *
11 *   2. Redistributions in binary form must reproduce the above copyright
12 *      notice, this list of conditions and the following disclaimer in the
13 *      documentation and/or other materials provided with the
14 *      distribution.
15 *
16 *   3. Neither the name of the authors nor the names of their contributors
17 *      may be used to endorse or promote products derived from this
18 *      software without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY MATTEO LANDI AND CONTRIBUTORS "AS IS" AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL MATTEO LANDI OR CONTRIBUTORS
24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
30 * THE POSSIBILITY OF SUCH DAMAGE.
31 */
32
33/*
34 * $FreeBSD: head/sys/net/netmap.h 257529 2013-11-01 21:21:14Z luigi $
35 *
36 * Definitions of constants and the structures used by the netmap
37 * framework, for the part visible to both kernel and userspace.
38 * Detailed info on netmap is available with "man netmap" or at
39 *
40 *	http://info.iet.unipi.it/~luigi/netmap/
41 *
42 * This API is also used to communicate with the VALE software switch
43 */
44
45#ifndef _NET_NETMAP_H_
46#define _NET_NETMAP_H_
47
48/*
49 * --- Netmap data structures ---
50 *
51 * The userspace data structures used by netmap are shown below.
52 * They are allocated by the kernel and mmap()ed by userspace threads.
53 * Pointers are implemented as memory offsets or indexes,
54 * so that they can be easily dereferenced in kernel and userspace.
55
56   KERNEL (opaque, obviously)
57
58  ====================================================================
59                                         |
60   USERSPACE                             |      struct netmap_ring
61                                         +---->+--------------+
62                                             / | cur          |
63   struct netmap_if (nifp, 1 per fd)        /  | avail        |
64    +---------------+                      /   | buf_ofs      |
65    | ni_tx_rings   |                     /    +==============+
66    | ni_rx_rings   |                    /     | buf_idx, len | slot[0]
67    |               |                   /      | flags, ptr   |
68    |               |                  /       +--------------+
69    +===============+                 /        | buf_idx, len | slot[1]
70    | txring_ofs[0] | (rel.to nifp)--'         | flags, ptr   |
71    | txring_ofs[1] |                          +--------------+
72  (ni_tx_rings+1 entries)                     (num_slots entries)
73    | txring_ofs[t] |                          | buf_idx, len | slot[n-1]
74    +---------------+                          | flags, ptr   |
75    | rxring_ofs[0] |                          +--------------+
76    | rxring_ofs[1] |
77  (ni_rx_rings+1 entries)
78    | rxring_ofs[r] |
79    +---------------+
80
81 * For each "interface" (NIC, host stack, VALE switch port) attached to a
82 * file descriptor, the mmap()ed region contains a (logically readonly)
83 * struct netmap_if pointing to struct netmap_ring's.
84 * There is one netmap_ring per physical NIC ring, plus one tx/rx ring
85 * pair attached to the host stack (this pair is unused for VALE ports).
86 *
87 * All physical/host stack ports share the same memory region,
88 * so that zero-copy can be implemented between them.
89 * VALE switch ports instead have separate memory regions.
90 *
91 * The netmap_ring is the userspace-visible replica of the NIC ring.
92 * Each slot has the index of a buffer (MTU-sized and residing in the
93 * mmapped region), its length and some flags. An extra 64-bit pointer
94 * is provided for user-supplied buffers in the tx path.
95 *
96 * In user space, the buffer address is computed as
97 *	(char *)ring + buf_ofs + index*NETMAP_BUF_SIZE
98 */
99
100/*
101 * struct netmap_slot is a buffer descriptor
102 *
103 * buf_idx	the index of the buffer associated to the slot.
104 * len		the length of the payload
105 * flags	control operation on the slot, as defined below
106 *
107 * NS_BUF_CHANGED	must be set whenever userspace wants
108 *		to change buf_idx (it might be necessary to
109 *		reprogram the NIC)
110 *
111 * NS_REPORT	must be set if we want the NIC to generate an interrupt
112 *		when this slot is used. Leaving it to 0 improves
113 *		performance.
114 *
115 * NS_FORWARD	if set on a receive ring, and the device is in
116 *		transparent mode, buffers released with the flag set
117 *		will be forwarded to the 'other' side (host stack
118 *		or NIC, respectively) on the next select() or ioctl()
119 *
120 * NS_NO_LEARN	on a VALE switch, do not 'learn' the source port for
121 *		this packet.
122 *
123 * NS_INDIRECT	(tx rings only) data is in a userspace buffer pointed
124 *		by the ptr field in the slot.
125 *
126 * NS_MOREFRAG	Part of a multi-segment frame. The last (or only)
127 *		segment must not have this flag.
128 *		Only supported on VALE ports.
129 *
130 * NS_PORT_MASK	the high 8 bits of the flag, if not zero, indicate the
131 *		destination port for the VALE switch, overriding
132 *		the lookup table.
133 */
134
135struct netmap_slot {
136	uint32_t buf_idx;	/* buffer index */
137	uint16_t len;		/* packet length */
138	uint16_t flags;		/* buf changed, etc. */
139#define	NS_BUF_CHANGED	0x0001	/* buf_idx changed */
140#define	NS_REPORT	0x0002	/* ask the hardware to report results
141				 * e.g. by generating an interrupt
142				 */
143#define	NS_FORWARD	0x0004	/* pass packet to the other endpoint
144				 * (host stack or device)
145				 */
146#define	NS_NO_LEARN	0x0008
147#define	NS_INDIRECT	0x0010
148#define	NS_MOREFRAG	0x0020
149#define	NS_PORT_SHIFT	8
150#define	NS_PORT_MASK	(0xff << NS_PORT_SHIFT)
151				/*
152				 * in rx rings, the high 8 bits
153				 *  are the number of fragments.
154				 */
155#define	NS_RFRAGS(_slot)	( ((_slot)->flags >> 8) & 0xff)
156	uint64_t	ptr;	/* pointer for indirect buffers */
157};
158
159/*
160 * struct netmap_ring
161 *
162 * Netmap representation of a TX or RX ring (also known as "queue").
163 * This is a queue implemented as a fixed-size circular array.
164 * At the software level, two fields are important: avail and cur.
165 *
166 * In TX rings:
167 *
168 *	avail	tells how many slots are available for transmission.
169 *		It is updated by the kernel in each netmap system call.
170 *		It MUST BE decremented by the user when it
171 *		adds a new packet to send.
172 *
173 *	cur	indicates the slot to use for the next packet
174 *		to send (i.e. the "tail" of the queue).
175 *		It MUST BE incremented by the user before
176 *		netmap system calls to reflect the number of newly
177 *		sent packets.
178 *		It is checked by the kernel on netmap system calls
179 *		(normally unmodified by the kernel unless invalid).
180 *
181 * In RX rings:
182 *
183 *	avail	is the number of packets available (possibly 0).
184 *		It is updated by the kernel in each netmap system call.
185 *		It MUST BE decremented by the user when it
186 *		consumes a packet.
187 *
188 *	cur	indicates the first slot that contains a packet not
189 *		yet processed (the "head" of the queue).
190 *		It MUST BE incremented by the user when it consumes
191 *		a packet.
192 *
193 *	reserved	indicates the number of buffers before 'cur'
194 *		that the user has not released yet. Normally 0,
195 *		it MUST BE incremented by the user when it
196 *		does not return the buffer immediately, and decremented
197 *		when the buffer is finally freed.
198 *
199 *
200 * DATA OWNERSHIP/LOCKING:
201 *	The netmap_ring, all slots, and buffers in the range
202 *	[reserved-cur , cur+avail[ are owned by the user program,
203 *	and the kernel only touches them in the same thread context
204 *	during a system call.
205 *	Other buffers are reserved for use by the NIC's DMA engines.
206 *
207 * FLAGS
208 *	NR_TIMESTAMP	updates the 'ts' field on each syscall. This is
209 *			a global timestamp for all packets.
210 *	NR_RX_TSTMP	if set, the last 64 byte in each buffer will
211 *			contain a timestamp for the frame supplied by
212 *			the hardware (if supported)
213 *	NR_FORWARD	if set, the NS_FORWARD flag in each slot of the
214 *			RX ring is checked, and if set the packet is
215 *			passed to the other side (host stack or device,
216 *			respectively). This permits bpf-like behaviour
217 *			or transparency for selected packets.
218 */
219struct netmap_ring {
220	/*
221	 * buf_ofs is meant to be used through macros.
222	 * It contains the offset of the buffer region from this
223	 * descriptor.
224	 */
225	const ssize_t	buf_ofs;
226	const uint32_t	num_slots;	/* number of slots in the ring. */
227	uint32_t	avail;		/* number of usable slots */
228	uint32_t        cur;		/* 'current' r/w position */
229	uint32_t	reserved;	/* not refilled before current */
230
231	const uint16_t	nr_buf_size;
232	uint16_t	flags;
233#define	NR_TIMESTAMP	0x0002		/* set timestamp on *sync() */
234#define	NR_FORWARD	0x0004		/* enable NS_FORWARD for ring */
235#define	NR_RX_TSTMP	0x0008		/* set rx timestamp in slots */
236
237	struct timeval	ts;		/* time of last *sync() */
238
239	/* the slots follow. This struct has variable size */
240	struct netmap_slot slot[0];	/* array of slots. */
241};
242
243
244/*
245 * Netmap representation of an interface and its queue(s).
246 * This is initialized by the kernel when binding a file
247 * descriptor to a port, and should be considered as readonly
248 * by user programs. The kernel never uses it.
249 *
250 * There is one netmap_if for each file descriptor on which we want
251 * to select/poll.
252 * select/poll operates on one or all pairs depending on the value of
253 * nmr_queueid passed on the ioctl.
254 */
255struct netmap_if {
256	char		ni_name[IFNAMSIZ]; /* name of the interface. */
257	const uint32_t	ni_version;	/* API version, currently unused */
258	const uint32_t	ni_flags;	/* properties */
259#define	NI_PRIV_MEM	0x1		/* private memory region */
260
261	const uint32_t	ni_rx_rings;	/* number of rx rings */
262	const uint32_t	ni_tx_rings;	/* number of tx rings */
263	/*
264	 * The following array contains the offset of each netmap ring
265	 * from this structure. The first ni_tx_rings+1 entries refer
266	 * to the tx rings, the next ni_rx_rings+1 refer to the rx rings
267	 * (the last entry in each block refers to the host stack rings).
268	 * The area is filled up by the kernel on NIOCREGIF,
269	 * and then only read by userspace code.
270	 */
271	const ssize_t	ring_ofs[0];
272};
273
274#ifndef NIOCREGIF
275/*
276 * ioctl names and related fields
277 *
278 * NIOCGINFO takes a struct ifreq, the interface name is the input,
279 *	the outputs are number of queues and number of descriptor
280 *	for each queue (useful to set number of threads etc.).
281 *	The info returned is only advisory and may change before
282 *	the interface is bound to a file descriptor.
283 *
284 * NIOCREGIF takes an interface name within a struct ifreq,
285 *	and activates netmap mode on the interface (if possible).
286 *
287 *   nr_name	is the name of the interface
288 *
289 *   nr_tx_slots, nr_tx_slots, nr_tx_rings, nr_rx_rings
290 *	indicate the configuration of the port on return.
291 *
292 *	On input, non-zero values for nr_tx_rings, nr_tx_slots and the
293 *	rx counterparts may be used to reconfigure the port according
294 *	to the requested values, but this is not guaranteed.
295 *	The actual values are returned on completion of the ioctl().
296 *
297 *   nr_ringid
298 *	indicates how rings should be bound to the file descriptors.
299 *	The default (0) means all physical rings of a NIC are bound.
300 *	NETMAP_HW_RING plus a ring number lets you bind just
301 *	a single ring pair.
302 *	NETMAP_SW_RING binds only the host tx/rx rings
303 *	NETMAP_NO_TX_POLL prevents select()/poll() from pushing
304 *	out packets on the tx ring unless POLLOUT is specified.
305 *
306 *	NETMAP_PRIV_MEM is a return value used to indicate that
307 *	this ring is in a private memory region hence buffer
308 *	swapping cannot be used
309 *
310 *   nr_cmd	is used to configure NICs attached to a VALE switch,
311 *	or to dump the configuration of a VALE switch.
312 *
313 *	nr_cmd = NETMAP_BDG_ATTACH and nr_name = vale*:ifname
314 *	attaches the NIC to the switch, with nr_ringid specifying
315 *	which rings to use
316 *
317 *	nr_cmd = NETMAP_BDG_DETACH and nr_name = vale*:ifname
318 *	disconnects a previously attached NIC
319 *
320 *	nr_cmd = NETMAP_BDG_LIST is used to list the configuration
321 *	of VALE switches, with additional arguments.
322 *
323 * NIOCTXSYNC, NIOCRXSYNC synchronize tx or rx queues,
324 *	whose identity is set in NIOCREGIF through nr_ringid
325 *
326 * NETMAP_API is the API version.
327 */
328
329/*
330 * struct nmreq overlays a struct ifreq
331 */
332struct nmreq {
333	char		nr_name[IFNAMSIZ];
334	uint32_t	nr_version;	/* API version */
335#define	NETMAP_API	5		/* current version */
336	uint32_t	nr_offset;	/* nifp offset in the shared region */
337	uint32_t	nr_memsize;	/* size of the shared region */
338	uint32_t	nr_tx_slots;	/* slots in tx rings */
339	uint32_t	nr_rx_slots;	/* slots in rx rings */
340	uint16_t	nr_tx_rings;	/* number of tx rings */
341	uint16_t	nr_rx_rings;	/* number of rx rings */
342	uint16_t	nr_ringid;	/* ring(s) we care about */
343#define NETMAP_PRIV_MEM	0x8000		/* rings use private memory */
344#define NETMAP_HW_RING	0x4000		/* low bits indicate one hw ring */
345#define NETMAP_SW_RING	0x2000		/* process the sw ring */
346#define NETMAP_NO_TX_POLL	0x1000	/* no automatic txsync on poll */
347#define NETMAP_RING_MASK 0xfff		/* the ring number */
348	uint16_t	nr_cmd;
349#define NETMAP_BDG_ATTACH	1	/* attach the NIC */
350#define NETMAP_BDG_DETACH	2	/* detach the NIC */
351#define NETMAP_BDG_LOOKUP_REG	3	/* register lookup function */
352#define NETMAP_BDG_LIST		4	/* get bridge's info */
353	uint16_t	nr_arg1;
354#define NETMAP_BDG_HOST		1	/* attach the host stack on ATTACH */
355	uint16_t	nr_arg2;
356	uint32_t	spare2[3];
357};
358
359/*
360 * FreeBSD uses the size value embedded in the _IOWR to determine
361 * how much to copy in/out. So we need it to match the actual
362 * data structure we pass. We put some spares in the structure
363 * to ease compatibility with other versions
364 */
365#define NIOCGINFO	_IOWR('i', 145, struct nmreq) /* return IF info */
366#define NIOCREGIF	_IOWR('i', 146, struct nmreq) /* interface register */
367#define NIOCUNREGIF	_IO('i', 147) /* deprecated. Was interface unregister */
368#define NIOCTXSYNC	_IO('i', 148) /* sync tx queues */
369#define NIOCRXSYNC	_IO('i', 149) /* sync rx queues */
370#endif /* !NIOCREGIF */
371
372#endif /* _NET_NETMAP_H_ */
373