if_ntb.c revision 304358
1/*-
2 * Copyright (c) 2016 Alexander Motin <mav@FreeBSD.org>
3 * Copyright (C) 2013 Intel Corporation
4 * Copyright (C) 2015 EMC Corporation
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29/*
30 * The Non-Transparent Bridge (NTB) is a device that allows you to connect
31 * two or more systems using a PCI-e links, providing remote memory access.
32 *
33 * This module contains a driver for simulated Ethernet device, using
34 * underlying NTB Transport device.
35 *
36 * NOTE: Much of the code in this module is shared with Linux. Any patches may
37 * be picked up and redistributed in Linux with a dual GPL/BSD license.
38 */
39
40#include <sys/cdefs.h>
41__FBSDID("$FreeBSD: stable/11/sys/dev/ntb/if_ntb/if_ntb.c 304358 2016-08-18 09:26:21Z mav $");
42
43#include <sys/param.h>
44#include <sys/kernel.h>
45#include <sys/systm.h>
46#include <sys/buf_ring.h>
47#include <sys/bus.h>
48#include <sys/limits.h>
49#include <sys/module.h>
50#include <sys/socket.h>
51#include <sys/sockio.h>
52#include <sys/sysctl.h>
53#include <sys/taskqueue.h>
54
55#include <net/if.h>
56#include <net/if_media.h>
57#include <net/if_types.h>
58#include <net/if_media.h>
59#include <net/if_var.h>
60#include <net/bpf.h>
61#include <net/ethernet.h>
62
63#include <machine/bus.h>
64
65#include "../ntb_transport.h"
66
67#define KTR_NTB KTR_SPARE3
68#define NTB_MEDIATYPE		 (IFM_ETHER | IFM_AUTO | IFM_FDX)
69
70#define	NTB_CSUM_FEATURES	(CSUM_IP | CSUM_TCP | CSUM_UDP | CSUM_SCTP)
71#define	NTB_CSUM_FEATURES6	(CSUM_TCP_IPV6 | CSUM_UDP_IPV6 | CSUM_SCTP_IPV6)
72#define	NTB_CSUM_SET		(CSUM_DATA_VALID | CSUM_DATA_VALID_IPV6 | \
73				    CSUM_PSEUDO_HDR | \
74				    CSUM_IP_CHECKED | CSUM_IP_VALID | \
75				    CSUM_SCTP_VALID)
76
77static SYSCTL_NODE(_hw, OID_AUTO, if_ntb, CTLFLAG_RW, 0, "if_ntb");
78
79static unsigned g_if_ntb_num_queues = 1;
80SYSCTL_UINT(_hw_if_ntb, OID_AUTO, num_queues, CTLFLAG_RWTUN,
81    &g_if_ntb_num_queues, 0, "Number of queues per interface");
82
83struct ntb_net_queue {
84	struct ntb_net_ctx	*sc;
85	if_t			 ifp;
86	struct ntb_transport_qp *qp;
87	struct buf_ring		*br;
88	struct task		 tx_task;
89	struct taskqueue	*tx_tq;
90	struct mtx		 tx_lock;
91	struct callout		 queue_full;
92};
93
94struct ntb_net_ctx {
95	if_t			 ifp;
96	struct ifmedia		 media;
97	u_char			 eaddr[ETHER_ADDR_LEN];
98	int			 num_queues;
99	struct ntb_net_queue	*queues;
100	int			 mtu;
101};
102
103static int ntb_net_probe(device_t dev);
104static int ntb_net_attach(device_t dev);
105static int ntb_net_detach(device_t dev);
106static void ntb_net_init(void *arg);
107static int ntb_ifmedia_upd(struct ifnet *);
108static void ntb_ifmedia_sts(struct ifnet *, struct ifmediareq *);
109static int ntb_ioctl(if_t ifp, u_long command, caddr_t data);
110static int ntb_transmit(if_t ifp, struct mbuf *m);
111static void ntb_net_tx_handler(struct ntb_transport_qp *qp, void *qp_data,
112    void *data, int len);
113static void ntb_net_rx_handler(struct ntb_transport_qp *qp, void *qp_data,
114    void *data, int len);
115static void ntb_net_event_handler(void *data, enum ntb_link_event status);
116static void ntb_handle_tx(void *arg, int pending);
117static void ntb_qp_full(void *arg);
118static void ntb_qflush(if_t ifp);
119static void create_random_local_eui48(u_char *eaddr);
120
121static int
122ntb_net_probe(device_t dev)
123{
124
125	device_set_desc(dev, "NTB Network Interface");
126	return (0);
127}
128
129static int
130ntb_net_attach(device_t dev)
131{
132	struct ntb_net_ctx *sc = device_get_softc(dev);
133	struct ntb_net_queue *q;
134	if_t ifp;
135	struct ntb_queue_handlers handlers = { ntb_net_rx_handler,
136	    ntb_net_tx_handler, ntb_net_event_handler };
137	int i;
138
139	ifp = sc->ifp = if_gethandle(IFT_ETHER);
140	if (ifp == NULL) {
141		printf("ntb: Cannot allocate ifnet structure\n");
142		return (ENOMEM);
143	}
144	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
145	if_setdev(ifp, dev);
146
147	sc->num_queues = g_if_ntb_num_queues;
148	sc->queues = malloc(sc->num_queues * sizeof(struct ntb_net_queue),
149	    M_DEVBUF, M_WAITOK | M_ZERO);
150	sc->mtu = INT_MAX;
151	for (i = 0; i < sc->num_queues; i++) {
152		q = &sc->queues[i];
153		q->sc = sc;
154		q->ifp = ifp;
155		q->qp = ntb_transport_create_queue(q,
156		    device_get_parent(dev), &handlers);
157		if (q->qp == NULL)
158			break;
159		sc->mtu = imin(sc->mtu, ntb_transport_max_size(q->qp));
160		mtx_init(&q->tx_lock, "ntb tx", NULL, MTX_DEF);
161		q->br = buf_ring_alloc(4096, M_DEVBUF, M_WAITOK, &q->tx_lock);
162		TASK_INIT(&q->tx_task, 0, ntb_handle_tx, q);
163		q->tx_tq = taskqueue_create_fast("ntb_txq", M_NOWAIT,
164		    taskqueue_thread_enqueue, &q->tx_tq);
165		taskqueue_start_threads(&q->tx_tq, 1, PI_NET, "%s txq%d",
166		    device_get_nameunit(dev), i);
167		callout_init(&q->queue_full, 1);
168	}
169	sc->num_queues = i;
170
171	if_setinitfn(ifp, ntb_net_init);
172	if_setsoftc(ifp, sc);
173	if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST);
174	if_setioctlfn(ifp, ntb_ioctl);
175	if_settransmitfn(ifp, ntb_transmit);
176	if_setqflushfn(ifp, ntb_qflush);
177	create_random_local_eui48(sc->eaddr);
178	ether_ifattach(ifp, sc->eaddr);
179	if_setcapabilities(ifp, IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6 |
180	    IFCAP_JUMBO_MTU | IFCAP_LINKSTATE);
181	if_setcapenable(ifp, IFCAP_JUMBO_MTU | IFCAP_LINKSTATE);
182	if_setmtu(ifp, sc->mtu - ETHER_HDR_LEN);
183
184	ifmedia_init(&sc->media, IFM_IMASK, ntb_ifmedia_upd,
185	    ntb_ifmedia_sts);
186	ifmedia_add(&sc->media, NTB_MEDIATYPE, 0, NULL);
187	ifmedia_set(&sc->media, NTB_MEDIATYPE);
188
189	for (i = 0; i < sc->num_queues; i++)
190		ntb_transport_link_up(sc->queues[i].qp);
191	return (0);
192}
193
194static int
195ntb_net_detach(device_t dev)
196{
197	struct ntb_net_ctx *sc = device_get_softc(dev);
198	struct ntb_net_queue *q;
199	int i;
200
201	for (i = 0; i < sc->num_queues; i++)
202		ntb_transport_link_down(sc->queues[i].qp);
203	ether_ifdetach(sc->ifp);
204	if_free(sc->ifp);
205	ifmedia_removeall(&sc->media);
206	for (i = 0; i < sc->num_queues; i++) {
207		q = &sc->queues[i];
208		ntb_transport_free_queue(q->qp);
209		buf_ring_free(q->br, M_DEVBUF);
210		callout_drain(&q->queue_full);
211		taskqueue_drain_all(q->tx_tq);
212		mtx_destroy(&q->tx_lock);
213	}
214	free(sc->queues, M_DEVBUF);
215	return (0);
216}
217
218/* Network device interface */
219
220static void
221ntb_net_init(void *arg)
222{
223	struct ntb_net_ctx *sc = arg;
224	if_t ifp = sc->ifp;
225
226	if_setdrvflagbits(ifp, IFF_DRV_RUNNING, IFF_DRV_OACTIVE);
227	if_link_state_change(ifp, ntb_transport_link_query(sc->queues[0].qp) ?
228	    LINK_STATE_UP : LINK_STATE_DOWN);
229}
230
231static int
232ntb_ioctl(if_t ifp, u_long command, caddr_t data)
233{
234	struct ntb_net_ctx *sc = if_getsoftc(ifp);
235	struct ifreq *ifr = (struct ifreq *)data;
236	int error = 0;
237
238	switch (command) {
239	case SIOCSIFMTU:
240	    {
241		if (ifr->ifr_mtu > sc->mtu - ETHER_HDR_LEN) {
242			error = EINVAL;
243			break;
244		}
245
246		if_setmtu(ifp, ifr->ifr_mtu);
247		break;
248	    }
249
250	case SIOCSIFMEDIA:
251	case SIOCGIFMEDIA:
252		error = ifmedia_ioctl(ifp, ifr, &sc->media, command);
253		break;
254
255	case SIOCSIFCAP:
256		if (ifr->ifr_reqcap & IFCAP_RXCSUM)
257			if_setcapenablebit(ifp, IFCAP_RXCSUM, 0);
258		else
259			if_setcapenablebit(ifp, 0, IFCAP_RXCSUM);
260		if (ifr->ifr_reqcap & IFCAP_TXCSUM) {
261			if_setcapenablebit(ifp, IFCAP_TXCSUM, 0);
262			if_sethwassistbits(ifp, NTB_CSUM_FEATURES, 0);
263		} else {
264			if_setcapenablebit(ifp, 0, IFCAP_TXCSUM);
265			if_sethwassistbits(ifp, 0, NTB_CSUM_FEATURES);
266		}
267		if (ifr->ifr_reqcap & IFCAP_RXCSUM_IPV6)
268			if_setcapenablebit(ifp, IFCAP_RXCSUM_IPV6, 0);
269		else
270			if_setcapenablebit(ifp, 0, IFCAP_RXCSUM_IPV6);
271		if (ifr->ifr_reqcap & IFCAP_TXCSUM_IPV6) {
272			if_setcapenablebit(ifp, IFCAP_TXCSUM_IPV6, 0);
273			if_sethwassistbits(ifp, NTB_CSUM_FEATURES6, 0);
274		} else {
275			if_setcapenablebit(ifp, 0, IFCAP_TXCSUM_IPV6);
276			if_sethwassistbits(ifp, 0, NTB_CSUM_FEATURES6);
277		}
278		break;
279
280	default:
281		error = ether_ioctl(ifp, command, data);
282		break;
283	}
284
285	return (error);
286}
287
288static int
289ntb_ifmedia_upd(struct ifnet *ifp)
290{
291	struct ntb_net_ctx *sc = if_getsoftc(ifp);
292	struct ifmedia *ifm = &sc->media;
293
294	if (IFM_TYPE(ifm->ifm_media) != IFM_ETHER)
295		return (EINVAL);
296
297	return (0);
298}
299
300static void
301ntb_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
302{
303	struct ntb_net_ctx *sc = if_getsoftc(ifp);
304
305	ifmr->ifm_status = IFM_AVALID;
306	ifmr->ifm_active = NTB_MEDIATYPE;
307	if (ntb_transport_link_query(sc->queues[0].qp))
308		ifmr->ifm_status |= IFM_ACTIVE;
309}
310
311static void
312ntb_transmit_locked(struct ntb_net_queue *q)
313{
314	if_t ifp = q->ifp;
315	struct mbuf *m;
316	int rc, len;
317	short mflags;
318
319	CTR0(KTR_NTB, "TX: ntb_transmit_locked");
320	while ((m = drbr_peek(ifp, q->br)) != NULL) {
321		CTR1(KTR_NTB, "TX: start mbuf %p", m);
322		if_etherbpfmtap(ifp, m);
323		len = m->m_pkthdr.len;
324		mflags = m->m_flags;
325		rc = ntb_transport_tx_enqueue(q->qp, m, m, len);
326		if (rc != 0) {
327			CTR2(KTR_NTB, "TX: could not tx mbuf %p: %d", m, rc);
328			if (rc == EAGAIN) {
329				drbr_putback(ifp, q->br, m);
330				callout_reset_sbt(&q->queue_full,
331				    SBT_1MS / 4, SBT_1MS / 4,
332				    ntb_qp_full, q, 0);
333			} else {
334				m_freem(m);
335				drbr_advance(ifp, q->br);
336				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
337			}
338			break;
339		}
340		drbr_advance(ifp, q->br);
341		if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
342		if_inc_counter(ifp, IFCOUNTER_OBYTES, len);
343		if (mflags & M_MCAST)
344			if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1);
345	}
346}
347
348static int
349ntb_transmit(if_t ifp, struct mbuf *m)
350{
351	struct ntb_net_ctx *sc = if_getsoftc(ifp);
352	struct ntb_net_queue *q;
353	int error, i;
354
355	CTR0(KTR_NTB, "TX: ntb_transmit");
356	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
357		i = m->m_pkthdr.flowid % sc->num_queues;
358	else
359		i = curcpu % sc->num_queues;
360	q = &sc->queues[i];
361
362	error = drbr_enqueue(ifp, q->br, m);
363	if (error)
364		return (error);
365
366	if (mtx_trylock(&q->tx_lock)) {
367		ntb_transmit_locked(q);
368		mtx_unlock(&q->tx_lock);
369	} else
370		taskqueue_enqueue(q->tx_tq, &q->tx_task);
371	return (0);
372}
373
374static void
375ntb_handle_tx(void *arg, int pending)
376{
377	struct ntb_net_queue *q = arg;
378
379	mtx_lock(&q->tx_lock);
380	ntb_transmit_locked(q);
381	mtx_unlock(&q->tx_lock);
382}
383
384static void
385ntb_qp_full(void *arg)
386{
387	struct ntb_net_queue *q = arg;
388
389	CTR0(KTR_NTB, "TX: qp_full callout");
390	if (ntb_transport_tx_free_entry(q->qp) > 0)
391		taskqueue_enqueue(q->tx_tq, &q->tx_task);
392	else
393		callout_schedule_sbt(&q->queue_full,
394		    SBT_1MS / 4, SBT_1MS / 4, 0);
395}
396
397static void
398ntb_qflush(if_t ifp)
399{
400	struct ntb_net_ctx *sc = if_getsoftc(ifp);
401	struct ntb_net_queue *q;
402	struct mbuf *m;
403	int i;
404
405	for (i = 0; i < sc->num_queues; i++) {
406		q = &sc->queues[i];
407		mtx_lock(&q->tx_lock);
408		while ((m = buf_ring_dequeue_sc(q->br)) != NULL)
409			m_freem(m);
410		mtx_unlock(&q->tx_lock);
411	}
412	if_qflush(ifp);
413}
414
415/* Network Device Callbacks */
416static void
417ntb_net_tx_handler(struct ntb_transport_qp *qp, void *qp_data, void *data,
418    int len)
419{
420
421	m_freem(data);
422	CTR1(KTR_NTB, "TX: tx_handler freeing mbuf %p", data);
423}
424
425static void
426ntb_net_rx_handler(struct ntb_transport_qp *qp, void *qp_data, void *data,
427    int len)
428{
429	struct ntb_net_queue *q = qp_data;
430	struct ntb_net_ctx *sc = q->sc;
431	struct mbuf *m = data;
432	if_t ifp = q->ifp;
433	uint16_t proto;
434
435	CTR1(KTR_NTB, "RX: rx handler (%d)", len);
436	if (len < 0) {
437		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
438		return;
439	}
440
441	m->m_pkthdr.rcvif = ifp;
442	if (sc->num_queues > 1) {
443		m->m_pkthdr.flowid = q - sc->queues;
444		M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
445	}
446	if (if_getcapenable(ifp) & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6)) {
447		m_copydata(m, 12, 2, (void *)&proto);
448		switch (ntohs(proto)) {
449		case ETHERTYPE_IP:
450			if (if_getcapenable(ifp) & IFCAP_RXCSUM) {
451				m->m_pkthdr.csum_data = 0xffff;
452				m->m_pkthdr.csum_flags = NTB_CSUM_SET;
453			}
454			break;
455		case ETHERTYPE_IPV6:
456			if (if_getcapenable(ifp) & IFCAP_RXCSUM_IPV6) {
457				m->m_pkthdr.csum_data = 0xffff;
458				m->m_pkthdr.csum_flags = NTB_CSUM_SET;
459			}
460			break;
461		}
462	}
463	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
464	if_input(ifp, m);
465}
466
467static void
468ntb_net_event_handler(void *data, enum ntb_link_event status)
469{
470	struct ntb_net_queue *q = data;
471	int new_state;
472
473	switch (status) {
474	case NTB_LINK_DOWN:
475		new_state = LINK_STATE_DOWN;
476		break;
477	case NTB_LINK_UP:
478		new_state = LINK_STATE_UP;
479		break;
480	default:
481		new_state = LINK_STATE_UNKNOWN;
482		break;
483	}
484	if_link_state_change(q->ifp, new_state);
485}
486
487/* Helper functions */
488/* TODO: This too should really be part of the kernel */
489#define EUI48_MULTICAST			1 << 0
490#define EUI48_LOCALLY_ADMINISTERED	1 << 1
491static void
492create_random_local_eui48(u_char *eaddr)
493{
494	static uint8_t counter = 0;
495	uint32_t seed = ticks;
496
497	eaddr[0] = EUI48_LOCALLY_ADMINISTERED;
498	memcpy(&eaddr[1], &seed, sizeof(uint32_t));
499	eaddr[5] = counter++;
500}
501
502static device_method_t ntb_net_methods[] = {
503	/* Device interface */
504	DEVMETHOD(device_probe,     ntb_net_probe),
505	DEVMETHOD(device_attach,    ntb_net_attach),
506	DEVMETHOD(device_detach,    ntb_net_detach),
507	DEVMETHOD_END
508};
509
510devclass_t ntb_net_devclass;
511static DEFINE_CLASS_0(ntb, ntb_net_driver, ntb_net_methods,
512    sizeof(struct ntb_net_ctx));
513DRIVER_MODULE(if_ntb, ntb_transport, ntb_net_driver, ntb_net_devclass,
514    NULL, NULL);
515MODULE_DEPEND(if_ntb, ntb_transport, 1, 1, 1);
516MODULE_VERSION(if_ntb, 1);
517