1/*-
2 * Copyright (c) 2020 Mellanox Technologies. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 *    notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 *    notice, this list of conditions and the following disclaimer in the
11 *    documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 * SUCH DAMAGE.
24 */
25
26#include "opt_inet.h"
27#include "opt_inet6.h"
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD$");
31
32#include <sys/param.h>
33#include <sys/systm.h>
34#include <sys/devctl.h>
35#include <sys/eventhandler.h>
36#include <sys/kernel.h>
37#include <sys/mbuf.h>
38#include <sys/module.h>
39#include <sys/socket.h>
40#include <sys/sysctl.h>
41
42#include <net/bpf.h>
43#include <net/ethernet.h>
44#include <net/infiniband.h>
45#include <net/if.h>
46#include <net/if_var.h>
47#include <net/if_dl.h>
48#include <net/if_media.h>
49#include <net/if_lagg.h>
50#include <net/if_llatbl.h>
51#include <net/if_types.h>
52#include <net/netisr.h>
53#include <net/route.h>
54#include <netinet/if_ether.h>
55#include <netinet/in.h>
56#include <netinet/ip6.h>
57#include <netinet6/in6_var.h>
58#include <netinet6/nd6.h>
59
60#include <security/mac/mac_framework.h>
61
62/* if_lagg(4) support */
63struct mbuf *(*lagg_input_infiniband_p)(struct ifnet *, struct mbuf *);
64
65#ifdef INET
66static inline void
67infiniband_ipv4_multicast_map(uint32_t addr,
68    const uint8_t *broadcast, uint8_t *buf)
69{
70	uint8_t scope;
71
72	addr = ntohl(addr);
73	scope = broadcast[5] & 0xF;
74
75	buf[0] = 0;
76	buf[1] = 0xff;
77	buf[2] = 0xff;
78	buf[3] = 0xff;
79	buf[4] = 0xff;
80	buf[5] = 0x10 | scope;
81	buf[6] = 0x40;
82	buf[7] = 0x1b;
83	buf[8] = broadcast[8];
84	buf[9] = broadcast[9];
85	buf[10] = 0;
86	buf[11] = 0;
87	buf[12] = 0;
88	buf[13] = 0;
89	buf[14] = 0;
90	buf[15] = 0;
91	buf[16] = (addr >> 24) & 0xff;
92	buf[17] = (addr >> 16) & 0xff;
93	buf[18] = (addr >> 8) & 0xff;
94	buf[19] = addr & 0xff;
95}
96#endif
97
98#ifdef INET6
99static inline void
100infiniband_ipv6_multicast_map(const struct in6_addr *addr,
101    const uint8_t *broadcast, uint8_t *buf)
102{
103	uint8_t scope;
104
105	scope = broadcast[5] & 0xF;
106
107	buf[0] = 0;
108	buf[1] = 0xff;
109	buf[2] = 0xff;
110	buf[3] = 0xff;
111	buf[4] = 0xff;
112	buf[5] = 0x10 | scope;
113	buf[6] = 0x60;
114	buf[7] = 0x1b;
115	buf[8] = broadcast[8];
116	buf[9] = broadcast[9];
117	memcpy(&buf[10], &addr->s6_addr[6], 10);
118}
119#endif
120
121/*
122 * This is for clients that have an infiniband_header in the mbuf.
123 */
124void
125infiniband_bpf_mtap(struct ifnet *ifp, struct mbuf *mb)
126{
127	struct infiniband_header *ibh;
128	struct ether_header eh;
129
130	if (mb->m_len < sizeof(*ibh))
131		return;
132
133	ibh = mtod(mb, struct infiniband_header *);
134	eh.ether_type = ibh->ib_protocol;
135	memset(eh.ether_shost, 0, ETHER_ADDR_LEN);
136	memcpy(eh.ether_dhost, ibh->ib_hwaddr + 4, ETHER_ADDR_LEN);
137	mb->m_data += sizeof(*ibh);
138	mb->m_len -= sizeof(*ibh);
139	mb->m_pkthdr.len -= sizeof(*ibh);
140	bpf_mtap2(ifp->if_bpf, &eh, sizeof(eh), mb);
141	mb->m_data -= sizeof(*ibh);
142	mb->m_len += sizeof(*ibh);
143	mb->m_pkthdr.len += sizeof(*ibh);
144}
145
146static void
147update_mbuf_csumflags(struct mbuf *src, struct mbuf *dst)
148{
149	int csum_flags = 0;
150
151	if (src->m_pkthdr.csum_flags & CSUM_IP)
152		csum_flags |= (CSUM_IP_CHECKED|CSUM_IP_VALID);
153	if (src->m_pkthdr.csum_flags & CSUM_DELAY_DATA)
154		csum_flags |= (CSUM_DATA_VALID|CSUM_PSEUDO_HDR);
155	if (src->m_pkthdr.csum_flags & CSUM_SCTP)
156		csum_flags |= CSUM_SCTP_VALID;
157	dst->m_pkthdr.csum_flags |= csum_flags;
158	if (csum_flags & CSUM_DATA_VALID)
159		dst->m_pkthdr.csum_data = 0xffff;
160}
161
162/*
163 * Handle link-layer encapsulation requests.
164 */
165static int
166infiniband_requestencap(struct ifnet *ifp, struct if_encap_req *req)
167{
168	struct infiniband_header *ih;
169	struct arphdr *ah;
170	uint16_t etype;
171	const uint8_t *lladdr;
172
173	if (req->rtype != IFENCAP_LL)
174		return (EOPNOTSUPP);
175
176	if (req->bufsize < INFINIBAND_HDR_LEN)
177		return (ENOMEM);
178
179	ih = (struct infiniband_header *)req->buf;
180	lladdr = req->lladdr;
181	req->lladdr_off = 0;
182
183	switch (req->family) {
184	case AF_INET:
185		etype = htons(ETHERTYPE_IP);
186		break;
187	case AF_INET6:
188		etype = htons(ETHERTYPE_IPV6);
189		break;
190	case AF_ARP:
191		ah = (struct arphdr *)req->hdata;
192		ah->ar_hrd = htons(ARPHRD_INFINIBAND);
193
194		switch (ntohs(ah->ar_op)) {
195		case ARPOP_REVREQUEST:
196		case ARPOP_REVREPLY:
197			etype = htons(ETHERTYPE_REVARP);
198			break;
199		case ARPOP_REQUEST:
200		case ARPOP_REPLY:
201		default:
202			etype = htons(ETHERTYPE_ARP);
203			break;
204		}
205
206		if (req->flags & IFENCAP_FLAG_BROADCAST)
207			lladdr = ifp->if_broadcastaddr;
208		break;
209	default:
210		return (EAFNOSUPPORT);
211	}
212
213	ih->ib_protocol = etype;
214	ih->ib_reserved = 0;
215	memcpy(ih->ib_hwaddr, lladdr, INFINIBAND_ADDR_LEN);
216	req->bufsize = sizeof(struct infiniband_header);
217
218	return (0);
219}
220
221static int
222infiniband_resolve_addr(struct ifnet *ifp, struct mbuf *m,
223    const struct sockaddr *dst, struct route *ro, uint8_t *phdr,
224    uint32_t *pflags, struct llentry **plle)
225{
226	struct infiniband_header *ih;
227	uint32_t lleflags = 0;
228	int error = 0;
229
230	if (plle)
231		*plle = NULL;
232	ih = (struct infiniband_header *)phdr;
233
234	switch (dst->sa_family) {
235#ifdef INET
236	case AF_INET:
237		if ((m->m_flags & (M_BCAST | M_MCAST)) == 0) {
238			error = arpresolve(ifp, 0, m, dst, phdr, &lleflags, plle);
239		} else {
240			if (m->m_flags & M_BCAST) {
241				memcpy(ih->ib_hwaddr, ifp->if_broadcastaddr,
242				    INFINIBAND_ADDR_LEN);
243			} else {
244				infiniband_ipv4_multicast_map(
245				    ((const struct sockaddr_in *)dst)->sin_addr.s_addr,
246				    ifp->if_broadcastaddr, ih->ib_hwaddr);
247			}
248			ih->ib_protocol = htons(ETHERTYPE_IP);
249			ih->ib_reserved = 0;
250		}
251		break;
252#endif
253#ifdef INET6
254	case AF_INET6:
255		if ((m->m_flags & M_MCAST) == 0) {
256			error = nd6_resolve(ifp, 0, m, dst, phdr, &lleflags, plle);
257		} else {
258			infiniband_ipv6_multicast_map(
259			    &((const struct sockaddr_in6 *)dst)->sin6_addr,
260			    ifp->if_broadcastaddr, ih->ib_hwaddr);
261			ih->ib_protocol = htons(ETHERTYPE_IPV6);
262			ih->ib_reserved = 0;
263		}
264		break;
265#endif
266	default:
267		if_printf(ifp, "can't handle af%d\n", dst->sa_family);
268		if (m != NULL)
269			m_freem(m);
270		return (EAFNOSUPPORT);
271	}
272
273	if (error == EHOSTDOWN) {
274		if (ro != NULL && (ro->ro_flags & RT_HAS_GW) != 0)
275			error = EHOSTUNREACH;
276	}
277
278	if (error != 0)
279		return (error);
280
281	*pflags = RT_MAY_LOOP;
282	if (lleflags & LLE_IFADDR)
283		*pflags |= RT_L2_ME;
284
285	return (0);
286}
287
288/*
289 * Infiniband output routine.
290 */
291static int
292infiniband_output(struct ifnet *ifp, struct mbuf *m,
293    const struct sockaddr *dst, struct route *ro)
294{
295	uint8_t linkhdr[INFINIBAND_HDR_LEN];
296	uint8_t *phdr;
297	struct llentry *lle = NULL;
298	struct infiniband_header *ih;
299	int error = 0;
300	int hlen;	/* link layer header length */
301	uint32_t pflags;
302	bool addref;
303
304	NET_EPOCH_ASSERT();
305
306	addref = false;
307	phdr = NULL;
308	pflags = 0;
309	if (ro != NULL) {
310		/* XXX BPF uses ro_prepend */
311		if (ro->ro_prepend != NULL) {
312			phdr = ro->ro_prepend;
313			hlen = ro->ro_plen;
314		} else if (!(m->m_flags & (M_BCAST | M_MCAST))) {
315			if ((ro->ro_flags & RT_LLE_CACHE) != 0) {
316				lle = ro->ro_lle;
317				if (lle != NULL &&
318				    (lle->la_flags & LLE_VALID) == 0) {
319					LLE_FREE(lle);
320					lle = NULL;	/* redundant */
321					ro->ro_lle = NULL;
322				}
323				if (lle == NULL) {
324					/* if we lookup, keep cache */
325					addref = 1;
326				} else
327					/*
328					 * Notify LLE code that
329					 * the entry was used
330					 * by datapath.
331					 */
332					llentry_mark_used(lle);
333			}
334			if (lle != NULL) {
335				phdr = lle->r_linkdata;
336				hlen = lle->r_hdrlen;
337				pflags = lle->r_flags;
338			}
339		}
340	}
341
342#ifdef MAC
343	error = mac_ifnet_check_transmit(ifp, m);
344	if (error)
345		goto bad;
346#endif
347
348	M_PROFILE(m);
349	if (ifp->if_flags & IFF_MONITOR) {
350		error = ENETDOWN;
351		goto bad;
352	}
353	if (!((ifp->if_flags & IFF_UP) &&
354	    (ifp->if_drv_flags & IFF_DRV_RUNNING))) {
355		error = ENETDOWN;
356		goto bad;
357	}
358
359	if (phdr == NULL) {
360		/* No prepend data supplied. Try to calculate ourselves. */
361		phdr = linkhdr;
362		hlen = INFINIBAND_HDR_LEN;
363		error = infiniband_resolve_addr(ifp, m, dst, ro, phdr, &pflags,
364		    addref ? &lle : NULL);
365		if (addref && lle != NULL)
366			ro->ro_lle = lle;
367		if (error != 0)
368			return (error == EWOULDBLOCK ? 0 : error);
369	}
370
371	if ((pflags & RT_L2_ME) != 0) {
372		update_mbuf_csumflags(m, m);
373		return (if_simloop(ifp, m, dst->sa_family, 0));
374	}
375
376	/*
377	 * Add local infiniband header. If no space in first mbuf,
378	 * allocate another.
379	 */
380	M_PREPEND(m, INFINIBAND_HDR_LEN, M_NOWAIT);
381	if (m == NULL) {
382		error = ENOBUFS;
383		goto bad;
384	}
385	if ((pflags & RT_HAS_HEADER) == 0) {
386		ih = mtod(m, struct infiniband_header *);
387		memcpy(ih, phdr, hlen);
388	}
389
390	/*
391	 * Queue message on interface, update output statistics if
392	 * successful, and start output if interface not yet active.
393	 */
394	return (ifp->if_transmit(ifp, m));
395bad:
396	if (m != NULL)
397		m_freem(m);
398	return (error);
399}
400
401/*
402 * Process a received Infiniband packet.
403 */
404static void
405infiniband_input(struct ifnet *ifp, struct mbuf *m)
406{
407	struct infiniband_header *ibh;
408	struct epoch_tracker et;
409	int isr;
410
411	CURVNET_SET_QUIET(ifp->if_vnet);
412
413	if ((ifp->if_flags & IFF_UP) == 0) {
414		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
415		m_freem(m);
416		goto done;
417	}
418
419	ibh = mtod(m, struct infiniband_header *);
420
421	/*
422	 * Reset layer specific mbuf flags to avoid confusing upper
423	 * layers:
424	 */
425	m->m_flags &= ~M_VLANTAG;
426	m_clrprotoflags(m);
427
428	if (INFINIBAND_IS_MULTICAST(ibh->ib_hwaddr)) {
429		if (memcmp(ibh->ib_hwaddr, ifp->if_broadcastaddr,
430		    ifp->if_addrlen) == 0)
431			m->m_flags |= M_BCAST;
432		else
433			m->m_flags |= M_MCAST;
434		if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1);
435	}
436
437	/* Let BPF have it before we strip the header. */
438	INFINIBAND_BPF_MTAP(ifp, m);
439
440	/* Allow monitor mode to claim this frame, after stats are updated. */
441	if (ifp->if_flags & IFF_MONITOR) {
442		m_freem(m);
443		goto done;
444	}
445
446	/* Direct packet to correct FIB based on interface config. */
447	M_SETFIB(m, ifp->if_fib);
448
449	/* Handle input from a lagg<N> port */
450	if (ifp->if_type == IFT_INFINIBANDLAG) {
451		KASSERT(lagg_input_infiniband_p != NULL,
452		    ("%s: if_lagg not loaded!", __func__));
453		m = (*lagg_input_infiniband_p)(ifp, m);
454		if (__predict_false(m == NULL))
455			goto done;
456		ifp = m->m_pkthdr.rcvif;
457	}
458
459	/*
460	 * Dispatch frame to upper layer.
461	 */
462	switch (ibh->ib_protocol) {
463#ifdef INET
464	case htons(ETHERTYPE_IP):
465		isr = NETISR_IP;
466		break;
467
468	case htons(ETHERTYPE_ARP):
469		if (ifp->if_flags & IFF_NOARP) {
470			/* Discard packet if ARP is disabled on interface */
471			m_freem(m);
472			goto done;
473		}
474		isr = NETISR_ARP;
475		break;
476#endif
477#ifdef INET6
478	case htons(ETHERTYPE_IPV6):
479		isr = NETISR_IPV6;
480		break;
481#endif
482	default:
483		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
484		m_freem(m);
485		goto done;
486	}
487
488	/* Strip off the Infiniband header. */
489	m_adj(m, INFINIBAND_HDR_LEN);
490
491#ifdef MAC
492	/*
493	 * Tag the mbuf with an appropriate MAC label before any other
494	 * consumers can get to it.
495	 */
496	mac_ifnet_create_mbuf(ifp, m);
497#endif
498	/* Allow monitor mode to claim this frame, after stats are updated. */
499	NET_EPOCH_ENTER(et);
500	netisr_dispatch(isr, m);
501	NET_EPOCH_EXIT(et);
502done:
503	CURVNET_RESTORE();
504}
505
506static int
507infiniband_resolvemulti(struct ifnet *ifp, struct sockaddr **llsa,
508    struct sockaddr *sa)
509{
510	struct sockaddr_dl *sdl;
511#ifdef INET
512	struct sockaddr_in *sin;
513#endif
514#ifdef INET6
515	struct sockaddr_in6 *sin6;
516#endif
517	uint8_t *e_addr;
518
519	switch (sa->sa_family) {
520	case AF_LINK:
521		/*
522		 * No mapping needed. Just check that it's a valid MC address.
523		 */
524		sdl = (struct sockaddr_dl *)sa;
525		e_addr = LLADDR(sdl);
526		if (!INFINIBAND_IS_MULTICAST(e_addr))
527			return (EADDRNOTAVAIL);
528		*llsa = NULL;
529		return 0;
530
531#ifdef INET
532	case AF_INET:
533		sin = (struct sockaddr_in *)sa;
534		if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
535			return (EADDRNOTAVAIL);
536		sdl = link_init_sdl(ifp, *llsa, IFT_INFINIBAND);
537		sdl->sdl_alen = INFINIBAND_ADDR_LEN;
538		e_addr = LLADDR(sdl);
539		infiniband_ipv4_multicast_map(
540		    sin->sin_addr.s_addr, ifp->if_broadcastaddr, e_addr);
541		*llsa = (struct sockaddr *)sdl;
542		return (0);
543#endif
544#ifdef INET6
545	case AF_INET6:
546		sin6 = (struct sockaddr_in6 *)sa;
547		/*
548		 * An IP6 address of 0 means listen to all of the
549		 * multicast address used for IP6. This has no meaning
550		 * in infiniband.
551		 */
552		if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
553			return (EADDRNOTAVAIL);
554		if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr))
555			return (EADDRNOTAVAIL);
556		sdl = link_init_sdl(ifp, *llsa, IFT_INFINIBAND);
557		sdl->sdl_alen = INFINIBAND_ADDR_LEN;
558		e_addr = LLADDR(sdl);
559		infiniband_ipv6_multicast_map(
560		    &sin6->sin6_addr, ifp->if_broadcastaddr, e_addr);
561		*llsa = (struct sockaddr *)sdl;
562		return (0);
563#endif
564	default:
565		return (EAFNOSUPPORT);
566	}
567}
568
569void
570infiniband_ifattach(struct ifnet *ifp, const uint8_t *lla, const uint8_t *llb)
571{
572	struct sockaddr_dl *sdl;
573	struct ifaddr *ifa;
574	int i;
575
576	ifp->if_addrlen = INFINIBAND_ADDR_LEN;
577	ifp->if_hdrlen = INFINIBAND_HDR_LEN;
578	ifp->if_mtu = INFINIBAND_MTU;
579	if_attach(ifp);
580	ifp->if_output = infiniband_output;
581	ifp->if_input = infiniband_input;
582	ifp->if_resolvemulti = infiniband_resolvemulti;
583	ifp->if_requestencap = infiniband_requestencap;
584
585	if (ifp->if_baudrate == 0)
586		ifp->if_baudrate = IF_Gbps(10); /* default value */
587	if (llb != NULL)
588		ifp->if_broadcastaddr = llb;
589
590	ifa = ifp->if_addr;
591	KASSERT(ifa != NULL, ("%s: no lladdr!\n", __func__));
592	sdl = (struct sockaddr_dl *)ifa->ifa_addr;
593	sdl->sdl_type = IFT_INFINIBAND;
594	sdl->sdl_alen = ifp->if_addrlen;
595
596	if (lla != NULL) {
597		memcpy(LLADDR(sdl), lla, ifp->if_addrlen);
598
599		if (ifp->if_hw_addr != NULL)
600			memcpy(ifp->if_hw_addr, lla, ifp->if_addrlen);
601	} else {
602		lla = LLADDR(sdl);
603	}
604
605	/* Attach ethernet compatible network device */
606	bpfattach(ifp, DLT_EN10MB, ETHER_HDR_LEN);
607
608	/* Announce Infiniband MAC address if non-zero. */
609	for (i = 0; i < ifp->if_addrlen; i++)
610		if (lla[i] != 0)
611			break;
612	if (i != ifp->if_addrlen)
613		if_printf(ifp, "Infiniband address: %20D\n", lla, ":");
614
615	/* Add necessary bits are setup; announce it now. */
616	EVENTHANDLER_INVOKE(infiniband_ifattach_event, ifp);
617
618	if (IS_DEFAULT_VNET(curvnet))
619		devctl_notify("INFINIBAND", ifp->if_xname, "IFATTACH", NULL);
620}
621
622/*
623 * Perform common duties while detaching an Infiniband interface
624 */
625void
626infiniband_ifdetach(struct ifnet *ifp)
627{
628	bpfdetach(ifp);
629	if_detach(ifp);
630}
631
632static int
633infiniband_modevent(module_t mod, int type, void *data)
634{
635	switch (type) {
636	case MOD_LOAD:
637	case MOD_UNLOAD:
638		return (0);
639	default:
640		return (EOPNOTSUPP);
641	}
642}
643
644static moduledata_t infiniband_mod = {
645	.name = "if_infiniband",
646	.evhand = &infiniband_modevent,
647};
648
649DECLARE_MODULE(if_infiniband, infiniband_mod, SI_SUB_INIT_IF, SI_ORDER_ANY);
650MODULE_VERSION(if_infiniband, 1);
651