ipoib_main.c revision 331769
1/*
2 * Copyright (c) 2004 Topspin Communications.  All rights reserved.
3 * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
4 * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
5 *
6 * This software is available to you under a choice of one of two
7 * licenses.  You may choose to be licensed under the terms of the GNU
8 * General Public License (GPL) Version 2, available from the file
9 * COPYING in the main directory of this source tree, or the
10 * OpenIB.org BSD license below:
11 *
12 *     Redistribution and use in source and binary forms, with or
13 *     without modification, are permitted provided that the following
14 *     conditions are met:
15 *
16 *      - Redistributions of source code must retain the above
17 *        copyright notice, this list of conditions and the following
18 *        disclaimer.
19 *
20 *      - Redistributions in binary form must reproduce the above
21 *        copyright notice, this list of conditions and the following
22 *        disclaimer in the documentation and/or other materials
23 *        provided with the distribution.
24 *
25 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
26 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
27 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
28 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
29 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
30 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
31 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
32 * SOFTWARE.
33 */
34
35#include "ipoib.h"
36
37static	int ipoib_resolvemulti(struct ifnet *, struct sockaddr **,
38		struct sockaddr *);
39
40
41#include <linux/module.h>
42
43#include <linux/slab.h>
44#include <linux/kernel.h>
45#include <linux/vmalloc.h>
46
47#include <linux/if_arp.h>	/* For ARPHRD_xxx */
48#include <linux/if_vlan.h>
49#include <net/ip.h>
50#include <net/ipv6.h>
51
52MODULE_AUTHOR("Roland Dreier");
53MODULE_DESCRIPTION("IP-over-InfiniBand net driver");
54MODULE_LICENSE("Dual BSD/GPL");
55
56int ipoib_sendq_size = IPOIB_TX_RING_SIZE;
57int ipoib_recvq_size = IPOIB_RX_RING_SIZE;
58
59module_param_named(send_queue_size, ipoib_sendq_size, int, 0444);
60MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue");
61module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444);
62MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue");
63
64#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
65int ipoib_debug_level = 1;
66
67module_param_named(debug_level, ipoib_debug_level, int, 0644);
68MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0");
69#endif
70
71struct ipoib_path_iter {
72	struct ipoib_dev_priv *priv;
73	struct ipoib_path  path;
74};
75
76static const u8 ipv4_bcast_addr[] = {
77	0x00, 0xff, 0xff, 0xff,
78	0xff, 0x12, 0x40, 0x1b,	0x00, 0x00, 0x00, 0x00,
79	0x00, 0x00, 0x00, 0x00,	0xff, 0xff, 0xff, 0xff
80};
81
82struct workqueue_struct *ipoib_workqueue;
83
84struct ib_sa_client ipoib_sa_client;
85
86static void ipoib_add_one(struct ib_device *device);
87static void ipoib_remove_one(struct ib_device *device, void *client_data);
88static void ipoib_start(struct ifnet *dev);
89static int ipoib_output(struct ifnet *ifp, struct mbuf *m,
90	    const struct sockaddr *dst, struct route *ro);
91static int ipoib_ioctl(struct ifnet *ifp, u_long command, caddr_t data);
92static void ipoib_input(struct ifnet *ifp, struct mbuf *m);
93
94#define	IPOIB_MTAP(_ifp, _m)					\
95do {								\
96	if (bpf_peers_present((_ifp)->if_bpf)) {		\
97		M_ASSERTVALID(_m);				\
98		ipoib_mtap_mb((_ifp), (_m));			\
99	}							\
100} while (0)
101
102static struct unrhdr *ipoib_unrhdr;
103
104static void
105ipoib_unrhdr_init(void *arg)
106{
107
108	ipoib_unrhdr = new_unrhdr(0, 65535, NULL);
109}
110SYSINIT(ipoib_unrhdr_init, SI_SUB_KLD - 1, SI_ORDER_ANY, ipoib_unrhdr_init, NULL);
111
112static void
113ipoib_unrhdr_uninit(void *arg)
114{
115
116	if (ipoib_unrhdr != NULL) {
117		struct unrhdr *hdr;
118
119		hdr = ipoib_unrhdr;
120		ipoib_unrhdr = NULL;
121
122		delete_unrhdr(hdr);
123	}
124}
125SYSUNINIT(ipoib_unrhdr_uninit, SI_SUB_KLD - 1, SI_ORDER_ANY, ipoib_unrhdr_uninit, NULL);
126
127/*
128 * This is for clients that have an ipoib_header in the mbuf.
129 */
130static void
131ipoib_mtap_mb(struct ifnet *ifp, struct mbuf *mb)
132{
133	struct ipoib_header *ih;
134	struct ether_header eh;
135
136	ih = mtod(mb, struct ipoib_header *);
137	eh.ether_type = ih->proto;
138	bcopy(ih->hwaddr, &eh.ether_dhost, ETHER_ADDR_LEN);
139	bzero(&eh.ether_shost, ETHER_ADDR_LEN);
140	mb->m_data += sizeof(struct ipoib_header);
141	mb->m_len -= sizeof(struct ipoib_header);
142	bpf_mtap2(ifp->if_bpf, &eh, sizeof(eh), mb);
143	mb->m_data -= sizeof(struct ipoib_header);
144	mb->m_len += sizeof(struct ipoib_header);
145}
146
147void
148ipoib_mtap_proto(struct ifnet *ifp, struct mbuf *mb, uint16_t proto)
149{
150	struct ether_header eh;
151
152	eh.ether_type = proto;
153	bzero(&eh.ether_shost, ETHER_ADDR_LEN);
154	bzero(&eh.ether_dhost, ETHER_ADDR_LEN);
155	bpf_mtap2(ifp->if_bpf, &eh, sizeof(eh), mb);
156}
157
158static struct ib_client ipoib_client = {
159	.name   = "ipoib",
160	.add    = ipoib_add_one,
161	.remove = ipoib_remove_one
162};
163
164int
165ipoib_open(struct ipoib_dev_priv *priv)
166{
167	struct ifnet *dev = priv->dev;
168
169	ipoib_dbg(priv, "bringing up interface\n");
170
171	set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
172
173	if (ipoib_pkey_dev_delay_open(priv))
174		return 0;
175
176	if (ipoib_ib_dev_open(priv))
177		goto err_disable;
178
179	if (ipoib_ib_dev_up(priv))
180		goto err_stop;
181
182	if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
183		struct ipoib_dev_priv *cpriv;
184
185		/* Bring up any child interfaces too */
186		mutex_lock(&priv->vlan_mutex);
187		list_for_each_entry(cpriv, &priv->child_intfs, list)
188			if ((cpriv->dev->if_drv_flags & IFF_DRV_RUNNING) == 0)
189				ipoib_open(cpriv);
190		mutex_unlock(&priv->vlan_mutex);
191	}
192	dev->if_drv_flags |= IFF_DRV_RUNNING;
193	dev->if_drv_flags &= ~IFF_DRV_OACTIVE;
194
195	return 0;
196
197err_stop:
198	ipoib_ib_dev_stop(priv, 1);
199
200err_disable:
201	clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
202
203	return -EINVAL;
204}
205
206static void
207ipoib_init(void *arg)
208{
209	struct ifnet *dev;
210	struct ipoib_dev_priv *priv;
211
212	priv = arg;
213	dev = priv->dev;
214	if ((dev->if_drv_flags & IFF_DRV_RUNNING) == 0)
215		ipoib_open(priv);
216	queue_work(ipoib_workqueue, &priv->flush_light);
217}
218
219
220static int
221ipoib_stop(struct ipoib_dev_priv *priv)
222{
223	struct ifnet *dev = priv->dev;
224
225	ipoib_dbg(priv, "stopping interface\n");
226
227	clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
228
229	dev->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE);
230
231	ipoib_ib_dev_down(priv, 0);
232	ipoib_ib_dev_stop(priv, 0);
233
234	if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
235		struct ipoib_dev_priv *cpriv;
236
237		/* Bring down any child interfaces too */
238		mutex_lock(&priv->vlan_mutex);
239		list_for_each_entry(cpriv, &priv->child_intfs, list)
240			if ((cpriv->dev->if_drv_flags & IFF_DRV_RUNNING) != 0)
241				ipoib_stop(cpriv);
242		mutex_unlock(&priv->vlan_mutex);
243	}
244
245	return 0;
246}
247
248int
249ipoib_change_mtu(struct ipoib_dev_priv *priv, int new_mtu)
250{
251	struct ifnet *dev = priv->dev;
252
253	/* dev->if_mtu > 2K ==> connected mode */
254	if (ipoib_cm_admin_enabled(priv)) {
255		if (new_mtu > IPOIB_CM_MTU(ipoib_cm_max_mtu(priv)))
256			return -EINVAL;
257
258		if (new_mtu > priv->mcast_mtu)
259			ipoib_warn(priv, "mtu > %d will cause multicast packet drops.\n",
260				   priv->mcast_mtu);
261
262		dev->if_mtu = new_mtu;
263		return 0;
264	}
265
266	if (new_mtu > IPOIB_UD_MTU(priv->max_ib_mtu))
267		return -EINVAL;
268
269	priv->admin_mtu = new_mtu;
270
271	dev->if_mtu = min(priv->mcast_mtu, priv->admin_mtu);
272
273	queue_work(ipoib_workqueue, &priv->flush_light);
274
275	return 0;
276}
277
278static int
279ipoib_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
280{
281	struct ipoib_dev_priv *priv = ifp->if_softc;
282	struct ifaddr *ifa = (struct ifaddr *) data;
283	struct ifreq *ifr = (struct ifreq *) data;
284	int error = 0;
285
286	/* check if detaching */
287	if (priv == NULL || priv->gone != 0)
288		return (ENXIO);
289
290	switch (command) {
291	case SIOCSIFFLAGS:
292		if (ifp->if_flags & IFF_UP) {
293			if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
294				error = -ipoib_open(priv);
295		} else
296			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
297				ipoib_stop(priv);
298		break;
299	case SIOCADDMULTI:
300	case SIOCDELMULTI:
301		if (ifp->if_drv_flags & IFF_DRV_RUNNING)
302			queue_work(ipoib_workqueue, &priv->restart_task);
303		break;
304	case SIOCSIFADDR:
305		ifp->if_flags |= IFF_UP;
306
307		switch (ifa->ifa_addr->sa_family) {
308#ifdef INET
309		case AF_INET:
310			ifp->if_init(ifp->if_softc);	/* before arpwhohas */
311			arp_ifinit(ifp, ifa);
312			break;
313#endif
314		default:
315			ifp->if_init(ifp->if_softc);
316			break;
317		}
318		break;
319
320	case SIOCGIFADDR:
321		{
322			struct sockaddr *sa;
323
324			sa = (struct sockaddr *) & ifr->ifr_data;
325			bcopy(IF_LLADDR(ifp),
326			      (caddr_t) sa->sa_data, INFINIBAND_ALEN);
327		}
328		break;
329
330	case SIOCSIFMTU:
331		/*
332		 * Set the interface MTU.
333		 */
334		error = -ipoib_change_mtu(priv, ifr->ifr_mtu);
335		break;
336	default:
337		error = EINVAL;
338		break;
339	}
340	return (error);
341}
342
343
344static struct ipoib_path *
345__path_find(struct ipoib_dev_priv *priv, void *gid)
346{
347	struct rb_node *n = priv->path_tree.rb_node;
348	struct ipoib_path *path;
349	int ret;
350
351	while (n) {
352		path = rb_entry(n, struct ipoib_path, rb_node);
353
354		ret = memcmp(gid, path->pathrec.dgid.raw,
355			     sizeof (union ib_gid));
356
357		if (ret < 0)
358			n = n->rb_left;
359		else if (ret > 0)
360			n = n->rb_right;
361		else
362			return path;
363	}
364
365	return NULL;
366}
367
368static int
369__path_add(struct ipoib_dev_priv *priv, struct ipoib_path *path)
370{
371	struct rb_node **n = &priv->path_tree.rb_node;
372	struct rb_node *pn = NULL;
373	struct ipoib_path *tpath;
374	int ret;
375
376	while (*n) {
377		pn = *n;
378		tpath = rb_entry(pn, struct ipoib_path, rb_node);
379
380		ret = memcmp(path->pathrec.dgid.raw, tpath->pathrec.dgid.raw,
381			     sizeof (union ib_gid));
382		if (ret < 0)
383			n = &pn->rb_left;
384		else if (ret > 0)
385			n = &pn->rb_right;
386		else
387			return -EEXIST;
388	}
389
390	rb_link_node(&path->rb_node, pn, n);
391	rb_insert_color(&path->rb_node, &priv->path_tree);
392
393	list_add_tail(&path->list, &priv->path_list);
394
395	return 0;
396}
397
398void
399ipoib_path_free(struct ipoib_dev_priv *priv, struct ipoib_path *path)
400{
401
402	_IF_DRAIN(&path->queue);
403
404	if (path->ah)
405		ipoib_put_ah(path->ah);
406	if (ipoib_cm_get(path))
407		ipoib_cm_destroy_tx(ipoib_cm_get(path));
408
409	kfree(path);
410}
411
412#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
413
414struct ipoib_path_iter *
415ipoib_path_iter_init(struct ipoib_dev_priv *priv)
416{
417	struct ipoib_path_iter *iter;
418
419	iter = kmalloc(sizeof *iter, GFP_KERNEL);
420	if (!iter)
421		return NULL;
422
423	iter->priv = priv;
424	memset(iter->path.pathrec.dgid.raw, 0, 16);
425
426	if (ipoib_path_iter_next(iter)) {
427		kfree(iter);
428		return NULL;
429	}
430
431	return iter;
432}
433
434int
435ipoib_path_iter_next(struct ipoib_path_iter *iter)
436{
437	struct ipoib_dev_priv *priv = iter->priv;
438	struct rb_node *n;
439	struct ipoib_path *path;
440	int ret = 1;
441
442	spin_lock_irq(&priv->lock);
443
444	n = rb_first(&priv->path_tree);
445
446	while (n) {
447		path = rb_entry(n, struct ipoib_path, rb_node);
448
449		if (memcmp(iter->path.pathrec.dgid.raw, path->pathrec.dgid.raw,
450			   sizeof (union ib_gid)) < 0) {
451			iter->path = *path;
452			ret = 0;
453			break;
454		}
455
456		n = rb_next(n);
457	}
458
459	spin_unlock_irq(&priv->lock);
460
461	return ret;
462}
463
464void
465ipoib_path_iter_read(struct ipoib_path_iter *iter, struct ipoib_path *path)
466{
467	*path = iter->path;
468}
469
470#endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */
471
472void
473ipoib_mark_paths_invalid(struct ipoib_dev_priv *priv)
474{
475	struct ipoib_path *path, *tp;
476
477	spin_lock_irq(&priv->lock);
478
479	list_for_each_entry_safe(path, tp, &priv->path_list, list) {
480		ipoib_dbg(priv, "mark path LID 0x%04x GID %16D invalid\n",
481			be16_to_cpu(path->pathrec.dlid),
482			path->pathrec.dgid.raw, ":");
483		path->valid =  0;
484	}
485
486	spin_unlock_irq(&priv->lock);
487}
488
489void
490ipoib_flush_paths(struct ipoib_dev_priv *priv)
491{
492	struct ipoib_path *path, *tp;
493	LIST_HEAD(remove_list);
494	unsigned long flags;
495
496	spin_lock_irqsave(&priv->lock, flags);
497
498	list_splice_init(&priv->path_list, &remove_list);
499
500	list_for_each_entry(path, &remove_list, list)
501		rb_erase(&path->rb_node, &priv->path_tree);
502
503	list_for_each_entry_safe(path, tp, &remove_list, list) {
504		if (path->query)
505			ib_sa_cancel_query(path->query_id, path->query);
506		spin_unlock_irqrestore(&priv->lock, flags);
507		wait_for_completion(&path->done);
508		ipoib_path_free(priv, path);
509		spin_lock_irqsave(&priv->lock, flags);
510	}
511
512	spin_unlock_irqrestore(&priv->lock, flags);
513}
514
515static void
516path_rec_completion(int status, struct ib_sa_path_rec *pathrec, void *path_ptr)
517{
518	struct ipoib_path *path = path_ptr;
519	struct ipoib_dev_priv *priv = path->priv;
520	struct ifnet *dev = priv->dev;
521	struct ipoib_ah *ah = NULL;
522	struct ipoib_ah *old_ah = NULL;
523	struct ifqueue mbqueue;
524	struct mbuf *mb;
525	unsigned long flags;
526
527	if (!status)
528		ipoib_dbg(priv, "PathRec LID 0x%04x for GID %16D\n",
529			  be16_to_cpu(pathrec->dlid), pathrec->dgid.raw, ":");
530	else
531		ipoib_dbg(priv, "PathRec status %d for GID %16D\n",
532			  status, path->pathrec.dgid.raw, ":");
533
534	bzero(&mbqueue, sizeof(mbqueue));
535
536	if (!status) {
537		struct ib_ah_attr av;
538
539		if (!ib_init_ah_from_path(priv->ca, priv->port, pathrec, &av))
540			ah = ipoib_create_ah(priv, priv->pd, &av);
541	}
542
543	spin_lock_irqsave(&priv->lock, flags);
544
545	if (ah) {
546		path->pathrec = *pathrec;
547
548		old_ah   = path->ah;
549		path->ah = ah;
550
551		ipoib_dbg(priv, "created address handle %p for LID 0x%04x, SL %d\n",
552			  ah, be16_to_cpu(pathrec->dlid), pathrec->sl);
553
554		for (;;) {
555			_IF_DEQUEUE(&path->queue, mb);
556			if (mb == NULL)
557				break;
558			_IF_ENQUEUE(&mbqueue, mb);
559		}
560
561#ifdef CONFIG_INFINIBAND_IPOIB_CM
562		if (ipoib_cm_enabled(priv, path->hwaddr) && !ipoib_cm_get(path))
563			ipoib_cm_set(path, ipoib_cm_create_tx(priv, path));
564#endif
565
566		path->valid = 1;
567	}
568
569	path->query = NULL;
570	complete(&path->done);
571
572	spin_unlock_irqrestore(&priv->lock, flags);
573
574	if (old_ah)
575		ipoib_put_ah(old_ah);
576
577	for (;;) {
578		_IF_DEQUEUE(&mbqueue, mb);
579		if (mb == NULL)
580			break;
581		mb->m_pkthdr.rcvif = dev;
582		if (dev->if_transmit(dev, mb))
583			ipoib_warn(priv, "dev_queue_xmit failed "
584				   "to requeue packet\n");
585	}
586}
587
588static struct ipoib_path *
589path_rec_create(struct ipoib_dev_priv *priv, uint8_t *hwaddr)
590{
591	struct ipoib_path *path;
592
593	if (!priv->broadcast)
594		return NULL;
595
596	path = kzalloc(sizeof *path, GFP_ATOMIC);
597	if (!path)
598		return NULL;
599
600	path->priv = priv;
601
602	bzero(&path->queue, sizeof(path->queue));
603
604#ifdef CONFIG_INFINIBAND_IPOIB_CM
605	memcpy(&path->hwaddr, hwaddr, INFINIBAND_ALEN);
606#endif
607	memcpy(path->pathrec.dgid.raw, &hwaddr[4], sizeof (union ib_gid));
608	path->pathrec.sgid	    = priv->local_gid;
609	path->pathrec.pkey	    = cpu_to_be16(priv->pkey);
610	path->pathrec.numb_path     = 1;
611	path->pathrec.traffic_class = priv->broadcast->mcmember.traffic_class;
612
613	return path;
614}
615
616static int
617path_rec_start(struct ipoib_dev_priv *priv, struct ipoib_path *path)
618{
619	struct ifnet *dev = priv->dev;
620
621	ib_sa_comp_mask comp_mask = IB_SA_PATH_REC_MTU_SELECTOR | IB_SA_PATH_REC_MTU;
622	struct ib_sa_path_rec p_rec;
623
624	p_rec = path->pathrec;
625	p_rec.mtu_selector = IB_SA_GT;
626
627	switch (roundup_pow_of_two(dev->if_mtu + IPOIB_ENCAP_LEN)) {
628	case 512:
629		p_rec.mtu = IB_MTU_256;
630		break;
631	case 1024:
632		p_rec.mtu = IB_MTU_512;
633		break;
634	case 2048:
635		p_rec.mtu = IB_MTU_1024;
636		break;
637	case 4096:
638		p_rec.mtu = IB_MTU_2048;
639		break;
640	default:
641		/* Wildcard everything */
642		comp_mask = 0;
643		p_rec.mtu = 0;
644		p_rec.mtu_selector = 0;
645	}
646
647	ipoib_dbg(priv, "Start path record lookup for %16D MTU > %d\n",
648		  p_rec.dgid.raw, ":",
649		  comp_mask ? ib_mtu_enum_to_int(p_rec.mtu) : 0);
650
651	init_completion(&path->done);
652
653	path->query_id =
654		ib_sa_path_rec_get(&ipoib_sa_client, priv->ca, priv->port,
655				   &p_rec, comp_mask		|
656				   IB_SA_PATH_REC_DGID		|
657				   IB_SA_PATH_REC_SGID		|
658				   IB_SA_PATH_REC_NUMB_PATH	|
659				   IB_SA_PATH_REC_TRAFFIC_CLASS |
660				   IB_SA_PATH_REC_PKEY,
661				   1000, GFP_ATOMIC,
662				   path_rec_completion,
663				   path, &path->query);
664	if (path->query_id < 0) {
665		ipoib_warn(priv, "ib_sa_path_rec_get failed: %d\n", path->query_id);
666		path->query = NULL;
667		complete(&path->done);
668		return path->query_id;
669	}
670
671	return 0;
672}
673
674static void
675ipoib_unicast_send(struct mbuf *mb, struct ipoib_dev_priv *priv, struct ipoib_header *eh)
676{
677	struct ipoib_path *path;
678
679	path = __path_find(priv, eh->hwaddr + 4);
680	if (!path || !path->valid) {
681		int new_path = 0;
682
683		if (!path) {
684			path = path_rec_create(priv, eh->hwaddr);
685			new_path = 1;
686		}
687		if (path) {
688			if (_IF_QLEN(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE)
689				_IF_ENQUEUE(&path->queue, mb);
690			else {
691				if_inc_counter(priv->dev, IFCOUNTER_OERRORS, 1);
692				m_freem(mb);
693			}
694
695			if (!path->query && path_rec_start(priv, path)) {
696				spin_unlock_irqrestore(&priv->lock, flags);
697				if (new_path)
698					ipoib_path_free(priv, path);
699				return;
700			} else
701				__path_add(priv, path);
702		} else {
703			if_inc_counter(priv->dev, IFCOUNTER_OERRORS, 1);
704			m_freem(mb);
705		}
706
707		return;
708	}
709
710	if (ipoib_cm_get(path) && ipoib_cm_up(path)) {
711		ipoib_cm_send(priv, mb, ipoib_cm_get(path));
712	} else if (path->ah) {
713		ipoib_send(priv, mb, path->ah, IPOIB_QPN(eh->hwaddr));
714	} else if ((path->query || !path_rec_start(priv, path)) &&
715		    path->queue.ifq_len < IPOIB_MAX_PATH_REC_QUEUE) {
716		_IF_ENQUEUE(&path->queue, mb);
717	} else {
718		if_inc_counter(priv->dev, IFCOUNTER_OERRORS, 1);
719		m_freem(mb);
720	}
721}
722
723static int
724ipoib_send_one(struct ipoib_dev_priv *priv, struct mbuf *mb)
725{
726	struct ipoib_header *eh;
727
728	eh = mtod(mb, struct ipoib_header *);
729	if (IPOIB_IS_MULTICAST(eh->hwaddr)) {
730		/* Add in the P_Key for multicast*/
731		eh->hwaddr[8] = (priv->pkey >> 8) & 0xff;
732		eh->hwaddr[9] = priv->pkey & 0xff;
733
734		ipoib_mcast_send(priv, eh->hwaddr + 4, mb);
735	} else
736		ipoib_unicast_send(mb, priv, eh);
737
738	return 0;
739}
740
741
742static void
743_ipoib_start(struct ifnet *dev, struct ipoib_dev_priv *priv)
744{
745	struct mbuf *mb;
746
747	if ((dev->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) !=
748	    IFF_DRV_RUNNING)
749		return;
750
751	spin_lock(&priv->lock);
752	while (!IFQ_DRV_IS_EMPTY(&dev->if_snd) &&
753	    (dev->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
754		IFQ_DRV_DEQUEUE(&dev->if_snd, mb);
755		if (mb == NULL)
756			break;
757		IPOIB_MTAP(dev, mb);
758		ipoib_send_one(priv, mb);
759	}
760	spin_unlock(&priv->lock);
761}
762
763static void
764ipoib_start(struct ifnet *dev)
765{
766	_ipoib_start(dev, dev->if_softc);
767}
768
769static void
770ipoib_vlan_start(struct ifnet *dev)
771{
772	struct ipoib_dev_priv *priv;
773	struct mbuf *mb;
774
775	priv = VLAN_COOKIE(dev);
776	if (priv != NULL)
777		return _ipoib_start(dev, priv);
778	while (!IFQ_DRV_IS_EMPTY(&dev->if_snd)) {
779		IFQ_DRV_DEQUEUE(&dev->if_snd, mb);
780		if (mb == NULL)
781			break;
782		m_freem(mb);
783		if_inc_counter(dev, IFCOUNTER_OERRORS, 1);
784	}
785}
786
787int
788ipoib_dev_init(struct ipoib_dev_priv *priv, struct ib_device *ca, int port)
789{
790
791	/* Allocate RX/TX "rings" to hold queued mbs */
792	priv->rx_ring =	kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring,
793				GFP_KERNEL);
794	if (!priv->rx_ring) {
795		printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n",
796		       ca->name, ipoib_recvq_size);
797		goto out;
798	}
799
800	priv->tx_ring = kzalloc(ipoib_sendq_size * sizeof *priv->tx_ring, GFP_KERNEL);
801	if (!priv->tx_ring) {
802		printk(KERN_WARNING "%s: failed to allocate TX ring (%d entries)\n",
803		       ca->name, ipoib_sendq_size);
804		goto out_rx_ring_cleanup;
805	}
806	memset(priv->tx_ring, 0, ipoib_sendq_size * sizeof *priv->tx_ring);
807
808	/* priv->tx_head, tx_tail & tx_outstanding are already 0 */
809
810	if (ipoib_ib_dev_init(priv, ca, port))
811		goto out_tx_ring_cleanup;
812
813	return 0;
814
815out_tx_ring_cleanup:
816	kfree(priv->tx_ring);
817
818out_rx_ring_cleanup:
819	kfree(priv->rx_ring);
820
821out:
822	return -ENOMEM;
823}
824
825static void
826ipoib_detach(struct ipoib_dev_priv *priv)
827{
828	struct ifnet *dev;
829
830	dev = priv->dev;
831	if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
832		priv->gone = 1;
833		bpfdetach(dev);
834		if_detach(dev);
835		if_free(dev);
836		free_unr(ipoib_unrhdr, priv->unit);
837	} else
838		VLAN_SETCOOKIE(priv->dev, NULL);
839
840	free(priv, M_TEMP);
841}
842
843void
844ipoib_dev_cleanup(struct ipoib_dev_priv *priv)
845{
846	struct ipoib_dev_priv *cpriv, *tcpriv;
847
848	/* Delete any child interfaces first */
849	list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) {
850		ipoib_dev_cleanup(cpriv);
851		ipoib_detach(cpriv);
852	}
853
854	ipoib_ib_dev_cleanup(priv);
855
856	kfree(priv->rx_ring);
857	kfree(priv->tx_ring);
858
859	priv->rx_ring = NULL;
860	priv->tx_ring = NULL;
861}
862
863static struct ipoib_dev_priv *
864ipoib_priv_alloc(void)
865{
866	struct ipoib_dev_priv *priv;
867
868	priv = malloc(sizeof(struct ipoib_dev_priv), M_TEMP, M_ZERO|M_WAITOK);
869	spin_lock_init(&priv->lock);
870	spin_lock_init(&priv->drain_lock);
871	mutex_init(&priv->vlan_mutex);
872	INIT_LIST_HEAD(&priv->path_list);
873	INIT_LIST_HEAD(&priv->child_intfs);
874	INIT_LIST_HEAD(&priv->dead_ahs);
875	INIT_LIST_HEAD(&priv->multicast_list);
876	INIT_DELAYED_WORK(&priv->pkey_poll_task, ipoib_pkey_poll);
877	INIT_DELAYED_WORK(&priv->mcast_task,   ipoib_mcast_join_task);
878	INIT_WORK(&priv->carrier_on_task, ipoib_mcast_carrier_on_task);
879	INIT_WORK(&priv->flush_light,   ipoib_ib_dev_flush_light);
880	INIT_WORK(&priv->flush_normal,   ipoib_ib_dev_flush_normal);
881	INIT_WORK(&priv->flush_heavy,   ipoib_ib_dev_flush_heavy);
882	INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task);
883	INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah);
884	memcpy(priv->broadcastaddr, ipv4_bcast_addr, INFINIBAND_ALEN);
885
886	return (priv);
887}
888
889struct ipoib_dev_priv *
890ipoib_intf_alloc(const char *name)
891{
892	struct ipoib_dev_priv *priv;
893	struct sockaddr_dl *sdl;
894	struct ifnet *dev;
895
896	priv = ipoib_priv_alloc();
897	dev = priv->dev = if_alloc(IFT_INFINIBAND);
898	if (!dev) {
899		free(priv, M_TEMP);
900		return NULL;
901	}
902	dev->if_softc = priv;
903	priv->unit = alloc_unr(ipoib_unrhdr);
904	if (priv->unit == -1) {
905		if_free(dev);
906		free(priv, M_TEMP);
907		return NULL;
908	}
909	if_initname(dev, name, priv->unit);
910	dev->if_flags = IFF_BROADCAST | IFF_MULTICAST;
911	dev->if_addrlen = INFINIBAND_ALEN;
912	dev->if_hdrlen = IPOIB_HEADER_LEN;
913	if_attach(dev);
914	dev->if_init = ipoib_init;
915	dev->if_ioctl = ipoib_ioctl;
916	dev->if_start = ipoib_start;
917	dev->if_output = ipoib_output;
918	dev->if_input = ipoib_input;
919	dev->if_resolvemulti = ipoib_resolvemulti;
920	dev->if_baudrate = IF_Gbps(10);
921	dev->if_broadcastaddr = priv->broadcastaddr;
922	dev->if_snd.ifq_maxlen = ipoib_sendq_size * 2;
923	sdl = (struct sockaddr_dl *)dev->if_addr->ifa_addr;
924	sdl->sdl_type = IFT_INFINIBAND;
925	sdl->sdl_alen = dev->if_addrlen;
926	priv->dev = dev;
927	if_link_state_change(dev, LINK_STATE_DOWN);
928	bpfattach(dev, DLT_EN10MB, ETHER_HDR_LEN);
929
930	return dev->if_softc;
931}
932
933int
934ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca)
935{
936	struct ib_device_attr *device_attr = &hca->attrs;
937
938	priv->hca_caps = device_attr->device_cap_flags;
939
940	priv->dev->if_hwassist = 0;
941	priv->dev->if_capabilities = 0;
942
943#ifndef CONFIG_INFINIBAND_IPOIB_CM
944	if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) {
945		set_bit(IPOIB_FLAG_CSUM, &priv->flags);
946		priv->dev->if_hwassist = CSUM_IP | CSUM_TCP | CSUM_UDP;
947		priv->dev->if_capabilities = IFCAP_HWCSUM | IFCAP_VLAN_HWCSUM;
948	}
949
950#if 0
951	if (priv->dev->features & NETIF_F_SG && priv->hca_caps & IB_DEVICE_UD_TSO) {
952		priv->dev->if_capabilities |= IFCAP_TSO4;
953		priv->dev->if_hwassist |= CSUM_TSO;
954	}
955#endif
956#endif
957	priv->dev->if_capabilities |=
958	    IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU | IFCAP_LINKSTATE;
959	priv->dev->if_capenable = priv->dev->if_capabilities;
960
961	return 0;
962}
963
964
965static struct ifnet *
966ipoib_add_port(const char *format, struct ib_device *hca, u8 port)
967{
968	struct ipoib_dev_priv *priv;
969	struct ib_port_attr attr;
970	int result = -ENOMEM;
971
972	priv = ipoib_intf_alloc(format);
973	if (!priv)
974		goto alloc_mem_failed;
975
976	if (!ib_query_port(hca, port, &attr))
977		priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu);
978	else {
979		printk(KERN_WARNING "%s: ib_query_port %d failed\n",
980		       hca->name, port);
981		goto device_init_failed;
982	}
983
984	/* MTU will be reset when mcast join happens */
985	priv->dev->if_mtu = IPOIB_UD_MTU(priv->max_ib_mtu);
986	priv->mcast_mtu = priv->admin_mtu = priv->dev->if_mtu;
987
988	result = ib_query_pkey(hca, port, 0, &priv->pkey);
989	if (result) {
990		printk(KERN_WARNING "%s: ib_query_pkey port %d failed (ret = %d)\n",
991		       hca->name, port, result);
992		goto device_init_failed;
993	}
994
995	if (ipoib_set_dev_features(priv, hca))
996		goto device_init_failed;
997
998	/*
999	 * Set the full membership bit, so that we join the right
1000	 * broadcast group, etc.
1001	 */
1002	priv->pkey |= 0x8000;
1003
1004	priv->broadcastaddr[8] = priv->pkey >> 8;
1005	priv->broadcastaddr[9] = priv->pkey & 0xff;
1006
1007	result = ib_query_gid(hca, port, 0, &priv->local_gid, NULL);
1008	if (result) {
1009		printk(KERN_WARNING "%s: ib_query_gid port %d failed (ret = %d)\n",
1010		       hca->name, port, result);
1011		goto device_init_failed;
1012	}
1013	memcpy(IF_LLADDR(priv->dev) + 4, priv->local_gid.raw, sizeof (union ib_gid));
1014
1015	result = ipoib_dev_init(priv, hca, port);
1016	if (result < 0) {
1017		printk(KERN_WARNING "%s: failed to initialize port %d (ret = %d)\n",
1018		       hca->name, port, result);
1019		goto device_init_failed;
1020	}
1021	if (ipoib_cm_admin_enabled(priv))
1022		priv->dev->if_mtu = IPOIB_CM_MTU(ipoib_cm_max_mtu(priv));
1023
1024	INIT_IB_EVENT_HANDLER(&priv->event_handler,
1025			      priv->ca, ipoib_event);
1026	result = ib_register_event_handler(&priv->event_handler);
1027	if (result < 0) {
1028		printk(KERN_WARNING "%s: ib_register_event_handler failed for "
1029		       "port %d (ret = %d)\n",
1030		       hca->name, port, result);
1031		goto event_failed;
1032	}
1033	if_printf(priv->dev, "Attached to %s port %d\n", hca->name, port);
1034
1035	return priv->dev;
1036
1037event_failed:
1038	ipoib_dev_cleanup(priv);
1039
1040device_init_failed:
1041	ipoib_detach(priv);
1042
1043alloc_mem_failed:
1044	return ERR_PTR(result);
1045}
1046
1047static void
1048ipoib_add_one(struct ib_device *device)
1049{
1050	struct list_head *dev_list;
1051	struct ifnet *dev;
1052	struct ipoib_dev_priv *priv;
1053	int s, e, p;
1054
1055	if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
1056		return;
1057
1058	dev_list = kmalloc(sizeof *dev_list, GFP_KERNEL);
1059	if (!dev_list)
1060		return;
1061
1062	INIT_LIST_HEAD(dev_list);
1063
1064	if (device->node_type == RDMA_NODE_IB_SWITCH) {
1065		s = 0;
1066		e = 0;
1067	} else {
1068		s = 1;
1069		e = device->phys_port_cnt;
1070	}
1071
1072	for (p = s; p <= e; ++p) {
1073		if (rdma_port_get_link_layer(device, p) != IB_LINK_LAYER_INFINIBAND)
1074			continue;
1075		dev = ipoib_add_port("ib", device, p);
1076		if (!IS_ERR(dev)) {
1077			priv = dev->if_softc;
1078			list_add_tail(&priv->list, dev_list);
1079		}
1080	}
1081
1082	ib_set_client_data(device, &ipoib_client, dev_list);
1083}
1084
1085static void
1086ipoib_remove_one(struct ib_device *device, void *client_data)
1087{
1088	struct ipoib_dev_priv *priv, *tmp;
1089	struct list_head *dev_list = client_data;
1090
1091	if (!dev_list)
1092		return;
1093
1094	if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
1095		return;
1096
1097	list_for_each_entry_safe(priv, tmp, dev_list, list) {
1098		if (rdma_port_get_link_layer(device, priv->port) != IB_LINK_LAYER_INFINIBAND)
1099			continue;
1100
1101		ipoib_stop(priv);
1102
1103		ib_unregister_event_handler(&priv->event_handler);
1104
1105		/* dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP); */
1106
1107		flush_workqueue(ipoib_workqueue);
1108
1109		ipoib_dev_cleanup(priv);
1110		ipoib_detach(priv);
1111	}
1112
1113	kfree(dev_list);
1114}
1115
1116static void
1117ipoib_config_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag)
1118{
1119	struct ipoib_dev_priv *parent;
1120	struct ipoib_dev_priv *priv;
1121	struct ifnet *dev;
1122	uint16_t pkey;
1123	int error;
1124
1125	if (ifp->if_type != IFT_INFINIBAND)
1126		return;
1127	dev = VLAN_DEVAT(ifp, vtag);
1128	if (dev == NULL)
1129		return;
1130	priv = NULL;
1131	error = 0;
1132	parent = ifp->if_softc;
1133	/* We only support 15 bits of pkey. */
1134	if (vtag & 0x8000)
1135		return;
1136	pkey = vtag | 0x8000;	/* Set full membership bit. */
1137	if (pkey == parent->pkey)
1138		return;
1139	/* Check for dups */
1140	mutex_lock(&parent->vlan_mutex);
1141	list_for_each_entry(priv, &parent->child_intfs, list) {
1142		if (priv->pkey == pkey) {
1143			priv = NULL;
1144			error = EBUSY;
1145			goto out;
1146		}
1147	}
1148	priv = ipoib_priv_alloc();
1149	priv->dev = dev;
1150	priv->max_ib_mtu = parent->max_ib_mtu;
1151	priv->mcast_mtu = priv->admin_mtu = parent->dev->if_mtu;
1152	set_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags);
1153	error = ipoib_set_dev_features(priv, parent->ca);
1154	if (error)
1155		goto out;
1156	priv->pkey = pkey;
1157	priv->broadcastaddr[8] = pkey >> 8;
1158	priv->broadcastaddr[9] = pkey & 0xff;
1159	dev->if_broadcastaddr = priv->broadcastaddr;
1160	error = ipoib_dev_init(priv, parent->ca, parent->port);
1161	if (error)
1162		goto out;
1163	priv->parent = parent->dev;
1164	list_add_tail(&priv->list, &parent->child_intfs);
1165	VLAN_SETCOOKIE(dev, priv);
1166	dev->if_start = ipoib_vlan_start;
1167	dev->if_drv_flags &= ~IFF_DRV_RUNNING;
1168	dev->if_hdrlen = IPOIB_HEADER_LEN;
1169	if (ifp->if_drv_flags & IFF_DRV_RUNNING)
1170		ipoib_open(priv);
1171	mutex_unlock(&parent->vlan_mutex);
1172	return;
1173out:
1174	mutex_unlock(&parent->vlan_mutex);
1175	if (priv)
1176		free(priv, M_TEMP);
1177	if (error)
1178		ipoib_warn(parent,
1179		    "failed to initialize subinterface: device %s, port %d vtag 0x%X",
1180		    parent->ca->name, parent->port, vtag);
1181	return;
1182}
1183
1184static void
1185ipoib_unconfig_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag)
1186{
1187	struct ipoib_dev_priv *parent;
1188	struct ipoib_dev_priv *priv;
1189	struct ifnet *dev;
1190	uint16_t pkey;
1191
1192	if (ifp->if_type != IFT_INFINIBAND)
1193		return;
1194
1195	dev = VLAN_DEVAT(ifp, vtag);
1196	if (dev)
1197		VLAN_SETCOOKIE(dev, NULL);
1198	pkey = vtag | 0x8000;
1199	parent = ifp->if_softc;
1200	mutex_lock(&parent->vlan_mutex);
1201	list_for_each_entry(priv, &parent->child_intfs, list) {
1202		if (priv->pkey == pkey) {
1203			ipoib_dev_cleanup(priv);
1204			list_del(&priv->list);
1205			break;
1206		}
1207	}
1208	mutex_unlock(&parent->vlan_mutex);
1209}
1210
1211eventhandler_tag ipoib_vlan_attach;
1212eventhandler_tag ipoib_vlan_detach;
1213
1214static int __init
1215ipoib_init_module(void)
1216{
1217	int ret;
1218
1219	ipoib_recvq_size = roundup_pow_of_two(ipoib_recvq_size);
1220	ipoib_recvq_size = min(ipoib_recvq_size, IPOIB_MAX_QUEUE_SIZE);
1221	ipoib_recvq_size = max(ipoib_recvq_size, IPOIB_MIN_QUEUE_SIZE);
1222
1223	ipoib_sendq_size = roundup_pow_of_two(ipoib_sendq_size);
1224	ipoib_sendq_size = min(ipoib_sendq_size, IPOIB_MAX_QUEUE_SIZE);
1225	ipoib_sendq_size = max(ipoib_sendq_size, max(2 * MAX_SEND_CQE,
1226						     IPOIB_MIN_QUEUE_SIZE));
1227#ifdef CONFIG_INFINIBAND_IPOIB_CM
1228	ipoib_max_conn_qp = min(ipoib_max_conn_qp, IPOIB_CM_MAX_CONN_QP);
1229#endif
1230
1231	ipoib_vlan_attach = EVENTHANDLER_REGISTER(vlan_config,
1232		ipoib_config_vlan, NULL, EVENTHANDLER_PRI_FIRST);
1233	ipoib_vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig,
1234		ipoib_unconfig_vlan, NULL, EVENTHANDLER_PRI_FIRST);
1235
1236	/*
1237	 * We create our own workqueue mainly because we want to be
1238	 * able to flush it when devices are being removed.  We can't
1239	 * use schedule_work()/flush_scheduled_work() because both
1240	 * unregister_netdev() and linkwatch_event take the rtnl lock,
1241	 * so flush_scheduled_work() can deadlock during device
1242	 * removal.
1243	 */
1244	ipoib_workqueue = create_singlethread_workqueue("ipoib");
1245	if (!ipoib_workqueue) {
1246		ret = -ENOMEM;
1247		goto err_fs;
1248	}
1249
1250	ib_sa_register_client(&ipoib_sa_client);
1251
1252	ret = ib_register_client(&ipoib_client);
1253	if (ret)
1254		goto err_sa;
1255
1256	return 0;
1257
1258err_sa:
1259	ib_sa_unregister_client(&ipoib_sa_client);
1260	destroy_workqueue(ipoib_workqueue);
1261
1262err_fs:
1263	return ret;
1264}
1265
1266static void __exit
1267ipoib_cleanup_module(void)
1268{
1269
1270	EVENTHANDLER_DEREGISTER(vlan_config, ipoib_vlan_attach);
1271	EVENTHANDLER_DEREGISTER(vlan_unconfig, ipoib_vlan_detach);
1272	ib_unregister_client(&ipoib_client);
1273	ib_sa_unregister_client(&ipoib_sa_client);
1274	destroy_workqueue(ipoib_workqueue);
1275}
1276
1277/*
1278 * Infiniband output routine.
1279 */
1280static int
1281ipoib_output(struct ifnet *ifp, struct mbuf *m,
1282	const struct sockaddr *dst, struct route *ro)
1283{
1284	u_char edst[INFINIBAND_ALEN];
1285#if defined(INET) || defined(INET6)
1286	struct llentry *lle = NULL;
1287#endif
1288	struct ipoib_header *eh;
1289	int error = 0, is_gw = 0;
1290	short type;
1291
1292	if (ro != NULL)
1293		is_gw = (ro->ro_flags & RT_HAS_GW) != 0;
1294#ifdef MAC
1295	error = mac_ifnet_check_transmit(ifp, m);
1296	if (error)
1297		goto bad;
1298#endif
1299
1300	M_PROFILE(m);
1301	if (ifp->if_flags & IFF_MONITOR) {
1302		error = ENETDOWN;
1303		goto bad;
1304	}
1305	if (!((ifp->if_flags & IFF_UP) &&
1306	    (ifp->if_drv_flags & IFF_DRV_RUNNING))) {
1307		error = ENETDOWN;
1308		goto bad;
1309	}
1310
1311	switch (dst->sa_family) {
1312#ifdef INET
1313	case AF_INET:
1314		if (lle != NULL && (lle->la_flags & LLE_VALID))
1315			memcpy(edst, lle->ll_addr, sizeof(edst));
1316		else if (m->m_flags & M_MCAST)
1317			ip_ib_mc_map(((struct sockaddr_in *)dst)->sin_addr.s_addr, ifp->if_broadcastaddr, edst);
1318		else
1319			error = arpresolve(ifp, is_gw, m, dst, edst, NULL, NULL);
1320		if (error)
1321			return (error == EWOULDBLOCK ? 0 : error);
1322		type = htons(ETHERTYPE_IP);
1323		break;
1324	case AF_ARP:
1325	{
1326		struct arphdr *ah;
1327		ah = mtod(m, struct arphdr *);
1328		ah->ar_hrd = htons(ARPHRD_INFINIBAND);
1329
1330		switch(ntohs(ah->ar_op)) {
1331		case ARPOP_REVREQUEST:
1332		case ARPOP_REVREPLY:
1333			type = htons(ETHERTYPE_REVARP);
1334			break;
1335		case ARPOP_REQUEST:
1336		case ARPOP_REPLY:
1337		default:
1338			type = htons(ETHERTYPE_ARP);
1339			break;
1340		}
1341
1342		if (m->m_flags & M_BCAST)
1343			bcopy(ifp->if_broadcastaddr, edst, INFINIBAND_ALEN);
1344		else
1345			bcopy(ar_tha(ah), edst, INFINIBAND_ALEN);
1346
1347	}
1348	break;
1349#endif
1350#ifdef INET6
1351	case AF_INET6:
1352		if (lle != NULL && (lle->la_flags & LLE_VALID))
1353			memcpy(edst, lle->ll_addr, sizeof(edst));
1354		else if (m->m_flags & M_MCAST)
1355			ipv6_ib_mc_map(&((struct sockaddr_in6 *)dst)->sin6_addr, ifp->if_broadcastaddr, edst);
1356		else
1357			error = nd6_resolve(ifp, is_gw, m, dst, edst, NULL, NULL);
1358		if (error)
1359			return error;
1360		type = htons(ETHERTYPE_IPV6);
1361		break;
1362#endif
1363
1364	default:
1365		if_printf(ifp, "can't handle af%d\n", dst->sa_family);
1366		error = EAFNOSUPPORT;
1367		goto bad;
1368	}
1369
1370	/*
1371	 * Add local net header.  If no space in first mbuf,
1372	 * allocate another.
1373	 */
1374	M_PREPEND(m, IPOIB_HEADER_LEN, M_NOWAIT);
1375	if (m == NULL) {
1376		error = ENOBUFS;
1377		goto bad;
1378	}
1379	eh = mtod(m, struct ipoib_header *);
1380	(void)memcpy(&eh->proto, &type, sizeof(eh->proto));
1381	(void)memcpy(&eh->hwaddr, edst, sizeof (edst));
1382
1383	/*
1384	 * Queue message on interface, update output statistics if
1385	 * successful, and start output if interface not yet active.
1386	 */
1387	return ((ifp->if_transmit)(ifp, m));
1388bad:
1389	if (m != NULL)
1390		m_freem(m);
1391	return (error);
1392}
1393
1394/*
1395 * Upper layer processing for a received Infiniband packet.
1396 */
1397void
1398ipoib_demux(struct ifnet *ifp, struct mbuf *m, u_short proto)
1399{
1400	int isr;
1401
1402#ifdef MAC
1403	/*
1404	 * Tag the mbuf with an appropriate MAC label before any other
1405	 * consumers can get to it.
1406	 */
1407	mac_ifnet_create_mbuf(ifp, m);
1408#endif
1409	/* Allow monitor mode to claim this frame, after stats are updated. */
1410	if (ifp->if_flags & IFF_MONITOR) {
1411		if_printf(ifp, "discard frame at IFF_MONITOR\n");
1412		m_freem(m);
1413		return;
1414	}
1415	/*
1416	 * Dispatch frame to upper layer.
1417	 */
1418	switch (proto) {
1419#ifdef INET
1420	case ETHERTYPE_IP:
1421		isr = NETISR_IP;
1422		break;
1423
1424	case ETHERTYPE_ARP:
1425		if (ifp->if_flags & IFF_NOARP) {
1426			/* Discard packet if ARP is disabled on interface */
1427			m_freem(m);
1428			return;
1429		}
1430		isr = NETISR_ARP;
1431		break;
1432#endif
1433#ifdef INET6
1434	case ETHERTYPE_IPV6:
1435		isr = NETISR_IPV6;
1436		break;
1437#endif
1438	default:
1439		goto discard;
1440	}
1441	netisr_dispatch(isr, m);
1442	return;
1443
1444discard:
1445	m_freem(m);
1446}
1447
1448/*
1449 * Process a received Infiniband packet.
1450 */
1451static void
1452ipoib_input(struct ifnet *ifp, struct mbuf *m)
1453{
1454	struct ipoib_header *eh;
1455
1456	if ((ifp->if_flags & IFF_UP) == 0) {
1457		m_freem(m);
1458		return;
1459	}
1460	CURVNET_SET_QUIET(ifp->if_vnet);
1461
1462	/* Let BPF have it before we strip the header. */
1463	IPOIB_MTAP(ifp, m);
1464	eh = mtod(m, struct ipoib_header *);
1465	/*
1466	 * Reset layer specific mbuf flags to avoid confusing upper layers.
1467	 * Strip off Infiniband header.
1468	 */
1469	m->m_flags &= ~M_VLANTAG;
1470	m_clrprotoflags(m);
1471	m_adj(m, IPOIB_HEADER_LEN);
1472
1473	if (IPOIB_IS_MULTICAST(eh->hwaddr)) {
1474		if (memcmp(eh->hwaddr, ifp->if_broadcastaddr,
1475		    ifp->if_addrlen) == 0)
1476			m->m_flags |= M_BCAST;
1477		else
1478			m->m_flags |= M_MCAST;
1479		if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1);
1480	}
1481
1482	ipoib_demux(ifp, m, ntohs(eh->proto));
1483	CURVNET_RESTORE();
1484}
1485
1486static int
1487ipoib_resolvemulti(struct ifnet *ifp, struct sockaddr **llsa,
1488	struct sockaddr *sa)
1489{
1490	struct sockaddr_dl *sdl;
1491#ifdef INET
1492	struct sockaddr_in *sin;
1493#endif
1494#ifdef INET6
1495	struct sockaddr_in6 *sin6;
1496#endif
1497	u_char *e_addr;
1498
1499	switch(sa->sa_family) {
1500	case AF_LINK:
1501		/*
1502		 * No mapping needed. Just check that it's a valid MC address.
1503		 */
1504		sdl = (struct sockaddr_dl *)sa;
1505		e_addr = LLADDR(sdl);
1506		if (!IPOIB_IS_MULTICAST(e_addr))
1507			return EADDRNOTAVAIL;
1508		*llsa = NULL;
1509		return 0;
1510
1511#ifdef INET
1512	case AF_INET:
1513		sin = (struct sockaddr_in *)sa;
1514		if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
1515			return EADDRNOTAVAIL;
1516		sdl = link_init_sdl(ifp, *llsa, IFT_INFINIBAND);
1517		sdl->sdl_alen = INFINIBAND_ALEN;
1518		e_addr = LLADDR(sdl);
1519		ip_ib_mc_map(sin->sin_addr.s_addr, ifp->if_broadcastaddr,
1520		    e_addr);
1521		*llsa = (struct sockaddr *)sdl;
1522		return 0;
1523#endif
1524#ifdef INET6
1525	case AF_INET6:
1526		sin6 = (struct sockaddr_in6 *)sa;
1527		/*
1528		 * An IP6 address of 0 means listen to all
1529		 * of the multicast address used for IP6.
1530		 * This has no meaning in ipoib.
1531		 */
1532		if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
1533			return EADDRNOTAVAIL;
1534		if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr))
1535			return EADDRNOTAVAIL;
1536		sdl = link_init_sdl(ifp, *llsa, IFT_INFINIBAND);
1537		sdl->sdl_alen = INFINIBAND_ALEN;
1538		e_addr = LLADDR(sdl);
1539		ipv6_ib_mc_map(&sin6->sin6_addr, ifp->if_broadcastaddr, e_addr);
1540		*llsa = (struct sockaddr *)sdl;
1541		return 0;
1542#endif
1543
1544	default:
1545		return EAFNOSUPPORT;
1546	}
1547}
1548
1549module_init(ipoib_init_module);
1550module_exit(ipoib_cleanup_module);
1551
1552static int
1553ipoib_evhand(module_t mod, int event, void *arg)
1554{
1555	                return (0);
1556}
1557
1558static moduledata_t ipoib_mod = {
1559	                .name = "ipoib",
1560			                .evhand = ipoib_evhand,
1561};
1562
1563DECLARE_MODULE(ipoib, ipoib_mod, SI_SUB_LAST, SI_ORDER_ANY);
1564MODULE_DEPEND(ipoib, ibcore, 1, 1, 1);
1565MODULE_DEPEND(ipoib, linuxkpi, 1, 1, 1);
1566