ipoib_multicast.c revision 331769
1/*
2 * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
3 * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
4 * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
5 *
6 * This software is available to you under a choice of one of two
7 * licenses.  You may choose to be licensed under the terms of the GNU
8 * General Public License (GPL) Version 2, available from the file
9 * COPYING in the main directory of this source tree, or the
10 * OpenIB.org BSD license below:
11 *
12 *     Redistribution and use in source and binary forms, with or
13 *     without modification, are permitted provided that the following
14 *     conditions are met:
15 *
16 *      - Redistributions of source code must retain the above
17 *        copyright notice, this list of conditions and the following
18 *        disclaimer.
19 *
20 *      - Redistributions in binary form must reproduce the above
21 *        copyright notice, this list of conditions and the following
22 *        disclaimer in the documentation and/or other materials
23 *        provided with the distribution.
24 *
25 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
26 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
27 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
28 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
29 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
30 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
31 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
32 * SOFTWARE.
33 */
34
35#include "ipoib.h"
36
37#include <linux/delay.h>
38#include <linux/completion.h>
39
40#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
41static int mcast_debug_level = 1;
42
43module_param(mcast_debug_level, int, 0644);
44MODULE_PARM_DESC(mcast_debug_level,
45		 "Enable multicast debug tracing if > 0");
46#endif
47
48static DEFINE_MUTEX(mcast_mutex);
49
50struct ipoib_mcast_iter {
51	struct ipoib_dev_priv *priv;
52	union ib_gid       mgid;
53	unsigned long      created;
54	unsigned int       queuelen;
55	unsigned int       complete;
56	unsigned int       send_only;
57};
58
59static void ipoib_mcast_free(struct ipoib_mcast *mcast)
60{
61	struct ifnet *dev = mcast->priv->dev;
62	int tx_dropped = 0;
63
64	ipoib_dbg_mcast(mcast->priv, "deleting multicast group %16D\n",
65			mcast->mcmember.mgid.raw, ":");
66
67	if (mcast->ah)
68		ipoib_put_ah(mcast->ah);
69
70	tx_dropped = mcast->pkt_queue.ifq_len;
71	_IF_DRAIN(&mcast->pkt_queue);	/* XXX Locking. */
72
73	if_inc_counter(dev, IFCOUNTER_OERRORS, tx_dropped);
74
75	kfree(mcast);
76}
77
78static struct ipoib_mcast *ipoib_mcast_alloc(struct ipoib_dev_priv *priv,
79					     int can_sleep)
80{
81	struct ipoib_mcast *mcast;
82
83	mcast = kzalloc(sizeof *mcast, can_sleep ? GFP_KERNEL : GFP_ATOMIC);
84	if (!mcast)
85		return NULL;
86
87	mcast->priv = priv;
88	mcast->created = jiffies;
89	mcast->backoff = 1;
90
91	INIT_LIST_HEAD(&mcast->list);
92	bzero(&mcast->pkt_queue, sizeof(mcast->pkt_queue));
93
94	return mcast;
95}
96
97static struct ipoib_mcast *__ipoib_mcast_find(struct ipoib_dev_priv *priv,
98    void *mgid)
99{
100	struct rb_node *n = priv->multicast_tree.rb_node;
101
102	while (n) {
103		struct ipoib_mcast *mcast;
104		int ret;
105
106		mcast = rb_entry(n, struct ipoib_mcast, rb_node);
107
108		ret = memcmp(mgid, mcast->mcmember.mgid.raw,
109			     sizeof (union ib_gid));
110		if (ret < 0)
111			n = n->rb_left;
112		else if (ret > 0)
113			n = n->rb_right;
114		else
115			return mcast;
116	}
117
118	return NULL;
119}
120
121static int __ipoib_mcast_add(struct ipoib_dev_priv *priv,
122    struct ipoib_mcast *mcast)
123{
124	struct rb_node **n = &priv->multicast_tree.rb_node, *pn = NULL;
125
126	while (*n) {
127		struct ipoib_mcast *tmcast;
128		int ret;
129
130		pn = *n;
131		tmcast = rb_entry(pn, struct ipoib_mcast, rb_node);
132
133		ret = memcmp(mcast->mcmember.mgid.raw, tmcast->mcmember.mgid.raw,
134			     sizeof (union ib_gid));
135		if (ret < 0)
136			n = &pn->rb_left;
137		else if (ret > 0)
138			n = &pn->rb_right;
139		else
140			return -EEXIST;
141	}
142
143	rb_link_node(&mcast->rb_node, pn, n);
144	rb_insert_color(&mcast->rb_node, &priv->multicast_tree);
145
146	return 0;
147}
148
149static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast,
150				   struct ib_sa_mcmember_rec *mcmember)
151{
152	struct ipoib_dev_priv *priv = mcast->priv;
153	struct ifnet *dev = priv->dev;
154	struct ipoib_ah *ah;
155	int ret;
156	int set_qkey = 0;
157
158	mcast->mcmember = *mcmember;
159
160	/* Set the cached Q_Key before we attach if it's the broadcast group */
161	if (!memcmp(mcast->mcmember.mgid.raw, dev->if_broadcastaddr + 4,
162		    sizeof (union ib_gid))) {
163		spin_lock_irq(&priv->lock);
164		if (!priv->broadcast) {
165			spin_unlock_irq(&priv->lock);
166			return -EAGAIN;
167		}
168		priv->qkey = be32_to_cpu(priv->broadcast->mcmember.qkey);
169		spin_unlock_irq(&priv->lock);
170		priv->tx_wr.remote_qkey = priv->qkey;
171		set_qkey = 1;
172	}
173
174	if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) {
175		if (test_and_set_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) {
176			ipoib_warn(priv, "multicast group %16D already attached\n",
177				   mcast->mcmember.mgid.raw, ":");
178
179			return 0;
180		}
181
182		ret = ipoib_mcast_attach(priv, be16_to_cpu(mcast->mcmember.mlid),
183					 &mcast->mcmember.mgid, set_qkey);
184		if (ret < 0) {
185			ipoib_warn(priv, "couldn't attach QP to multicast group %16D\n",
186				   mcast->mcmember.mgid.raw, ":");
187
188			clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags);
189			return ret;
190		}
191	}
192
193	{
194		struct ib_ah_attr av = {
195			.dlid	       = be16_to_cpu(mcast->mcmember.mlid),
196			.port_num      = priv->port,
197			.sl	       = mcast->mcmember.sl,
198			.ah_flags      = IB_AH_GRH,
199			.static_rate   = mcast->mcmember.rate,
200			.grh	       = {
201				.flow_label    = be32_to_cpu(mcast->mcmember.flow_label),
202				.hop_limit     = mcast->mcmember.hop_limit,
203				.sgid_index    = 0,
204				.traffic_class = mcast->mcmember.traffic_class
205			}
206		};
207		av.grh.dgid = mcast->mcmember.mgid;
208
209		ah = ipoib_create_ah(priv, priv->pd, &av);
210		if (!ah) {
211			ipoib_warn(priv, "ib_address_create failed\n");
212		} else {
213			spin_lock_irq(&priv->lock);
214			mcast->ah = ah;
215			spin_unlock_irq(&priv->lock);
216
217			ipoib_dbg_mcast(priv, "MGID %16D AV %p, LID 0x%04x, SL %d\n",
218					mcast->mcmember.mgid.raw, ":",
219					mcast->ah->ah,
220					be16_to_cpu(mcast->mcmember.mlid),
221					mcast->mcmember.sl);
222		}
223	}
224
225	/* actually send any queued packets */
226	while (mcast->pkt_queue.ifq_len) {
227		struct mbuf *mb;
228		_IF_DEQUEUE(&mcast->pkt_queue, mb);
229		mb->m_pkthdr.rcvif = dev;
230
231		if (dev->if_transmit(dev, mb))
232			ipoib_warn(priv, "dev_queue_xmit failed to requeue packet\n");
233	}
234
235	return 0;
236}
237
238static int
239ipoib_mcast_sendonly_join_complete(int status,
240				   struct ib_sa_multicast *multicast)
241{
242	struct ipoib_mcast *mcast = multicast->context;
243	struct ipoib_dev_priv *priv = mcast->priv;
244
245	/* We trap for port events ourselves. */
246	if (status == -ENETRESET)
247		return 0;
248
249	if (!status)
250		status = ipoib_mcast_join_finish(mcast, &multicast->rec);
251
252	if (status) {
253		if (mcast->logcount++ < 20)
254			ipoib_dbg_mcast(priv, "multicast join failed for %16D, status %d\n",
255					mcast->mcmember.mgid.raw, ":", status);
256
257		/* Flush out any queued packets */
258		if_inc_counter(priv->dev, IFCOUNTER_OERRORS, mcast->pkt_queue.ifq_len);
259		_IF_DRAIN(&mcast->pkt_queue);
260
261		/* Clear the busy flag so we try again */
262		status = test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY,
263					    &mcast->flags);
264	}
265	return status;
266}
267
268static int ipoib_mcast_sendonly_join(struct ipoib_mcast *mcast)
269{
270	struct ipoib_dev_priv *priv = mcast->priv;
271	struct ib_sa_mcmember_rec rec = {
272#if 0				/* Some SMs don't support send-only yet */
273		.join_state = 4
274#else
275		.join_state = 1
276#endif
277	};
278	int ret = 0;
279
280	if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) {
281		ipoib_dbg_mcast(priv, "device shutting down, no multicast joins\n");
282		return -ENODEV;
283	}
284
285	if (test_and_set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) {
286		ipoib_dbg_mcast(priv, "multicast entry busy, skipping\n");
287		return -EBUSY;
288	}
289
290	rec.mgid     = mcast->mcmember.mgid;
291	rec.port_gid = priv->local_gid;
292	rec.pkey     = cpu_to_be16(priv->pkey);
293
294	mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca,
295					 priv->port, &rec,
296					 IB_SA_MCMEMBER_REC_MGID	|
297					 IB_SA_MCMEMBER_REC_PORT_GID	|
298					 IB_SA_MCMEMBER_REC_PKEY	|
299					 IB_SA_MCMEMBER_REC_JOIN_STATE,
300					 GFP_ATOMIC,
301					 ipoib_mcast_sendonly_join_complete,
302					 mcast);
303	if (IS_ERR(mcast->mc)) {
304		ret = PTR_ERR(mcast->mc);
305		clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
306		ipoib_warn(priv, "ib_sa_join_multicast failed (ret = %d)\n",
307			   ret);
308	} else {
309		ipoib_dbg_mcast(priv, "no multicast record for %16D, starting join\n",
310				mcast->mcmember.mgid.raw, ":");
311	}
312
313	return ret;
314}
315
316void ipoib_mcast_carrier_on_task(struct work_struct *work)
317{
318	struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
319						   carrier_on_task);
320	struct ib_port_attr attr;
321
322	/*
323	 * Take rtnl_lock to avoid racing with ipoib_stop() and
324	 * turning the carrier back on while a device is being
325	 * removed.
326	 */
327	if (ib_query_port(priv->ca, priv->port, &attr) ||
328	    attr.state != IB_PORT_ACTIVE) {
329		ipoib_dbg(priv, "Keeping carrier off until IB port is active\n");
330		return;
331	}
332	if_link_state_change(priv->dev, LINK_STATE_UP);
333}
334
335static int ipoib_mcast_join_complete(int status,
336				     struct ib_sa_multicast *multicast)
337{
338	struct ipoib_mcast *mcast = multicast->context;
339	struct ipoib_dev_priv *priv = mcast->priv;
340
341	ipoib_dbg_mcast(priv, "join completion for %16D (status %d)\n",
342			mcast->mcmember.mgid.raw, ":", status);
343
344	/* We trap for port events ourselves. */
345	if (status == -ENETRESET)
346		return 0;
347
348	if (!status)
349		status = ipoib_mcast_join_finish(mcast, &multicast->rec);
350
351	if (!status) {
352		mcast->backoff = 1;
353		mutex_lock(&mcast_mutex);
354		if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
355			queue_delayed_work(ipoib_workqueue,
356					   &priv->mcast_task, 0);
357		mutex_unlock(&mcast_mutex);
358
359		/*
360		 * Defer carrier on work to ipoib_workqueue to avoid a
361		 * deadlock on rtnl_lock here.
362		 */
363		if (mcast == priv->broadcast)
364			queue_work(ipoib_workqueue, &priv->carrier_on_task);
365
366		return 0;
367	}
368
369	if (mcast->logcount++ < 20) {
370		if (status == -ETIMEDOUT || status == -EAGAIN) {
371			ipoib_dbg_mcast(priv, "multicast join failed for %16D, status %d\n",
372					mcast->mcmember.mgid.raw, ":", status);
373		} else {
374			ipoib_warn(priv, "multicast join failed for %16D, status %d\n",
375				   mcast->mcmember.mgid.raw, ":", status);
376		}
377	}
378
379	mcast->backoff *= 2;
380	if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS)
381		mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS;
382
383	/* Clear the busy flag so we try again */
384	status = test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
385
386	mutex_lock(&mcast_mutex);
387	spin_lock_irq(&priv->lock);
388	if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
389		queue_delayed_work(ipoib_workqueue, &priv->mcast_task,
390				   mcast->backoff * HZ);
391	spin_unlock_irq(&priv->lock);
392	mutex_unlock(&mcast_mutex);
393
394	return status;
395}
396
397static void ipoib_mcast_join(struct ipoib_dev_priv *priv,
398    struct ipoib_mcast *mcast, int create)
399{
400	struct ib_sa_mcmember_rec rec = {
401		.join_state = 1
402	};
403	ib_sa_comp_mask comp_mask;
404	int ret = 0;
405
406	ipoib_dbg_mcast(priv, "joining MGID %16D\n",
407	    mcast->mcmember.mgid.raw, ":");
408
409	rec.mgid     = mcast->mcmember.mgid;
410	rec.port_gid = priv->local_gid;
411	rec.pkey     = cpu_to_be16(priv->pkey);
412
413	comp_mask =
414		IB_SA_MCMEMBER_REC_MGID		|
415		IB_SA_MCMEMBER_REC_PORT_GID	|
416		IB_SA_MCMEMBER_REC_PKEY		|
417		IB_SA_MCMEMBER_REC_JOIN_STATE;
418
419	if (create) {
420		comp_mask |=
421			IB_SA_MCMEMBER_REC_QKEY			|
422			IB_SA_MCMEMBER_REC_MTU_SELECTOR		|
423			IB_SA_MCMEMBER_REC_MTU			|
424			IB_SA_MCMEMBER_REC_TRAFFIC_CLASS	|
425			IB_SA_MCMEMBER_REC_RATE_SELECTOR	|
426			IB_SA_MCMEMBER_REC_RATE			|
427			IB_SA_MCMEMBER_REC_SL			|
428			IB_SA_MCMEMBER_REC_FLOW_LABEL		|
429			IB_SA_MCMEMBER_REC_HOP_LIMIT;
430
431		rec.qkey	  = priv->broadcast->mcmember.qkey;
432		rec.mtu_selector  = IB_SA_EQ;
433		rec.mtu		  = priv->broadcast->mcmember.mtu;
434		rec.traffic_class = priv->broadcast->mcmember.traffic_class;
435		rec.rate_selector = IB_SA_EQ;
436		rec.rate	  = priv->broadcast->mcmember.rate;
437		rec.sl		  = priv->broadcast->mcmember.sl;
438		rec.flow_label	  = priv->broadcast->mcmember.flow_label;
439		rec.hop_limit	  = priv->broadcast->mcmember.hop_limit;
440	}
441
442	set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
443	mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, priv->port,
444					 &rec, comp_mask, GFP_KERNEL,
445					 ipoib_mcast_join_complete, mcast);
446	if (IS_ERR(mcast->mc)) {
447		clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
448		ret = PTR_ERR(mcast->mc);
449		ipoib_warn(priv, "ib_sa_join_multicast failed, status %d\n", ret);
450
451		mcast->backoff *= 2;
452		if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS)
453			mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS;
454
455		mutex_lock(&mcast_mutex);
456		if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
457			queue_delayed_work(ipoib_workqueue,
458					   &priv->mcast_task,
459					   mcast->backoff * HZ);
460		mutex_unlock(&mcast_mutex);
461	}
462}
463
464void ipoib_mcast_join_task(struct work_struct *work)
465{
466	struct ipoib_dev_priv *priv =
467		container_of(work, struct ipoib_dev_priv, mcast_task.work);
468	struct ifnet *dev = priv->dev;
469	struct ib_port_attr attr;
470
471	ipoib_dbg_mcast(priv, "Running join task. flags 0x%lX\n", priv->flags);
472
473	if (!test_bit(IPOIB_MCAST_RUN, &priv->flags))
474		return;
475
476	if (ib_query_port(priv->ca, priv->port, &attr) ||
477            attr.state != IB_PORT_ACTIVE) {
478		ipoib_dbg(priv, "%s: port state is not ACTIVE (state = %d) suspend task.\n",
479                          __func__, attr.state);
480		return;
481	}
482
483	if (ib_query_gid(priv->ca, priv->port, 0, &priv->local_gid, NULL))
484		ipoib_warn(priv, "ib_query_gid() failed\n");
485	else
486		memcpy(IF_LLADDR(dev) + 4, priv->local_gid.raw, sizeof (union ib_gid));
487
488	{
489		struct ib_port_attr attr;
490
491		if (!ib_query_port(priv->ca, priv->port, &attr))
492			priv->local_lid = attr.lid;
493		else
494			ipoib_warn(priv, "ib_query_port failed\n");
495	}
496
497	if (!priv->broadcast) {
498		struct ipoib_mcast *broadcast;
499
500		if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
501			return;
502
503		broadcast = ipoib_mcast_alloc(priv, 1);
504		if (!broadcast) {
505			ipoib_warn(priv, "failed to allocate broadcast group\n");
506			mutex_lock(&mcast_mutex);
507			if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
508				queue_delayed_work(ipoib_workqueue,
509						   &priv->mcast_task, HZ);
510			mutex_unlock(&mcast_mutex);
511			return;
512		}
513
514		spin_lock_irq(&priv->lock);
515		memcpy(broadcast->mcmember.mgid.raw, dev->if_broadcastaddr + 4,
516		       sizeof (union ib_gid));
517		priv->broadcast = broadcast;
518
519		__ipoib_mcast_add(priv, priv->broadcast);
520		spin_unlock_irq(&priv->lock);
521	}
522
523	if (priv->broadcast &&
524	    !test_bit(IPOIB_MCAST_FLAG_ATTACHED, &priv->broadcast->flags)) {
525		if (priv->broadcast &&
526		    !test_bit(IPOIB_MCAST_FLAG_BUSY, &priv->broadcast->flags))
527			ipoib_mcast_join(priv, priv->broadcast, 0);
528		return;
529	}
530
531	while (1) {
532		struct ipoib_mcast *mcast = NULL;
533
534		spin_lock_irq(&priv->lock);
535		list_for_each_entry(mcast, &priv->multicast_list, list) {
536			if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)
537			    && !test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)
538			    && !test_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) {
539				/* Found the next unjoined group */
540				break;
541			}
542		}
543		spin_unlock_irq(&priv->lock);
544
545		if (&mcast->list == &priv->multicast_list) {
546			/* All done */
547			break;
548		}
549
550		ipoib_mcast_join(priv, mcast, 1);
551		return;
552	}
553
554	spin_lock_irq(&priv->lock);
555	if (priv->broadcast)
556		priv->mcast_mtu = IPOIB_UD_MTU(ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu));
557	else
558		priv->mcast_mtu = priv->admin_mtu;
559	spin_unlock_irq(&priv->lock);
560
561	if (!ipoib_cm_admin_enabled(priv))
562		ipoib_change_mtu(priv, min(priv->mcast_mtu, priv->admin_mtu));
563
564	ipoib_dbg_mcast(priv, "successfully joined all multicast groups\n");
565
566	clear_bit(IPOIB_MCAST_RUN, &priv->flags);
567}
568
569int ipoib_mcast_start_thread(struct ipoib_dev_priv *priv)
570{
571	ipoib_dbg_mcast(priv, "starting multicast thread flags 0x%lX\n",
572	    priv->flags);
573
574	mutex_lock(&mcast_mutex);
575	if (!test_and_set_bit(IPOIB_MCAST_RUN, &priv->flags))
576		queue_delayed_work(ipoib_workqueue, &priv->mcast_task, 0);
577	mutex_unlock(&mcast_mutex);
578
579	return 0;
580}
581
582int ipoib_mcast_stop_thread(struct ipoib_dev_priv *priv, int flush)
583{
584
585	ipoib_dbg_mcast(priv, "stopping multicast thread\n");
586
587	mutex_lock(&mcast_mutex);
588	clear_bit(IPOIB_MCAST_RUN, &priv->flags);
589	cancel_delayed_work(&priv->mcast_task);
590	mutex_unlock(&mcast_mutex);
591
592	if (flush)
593		flush_workqueue(ipoib_workqueue);
594
595	return 0;
596}
597
598static int ipoib_mcast_leave(struct ipoib_dev_priv *priv, struct ipoib_mcast *mcast)
599{
600	int ret = 0;
601
602	if (test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags))
603		ib_sa_free_multicast(mcast->mc);
604
605	if (test_and_clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) {
606		ipoib_dbg_mcast(priv, "leaving MGID %16D\n",
607				mcast->mcmember.mgid.raw, ":");
608
609		/* Remove ourselves from the multicast group */
610		ret = ib_detach_mcast(priv->qp, &mcast->mcmember.mgid,
611				      be16_to_cpu(mcast->mcmember.mlid));
612		if (ret)
613			ipoib_warn(priv, "ib_detach_mcast failed (result = %d)\n", ret);
614	}
615
616	return 0;
617}
618
619void
620ipoib_mcast_send(struct ipoib_dev_priv *priv, void *mgid, struct mbuf *mb)
621{
622	struct ifnet *dev = priv->dev;
623	struct ipoib_mcast *mcast;
624
625	if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)		||
626	    !priv->broadcast					||
627	    !test_bit(IPOIB_MCAST_FLAG_ATTACHED, &priv->broadcast->flags)) {
628		if_inc_counter(dev, IFCOUNTER_OERRORS, 1);
629		m_freem(mb);
630		return;
631	}
632
633	mcast = __ipoib_mcast_find(priv, mgid);
634	if (!mcast) {
635		/* Let's create a new send only group now */
636		ipoib_dbg_mcast(priv, "setting up send only multicast group for %16D\n",
637				mgid, ":");
638
639		mcast = ipoib_mcast_alloc(priv, 0);
640		if (!mcast) {
641			ipoib_warn(priv, "unable to allocate memory for "
642				   "multicast structure\n");
643			if_inc_counter(dev, IFCOUNTER_OERRORS, 1);
644			m_freem(mb);
645			goto out;
646		}
647
648		set_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags);
649		memcpy(mcast->mcmember.mgid.raw, mgid, sizeof (union ib_gid));
650		__ipoib_mcast_add(priv, mcast);
651		list_add_tail(&mcast->list, &priv->multicast_list);
652	}
653
654	if (!mcast->ah) {
655		if (mcast->pkt_queue.ifq_len < IPOIB_MAX_MCAST_QUEUE) {
656			_IF_ENQUEUE(&mcast->pkt_queue, mb);
657		} else {
658			if_inc_counter(dev, IFCOUNTER_OERRORS, 1);
659			m_freem(mb);
660		}
661
662		if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags))
663			ipoib_dbg_mcast(priv, "no address vector, "
664					"but multicast join already started\n");
665		else if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags))
666			ipoib_mcast_sendonly_join(mcast);
667
668		/*
669		 * If lookup completes between here and out:, don't
670		 * want to send packet twice.
671		 */
672		mcast = NULL;
673	}
674
675out:
676	if (mcast && mcast->ah)
677		ipoib_send(priv, mb, mcast->ah, IB_MULTICAST_QPN);
678}
679
680void ipoib_mcast_dev_flush(struct ipoib_dev_priv *priv)
681{
682	LIST_HEAD(remove_list);
683	struct ipoib_mcast *mcast, *tmcast;
684	unsigned long flags;
685
686	ipoib_dbg_mcast(priv, "flushing multicast list\n");
687
688	spin_lock_irqsave(&priv->lock, flags);
689
690	list_for_each_entry_safe(mcast, tmcast, &priv->multicast_list, list) {
691		list_del(&mcast->list);
692		rb_erase(&mcast->rb_node, &priv->multicast_tree);
693		list_add_tail(&mcast->list, &remove_list);
694	}
695
696	if (priv->broadcast) {
697		rb_erase(&priv->broadcast->rb_node, &priv->multicast_tree);
698		list_add_tail(&priv->broadcast->list, &remove_list);
699		priv->broadcast = NULL;
700	}
701
702	spin_unlock_irqrestore(&priv->lock, flags);
703
704	list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
705		ipoib_mcast_leave(priv, mcast);
706		ipoib_mcast_free(mcast);
707	}
708}
709
710static int ipoib_mcast_addr_is_valid(const u8 *addr, unsigned int addrlen,
711				     const u8 *broadcast)
712{
713	if (addrlen != INFINIBAND_ALEN)
714		return 0;
715	/* reserved QPN, prefix, scope */
716	if (memcmp(addr, broadcast, 6))
717		return 0;
718	/* signature lower, pkey */
719	if (memcmp(addr + 7, broadcast + 7, 3))
720		return 0;
721	return 1;
722}
723
724void ipoib_mcast_restart_task(struct work_struct *work)
725{
726	struct ipoib_dev_priv *priv =
727		container_of(work, struct ipoib_dev_priv, restart_task);
728	ipoib_mcast_restart(priv);
729}
730
731void ipoib_mcast_restart(struct ipoib_dev_priv *priv)
732{
733	struct ifnet *dev = priv->dev;
734	struct ifmultiaddr *ifma;
735	struct ipoib_mcast *mcast, *tmcast;
736	LIST_HEAD(remove_list);
737	struct ib_sa_mcmember_rec rec;
738	int addrlen;
739
740	ipoib_dbg_mcast(priv, "restarting multicast task flags 0x%lX\n",
741	    priv->flags);
742
743	ipoib_mcast_stop_thread(priv, 0);
744
745	if_maddr_rlock(dev);
746	spin_lock(&priv->lock);
747
748	/*
749	 * Unfortunately, the networking core only gives us a list of all of
750	 * the multicast hardware addresses. We need to figure out which ones
751	 * are new and which ones have been removed
752	 */
753
754	/* Clear out the found flag */
755	list_for_each_entry(mcast, &priv->multicast_list, list)
756		clear_bit(IPOIB_MCAST_FLAG_FOUND, &mcast->flags);
757
758	/* Mark all of the entries that are found or don't exist */
759
760
761	TAILQ_FOREACH(ifma, &dev->if_multiaddrs, ifma_link) {
762		union ib_gid mgid;
763		uint8_t *addr;
764
765		if (ifma->ifma_addr->sa_family != AF_LINK)
766			continue;
767		addr = LLADDR((struct sockaddr_dl *)ifma->ifma_addr);
768		addrlen = ((struct sockaddr_dl *)ifma->ifma_addr)->sdl_alen;
769		if (!ipoib_mcast_addr_is_valid(addr, addrlen,
770					       dev->if_broadcastaddr))
771			continue;
772
773		memcpy(mgid.raw, addr + 4, sizeof mgid);
774
775		mcast = __ipoib_mcast_find(priv, &mgid);
776		if (!mcast || test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) {
777			struct ipoib_mcast *nmcast;
778
779			/* ignore group which is directly joined by userspace */
780			if (test_bit(IPOIB_FLAG_UMCAST, &priv->flags) &&
781			    !ib_sa_get_mcmember_rec(priv->ca, priv->port, &mgid, &rec)) {
782				ipoib_dbg_mcast(priv, "ignoring multicast entry for mgid %16D\n",
783						mgid.raw, ":");
784				continue;
785			}
786
787			/* Not found or send-only group, let's add a new entry */
788			ipoib_dbg_mcast(priv, "adding multicast entry for mgid %16D\n",
789					mgid.raw, ":");
790
791			nmcast = ipoib_mcast_alloc(priv, 0);
792			if (!nmcast) {
793				ipoib_warn(priv, "unable to allocate memory for multicast structure\n");
794				continue;
795			}
796
797			set_bit(IPOIB_MCAST_FLAG_FOUND, &nmcast->flags);
798
799			nmcast->mcmember.mgid = mgid;
800
801			if (mcast) {
802				/* Destroy the send only entry */
803				list_move_tail(&mcast->list, &remove_list);
804
805				rb_replace_node(&mcast->rb_node,
806						&nmcast->rb_node,
807						&priv->multicast_tree);
808			} else
809				__ipoib_mcast_add(priv, nmcast);
810
811			list_add_tail(&nmcast->list, &priv->multicast_list);
812		}
813
814		if (mcast)
815			set_bit(IPOIB_MCAST_FLAG_FOUND, &mcast->flags);
816	}
817
818	/* Remove all of the entries don't exist anymore */
819	list_for_each_entry_safe(mcast, tmcast, &priv->multicast_list, list) {
820		if (!test_bit(IPOIB_MCAST_FLAG_FOUND, &mcast->flags) &&
821		    !test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) {
822			ipoib_dbg_mcast(priv, "deleting multicast group %16D\n",
823					mcast->mcmember.mgid.raw, ":");
824
825			rb_erase(&mcast->rb_node, &priv->multicast_tree);
826
827			/* Move to the remove list */
828			list_move_tail(&mcast->list, &remove_list);
829		}
830	}
831
832	spin_unlock(&priv->lock);
833	if_maddr_runlock(dev);
834
835	/* We have to cancel outside of the spinlock */
836	list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
837		ipoib_mcast_leave(mcast->priv, mcast);
838		ipoib_mcast_free(mcast);
839	}
840
841	if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
842		ipoib_mcast_start_thread(priv);
843}
844
845#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
846
847struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct ipoib_dev_priv *priv)
848{
849	struct ipoib_mcast_iter *iter;
850
851	iter = kmalloc(sizeof *iter, GFP_KERNEL);
852	if (!iter)
853		return NULL;
854
855	iter->priv = priv;
856	memset(iter->mgid.raw, 0, 16);
857
858	if (ipoib_mcast_iter_next(iter)) {
859		kfree(iter);
860		return NULL;
861	}
862
863	return iter;
864}
865
866int ipoib_mcast_iter_next(struct ipoib_mcast_iter *iter)
867{
868	struct ipoib_dev_priv *priv = iter->priv;
869	struct rb_node *n;
870	struct ipoib_mcast *mcast;
871	int ret = 1;
872
873	spin_lock_irq(&priv->lock);
874
875	n = rb_first(&priv->multicast_tree);
876
877	while (n) {
878		mcast = rb_entry(n, struct ipoib_mcast, rb_node);
879
880		if (memcmp(iter->mgid.raw, mcast->mcmember.mgid.raw,
881			   sizeof (union ib_gid)) < 0) {
882			iter->mgid      = mcast->mcmember.mgid;
883			iter->created   = mcast->created;
884			iter->queuelen  = mcast->pkt_queue.ifq_len;
885			iter->complete  = !!mcast->ah;
886			iter->send_only = !!(mcast->flags & (1 << IPOIB_MCAST_FLAG_SENDONLY));
887
888			ret = 0;
889
890			break;
891		}
892
893		n = rb_next(n);
894	}
895
896	spin_unlock_irq(&priv->lock);
897
898	return ret;
899}
900
901void ipoib_mcast_iter_read(struct ipoib_mcast_iter *iter,
902			   union ib_gid *mgid,
903			   unsigned long *created,
904			   unsigned int *queuelen,
905			   unsigned int *complete,
906			   unsigned int *send_only)
907{
908	*mgid      = iter->mgid;
909	*created   = iter->created;
910	*queuelen  = iter->queuelen;
911	*complete  = iter->complete;
912	*send_only = iter->send_only;
913}
914
915#endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */
916