1/*
2 * Copyright (c) 2004-2008 Voltaire, Inc. All rights reserved.
3 * Copyright (c) 2002-2006 Mellanox Technologies LTD. All rights reserved.
4 * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
5 * Copyright (c) 2008 Xsigo Systems Inc.  All rights reserved.
6 *
7 * This software is available to you under a choice of one of two
8 * licenses.  You may choose to be licensed under the terms of the GNU
9 * General Public License (GPL) Version 2, available from the file
10 * COPYING in the main directory of this source tree, or the
11 * OpenIB.org BSD license below:
12 *
13 *     Redistribution and use in source and binary forms, with or
14 *     without modification, are permitted provided that the following
15 *     conditions are met:
16 *
17 *      - Redistributions of source code must retain the above
18 *        copyright notice, this list of conditions and the following
19 *        disclaimer.
20 *
21 *      - Redistributions in binary form must reproduce the above
22 *        copyright notice, this list of conditions and the following
23 *        disclaimer in the documentation and/or other materials
24 *        provided with the distribution.
25 *
26 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
27 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
28 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
29 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
30 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
31 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
32 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
33 * SOFTWARE.
34 *
35 */
36
37/*
38 * Abstract:
39 *    Implementation of osm_mcast_mgr_t.
40 * This file implements the Multicast Manager object.
41 */
42
43#if HAVE_CONFIG_H
44#  include <config.h>
45#endif				/* HAVE_CONFIG_H */
46
47#include <stdlib.h>
48#include <string.h>
49#include <iba/ib_types.h>
50#include <complib/cl_debug.h>
51#include <opensm/osm_opensm.h>
52#include <opensm/osm_sm.h>
53#include <opensm/osm_multicast.h>
54#include <opensm/osm_node.h>
55#include <opensm/osm_switch.h>
56#include <opensm/osm_helper.h>
57#include <opensm/osm_msgdef.h>
58
59/**********************************************************************
60 **********************************************************************/
61typedef struct osm_mcast_work_obj {
62	cl_list_item_t list_item;
63	osm_port_t *p_port;
64} osm_mcast_work_obj_t;
65
66/**********************************************************************
67 **********************************************************************/
68static osm_mcast_work_obj_t *__osm_mcast_work_obj_new(IN const osm_port_t *
69						      const p_port)
70{
71	/*
72	   TO DO - get these objects from a lockpool.
73	 */
74	osm_mcast_work_obj_t *p_obj;
75
76	/*
77	   clean allocated memory to avoid assertion when trying to insert to
78	   qlist.
79	   see cl_qlist_insert_tail(): CL_ASSERT(p_list_item->p_list != p_list)
80	 */
81	p_obj = malloc(sizeof(*p_obj));
82	if (p_obj) {
83		memset(p_obj, 0, sizeof(*p_obj));
84		p_obj->p_port = (osm_port_t *) p_port;
85	}
86
87	return (p_obj);
88}
89
90/**********************************************************************
91 **********************************************************************/
92static void __osm_mcast_work_obj_delete(IN osm_mcast_work_obj_t * p_wobj)
93{
94	free(p_wobj);
95}
96
97/**********************************************************************
98 Recursively remove nodes from the tree
99 *********************************************************************/
100static void __osm_mcast_mgr_purge_tree_node(IN osm_mtree_node_t * p_mtn)
101{
102	uint8_t i;
103
104	for (i = 0; i < p_mtn->max_children; i++) {
105		if (p_mtn->child_array[i] &&
106		    (p_mtn->child_array[i] != OSM_MTREE_LEAF))
107			__osm_mcast_mgr_purge_tree_node(p_mtn->child_array[i]);
108
109		p_mtn->child_array[i] = NULL;
110
111	}
112
113	free(p_mtn);
114}
115
116/**********************************************************************
117 **********************************************************************/
118static void
119__osm_mcast_mgr_purge_tree(osm_sm_t * sm, IN osm_mgrp_t * const p_mgrp)
120{
121	OSM_LOG_ENTER(sm->p_log);
122
123	if (p_mgrp->p_root)
124		__osm_mcast_mgr_purge_tree_node(p_mgrp->p_root);
125
126	p_mgrp->p_root = NULL;
127
128	OSM_LOG_EXIT(sm->p_log);
129}
130
131/**********************************************************************
132 **********************************************************************/
133static float
134osm_mcast_mgr_compute_avg_hops(osm_sm_t * sm,
135			       const osm_mgrp_t * const p_mgrp,
136			       const osm_switch_t * const p_sw)
137{
138	float avg_hops = 0;
139	uint32_t hops = 0;
140	uint32_t num_ports = 0;
141	const osm_port_t *p_port;
142	const osm_mcm_port_t *p_mcm_port;
143	const cl_qmap_t *p_mcm_tbl;
144
145	OSM_LOG_ENTER(sm->p_log);
146
147	p_mcm_tbl = &p_mgrp->mcm_port_tbl;
148
149	/*
150	   For each member of the multicast group, compute the
151	   number of hops to its base LID.
152	 */
153	for (p_mcm_port = (osm_mcm_port_t *) cl_qmap_head(p_mcm_tbl);
154	     p_mcm_port != (osm_mcm_port_t *) cl_qmap_end(p_mcm_tbl);
155	     p_mcm_port =
156	     (osm_mcm_port_t *) cl_qmap_next(&p_mcm_port->map_item)) {
157		/*
158		   Acquire the port object for this port guid, then create
159		   the new worker object to build the list.
160		 */
161		p_port = osm_get_port_by_guid(sm->p_subn,
162					      ib_gid_get_guid(&p_mcm_port->
163							      port_gid));
164
165		if (!p_port) {
166			OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A18: "
167				"No port object for port 0x%016" PRIx64 "\n",
168				cl_ntoh64(ib_gid_get_guid
169					  (&p_mcm_port->port_gid)));
170			continue;
171		}
172
173		hops += osm_switch_get_port_least_hops(p_sw, p_port);
174		num_ports++;
175	}
176
177	/*
178	   We should be here if there aren't any ports in the group.
179	 */
180	CL_ASSERT(num_ports);
181
182	if (num_ports != 0)
183		avg_hops = (float)(hops / num_ports);
184
185	OSM_LOG_EXIT(sm->p_log);
186	return (avg_hops);
187}
188
189/**********************************************************************
190 Calculate the maximal "min hops" from the given switch to any
191 of the group HCAs
192 **********************************************************************/
193static float
194osm_mcast_mgr_compute_max_hops(osm_sm_t * sm,
195			       const osm_mgrp_t * const p_mgrp,
196			       const osm_switch_t * const p_sw)
197{
198	uint32_t max_hops = 0;
199	uint32_t hops = 0;
200	const osm_port_t *p_port;
201	const osm_mcm_port_t *p_mcm_port;
202	const cl_qmap_t *p_mcm_tbl;
203
204	OSM_LOG_ENTER(sm->p_log);
205
206	p_mcm_tbl = &p_mgrp->mcm_port_tbl;
207
208	/*
209	   For each member of the multicast group, compute the
210	   number of hops to its base LID.
211	 */
212	for (p_mcm_port = (osm_mcm_port_t *) cl_qmap_head(p_mcm_tbl);
213	     p_mcm_port != (osm_mcm_port_t *) cl_qmap_end(p_mcm_tbl);
214	     p_mcm_port =
215	     (osm_mcm_port_t *) cl_qmap_next(&p_mcm_port->map_item)) {
216		/*
217		   Acquire the port object for this port guid, then create
218		   the new worker object to build the list.
219		 */
220		p_port = osm_get_port_by_guid(sm->p_subn,
221					      ib_gid_get_guid(&p_mcm_port->
222							      port_gid));
223
224		if (!p_port) {
225			OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A1A: "
226				"No port object for port 0x%016" PRIx64 "\n",
227				cl_ntoh64(ib_gid_get_guid
228					  (&p_mcm_port->port_gid)));
229			continue;
230		}
231
232		hops = osm_switch_get_port_least_hops(p_sw, p_port);
233		if (hops > max_hops)
234			max_hops = hops;
235	}
236
237	if (max_hops == 0) {
238		/*
239		   We should be here if there aren't any ports in the group.
240		 */
241		max_hops = 10001;	/* see later - we use it to realize no hops */
242	}
243
244	OSM_LOG_EXIT(sm->p_log);
245	return (float)(max_hops);
246}
247
248/**********************************************************************
249   This function attempts to locate the optimal switch for the
250   center of the spanning tree.  The current algorithm chooses
251   a switch with the lowest average hop count to the members
252   of the multicast group.
253**********************************************************************/
254static osm_switch_t *__osm_mcast_mgr_find_optimal_switch(osm_sm_t * sm,
255							 const osm_mgrp_t *
256							 const p_mgrp)
257{
258	cl_qmap_t *p_sw_tbl;
259	const osm_switch_t *p_sw;
260	const osm_switch_t *p_best_sw = NULL;
261	float hops = 0;
262	float best_hops = 10000;	/* any big # will do */
263#ifdef OSM_VENDOR_INTF_ANAFA
264	boolean_t use_avg_hops = TRUE;	/* anafa2 - bug hca on switch *//* use max hops for root */
265#else
266	boolean_t use_avg_hops = FALSE;	/* use max hops for root */
267#endif
268
269	OSM_LOG_ENTER(sm->p_log);
270
271	p_sw_tbl = &sm->p_subn->sw_guid_tbl;
272
273	CL_ASSERT(!osm_mgrp_is_empty(p_mgrp));
274
275	for (p_sw = (osm_switch_t *) cl_qmap_head(p_sw_tbl);
276	     p_sw != (osm_switch_t *) cl_qmap_end(p_sw_tbl);
277	     p_sw = (osm_switch_t *) cl_qmap_next(&p_sw->map_item)) {
278		if (!osm_switch_supports_mcast(p_sw))
279			continue;
280
281		if (use_avg_hops)
282			hops = osm_mcast_mgr_compute_avg_hops(sm, p_mgrp, p_sw);
283		else
284			hops = osm_mcast_mgr_compute_max_hops(sm, p_mgrp, p_sw);
285
286		OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
287			"Switch 0x%016" PRIx64 ", hops = %f\n",
288			cl_ntoh64(osm_node_get_node_guid(p_sw->p_node)), hops);
289
290		if (hops < best_hops) {
291			p_best_sw = p_sw;
292			best_hops = hops;
293		}
294	}
295
296	if (p_best_sw)
297		OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
298			"Best switch is 0x%" PRIx64 ", hops = %f\n",
299			cl_ntoh64(osm_node_get_node_guid(p_best_sw->p_node)),
300			best_hops);
301	else
302		OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
303			"No multicast capable switches detected\n");
304
305	OSM_LOG_EXIT(sm->p_log);
306	return ((osm_switch_t *) p_best_sw);
307}
308
309/**********************************************************************
310   This function returns the existing or optimal root swtich for the tree.
311**********************************************************************/
312static osm_switch_t *__osm_mcast_mgr_find_root_switch(osm_sm_t * sm,
313						      const osm_mgrp_t *
314						      const p_mgrp)
315{
316	const osm_switch_t *p_sw = NULL;
317
318	OSM_LOG_ENTER(sm->p_log);
319
320	/*
321	   We always look for the best multicast tree root switch.
322	   Otherwise since we always start with a a single join
323	   the root will be always on the first switch attached to it.
324	   - Very bad ...
325	 */
326	p_sw = __osm_mcast_mgr_find_optimal_switch(sm, p_mgrp);
327
328	OSM_LOG_EXIT(sm->p_log);
329	return ((osm_switch_t *) p_sw);
330}
331
332/**********************************************************************
333 **********************************************************************/
334static osm_signal_t
335__osm_mcast_mgr_set_tbl(osm_sm_t * sm, IN osm_switch_t * const p_sw)
336{
337	osm_node_t *p_node;
338	osm_dr_path_t *p_path;
339	osm_madw_context_t mad_context;
340	ib_api_status_t status;
341	uint32_t block_id_ho = 0;
342	int16_t block_num = 0;
343	uint32_t position = 0;
344	uint32_t max_position;
345	osm_mcast_tbl_t *p_tbl;
346	ib_net16_t block[IB_MCAST_BLOCK_SIZE];
347	osm_signal_t signal = OSM_SIGNAL_DONE;
348
349	CL_ASSERT(sm);
350
351	OSM_LOG_ENTER(sm->p_log);
352
353	CL_ASSERT(p_sw);
354
355	p_node = p_sw->p_node;
356
357	CL_ASSERT(p_node);
358
359	p_path = osm_physp_get_dr_path_ptr(osm_node_get_physp_ptr(p_node, 0));
360
361	/*
362	   Send multicast forwarding table blocks to the switch
363	   as long as the switch indicates it has blocks needing
364	   configuration.
365	 */
366
367	mad_context.mft_context.node_guid = osm_node_get_node_guid(p_node);
368	mad_context.mft_context.set_method = TRUE;
369
370	p_tbl = osm_switch_get_mcast_tbl_ptr(p_sw);
371	max_position = p_tbl->max_position;
372
373	while (osm_mcast_tbl_get_block(p_tbl, block_num,
374				       (uint8_t) position, block)) {
375		OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
376			"Writing MFT block 0x%X\n", block_id_ho);
377
378		block_id_ho = block_num + (position << 28);
379
380		status = osm_req_set(sm, p_path, (void *)block, sizeof(block),
381				     IB_MAD_ATTR_MCAST_FWD_TBL,
382				     cl_hton32(block_id_ho),
383				     CL_DISP_MSGID_NONE, &mad_context);
384
385		if (status != IB_SUCCESS) {
386			OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A02: "
387				"Sending multicast fwd. tbl. block failed (%s)\n",
388				ib_get_err_str(status));
389		}
390
391		signal = OSM_SIGNAL_DONE_PENDING;
392
393		if (++position > max_position) {
394			position = 0;
395			block_num++;
396		}
397	}
398
399	OSM_LOG_EXIT(sm->p_log);
400	return (signal);
401}
402
403/**********************************************************************
404  This is part of the recursive function to compute the paths in the
405  spanning tree that eminate from this switch.  On input, the p_list
406  contains the group members that must be routed from this switch.
407**********************************************************************/
408static void
409__osm_mcast_mgr_subdivide(osm_sm_t * sm,
410			  osm_mgrp_t * const p_mgrp,
411			  osm_switch_t * const p_sw,
412			  cl_qlist_t * const p_list,
413			  cl_qlist_t * const list_array,
414			  uint8_t const array_size)
415{
416	uint8_t port_num;
417	uint16_t mlid_ho;
418	boolean_t ignore_existing;
419	osm_mcast_work_obj_t *p_wobj;
420
421	OSM_LOG_ENTER(sm->p_log);
422
423	mlid_ho = cl_ntoh16(osm_mgrp_get_mlid(p_mgrp));
424
425	/*
426	   For Multicast Groups, we want not to count on previous
427	   configurations - since we can easily generate a storm
428	   by loops.
429	 */
430	ignore_existing = TRUE;
431
432	/*
433	   Subdivide the set of ports into non-overlapping subsets
434	   that will be routed to other switches.
435	 */
436	while ((p_wobj =
437		(osm_mcast_work_obj_t *) cl_qlist_remove_head(p_list)) !=
438	       (osm_mcast_work_obj_t *) cl_qlist_end(p_list)) {
439		port_num =
440		    osm_switch_recommend_mcast_path(p_sw, p_wobj->p_port,
441						    mlid_ho, ignore_existing);
442
443		if (port_num == OSM_NO_PATH) {
444			/*
445			   This typically occurs if the switch does not support
446			   multicast and the multicast tree must branch at this
447			   switch.
448			 */
449			uint64_t node_guid_ho =
450			    cl_ntoh64(osm_node_get_node_guid(p_sw->p_node));
451			OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A03: "
452				"Error routing MLID 0x%X through switch 0x%"
453				PRIx64 "\n"
454				"\t\t\t\tNo multicast paths from this switch for port "
455				"with LID %u\n", mlid_ho, node_guid_ho,
456				cl_ntoh16(osm_port_get_base_lid
457					  (p_wobj->p_port)));
458
459			__osm_mcast_work_obj_delete(p_wobj);
460			continue;
461		}
462
463		if (port_num > array_size) {
464			uint64_t node_guid_ho =
465			    cl_ntoh64(osm_node_get_node_guid(p_sw->p_node));
466			OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A04: "
467				"Error routing MLID 0x%X through switch 0x%"
468				PRIx64 "\n"
469				"\t\t\t\tNo multicast paths from this switch to port "
470				"with LID %u\n", mlid_ho, node_guid_ho,
471				cl_ntoh16(osm_port_get_base_lid
472					  (p_wobj->p_port)));
473
474			__osm_mcast_work_obj_delete(p_wobj);
475
476			/* This is means OpenSM has a bug. */
477			CL_ASSERT(FALSE);
478			continue;
479		}
480
481		cl_qlist_insert_tail(&list_array[port_num], &p_wobj->list_item);
482	}
483
484	OSM_LOG_EXIT(sm->p_log);
485}
486
487/**********************************************************************
488 **********************************************************************/
489static void __osm_mcast_mgr_purge_list(osm_sm_t * sm, cl_qlist_t * const p_list)
490{
491	osm_mcast_work_obj_t *p_wobj;
492
493	OSM_LOG_ENTER(sm->p_log);
494
495	while ((p_wobj = (osm_mcast_work_obj_t *) cl_qlist_remove_head(p_list))
496	       != (osm_mcast_work_obj_t *) cl_qlist_end(p_list)) {
497		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A06: "
498			"Unable to route for port 0x%" PRIx64 "\n",
499			osm_port_get_guid(p_wobj->p_port));
500		__osm_mcast_work_obj_delete(p_wobj);
501	}
502
503	OSM_LOG_EXIT(sm->p_log);
504}
505
506/**********************************************************************
507  This is the recursive function to compute the paths in the spanning
508  tree that emanate from this switch.  On input, the p_list contains
509  the group members that must be routed from this switch.
510
511  The function returns the newly created mtree node element.
512**********************************************************************/
513static osm_mtree_node_t *__osm_mcast_mgr_branch(osm_sm_t * sm,
514						osm_mgrp_t * const p_mgrp,
515						osm_switch_t * const p_sw,
516						cl_qlist_t * const p_list,
517						uint8_t depth,
518						uint8_t const upstream_port,
519						uint8_t * const p_max_depth)
520{
521	uint8_t max_children;
522	osm_mtree_node_t *p_mtn = NULL;
523	cl_qlist_t *list_array = NULL;
524	uint8_t i;
525	ib_net64_t node_guid;
526	uint64_t node_guid_ho;
527	osm_mcast_work_obj_t *p_wobj;
528	cl_qlist_t *p_port_list;
529	size_t count;
530	uint16_t mlid_ho;
531	osm_mcast_tbl_t *p_tbl;
532
533	OSM_LOG_ENTER(sm->p_log);
534
535	CL_ASSERT(p_sw);
536	CL_ASSERT(p_list);
537	CL_ASSERT(p_max_depth);
538
539	node_guid = osm_node_get_node_guid(p_sw->p_node);
540	node_guid_ho = cl_ntoh64(node_guid);
541	mlid_ho = cl_ntoh16(osm_mgrp_get_mlid(p_mgrp));
542
543	OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
544		"Routing MLID 0x%X through switch 0x%" PRIx64
545		", %u nodes at depth %u\n",
546		mlid_ho, node_guid_ho, cl_qlist_count(p_list), depth);
547
548	CL_ASSERT(cl_qlist_count(p_list) > 0);
549
550	depth++;
551
552	if (depth >= 64) {
553		OSM_LOG(sm->p_log, OSM_LOG_ERROR,
554			"Maximal hops number is reached for MLID 0x%x."
555			" Break processing.", mlid_ho);
556		__osm_mcast_mgr_purge_list(sm, p_list);
557		goto Exit;
558	}
559
560	if (depth > *p_max_depth) {
561		CL_ASSERT(depth == *p_max_depth + 1);
562		*p_max_depth = depth;
563	}
564
565	if (osm_switch_supports_mcast(p_sw) == FALSE) {
566		/*
567		   This switch doesn't do multicast.  Clean-up.
568		 */
569		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A14: "
570			"Switch 0x%" PRIx64 " does not support multicast\n",
571			node_guid_ho);
572
573		/*
574		   Deallocate all the work objects on this branch of the tree.
575		 */
576		__osm_mcast_mgr_purge_list(sm, p_list);
577		goto Exit;
578	}
579
580	p_mtn = osm_mtree_node_new(p_sw);
581	if (p_mtn == NULL) {
582		/*
583		   We are unable to continue routing down this
584		   leg of the tree.  Clean-up.
585		 */
586		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A15: "
587			"Insufficient memory to build multicast tree\n");
588
589		/*
590		   Deallocate all the work objects on this branch of the tree.
591		 */
592		__osm_mcast_mgr_purge_list(sm, p_list);
593		goto Exit;
594	}
595
596	max_children = osm_mtree_node_get_max_children(p_mtn);
597
598	CL_ASSERT(max_children > 1);
599
600	/*
601	   Prepare an empty list for each port in the switch.
602	   TO DO - this list array could probably be moved
603	   inside the switch element to save on malloc thrashing.
604	 */
605	list_array = malloc(sizeof(cl_qlist_t) * max_children);
606	if (list_array == NULL) {
607		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A16: "
608			"Unable to allocate list array\n");
609		__osm_mcast_mgr_purge_list(sm, p_list);
610		goto Exit;
611	}
612
613	memset(list_array, 0, sizeof(cl_qlist_t) * max_children);
614
615	for (i = 0; i < max_children; i++)
616		cl_qlist_init(&list_array[i]);
617
618	__osm_mcast_mgr_subdivide(sm, p_mgrp, p_sw, p_list, list_array,
619				  max_children);
620
621	p_tbl = osm_switch_get_mcast_tbl_ptr(p_sw);
622
623	/*
624	   Add the upstream port to the forwarding table unless
625	   we're at the root of the spanning tree.
626	 */
627	if (depth > 1) {
628		OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
629			"Adding upstream port %u\n", upstream_port);
630
631		CL_ASSERT(upstream_port);
632		osm_mcast_tbl_set(p_tbl, mlid_ho, upstream_port);
633	}
634
635	/*
636	   For each port that was allocated some routes,
637	   recurse into this function to continue building the tree
638	   if the node on the other end of that port is another switch.
639	   Otherwise, the node is an endpoint, and we've found a leaf
640	   of the tree.  Mark leaves with our special pointer value.
641	 */
642
643	for (i = 0; i < max_children; i++) {
644		const osm_physp_t *p_physp;
645		const osm_physp_t *p_remote_physp;
646		osm_node_t *p_node;
647		const osm_node_t *p_remote_node;
648
649		p_port_list = &list_array[i];
650
651		count = cl_qlist_count(p_port_list);
652
653		/*
654		   There should be no children routed through the upstream port!
655		 */
656		CL_ASSERT((upstream_port == 0) || (i != upstream_port) ||
657			  ((i == upstream_port) && (count == 0)));
658
659		if (count == 0)
660			continue;	/* No routes down this port. */
661
662		OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
663			"Routing %zu destinations via switch port %u\n",
664			count, i);
665
666		/*
667		   This port routes frames for this mcast group.  Therefore,
668		   set the appropriate bit in the multicast forwarding
669		   table for this switch.
670		 */
671		osm_mcast_tbl_set(p_tbl, mlid_ho, i);
672		if (i == 0) {
673			/* This means we are adding the switch to the MC group.
674			   We do not need to continue looking at the remote port, just
675			   needed to add the port to the table */
676			CL_ASSERT(count == 1);
677
678			p_wobj = (osm_mcast_work_obj_t *)
679			    cl_qlist_remove_head(p_port_list);
680			__osm_mcast_work_obj_delete(p_wobj);
681			continue;
682		}
683
684		p_node = p_sw->p_node;
685		p_remote_node = osm_node_get_remote_node(p_node, i, NULL);
686		if (!p_remote_node)
687			continue;
688
689		if (osm_node_get_type(p_remote_node) == IB_NODE_TYPE_SWITCH) {
690			/*
691			   Acquire a pointer to the remote switch then recurse.
692			 */
693			CL_ASSERT(p_remote_node->sw);
694
695			p_physp = osm_node_get_physp_ptr(p_node, i);
696			CL_ASSERT(p_physp);
697
698			p_remote_physp = osm_physp_get_remote(p_physp);
699			CL_ASSERT(p_remote_physp);
700
701			p_mtn->child_array[i] =
702			    __osm_mcast_mgr_branch(sm, p_mgrp,
703						   p_remote_node->sw,
704						   p_port_list, depth,
705						   osm_physp_get_port_num
706						   (p_remote_physp),
707						   p_max_depth);
708		} else {
709			/*
710			   The neighbor node is not a switch, so this
711			   must be a leaf.
712			 */
713			CL_ASSERT(count == 1);
714
715			p_mtn->child_array[i] = OSM_MTREE_LEAF;
716			p_wobj = (osm_mcast_work_obj_t *)
717			    cl_qlist_remove_head(p_port_list);
718
719			CL_ASSERT(cl_is_qlist_empty(p_port_list));
720
721			OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
722				"Found leaf for port 0x%016" PRIx64
723				" on switch port %u\n",
724				cl_ntoh64(osm_port_get_guid(p_wobj->p_port)),
725				i);
726
727			__osm_mcast_work_obj_delete(p_wobj);
728		}
729	}
730
731	free(list_array);
732Exit:
733	OSM_LOG_EXIT(sm->p_log);
734	return (p_mtn);
735}
736
737/**********************************************************************
738 **********************************************************************/
739static ib_api_status_t
740__osm_mcast_mgr_build_spanning_tree(osm_sm_t * sm, osm_mgrp_t * const p_mgrp)
741{
742	const cl_qmap_t *p_mcm_tbl;
743	const osm_port_t *p_port;
744	const osm_mcm_port_t *p_mcm_port;
745	uint32_t num_ports;
746	cl_qlist_t port_list;
747	osm_switch_t *p_sw;
748	osm_mcast_work_obj_t *p_wobj;
749	ib_api_status_t status = IB_SUCCESS;
750	uint8_t max_depth = 0;
751	uint32_t count;
752
753	OSM_LOG_ENTER(sm->p_log);
754
755	cl_qlist_init(&port_list);
756
757	/*
758	   TO DO - for now, just blow away the old tree.
759	   In the future we'll need to construct the tree based
760	   on multicast forwarding table information if the user wants to
761	   preserve existing multicast routes.
762	 */
763	__osm_mcast_mgr_purge_tree(sm, p_mgrp);
764
765	p_mcm_tbl = &p_mgrp->mcm_port_tbl;
766	num_ports = cl_qmap_count(p_mcm_tbl);
767	if (num_ports == 0) {
768		OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
769			"MLID 0x%X has no members - nothing to do\n",
770			cl_ntoh16(osm_mgrp_get_mlid(p_mgrp)));
771		goto Exit;
772	}
773
774	/*
775	   This function builds the single spanning tree recursively.
776	   At each stage, the ports to be reached are divided into
777	   non-overlapping subsets of member ports that can be reached through
778	   a given switch port.  Construction then moves down each
779	   branch, and the process starts again with each branch computing
780	   for its own subset of the member ports.
781
782	   The maximum recursion depth is at worst the maximum hop count in the
783	   subnet, which is spec limited to 64.
784	 */
785
786	/*
787	   Locate the switch around which to create the spanning
788	   tree for this multicast group.
789	 */
790	p_sw = __osm_mcast_mgr_find_root_switch(sm, p_mgrp);
791	if (p_sw == NULL) {
792		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A08: "
793			"Unable to locate a suitable switch for group 0x%X\n",
794			cl_ntoh16(osm_mgrp_get_mlid(p_mgrp)));
795		status = IB_ERROR;
796		goto Exit;
797	}
798
799	/*
800	   Build the first "subset" containing all member ports.
801	 */
802	for (p_mcm_port = (osm_mcm_port_t *) cl_qmap_head(p_mcm_tbl);
803	     p_mcm_port != (osm_mcm_port_t *) cl_qmap_end(p_mcm_tbl);
804	     p_mcm_port =
805	     (osm_mcm_port_t *) cl_qmap_next(&p_mcm_port->map_item)) {
806		/*
807		   Acquire the port object for this port guid, then create
808		   the new worker object to build the list.
809		 */
810		p_port = osm_get_port_by_guid(sm->p_subn,
811					      ib_gid_get_guid(&p_mcm_port->
812							      port_gid));
813		if (!p_port) {
814			OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A09: "
815				"No port object for port 0x%016" PRIx64 "\n",
816				cl_ntoh64(ib_gid_get_guid
817					  (&p_mcm_port->port_gid)));
818			continue;
819		}
820
821		p_wobj = __osm_mcast_work_obj_new(p_port);
822		if (p_wobj == NULL) {
823			OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A10: "
824				"Insufficient memory to route port 0x%016"
825				PRIx64 "\n",
826				cl_ntoh64(osm_port_get_guid(p_port)));
827			continue;
828		}
829
830		cl_qlist_insert_tail(&port_list, &p_wobj->list_item);
831	}
832
833	count = cl_qlist_count(&port_list);
834	p_mgrp->p_root = __osm_mcast_mgr_branch(sm, p_mgrp, p_sw,
835						&port_list, 0, 0, &max_depth);
836
837	OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
838		"Configured MLID 0x%X for %u ports, max tree depth = %u\n",
839		cl_ntoh16(osm_mgrp_get_mlid(p_mgrp)), count, max_depth);
840
841Exit:
842	OSM_LOG_EXIT(sm->p_log);
843	return (status);
844}
845
846#if 0
847/* unused */
848/**********************************************************************
849 **********************************************************************/
850void
851osm_mcast_mgr_set_table(osm_sm_t * sm,
852			IN const osm_mgrp_t * const p_mgrp,
853			IN const osm_mtree_node_t * const p_mtn)
854{
855	uint8_t i;
856	uint8_t max_children;
857	osm_mtree_node_t *p_child_mtn;
858	uint16_t mlid_ho;
859	osm_mcast_tbl_t *p_tbl;
860	osm_switch_t *p_sw;
861
862	OSM_LOG_ENTER(sm->p_log);
863
864	mlid_ho = cl_ntoh16(osm_mgrp_get_mlid(p_mgrp));
865	p_sw = osm_mtree_node_get_switch_ptr(p_mtn);
866
867	CL_ASSERT(p_sw);
868
869	OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
870		"Configuring MLID 0x%X on switch 0x%" PRIx64 "\n",
871		mlid_ho, osm_node_get_node_guid(p_sw->p_node));
872
873	/*
874	   For every child of this tree node, set the corresponding
875	   bit in the switch's mcast table.
876	 */
877	p_tbl = osm_switch_get_mcast_tbl_ptr(p_sw);
878	max_children = osm_mtree_node_get_max_children(p_mtn);
879
880	CL_ASSERT(max_children <= osm_switch_get_num_ports(p_sw));
881
882	osm_mcast_tbl_clear_mlid(p_tbl, mlid_ho);
883
884	for (i = 0; i < max_children; i++) {
885		p_child_mtn = osm_mtree_node_get_child(p_mtn, i);
886		if (p_child_mtn == NULL)
887			continue;
888
889		osm_mcast_tbl_set(p_tbl, mlid_ho, i);
890	}
891
892	OSM_LOG_EXIT(sm->p_log);
893}
894#endif
895
896/**********************************************************************
897 **********************************************************************/
898static void __osm_mcast_mgr_clear(osm_sm_t * sm, IN osm_mgrp_t * const p_mgrp)
899{
900	osm_switch_t *p_sw;
901	cl_qmap_t *p_sw_tbl;
902	osm_mcast_tbl_t *p_mcast_tbl;
903
904	OSM_LOG_ENTER(sm->p_log);
905
906	/*
907	   Walk the switches and clear the routing entries for
908	   this MLID.
909	 */
910	p_sw_tbl = &sm->p_subn->sw_guid_tbl;
911	p_sw = (osm_switch_t *) cl_qmap_head(p_sw_tbl);
912	while (p_sw != (osm_switch_t *) cl_qmap_end(p_sw_tbl)) {
913		p_mcast_tbl = osm_switch_get_mcast_tbl_ptr(p_sw);
914		osm_mcast_tbl_clear_mlid(p_mcast_tbl, cl_ntoh16(p_mgrp->mlid));
915		p_sw = (osm_switch_t *) cl_qmap_next(&p_sw->map_item);
916	}
917
918	OSM_LOG_EXIT(sm->p_log);
919}
920
921#if 0
922/* TO DO - make this real -- at least update spanning tree */
923/**********************************************************************
924   Lock must be held on entry.
925**********************************************************************/
926ib_api_status_t
927osm_mcast_mgr_process_single(osm_sm_t * sm,
928			     IN ib_net16_t const mlid,
929			     IN ib_net64_t const port_guid,
930			     IN uint8_t const join_state)
931{
932	uint8_t port_num;
933	uint16_t mlid_ho;
934	ib_net64_t sw_guid;
935	osm_port_t *p_port;
936	osm_physp_t *p_physp;
937	osm_physp_t *p_remote_physp;
938	osm_node_t *p_remote_node;
939	osm_mcast_tbl_t *p_mcast_tbl;
940	ib_api_status_t status = IB_SUCCESS;
941
942	OSM_LOG_ENTER(sm->p_log);
943
944	CL_ASSERT(mlid);
945	CL_ASSERT(port_guid);
946
947	mlid_ho = cl_ntoh16(mlid);
948
949	OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
950		"Attempting to add port 0x%" PRIx64 " to MLID 0x%X, "
951		"\n\t\t\t\tjoin state = 0x%X\n",
952		cl_ntoh64(port_guid), mlid_ho, join_state);
953
954	/*
955	   Acquire the Port object.
956	 */
957	p_port = osm_get_port_by_guid(sm->p_subn, port_guid);
958	if (!p_port) {
959		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A01: "
960			"Unable to acquire port object for 0x%" PRIx64 "\n",
961			cl_ntoh64(port_guid));
962		status = IB_ERROR;
963		goto Exit;
964	}
965
966	p_physp = p_port->p_physp;
967	if (p_physp == NULL) {
968		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A05: "
969			"Unable to acquire phsyical port object for 0x%" PRIx64
970			"\n", cl_ntoh64(port_guid));
971		status = IB_ERROR;
972		goto Exit;
973	}
974
975	p_remote_physp = osm_physp_get_remote(p_physp);
976	if (p_remote_physp == NULL) {
977		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A11: "
978			"Unable to acquire remote phsyical port object "
979			"for 0x%" PRIx64 "\n", cl_ntoh64(port_guid));
980		status = IB_ERROR;
981		goto Exit;
982	}
983
984	p_remote_node = osm_physp_get_node_ptr(p_remote_physp);
985
986	CL_ASSERT(p_remote_node);
987
988	sw_guid = osm_node_get_node_guid(p_remote_node);
989
990	if (osm_node_get_type(p_remote_node) != IB_NODE_TYPE_SWITCH) {
991		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A22: "
992			"Remote node not a switch node 0x%" PRIx64 "\n",
993			cl_ntoh64(sw_guid));
994		status = IB_ERROR;
995		goto Exit;
996	}
997
998	if (!p_remote_node->sw) {
999		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A12: "
1000			"No switch object 0x%" PRIx64 "\n", cl_ntoh64(sw_guid));
1001		status = IB_ERROR;
1002		goto Exit;
1003	}
1004
1005	if (osm_switch_is_in_mcast_tree(p_remote_node->sw, mlid_ho)) {
1006		/*
1007		   We're in luck. The switch attached to this port
1008		   is already in the multicast group, so we can just
1009		   add the specified port as a new leaf of the tree.
1010		 */
1011		if (join_state & (IB_JOIN_STATE_FULL | IB_JOIN_STATE_NON)) {
1012			/*
1013			   This node wants to receive multicast frames.
1014			   Get the switch port number to which the new member port
1015			   is attached, then configure this single mcast table.
1016			 */
1017			port_num = osm_physp_get_port_num(p_remote_physp);
1018			CL_ASSERT(port_num);
1019
1020			p_mcast_tbl =
1021			    osm_switch_get_mcast_tbl_ptr(p_remote_node->sw);
1022			osm_mcast_tbl_set(p_mcast_tbl, mlid_ho, port_num);
1023		} else {
1024			if (join_state & IB_JOIN_STATE_SEND_ONLY)
1025				OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
1026					"Success.  Nothing to do for send"
1027					"only member\n");
1028			else {
1029				OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A13: "
1030					"Unknown join state 0x%X\n",
1031					join_state);
1032				status = IB_ERROR;
1033				goto Exit;
1034			}
1035		}
1036	} else
1037		OSM_LOG(sm->p_log, OSM_LOG_DEBUG, "Unable to add port\n");
1038
1039Exit:
1040	OSM_LOG_EXIT(sm->p_log);
1041	return (status);
1042}
1043#endif
1044
1045/**********************************************************************
1046   lock must already be held on entry
1047**********************************************************************/
1048static ib_api_status_t
1049osm_mcast_mgr_process_tree(osm_sm_t * sm,
1050			   IN osm_mgrp_t * const p_mgrp,
1051			   IN osm_mcast_req_type_t req_type,
1052			   ib_net64_t port_guid)
1053{
1054	ib_api_status_t status = IB_SUCCESS;
1055	ib_net16_t mlid;
1056
1057	OSM_LOG_ENTER(sm->p_log);
1058
1059	mlid = osm_mgrp_get_mlid(p_mgrp);
1060
1061	OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
1062		"Processing multicast group 0x%X\n", cl_ntoh16(mlid));
1063
1064	/*
1065	   If there are no switches in the subnet, then we have nothing to do.
1066	 */
1067	if (cl_qmap_count(&sm->p_subn->sw_guid_tbl) == 0) {
1068		OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
1069			"No switches in subnet. Nothing to do\n");
1070		goto Exit;
1071	}
1072
1073	/*
1074	   Clear the multicast tables to start clean, then build
1075	   the spanning tree which sets the mcast table bits for each
1076	   port in the group.
1077	 */
1078	__osm_mcast_mgr_clear(sm, p_mgrp);
1079
1080	if (!p_mgrp->full_members)
1081		goto Exit;
1082
1083	status = __osm_mcast_mgr_build_spanning_tree(sm, p_mgrp);
1084	if (status != IB_SUCCESS) {
1085		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A17: "
1086			"Unable to create spanning tree (%s)\n",
1087			ib_get_err_str(status));
1088		goto Exit;
1089	}
1090
1091Exit:
1092	OSM_LOG_EXIT(sm->p_log);
1093	return (status);
1094}
1095
1096/**********************************************************************
1097 Process the entire group.
1098 NOTE : The lock should be held externally!
1099 **********************************************************************/
1100static ib_api_status_t
1101mcast_mgr_process_mgrp(osm_sm_t * sm,
1102		       IN osm_mgrp_t * const p_mgrp,
1103		       IN osm_mcast_req_type_t req_type,
1104		       IN ib_net64_t port_guid)
1105{
1106	ib_api_status_t status;
1107
1108	OSM_LOG_ENTER(sm->p_log);
1109
1110	status = osm_mcast_mgr_process_tree(sm, p_mgrp, req_type, port_guid);
1111	if (status != IB_SUCCESS) {
1112		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A19: "
1113			"Unable to create spanning tree (%s)\n",
1114			ib_get_err_str(status));
1115		goto Exit;
1116	}
1117	p_mgrp->last_tree_id = p_mgrp->last_change_id;
1118
1119	/* remove MCGRP if it is marked for deletion */
1120	if (p_mgrp->to_be_deleted) {
1121		OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
1122			"Destroying mgrp with lid:0x%x\n",
1123			cl_ntoh16(p_mgrp->mlid));
1124		sm->p_subn->mgroups[cl_ntoh16(p_mgrp->mlid) - IB_LID_MCAST_START_HO] = NULL;
1125		osm_mgrp_delete(p_mgrp);
1126	}
1127
1128Exit:
1129	OSM_LOG_EXIT(sm->p_log);
1130	return status;
1131}
1132
1133/**********************************************************************
1134 **********************************************************************/
1135osm_signal_t osm_mcast_mgr_process(osm_sm_t * sm)
1136{
1137	osm_signal_t signal;
1138	osm_switch_t *p_sw;
1139	cl_qmap_t *p_sw_tbl;
1140	cl_qlist_t *p_list = &sm->mgrp_list;
1141	osm_mgrp_t *p_mgrp;
1142	boolean_t pending_transactions = FALSE;
1143	int i;
1144
1145	OSM_LOG_ENTER(sm->p_log);
1146
1147	p_sw_tbl = &sm->p_subn->sw_guid_tbl;
1148	/*
1149	   While holding the lock, iterate over all the established
1150	   multicast groups, servicing each in turn.
1151
1152	   Then, download the multicast tables to the switches.
1153	 */
1154	CL_PLOCK_EXCL_ACQUIRE(sm->p_lock);
1155
1156	for (i = 0; i <= sm->p_subn->max_mcast_lid_ho - IB_LID_MCAST_START_HO;
1157	     i++) {
1158		/*
1159		   We reached here due to some change that caused a heavy sweep
1160		   of the subnet. Not due to a specific multicast request.
1161		   So the request type is subnet_change and the port guid is 0.
1162		 */
1163		p_mgrp = sm->p_subn->mgroups[i];
1164		if (p_mgrp)
1165			mcast_mgr_process_mgrp(sm, p_mgrp,
1166					       OSM_MCAST_REQ_TYPE_SUBNET_CHANGE,
1167					       0);
1168	}
1169
1170	/*
1171	   Walk the switches and download the tables for each.
1172	 */
1173	p_sw = (osm_switch_t *) cl_qmap_head(p_sw_tbl);
1174	while (p_sw != (osm_switch_t *) cl_qmap_end(p_sw_tbl)) {
1175		signal = __osm_mcast_mgr_set_tbl(sm, p_sw);
1176		if (signal == OSM_SIGNAL_DONE_PENDING)
1177			pending_transactions = TRUE;
1178		p_sw = (osm_switch_t *) cl_qmap_next(&p_sw->map_item);
1179	}
1180
1181	while (!cl_is_qlist_empty(p_list)) {
1182		cl_list_item_t *p = cl_qlist_remove_head(p_list);
1183		free(p);
1184	}
1185
1186	CL_PLOCK_RELEASE(sm->p_lock);
1187
1188	OSM_LOG_EXIT(sm->p_log);
1189
1190	if (pending_transactions == TRUE)
1191		return (OSM_SIGNAL_DONE_PENDING);
1192	else
1193		return (OSM_SIGNAL_DONE);
1194}
1195
1196/**********************************************************************
1197  This is the function that is invoked during idle time to handle the
1198  process request for mcast groups where join/leave/delete was required.
1199 **********************************************************************/
1200osm_signal_t osm_mcast_mgr_process_mgroups(osm_sm_t * sm)
1201{
1202	cl_qlist_t *p_list = &sm->mgrp_list;
1203	osm_switch_t *p_sw;
1204	cl_qmap_t *p_sw_tbl;
1205	osm_mgrp_t *p_mgrp;
1206	ib_net16_t mlid;
1207	osm_signal_t ret, signal = OSM_SIGNAL_DONE;
1208	osm_mcast_mgr_ctxt_t *ctx;
1209	osm_mcast_req_type_t req_type;
1210	ib_net64_t port_guid;
1211
1212	OSM_LOG_ENTER(sm->p_log);
1213
1214	/* we need a lock to make sure the p_mgrp is not change other ways */
1215	CL_PLOCK_EXCL_ACQUIRE(sm->p_lock);
1216
1217	while (!cl_is_qlist_empty(p_list)) {
1218		ctx = (osm_mcast_mgr_ctxt_t *) cl_qlist_remove_head(p_list);
1219		req_type = ctx->req_type;
1220		port_guid = ctx->port_guid;
1221
1222		/* nice copy no warning on size diff */
1223		memcpy(&mlid, &ctx->mlid, sizeof(mlid));
1224
1225		/* we can destroy the context now */
1226		free(ctx);
1227
1228		/* since we delayed the execution we prefer to pass the
1229		   mlid as the mgrp identifier and then find it or abort */
1230		p_mgrp = osm_get_mgrp_by_mlid(sm->p_subn, mlid);
1231		if (!p_mgrp)
1232			continue;
1233
1234		/* if there was no change from the last time
1235		 * we processed the group we can skip doing anything
1236		 */
1237		if (p_mgrp->last_change_id == p_mgrp->last_tree_id) {
1238			OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
1239				"Skip processing mgrp with lid:0x%X change id:%u\n",
1240				cl_ntoh16(mlid), p_mgrp->last_change_id);
1241			continue;
1242		}
1243
1244		OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
1245			"Processing mgrp with lid:0x%X change id:%u\n",
1246			cl_ntoh16(mlid), p_mgrp->last_change_id);
1247		mcast_mgr_process_mgrp(sm, p_mgrp, req_type, port_guid);
1248	}
1249
1250	/*
1251	   Walk the switches and download the tables for each.
1252	 */
1253	p_sw_tbl = &sm->p_subn->sw_guid_tbl;
1254	p_sw = (osm_switch_t *) cl_qmap_head(p_sw_tbl);
1255	while (p_sw != (osm_switch_t *) cl_qmap_end(p_sw_tbl)) {
1256		ret = __osm_mcast_mgr_set_tbl(sm, p_sw);
1257		if (ret == OSM_SIGNAL_DONE_PENDING)
1258			signal = ret;
1259		p_sw = (osm_switch_t *) cl_qmap_next(&p_sw->map_item);
1260	}
1261
1262	osm_dump_mcast_routes(sm->p_subn->p_osm);
1263
1264	CL_PLOCK_RELEASE(sm->p_lock);
1265	OSM_LOG_EXIT(sm->p_log);
1266	return signal;
1267}
1268