1219820Sjeff/*
2219820Sjeff * Copyright (c) 2004-2008 Voltaire, Inc. All rights reserved.
3219820Sjeff * Copyright (c) 2002-2006 Mellanox Technologies LTD. All rights reserved.
4219820Sjeff * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
5219820Sjeff * Copyright (c) 2008 Xsigo Systems Inc.  All rights reserved.
6219820Sjeff *
7219820Sjeff * This software is available to you under a choice of one of two
8219820Sjeff * licenses.  You may choose to be licensed under the terms of the GNU
9219820Sjeff * General Public License (GPL) Version 2, available from the file
10219820Sjeff * COPYING in the main directory of this source tree, or the
11219820Sjeff * OpenIB.org BSD license below:
12219820Sjeff *
13219820Sjeff *     Redistribution and use in source and binary forms, with or
14219820Sjeff *     without modification, are permitted provided that the following
15219820Sjeff *     conditions are met:
16219820Sjeff *
17219820Sjeff *      - Redistributions of source code must retain the above
18219820Sjeff *        copyright notice, this list of conditions and the following
19219820Sjeff *        disclaimer.
20219820Sjeff *
21219820Sjeff *      - Redistributions in binary form must reproduce the above
22219820Sjeff *        copyright notice, this list of conditions and the following
23219820Sjeff *        disclaimer in the documentation and/or other materials
24219820Sjeff *        provided with the distribution.
25219820Sjeff *
26219820Sjeff * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
27219820Sjeff * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
28219820Sjeff * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
29219820Sjeff * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
30219820Sjeff * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
31219820Sjeff * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
32219820Sjeff * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
33219820Sjeff * SOFTWARE.
34219820Sjeff *
35219820Sjeff */
36219820Sjeff
37219820Sjeff/*
38219820Sjeff * Abstract:
39219820Sjeff *    Implementation of osm_mcast_mgr_t.
40219820Sjeff * This file implements the Multicast Manager object.
41219820Sjeff */
42219820Sjeff
43219820Sjeff#if HAVE_CONFIG_H
44219820Sjeff#  include <config.h>
45219820Sjeff#endif				/* HAVE_CONFIG_H */
46219820Sjeff
47219820Sjeff#include <stdlib.h>
48219820Sjeff#include <string.h>
49219820Sjeff#include <iba/ib_types.h>
50219820Sjeff#include <complib/cl_debug.h>
51219820Sjeff#include <opensm/osm_opensm.h>
52219820Sjeff#include <opensm/osm_sm.h>
53219820Sjeff#include <opensm/osm_multicast.h>
54219820Sjeff#include <opensm/osm_node.h>
55219820Sjeff#include <opensm/osm_switch.h>
56219820Sjeff#include <opensm/osm_helper.h>
57219820Sjeff#include <opensm/osm_msgdef.h>
58219820Sjeff
59219820Sjeff/**********************************************************************
60219820Sjeff **********************************************************************/
61219820Sjefftypedef struct osm_mcast_work_obj {
62219820Sjeff	cl_list_item_t list_item;
63219820Sjeff	osm_port_t *p_port;
64219820Sjeff} osm_mcast_work_obj_t;
65219820Sjeff
66219820Sjeff/**********************************************************************
67219820Sjeff **********************************************************************/
68219820Sjeffstatic osm_mcast_work_obj_t *__osm_mcast_work_obj_new(IN const osm_port_t *
69219820Sjeff						      const p_port)
70219820Sjeff{
71219820Sjeff	/*
72219820Sjeff	   TO DO - get these objects from a lockpool.
73219820Sjeff	 */
74219820Sjeff	osm_mcast_work_obj_t *p_obj;
75219820Sjeff
76219820Sjeff	/*
77219820Sjeff	   clean allocated memory to avoid assertion when trying to insert to
78219820Sjeff	   qlist.
79219820Sjeff	   see cl_qlist_insert_tail(): CL_ASSERT(p_list_item->p_list != p_list)
80219820Sjeff	 */
81219820Sjeff	p_obj = malloc(sizeof(*p_obj));
82219820Sjeff	if (p_obj) {
83219820Sjeff		memset(p_obj, 0, sizeof(*p_obj));
84219820Sjeff		p_obj->p_port = (osm_port_t *) p_port;
85219820Sjeff	}
86219820Sjeff
87219820Sjeff	return (p_obj);
88219820Sjeff}
89219820Sjeff
90219820Sjeff/**********************************************************************
91219820Sjeff **********************************************************************/
92219820Sjeffstatic void __osm_mcast_work_obj_delete(IN osm_mcast_work_obj_t * p_wobj)
93219820Sjeff{
94219820Sjeff	free(p_wobj);
95219820Sjeff}
96219820Sjeff
97219820Sjeff/**********************************************************************
98219820Sjeff Recursively remove nodes from the tree
99219820Sjeff *********************************************************************/
100219820Sjeffstatic void __osm_mcast_mgr_purge_tree_node(IN osm_mtree_node_t * p_mtn)
101219820Sjeff{
102219820Sjeff	uint8_t i;
103219820Sjeff
104219820Sjeff	for (i = 0; i < p_mtn->max_children; i++) {
105219820Sjeff		if (p_mtn->child_array[i] &&
106219820Sjeff		    (p_mtn->child_array[i] != OSM_MTREE_LEAF))
107219820Sjeff			__osm_mcast_mgr_purge_tree_node(p_mtn->child_array[i]);
108219820Sjeff
109219820Sjeff		p_mtn->child_array[i] = NULL;
110219820Sjeff
111219820Sjeff	}
112219820Sjeff
113219820Sjeff	free(p_mtn);
114219820Sjeff}
115219820Sjeff
116219820Sjeff/**********************************************************************
117219820Sjeff **********************************************************************/
118219820Sjeffstatic void
119219820Sjeff__osm_mcast_mgr_purge_tree(osm_sm_t * sm, IN osm_mgrp_t * const p_mgrp)
120219820Sjeff{
121219820Sjeff	OSM_LOG_ENTER(sm->p_log);
122219820Sjeff
123219820Sjeff	if (p_mgrp->p_root)
124219820Sjeff		__osm_mcast_mgr_purge_tree_node(p_mgrp->p_root);
125219820Sjeff
126219820Sjeff	p_mgrp->p_root = NULL;
127219820Sjeff
128219820Sjeff	OSM_LOG_EXIT(sm->p_log);
129219820Sjeff}
130219820Sjeff
131219820Sjeff/**********************************************************************
132219820Sjeff **********************************************************************/
133219820Sjeffstatic float
134219820Sjeffosm_mcast_mgr_compute_avg_hops(osm_sm_t * sm,
135219820Sjeff			       const osm_mgrp_t * const p_mgrp,
136219820Sjeff			       const osm_switch_t * const p_sw)
137219820Sjeff{
138219820Sjeff	float avg_hops = 0;
139219820Sjeff	uint32_t hops = 0;
140219820Sjeff	uint32_t num_ports = 0;
141219820Sjeff	const osm_port_t *p_port;
142219820Sjeff	const osm_mcm_port_t *p_mcm_port;
143219820Sjeff	const cl_qmap_t *p_mcm_tbl;
144219820Sjeff
145219820Sjeff	OSM_LOG_ENTER(sm->p_log);
146219820Sjeff
147219820Sjeff	p_mcm_tbl = &p_mgrp->mcm_port_tbl;
148219820Sjeff
149219820Sjeff	/*
150219820Sjeff	   For each member of the multicast group, compute the
151219820Sjeff	   number of hops to its base LID.
152219820Sjeff	 */
153219820Sjeff	for (p_mcm_port = (osm_mcm_port_t *) cl_qmap_head(p_mcm_tbl);
154219820Sjeff	     p_mcm_port != (osm_mcm_port_t *) cl_qmap_end(p_mcm_tbl);
155219820Sjeff	     p_mcm_port =
156219820Sjeff	     (osm_mcm_port_t *) cl_qmap_next(&p_mcm_port->map_item)) {
157219820Sjeff		/*
158219820Sjeff		   Acquire the port object for this port guid, then create
159219820Sjeff		   the new worker object to build the list.
160219820Sjeff		 */
161219820Sjeff		p_port = osm_get_port_by_guid(sm->p_subn,
162219820Sjeff					      ib_gid_get_guid(&p_mcm_port->
163219820Sjeff							      port_gid));
164219820Sjeff
165219820Sjeff		if (!p_port) {
166219820Sjeff			OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A18: "
167219820Sjeff				"No port object for port 0x%016" PRIx64 "\n",
168219820Sjeff				cl_ntoh64(ib_gid_get_guid
169219820Sjeff					  (&p_mcm_port->port_gid)));
170219820Sjeff			continue;
171219820Sjeff		}
172219820Sjeff
173219820Sjeff		hops += osm_switch_get_port_least_hops(p_sw, p_port);
174219820Sjeff		num_ports++;
175219820Sjeff	}
176219820Sjeff
177219820Sjeff	/*
178219820Sjeff	   We should be here if there aren't any ports in the group.
179219820Sjeff	 */
180219820Sjeff	CL_ASSERT(num_ports);
181219820Sjeff
182219820Sjeff	if (num_ports != 0)
183219820Sjeff		avg_hops = (float)(hops / num_ports);
184219820Sjeff
185219820Sjeff	OSM_LOG_EXIT(sm->p_log);
186219820Sjeff	return (avg_hops);
187219820Sjeff}
188219820Sjeff
189219820Sjeff/**********************************************************************
190219820Sjeff Calculate the maximal "min hops" from the given switch to any
191219820Sjeff of the group HCAs
192219820Sjeff **********************************************************************/
193219820Sjeffstatic float
194219820Sjeffosm_mcast_mgr_compute_max_hops(osm_sm_t * sm,
195219820Sjeff			       const osm_mgrp_t * const p_mgrp,
196219820Sjeff			       const osm_switch_t * const p_sw)
197219820Sjeff{
198219820Sjeff	uint32_t max_hops = 0;
199219820Sjeff	uint32_t hops = 0;
200219820Sjeff	const osm_port_t *p_port;
201219820Sjeff	const osm_mcm_port_t *p_mcm_port;
202219820Sjeff	const cl_qmap_t *p_mcm_tbl;
203219820Sjeff
204219820Sjeff	OSM_LOG_ENTER(sm->p_log);
205219820Sjeff
206219820Sjeff	p_mcm_tbl = &p_mgrp->mcm_port_tbl;
207219820Sjeff
208219820Sjeff	/*
209219820Sjeff	   For each member of the multicast group, compute the
210219820Sjeff	   number of hops to its base LID.
211219820Sjeff	 */
212219820Sjeff	for (p_mcm_port = (osm_mcm_port_t *) cl_qmap_head(p_mcm_tbl);
213219820Sjeff	     p_mcm_port != (osm_mcm_port_t *) cl_qmap_end(p_mcm_tbl);
214219820Sjeff	     p_mcm_port =
215219820Sjeff	     (osm_mcm_port_t *) cl_qmap_next(&p_mcm_port->map_item)) {
216219820Sjeff		/*
217219820Sjeff		   Acquire the port object for this port guid, then create
218219820Sjeff		   the new worker object to build the list.
219219820Sjeff		 */
220219820Sjeff		p_port = osm_get_port_by_guid(sm->p_subn,
221219820Sjeff					      ib_gid_get_guid(&p_mcm_port->
222219820Sjeff							      port_gid));
223219820Sjeff
224219820Sjeff		if (!p_port) {
225219820Sjeff			OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A1A: "
226219820Sjeff				"No port object for port 0x%016" PRIx64 "\n",
227219820Sjeff				cl_ntoh64(ib_gid_get_guid
228219820Sjeff					  (&p_mcm_port->port_gid)));
229219820Sjeff			continue;
230219820Sjeff		}
231219820Sjeff
232219820Sjeff		hops = osm_switch_get_port_least_hops(p_sw, p_port);
233219820Sjeff		if (hops > max_hops)
234219820Sjeff			max_hops = hops;
235219820Sjeff	}
236219820Sjeff
237219820Sjeff	if (max_hops == 0) {
238219820Sjeff		/*
239219820Sjeff		   We should be here if there aren't any ports in the group.
240219820Sjeff		 */
241219820Sjeff		max_hops = 10001;	/* see later - we use it to realize no hops */
242219820Sjeff	}
243219820Sjeff
244219820Sjeff	OSM_LOG_EXIT(sm->p_log);
245219820Sjeff	return (float)(max_hops);
246219820Sjeff}
247219820Sjeff
248219820Sjeff/**********************************************************************
249219820Sjeff   This function attempts to locate the optimal switch for the
250219820Sjeff   center of the spanning tree.  The current algorithm chooses
251219820Sjeff   a switch with the lowest average hop count to the members
252219820Sjeff   of the multicast group.
253219820Sjeff**********************************************************************/
254219820Sjeffstatic osm_switch_t *__osm_mcast_mgr_find_optimal_switch(osm_sm_t * sm,
255219820Sjeff							 const osm_mgrp_t *
256219820Sjeff							 const p_mgrp)
257219820Sjeff{
258219820Sjeff	cl_qmap_t *p_sw_tbl;
259219820Sjeff	const osm_switch_t *p_sw;
260219820Sjeff	const osm_switch_t *p_best_sw = NULL;
261219820Sjeff	float hops = 0;
262219820Sjeff	float best_hops = 10000;	/* any big # will do */
263219820Sjeff#ifdef OSM_VENDOR_INTF_ANAFA
264219820Sjeff	boolean_t use_avg_hops = TRUE;	/* anafa2 - bug hca on switch *//* use max hops for root */
265219820Sjeff#else
266219820Sjeff	boolean_t use_avg_hops = FALSE;	/* use max hops for root */
267219820Sjeff#endif
268219820Sjeff
269219820Sjeff	OSM_LOG_ENTER(sm->p_log);
270219820Sjeff
271219820Sjeff	p_sw_tbl = &sm->p_subn->sw_guid_tbl;
272219820Sjeff
273219820Sjeff	CL_ASSERT(!osm_mgrp_is_empty(p_mgrp));
274219820Sjeff
275219820Sjeff	for (p_sw = (osm_switch_t *) cl_qmap_head(p_sw_tbl);
276219820Sjeff	     p_sw != (osm_switch_t *) cl_qmap_end(p_sw_tbl);
277219820Sjeff	     p_sw = (osm_switch_t *) cl_qmap_next(&p_sw->map_item)) {
278219820Sjeff		if (!osm_switch_supports_mcast(p_sw))
279219820Sjeff			continue;
280219820Sjeff
281219820Sjeff		if (use_avg_hops)
282219820Sjeff			hops = osm_mcast_mgr_compute_avg_hops(sm, p_mgrp, p_sw);
283219820Sjeff		else
284219820Sjeff			hops = osm_mcast_mgr_compute_max_hops(sm, p_mgrp, p_sw);
285219820Sjeff
286219820Sjeff		OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
287219820Sjeff			"Switch 0x%016" PRIx64 ", hops = %f\n",
288219820Sjeff			cl_ntoh64(osm_node_get_node_guid(p_sw->p_node)), hops);
289219820Sjeff
290219820Sjeff		if (hops < best_hops) {
291219820Sjeff			p_best_sw = p_sw;
292219820Sjeff			best_hops = hops;
293219820Sjeff		}
294219820Sjeff	}
295219820Sjeff
296219820Sjeff	if (p_best_sw)
297219820Sjeff		OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
298219820Sjeff			"Best switch is 0x%" PRIx64 ", hops = %f\n",
299219820Sjeff			cl_ntoh64(osm_node_get_node_guid(p_best_sw->p_node)),
300219820Sjeff			best_hops);
301219820Sjeff	else
302219820Sjeff		OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
303219820Sjeff			"No multicast capable switches detected\n");
304219820Sjeff
305219820Sjeff	OSM_LOG_EXIT(sm->p_log);
306219820Sjeff	return ((osm_switch_t *) p_best_sw);
307219820Sjeff}
308219820Sjeff
309219820Sjeff/**********************************************************************
310219820Sjeff   This function returns the existing or optimal root swtich for the tree.
311219820Sjeff**********************************************************************/
312219820Sjeffstatic osm_switch_t *__osm_mcast_mgr_find_root_switch(osm_sm_t * sm,
313219820Sjeff						      const osm_mgrp_t *
314219820Sjeff						      const p_mgrp)
315219820Sjeff{
316219820Sjeff	const osm_switch_t *p_sw = NULL;
317219820Sjeff
318219820Sjeff	OSM_LOG_ENTER(sm->p_log);
319219820Sjeff
320219820Sjeff	/*
321219820Sjeff	   We always look for the best multicast tree root switch.
322219820Sjeff	   Otherwise since we always start with a a single join
323219820Sjeff	   the root will be always on the first switch attached to it.
324219820Sjeff	   - Very bad ...
325219820Sjeff	 */
326219820Sjeff	p_sw = __osm_mcast_mgr_find_optimal_switch(sm, p_mgrp);
327219820Sjeff
328219820Sjeff	OSM_LOG_EXIT(sm->p_log);
329219820Sjeff	return ((osm_switch_t *) p_sw);
330219820Sjeff}
331219820Sjeff
332219820Sjeff/**********************************************************************
333219820Sjeff **********************************************************************/
334219820Sjeffstatic osm_signal_t
335219820Sjeff__osm_mcast_mgr_set_tbl(osm_sm_t * sm, IN osm_switch_t * const p_sw)
336219820Sjeff{
337219820Sjeff	osm_node_t *p_node;
338219820Sjeff	osm_dr_path_t *p_path;
339219820Sjeff	osm_madw_context_t mad_context;
340219820Sjeff	ib_api_status_t status;
341219820Sjeff	uint32_t block_id_ho = 0;
342219820Sjeff	int16_t block_num = 0;
343219820Sjeff	uint32_t position = 0;
344219820Sjeff	uint32_t max_position;
345219820Sjeff	osm_mcast_tbl_t *p_tbl;
346219820Sjeff	ib_net16_t block[IB_MCAST_BLOCK_SIZE];
347219820Sjeff	osm_signal_t signal = OSM_SIGNAL_DONE;
348219820Sjeff
349219820Sjeff	CL_ASSERT(sm);
350219820Sjeff
351219820Sjeff	OSM_LOG_ENTER(sm->p_log);
352219820Sjeff
353219820Sjeff	CL_ASSERT(p_sw);
354219820Sjeff
355219820Sjeff	p_node = p_sw->p_node;
356219820Sjeff
357219820Sjeff	CL_ASSERT(p_node);
358219820Sjeff
359219820Sjeff	p_path = osm_physp_get_dr_path_ptr(osm_node_get_physp_ptr(p_node, 0));
360219820Sjeff
361219820Sjeff	/*
362219820Sjeff	   Send multicast forwarding table blocks to the switch
363219820Sjeff	   as long as the switch indicates it has blocks needing
364219820Sjeff	   configuration.
365219820Sjeff	 */
366219820Sjeff
367219820Sjeff	mad_context.mft_context.node_guid = osm_node_get_node_guid(p_node);
368219820Sjeff	mad_context.mft_context.set_method = TRUE;
369219820Sjeff
370219820Sjeff	p_tbl = osm_switch_get_mcast_tbl_ptr(p_sw);
371219820Sjeff	max_position = p_tbl->max_position;
372219820Sjeff
373219820Sjeff	while (osm_mcast_tbl_get_block(p_tbl, block_num,
374219820Sjeff				       (uint8_t) position, block)) {
375219820Sjeff		OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
376219820Sjeff			"Writing MFT block 0x%X\n", block_id_ho);
377219820Sjeff
378219820Sjeff		block_id_ho = block_num + (position << 28);
379219820Sjeff
380219820Sjeff		status = osm_req_set(sm, p_path, (void *)block, sizeof(block),
381219820Sjeff				     IB_MAD_ATTR_MCAST_FWD_TBL,
382219820Sjeff				     cl_hton32(block_id_ho),
383219820Sjeff				     CL_DISP_MSGID_NONE, &mad_context);
384219820Sjeff
385219820Sjeff		if (status != IB_SUCCESS) {
386219820Sjeff			OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A02: "
387219820Sjeff				"Sending multicast fwd. tbl. block failed (%s)\n",
388219820Sjeff				ib_get_err_str(status));
389219820Sjeff		}
390219820Sjeff
391219820Sjeff		signal = OSM_SIGNAL_DONE_PENDING;
392219820Sjeff
393219820Sjeff		if (++position > max_position) {
394219820Sjeff			position = 0;
395219820Sjeff			block_num++;
396219820Sjeff		}
397219820Sjeff	}
398219820Sjeff
399219820Sjeff	OSM_LOG_EXIT(sm->p_log);
400219820Sjeff	return (signal);
401219820Sjeff}
402219820Sjeff
403219820Sjeff/**********************************************************************
404219820Sjeff  This is part of the recursive function to compute the paths in the
405219820Sjeff  spanning tree that eminate from this switch.  On input, the p_list
406219820Sjeff  contains the group members that must be routed from this switch.
407219820Sjeff**********************************************************************/
408219820Sjeffstatic void
409219820Sjeff__osm_mcast_mgr_subdivide(osm_sm_t * sm,
410219820Sjeff			  osm_mgrp_t * const p_mgrp,
411219820Sjeff			  osm_switch_t * const p_sw,
412219820Sjeff			  cl_qlist_t * const p_list,
413219820Sjeff			  cl_qlist_t * const list_array,
414219820Sjeff			  uint8_t const array_size)
415219820Sjeff{
416219820Sjeff	uint8_t port_num;
417219820Sjeff	uint16_t mlid_ho;
418219820Sjeff	boolean_t ignore_existing;
419219820Sjeff	osm_mcast_work_obj_t *p_wobj;
420219820Sjeff
421219820Sjeff	OSM_LOG_ENTER(sm->p_log);
422219820Sjeff
423219820Sjeff	mlid_ho = cl_ntoh16(osm_mgrp_get_mlid(p_mgrp));
424219820Sjeff
425219820Sjeff	/*
426219820Sjeff	   For Multicast Groups, we want not to count on previous
427219820Sjeff	   configurations - since we can easily generate a storm
428219820Sjeff	   by loops.
429219820Sjeff	 */
430219820Sjeff	ignore_existing = TRUE;
431219820Sjeff
432219820Sjeff	/*
433219820Sjeff	   Subdivide the set of ports into non-overlapping subsets
434219820Sjeff	   that will be routed to other switches.
435219820Sjeff	 */
436219820Sjeff	while ((p_wobj =
437219820Sjeff		(osm_mcast_work_obj_t *) cl_qlist_remove_head(p_list)) !=
438219820Sjeff	       (osm_mcast_work_obj_t *) cl_qlist_end(p_list)) {
439219820Sjeff		port_num =
440219820Sjeff		    osm_switch_recommend_mcast_path(p_sw, p_wobj->p_port,
441219820Sjeff						    mlid_ho, ignore_existing);
442219820Sjeff
443219820Sjeff		if (port_num == OSM_NO_PATH) {
444219820Sjeff			/*
445219820Sjeff			   This typically occurs if the switch does not support
446219820Sjeff			   multicast and the multicast tree must branch at this
447219820Sjeff			   switch.
448219820Sjeff			 */
449219820Sjeff			uint64_t node_guid_ho =
450219820Sjeff			    cl_ntoh64(osm_node_get_node_guid(p_sw->p_node));
451219820Sjeff			OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A03: "
452219820Sjeff				"Error routing MLID 0x%X through switch 0x%"
453219820Sjeff				PRIx64 "\n"
454219820Sjeff				"\t\t\t\tNo multicast paths from this switch for port "
455219820Sjeff				"with LID %u\n", mlid_ho, node_guid_ho,
456219820Sjeff				cl_ntoh16(osm_port_get_base_lid
457219820Sjeff					  (p_wobj->p_port)));
458219820Sjeff
459219820Sjeff			__osm_mcast_work_obj_delete(p_wobj);
460219820Sjeff			continue;
461219820Sjeff		}
462219820Sjeff
463219820Sjeff		if (port_num > array_size) {
464219820Sjeff			uint64_t node_guid_ho =
465219820Sjeff			    cl_ntoh64(osm_node_get_node_guid(p_sw->p_node));
466219820Sjeff			OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A04: "
467219820Sjeff				"Error routing MLID 0x%X through switch 0x%"
468219820Sjeff				PRIx64 "\n"
469219820Sjeff				"\t\t\t\tNo multicast paths from this switch to port "
470219820Sjeff				"with LID %u\n", mlid_ho, node_guid_ho,
471219820Sjeff				cl_ntoh16(osm_port_get_base_lid
472219820Sjeff					  (p_wobj->p_port)));
473219820Sjeff
474219820Sjeff			__osm_mcast_work_obj_delete(p_wobj);
475219820Sjeff
476219820Sjeff			/* This is means OpenSM has a bug. */
477219820Sjeff			CL_ASSERT(FALSE);
478219820Sjeff			continue;
479219820Sjeff		}
480219820Sjeff
481219820Sjeff		cl_qlist_insert_tail(&list_array[port_num], &p_wobj->list_item);
482219820Sjeff	}
483219820Sjeff
484219820Sjeff	OSM_LOG_EXIT(sm->p_log);
485219820Sjeff}
486219820Sjeff
487219820Sjeff/**********************************************************************
488219820Sjeff **********************************************************************/
489219820Sjeffstatic void __osm_mcast_mgr_purge_list(osm_sm_t * sm, cl_qlist_t * const p_list)
490219820Sjeff{
491219820Sjeff	osm_mcast_work_obj_t *p_wobj;
492219820Sjeff
493219820Sjeff	OSM_LOG_ENTER(sm->p_log);
494219820Sjeff
495219820Sjeff	while ((p_wobj = (osm_mcast_work_obj_t *) cl_qlist_remove_head(p_list))
496219820Sjeff	       != (osm_mcast_work_obj_t *) cl_qlist_end(p_list)) {
497219820Sjeff		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A06: "
498219820Sjeff			"Unable to route for port 0x%" PRIx64 "\n",
499219820Sjeff			osm_port_get_guid(p_wobj->p_port));
500219820Sjeff		__osm_mcast_work_obj_delete(p_wobj);
501219820Sjeff	}
502219820Sjeff
503219820Sjeff	OSM_LOG_EXIT(sm->p_log);
504219820Sjeff}
505219820Sjeff
506219820Sjeff/**********************************************************************
507219820Sjeff  This is the recursive function to compute the paths in the spanning
508219820Sjeff  tree that emanate from this switch.  On input, the p_list contains
509219820Sjeff  the group members that must be routed from this switch.
510219820Sjeff
511219820Sjeff  The function returns the newly created mtree node element.
512219820Sjeff**********************************************************************/
513219820Sjeffstatic osm_mtree_node_t *__osm_mcast_mgr_branch(osm_sm_t * sm,
514219820Sjeff						osm_mgrp_t * const p_mgrp,
515219820Sjeff						osm_switch_t * const p_sw,
516219820Sjeff						cl_qlist_t * const p_list,
517219820Sjeff						uint8_t depth,
518219820Sjeff						uint8_t const upstream_port,
519219820Sjeff						uint8_t * const p_max_depth)
520219820Sjeff{
521219820Sjeff	uint8_t max_children;
522219820Sjeff	osm_mtree_node_t *p_mtn = NULL;
523219820Sjeff	cl_qlist_t *list_array = NULL;
524219820Sjeff	uint8_t i;
525219820Sjeff	ib_net64_t node_guid;
526219820Sjeff	uint64_t node_guid_ho;
527219820Sjeff	osm_mcast_work_obj_t *p_wobj;
528219820Sjeff	cl_qlist_t *p_port_list;
529219820Sjeff	size_t count;
530219820Sjeff	uint16_t mlid_ho;
531219820Sjeff	osm_mcast_tbl_t *p_tbl;
532219820Sjeff
533219820Sjeff	OSM_LOG_ENTER(sm->p_log);
534219820Sjeff
535219820Sjeff	CL_ASSERT(p_sw);
536219820Sjeff	CL_ASSERT(p_list);
537219820Sjeff	CL_ASSERT(p_max_depth);
538219820Sjeff
539219820Sjeff	node_guid = osm_node_get_node_guid(p_sw->p_node);
540219820Sjeff	node_guid_ho = cl_ntoh64(node_guid);
541219820Sjeff	mlid_ho = cl_ntoh16(osm_mgrp_get_mlid(p_mgrp));
542219820Sjeff
543219820Sjeff	OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
544219820Sjeff		"Routing MLID 0x%X through switch 0x%" PRIx64
545219820Sjeff		", %u nodes at depth %u\n",
546219820Sjeff		mlid_ho, node_guid_ho, cl_qlist_count(p_list), depth);
547219820Sjeff
548219820Sjeff	CL_ASSERT(cl_qlist_count(p_list) > 0);
549219820Sjeff
550219820Sjeff	depth++;
551219820Sjeff
552219820Sjeff	if (depth >= 64) {
553219820Sjeff		OSM_LOG(sm->p_log, OSM_LOG_ERROR,
554219820Sjeff			"Maximal hops number is reached for MLID 0x%x."
555219820Sjeff			" Break processing.", mlid_ho);
556219820Sjeff		__osm_mcast_mgr_purge_list(sm, p_list);
557219820Sjeff		goto Exit;
558219820Sjeff	}
559219820Sjeff
560219820Sjeff	if (depth > *p_max_depth) {
561219820Sjeff		CL_ASSERT(depth == *p_max_depth + 1);
562219820Sjeff		*p_max_depth = depth;
563219820Sjeff	}
564219820Sjeff
565219820Sjeff	if (osm_switch_supports_mcast(p_sw) == FALSE) {
566219820Sjeff		/*
567219820Sjeff		   This switch doesn't do multicast.  Clean-up.
568219820Sjeff		 */
569219820Sjeff		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A14: "
570219820Sjeff			"Switch 0x%" PRIx64 " does not support multicast\n",
571219820Sjeff			node_guid_ho);
572219820Sjeff
573219820Sjeff		/*
574219820Sjeff		   Deallocate all the work objects on this branch of the tree.
575219820Sjeff		 */
576219820Sjeff		__osm_mcast_mgr_purge_list(sm, p_list);
577219820Sjeff		goto Exit;
578219820Sjeff	}
579219820Sjeff
580219820Sjeff	p_mtn = osm_mtree_node_new(p_sw);
581219820Sjeff	if (p_mtn == NULL) {
582219820Sjeff		/*
583219820Sjeff		   We are unable to continue routing down this
584219820Sjeff		   leg of the tree.  Clean-up.
585219820Sjeff		 */
586219820Sjeff		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A15: "
587219820Sjeff			"Insufficient memory to build multicast tree\n");
588219820Sjeff
589219820Sjeff		/*
590219820Sjeff		   Deallocate all the work objects on this branch of the tree.
591219820Sjeff		 */
592219820Sjeff		__osm_mcast_mgr_purge_list(sm, p_list);
593219820Sjeff		goto Exit;
594219820Sjeff	}
595219820Sjeff
596219820Sjeff	max_children = osm_mtree_node_get_max_children(p_mtn);
597219820Sjeff
598219820Sjeff	CL_ASSERT(max_children > 1);
599219820Sjeff
600219820Sjeff	/*
601219820Sjeff	   Prepare an empty list for each port in the switch.
602219820Sjeff	   TO DO - this list array could probably be moved
603219820Sjeff	   inside the switch element to save on malloc thrashing.
604219820Sjeff	 */
605219820Sjeff	list_array = malloc(sizeof(cl_qlist_t) * max_children);
606219820Sjeff	if (list_array == NULL) {
607219820Sjeff		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A16: "
608219820Sjeff			"Unable to allocate list array\n");
609219820Sjeff		__osm_mcast_mgr_purge_list(sm, p_list);
610219820Sjeff		goto Exit;
611219820Sjeff	}
612219820Sjeff
613219820Sjeff	memset(list_array, 0, sizeof(cl_qlist_t) * max_children);
614219820Sjeff
615219820Sjeff	for (i = 0; i < max_children; i++)
616219820Sjeff		cl_qlist_init(&list_array[i]);
617219820Sjeff
618219820Sjeff	__osm_mcast_mgr_subdivide(sm, p_mgrp, p_sw, p_list, list_array,
619219820Sjeff				  max_children);
620219820Sjeff
621219820Sjeff	p_tbl = osm_switch_get_mcast_tbl_ptr(p_sw);
622219820Sjeff
623219820Sjeff	/*
624219820Sjeff	   Add the upstream port to the forwarding table unless
625219820Sjeff	   we're at the root of the spanning tree.
626219820Sjeff	 */
627219820Sjeff	if (depth > 1) {
628219820Sjeff		OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
629219820Sjeff			"Adding upstream port %u\n", upstream_port);
630219820Sjeff
631219820Sjeff		CL_ASSERT(upstream_port);
632219820Sjeff		osm_mcast_tbl_set(p_tbl, mlid_ho, upstream_port);
633219820Sjeff	}
634219820Sjeff
635219820Sjeff	/*
636219820Sjeff	   For each port that was allocated some routes,
637219820Sjeff	   recurse into this function to continue building the tree
638219820Sjeff	   if the node on the other end of that port is another switch.
639219820Sjeff	   Otherwise, the node is an endpoint, and we've found a leaf
640219820Sjeff	   of the tree.  Mark leaves with our special pointer value.
641219820Sjeff	 */
642219820Sjeff
643219820Sjeff	for (i = 0; i < max_children; i++) {
644219820Sjeff		const osm_physp_t *p_physp;
645219820Sjeff		const osm_physp_t *p_remote_physp;
646219820Sjeff		osm_node_t *p_node;
647219820Sjeff		const osm_node_t *p_remote_node;
648219820Sjeff
649219820Sjeff		p_port_list = &list_array[i];
650219820Sjeff
651219820Sjeff		count = cl_qlist_count(p_port_list);
652219820Sjeff
653219820Sjeff		/*
654219820Sjeff		   There should be no children routed through the upstream port!
655219820Sjeff		 */
656219820Sjeff		CL_ASSERT((upstream_port == 0) || (i != upstream_port) ||
657219820Sjeff			  ((i == upstream_port) && (count == 0)));
658219820Sjeff
659219820Sjeff		if (count == 0)
660219820Sjeff			continue;	/* No routes down this port. */
661219820Sjeff
662219820Sjeff		OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
663219820Sjeff			"Routing %zu destinations via switch port %u\n",
664219820Sjeff			count, i);
665219820Sjeff
666219820Sjeff		/*
667219820Sjeff		   This port routes frames for this mcast group.  Therefore,
668219820Sjeff		   set the appropriate bit in the multicast forwarding
669219820Sjeff		   table for this switch.
670219820Sjeff		 */
671219820Sjeff		osm_mcast_tbl_set(p_tbl, mlid_ho, i);
672219820Sjeff		if (i == 0) {
673219820Sjeff			/* This means we are adding the switch to the MC group.
674219820Sjeff			   We do not need to continue looking at the remote port, just
675219820Sjeff			   needed to add the port to the table */
676219820Sjeff			CL_ASSERT(count == 1);
677219820Sjeff
678219820Sjeff			p_wobj = (osm_mcast_work_obj_t *)
679219820Sjeff			    cl_qlist_remove_head(p_port_list);
680219820Sjeff			__osm_mcast_work_obj_delete(p_wobj);
681219820Sjeff			continue;
682219820Sjeff		}
683219820Sjeff
684219820Sjeff		p_node = p_sw->p_node;
685219820Sjeff		p_remote_node = osm_node_get_remote_node(p_node, i, NULL);
686219820Sjeff		if (!p_remote_node)
687219820Sjeff			continue;
688219820Sjeff
689219820Sjeff		if (osm_node_get_type(p_remote_node) == IB_NODE_TYPE_SWITCH) {
690219820Sjeff			/*
691219820Sjeff			   Acquire a pointer to the remote switch then recurse.
692219820Sjeff			 */
693219820Sjeff			CL_ASSERT(p_remote_node->sw);
694219820Sjeff
695219820Sjeff			p_physp = osm_node_get_physp_ptr(p_node, i);
696219820Sjeff			CL_ASSERT(p_physp);
697219820Sjeff
698219820Sjeff			p_remote_physp = osm_physp_get_remote(p_physp);
699219820Sjeff			CL_ASSERT(p_remote_physp);
700219820Sjeff
701219820Sjeff			p_mtn->child_array[i] =
702219820Sjeff			    __osm_mcast_mgr_branch(sm, p_mgrp,
703219820Sjeff						   p_remote_node->sw,
704219820Sjeff						   p_port_list, depth,
705219820Sjeff						   osm_physp_get_port_num
706219820Sjeff						   (p_remote_physp),
707219820Sjeff						   p_max_depth);
708219820Sjeff		} else {
709219820Sjeff			/*
710219820Sjeff			   The neighbor node is not a switch, so this
711219820Sjeff			   must be a leaf.
712219820Sjeff			 */
713219820Sjeff			CL_ASSERT(count == 1);
714219820Sjeff
715219820Sjeff			p_mtn->child_array[i] = OSM_MTREE_LEAF;
716219820Sjeff			p_wobj = (osm_mcast_work_obj_t *)
717219820Sjeff			    cl_qlist_remove_head(p_port_list);
718219820Sjeff
719219820Sjeff			CL_ASSERT(cl_is_qlist_empty(p_port_list));
720219820Sjeff
721219820Sjeff			OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
722219820Sjeff				"Found leaf for port 0x%016" PRIx64
723219820Sjeff				" on switch port %u\n",
724219820Sjeff				cl_ntoh64(osm_port_get_guid(p_wobj->p_port)),
725219820Sjeff				i);
726219820Sjeff
727219820Sjeff			__osm_mcast_work_obj_delete(p_wobj);
728219820Sjeff		}
729219820Sjeff	}
730219820Sjeff
731219820Sjeff	free(list_array);
732219820SjeffExit:
733219820Sjeff	OSM_LOG_EXIT(sm->p_log);
734219820Sjeff	return (p_mtn);
735219820Sjeff}
736219820Sjeff
737219820Sjeff/**********************************************************************
738219820Sjeff **********************************************************************/
739219820Sjeffstatic ib_api_status_t
740219820Sjeff__osm_mcast_mgr_build_spanning_tree(osm_sm_t * sm, osm_mgrp_t * const p_mgrp)
741219820Sjeff{
742219820Sjeff	const cl_qmap_t *p_mcm_tbl;
743219820Sjeff	const osm_port_t *p_port;
744219820Sjeff	const osm_mcm_port_t *p_mcm_port;
745219820Sjeff	uint32_t num_ports;
746219820Sjeff	cl_qlist_t port_list;
747219820Sjeff	osm_switch_t *p_sw;
748219820Sjeff	osm_mcast_work_obj_t *p_wobj;
749219820Sjeff	ib_api_status_t status = IB_SUCCESS;
750219820Sjeff	uint8_t max_depth = 0;
751219820Sjeff	uint32_t count;
752219820Sjeff
753219820Sjeff	OSM_LOG_ENTER(sm->p_log);
754219820Sjeff
755219820Sjeff	cl_qlist_init(&port_list);
756219820Sjeff
757219820Sjeff	/*
758219820Sjeff	   TO DO - for now, just blow away the old tree.
759219820Sjeff	   In the future we'll need to construct the tree based
760219820Sjeff	   on multicast forwarding table information if the user wants to
761219820Sjeff	   preserve existing multicast routes.
762219820Sjeff	 */
763219820Sjeff	__osm_mcast_mgr_purge_tree(sm, p_mgrp);
764219820Sjeff
765219820Sjeff	p_mcm_tbl = &p_mgrp->mcm_port_tbl;
766219820Sjeff	num_ports = cl_qmap_count(p_mcm_tbl);
767219820Sjeff	if (num_ports == 0) {
768219820Sjeff		OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
769219820Sjeff			"MLID 0x%X has no members - nothing to do\n",
770219820Sjeff			cl_ntoh16(osm_mgrp_get_mlid(p_mgrp)));
771219820Sjeff		goto Exit;
772219820Sjeff	}
773219820Sjeff
774219820Sjeff	/*
775219820Sjeff	   This function builds the single spanning tree recursively.
776219820Sjeff	   At each stage, the ports to be reached are divided into
777219820Sjeff	   non-overlapping subsets of member ports that can be reached through
778219820Sjeff	   a given switch port.  Construction then moves down each
779219820Sjeff	   branch, and the process starts again with each branch computing
780219820Sjeff	   for its own subset of the member ports.
781219820Sjeff
782219820Sjeff	   The maximum recursion depth is at worst the maximum hop count in the
783219820Sjeff	   subnet, which is spec limited to 64.
784219820Sjeff	 */
785219820Sjeff
786219820Sjeff	/*
787219820Sjeff	   Locate the switch around which to create the spanning
788219820Sjeff	   tree for this multicast group.
789219820Sjeff	 */
790219820Sjeff	p_sw = __osm_mcast_mgr_find_root_switch(sm, p_mgrp);
791219820Sjeff	if (p_sw == NULL) {
792219820Sjeff		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A08: "
793219820Sjeff			"Unable to locate a suitable switch for group 0x%X\n",
794219820Sjeff			cl_ntoh16(osm_mgrp_get_mlid(p_mgrp)));
795219820Sjeff		status = IB_ERROR;
796219820Sjeff		goto Exit;
797219820Sjeff	}
798219820Sjeff
799219820Sjeff	/*
800219820Sjeff	   Build the first "subset" containing all member ports.
801219820Sjeff	 */
802219820Sjeff	for (p_mcm_port = (osm_mcm_port_t *) cl_qmap_head(p_mcm_tbl);
803219820Sjeff	     p_mcm_port != (osm_mcm_port_t *) cl_qmap_end(p_mcm_tbl);
804219820Sjeff	     p_mcm_port =
805219820Sjeff	     (osm_mcm_port_t *) cl_qmap_next(&p_mcm_port->map_item)) {
806219820Sjeff		/*
807219820Sjeff		   Acquire the port object for this port guid, then create
808219820Sjeff		   the new worker object to build the list.
809219820Sjeff		 */
810219820Sjeff		p_port = osm_get_port_by_guid(sm->p_subn,
811219820Sjeff					      ib_gid_get_guid(&p_mcm_port->
812219820Sjeff							      port_gid));
813219820Sjeff		if (!p_port) {
814219820Sjeff			OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A09: "
815219820Sjeff				"No port object for port 0x%016" PRIx64 "\n",
816219820Sjeff				cl_ntoh64(ib_gid_get_guid
817219820Sjeff					  (&p_mcm_port->port_gid)));
818219820Sjeff			continue;
819219820Sjeff		}
820219820Sjeff
821219820Sjeff		p_wobj = __osm_mcast_work_obj_new(p_port);
822219820Sjeff		if (p_wobj == NULL) {
823219820Sjeff			OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A10: "
824219820Sjeff				"Insufficient memory to route port 0x%016"
825219820Sjeff				PRIx64 "\n",
826219820Sjeff				cl_ntoh64(osm_port_get_guid(p_port)));
827219820Sjeff			continue;
828219820Sjeff		}
829219820Sjeff
830219820Sjeff		cl_qlist_insert_tail(&port_list, &p_wobj->list_item);
831219820Sjeff	}
832219820Sjeff
833219820Sjeff	count = cl_qlist_count(&port_list);
834219820Sjeff	p_mgrp->p_root = __osm_mcast_mgr_branch(sm, p_mgrp, p_sw,
835219820Sjeff						&port_list, 0, 0, &max_depth);
836219820Sjeff
837219820Sjeff	OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
838219820Sjeff		"Configured MLID 0x%X for %u ports, max tree depth = %u\n",
839219820Sjeff		cl_ntoh16(osm_mgrp_get_mlid(p_mgrp)), count, max_depth);
840219820Sjeff
841219820SjeffExit:
842219820Sjeff	OSM_LOG_EXIT(sm->p_log);
843219820Sjeff	return (status);
844219820Sjeff}
845219820Sjeff
846219820Sjeff#if 0
847219820Sjeff/* unused */
848219820Sjeff/**********************************************************************
849219820Sjeff **********************************************************************/
850219820Sjeffvoid
851219820Sjeffosm_mcast_mgr_set_table(osm_sm_t * sm,
852219820Sjeff			IN const osm_mgrp_t * const p_mgrp,
853219820Sjeff			IN const osm_mtree_node_t * const p_mtn)
854219820Sjeff{
855219820Sjeff	uint8_t i;
856219820Sjeff	uint8_t max_children;
857219820Sjeff	osm_mtree_node_t *p_child_mtn;
858219820Sjeff	uint16_t mlid_ho;
859219820Sjeff	osm_mcast_tbl_t *p_tbl;
860219820Sjeff	osm_switch_t *p_sw;
861219820Sjeff
862219820Sjeff	OSM_LOG_ENTER(sm->p_log);
863219820Sjeff
864219820Sjeff	mlid_ho = cl_ntoh16(osm_mgrp_get_mlid(p_mgrp));
865219820Sjeff	p_sw = osm_mtree_node_get_switch_ptr(p_mtn);
866219820Sjeff
867219820Sjeff	CL_ASSERT(p_sw);
868219820Sjeff
869219820Sjeff	OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
870219820Sjeff		"Configuring MLID 0x%X on switch 0x%" PRIx64 "\n",
871219820Sjeff		mlid_ho, osm_node_get_node_guid(p_sw->p_node));
872219820Sjeff
873219820Sjeff	/*
874219820Sjeff	   For every child of this tree node, set the corresponding
875219820Sjeff	   bit in the switch's mcast table.
876219820Sjeff	 */
877219820Sjeff	p_tbl = osm_switch_get_mcast_tbl_ptr(p_sw);
878219820Sjeff	max_children = osm_mtree_node_get_max_children(p_mtn);
879219820Sjeff
880219820Sjeff	CL_ASSERT(max_children <= osm_switch_get_num_ports(p_sw));
881219820Sjeff
882219820Sjeff	osm_mcast_tbl_clear_mlid(p_tbl, mlid_ho);
883219820Sjeff
884219820Sjeff	for (i = 0; i < max_children; i++) {
885219820Sjeff		p_child_mtn = osm_mtree_node_get_child(p_mtn, i);
886219820Sjeff		if (p_child_mtn == NULL)
887219820Sjeff			continue;
888219820Sjeff
889219820Sjeff		osm_mcast_tbl_set(p_tbl, mlid_ho, i);
890219820Sjeff	}
891219820Sjeff
892219820Sjeff	OSM_LOG_EXIT(sm->p_log);
893219820Sjeff}
894219820Sjeff#endif
895219820Sjeff
896219820Sjeff/**********************************************************************
897219820Sjeff **********************************************************************/
898219820Sjeffstatic void __osm_mcast_mgr_clear(osm_sm_t * sm, IN osm_mgrp_t * const p_mgrp)
899219820Sjeff{
900219820Sjeff	osm_switch_t *p_sw;
901219820Sjeff	cl_qmap_t *p_sw_tbl;
902219820Sjeff	osm_mcast_tbl_t *p_mcast_tbl;
903219820Sjeff
904219820Sjeff	OSM_LOG_ENTER(sm->p_log);
905219820Sjeff
906219820Sjeff	/*
907219820Sjeff	   Walk the switches and clear the routing entries for
908219820Sjeff	   this MLID.
909219820Sjeff	 */
910219820Sjeff	p_sw_tbl = &sm->p_subn->sw_guid_tbl;
911219820Sjeff	p_sw = (osm_switch_t *) cl_qmap_head(p_sw_tbl);
912219820Sjeff	while (p_sw != (osm_switch_t *) cl_qmap_end(p_sw_tbl)) {
913219820Sjeff		p_mcast_tbl = osm_switch_get_mcast_tbl_ptr(p_sw);
914219820Sjeff		osm_mcast_tbl_clear_mlid(p_mcast_tbl, cl_ntoh16(p_mgrp->mlid));
915219820Sjeff		p_sw = (osm_switch_t *) cl_qmap_next(&p_sw->map_item);
916219820Sjeff	}
917219820Sjeff
918219820Sjeff	OSM_LOG_EXIT(sm->p_log);
919219820Sjeff}
920219820Sjeff
921219820Sjeff#if 0
922219820Sjeff/* TO DO - make this real -- at least update spanning tree */
923219820Sjeff/**********************************************************************
924219820Sjeff   Lock must be held on entry.
925219820Sjeff**********************************************************************/
926219820Sjeffib_api_status_t
927219820Sjeffosm_mcast_mgr_process_single(osm_sm_t * sm,
928219820Sjeff			     IN ib_net16_t const mlid,
929219820Sjeff			     IN ib_net64_t const port_guid,
930219820Sjeff			     IN uint8_t const join_state)
931219820Sjeff{
932219820Sjeff	uint8_t port_num;
933219820Sjeff	uint16_t mlid_ho;
934219820Sjeff	ib_net64_t sw_guid;
935219820Sjeff	osm_port_t *p_port;
936219820Sjeff	osm_physp_t *p_physp;
937219820Sjeff	osm_physp_t *p_remote_physp;
938219820Sjeff	osm_node_t *p_remote_node;
939219820Sjeff	osm_mcast_tbl_t *p_mcast_tbl;
940219820Sjeff	ib_api_status_t status = IB_SUCCESS;
941219820Sjeff
942219820Sjeff	OSM_LOG_ENTER(sm->p_log);
943219820Sjeff
944219820Sjeff	CL_ASSERT(mlid);
945219820Sjeff	CL_ASSERT(port_guid);
946219820Sjeff
947219820Sjeff	mlid_ho = cl_ntoh16(mlid);
948219820Sjeff
949219820Sjeff	OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
950219820Sjeff		"Attempting to add port 0x%" PRIx64 " to MLID 0x%X, "
951219820Sjeff		"\n\t\t\t\tjoin state = 0x%X\n",
952219820Sjeff		cl_ntoh64(port_guid), mlid_ho, join_state);
953219820Sjeff
954219820Sjeff	/*
955219820Sjeff	   Acquire the Port object.
956219820Sjeff	 */
957219820Sjeff	p_port = osm_get_port_by_guid(sm->p_subn, port_guid);
958219820Sjeff	if (!p_port) {
959219820Sjeff		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A01: "
960219820Sjeff			"Unable to acquire port object for 0x%" PRIx64 "\n",
961219820Sjeff			cl_ntoh64(port_guid));
962219820Sjeff		status = IB_ERROR;
963219820Sjeff		goto Exit;
964219820Sjeff	}
965219820Sjeff
966219820Sjeff	p_physp = p_port->p_physp;
967219820Sjeff	if (p_physp == NULL) {
968219820Sjeff		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A05: "
969219820Sjeff			"Unable to acquire phsyical port object for 0x%" PRIx64
970219820Sjeff			"\n", cl_ntoh64(port_guid));
971219820Sjeff		status = IB_ERROR;
972219820Sjeff		goto Exit;
973219820Sjeff	}
974219820Sjeff
975219820Sjeff	p_remote_physp = osm_physp_get_remote(p_physp);
976219820Sjeff	if (p_remote_physp == NULL) {
977219820Sjeff		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A11: "
978219820Sjeff			"Unable to acquire remote phsyical port object "
979219820Sjeff			"for 0x%" PRIx64 "\n", cl_ntoh64(port_guid));
980219820Sjeff		status = IB_ERROR;
981219820Sjeff		goto Exit;
982219820Sjeff	}
983219820Sjeff
984219820Sjeff	p_remote_node = osm_physp_get_node_ptr(p_remote_physp);
985219820Sjeff
986219820Sjeff	CL_ASSERT(p_remote_node);
987219820Sjeff
988219820Sjeff	sw_guid = osm_node_get_node_guid(p_remote_node);
989219820Sjeff
990219820Sjeff	if (osm_node_get_type(p_remote_node) != IB_NODE_TYPE_SWITCH) {
991219820Sjeff		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A22: "
992219820Sjeff			"Remote node not a switch node 0x%" PRIx64 "\n",
993219820Sjeff			cl_ntoh64(sw_guid));
994219820Sjeff		status = IB_ERROR;
995219820Sjeff		goto Exit;
996219820Sjeff	}
997219820Sjeff
998219820Sjeff	if (!p_remote_node->sw) {
999219820Sjeff		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A12: "
1000219820Sjeff			"No switch object 0x%" PRIx64 "\n", cl_ntoh64(sw_guid));
1001219820Sjeff		status = IB_ERROR;
1002219820Sjeff		goto Exit;
1003219820Sjeff	}
1004219820Sjeff
1005219820Sjeff	if (osm_switch_is_in_mcast_tree(p_remote_node->sw, mlid_ho)) {
1006219820Sjeff		/*
1007219820Sjeff		   We're in luck. The switch attached to this port
1008219820Sjeff		   is already in the multicast group, so we can just
1009219820Sjeff		   add the specified port as a new leaf of the tree.
1010219820Sjeff		 */
1011219820Sjeff		if (join_state & (IB_JOIN_STATE_FULL | IB_JOIN_STATE_NON)) {
1012219820Sjeff			/*
1013219820Sjeff			   This node wants to receive multicast frames.
1014219820Sjeff			   Get the switch port number to which the new member port
1015219820Sjeff			   is attached, then configure this single mcast table.
1016219820Sjeff			 */
1017219820Sjeff			port_num = osm_physp_get_port_num(p_remote_physp);
1018219820Sjeff			CL_ASSERT(port_num);
1019219820Sjeff
1020219820Sjeff			p_mcast_tbl =
1021219820Sjeff			    osm_switch_get_mcast_tbl_ptr(p_remote_node->sw);
1022219820Sjeff			osm_mcast_tbl_set(p_mcast_tbl, mlid_ho, port_num);
1023219820Sjeff		} else {
1024219820Sjeff			if (join_state & IB_JOIN_STATE_SEND_ONLY)
1025219820Sjeff				OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
1026219820Sjeff					"Success.  Nothing to do for send"
1027219820Sjeff					"only member\n");
1028219820Sjeff			else {
1029219820Sjeff				OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A13: "
1030219820Sjeff					"Unknown join state 0x%X\n",
1031219820Sjeff					join_state);
1032219820Sjeff				status = IB_ERROR;
1033219820Sjeff				goto Exit;
1034219820Sjeff			}
1035219820Sjeff		}
1036219820Sjeff	} else
1037219820Sjeff		OSM_LOG(sm->p_log, OSM_LOG_DEBUG, "Unable to add port\n");
1038219820Sjeff
1039219820SjeffExit:
1040219820Sjeff	OSM_LOG_EXIT(sm->p_log);
1041219820Sjeff	return (status);
1042219820Sjeff}
1043219820Sjeff#endif
1044219820Sjeff
1045219820Sjeff/**********************************************************************
1046219820Sjeff   lock must already be held on entry
1047219820Sjeff**********************************************************************/
1048219820Sjeffstatic ib_api_status_t
1049219820Sjeffosm_mcast_mgr_process_tree(osm_sm_t * sm,
1050219820Sjeff			   IN osm_mgrp_t * const p_mgrp,
1051219820Sjeff			   IN osm_mcast_req_type_t req_type,
1052219820Sjeff			   ib_net64_t port_guid)
1053219820Sjeff{
1054219820Sjeff	ib_api_status_t status = IB_SUCCESS;
1055219820Sjeff	ib_net16_t mlid;
1056219820Sjeff
1057219820Sjeff	OSM_LOG_ENTER(sm->p_log);
1058219820Sjeff
1059219820Sjeff	mlid = osm_mgrp_get_mlid(p_mgrp);
1060219820Sjeff
1061219820Sjeff	OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
1062219820Sjeff		"Processing multicast group 0x%X\n", cl_ntoh16(mlid));
1063219820Sjeff
1064219820Sjeff	/*
1065219820Sjeff	   If there are no switches in the subnet, then we have nothing to do.
1066219820Sjeff	 */
1067219820Sjeff	if (cl_qmap_count(&sm->p_subn->sw_guid_tbl) == 0) {
1068219820Sjeff		OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
1069219820Sjeff			"No switches in subnet. Nothing to do\n");
1070219820Sjeff		goto Exit;
1071219820Sjeff	}
1072219820Sjeff
1073219820Sjeff	/*
1074219820Sjeff	   Clear the multicast tables to start clean, then build
1075219820Sjeff	   the spanning tree which sets the mcast table bits for each
1076219820Sjeff	   port in the group.
1077219820Sjeff	 */
1078219820Sjeff	__osm_mcast_mgr_clear(sm, p_mgrp);
1079219820Sjeff
1080219820Sjeff	if (!p_mgrp->full_members)
1081219820Sjeff		goto Exit;
1082219820Sjeff
1083219820Sjeff	status = __osm_mcast_mgr_build_spanning_tree(sm, p_mgrp);
1084219820Sjeff	if (status != IB_SUCCESS) {
1085219820Sjeff		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A17: "
1086219820Sjeff			"Unable to create spanning tree (%s)\n",
1087219820Sjeff			ib_get_err_str(status));
1088219820Sjeff		goto Exit;
1089219820Sjeff	}
1090219820Sjeff
1091219820SjeffExit:
1092219820Sjeff	OSM_LOG_EXIT(sm->p_log);
1093219820Sjeff	return (status);
1094219820Sjeff}
1095219820Sjeff
1096219820Sjeff/**********************************************************************
1097219820Sjeff Process the entire group.
1098219820Sjeff NOTE : The lock should be held externally!
1099219820Sjeff **********************************************************************/
1100219820Sjeffstatic ib_api_status_t
1101219820Sjeffmcast_mgr_process_mgrp(osm_sm_t * sm,
1102219820Sjeff		       IN osm_mgrp_t * const p_mgrp,
1103219820Sjeff		       IN osm_mcast_req_type_t req_type,
1104219820Sjeff		       IN ib_net64_t port_guid)
1105219820Sjeff{
1106219820Sjeff	ib_api_status_t status;
1107219820Sjeff
1108219820Sjeff	OSM_LOG_ENTER(sm->p_log);
1109219820Sjeff
1110219820Sjeff	status = osm_mcast_mgr_process_tree(sm, p_mgrp, req_type, port_guid);
1111219820Sjeff	if (status != IB_SUCCESS) {
1112219820Sjeff		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A19: "
1113219820Sjeff			"Unable to create spanning tree (%s)\n",
1114219820Sjeff			ib_get_err_str(status));
1115219820Sjeff		goto Exit;
1116219820Sjeff	}
1117219820Sjeff	p_mgrp->last_tree_id = p_mgrp->last_change_id;
1118219820Sjeff
1119219820Sjeff	/* remove MCGRP if it is marked for deletion */
1120219820Sjeff	if (p_mgrp->to_be_deleted) {
1121219820Sjeff		OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
1122219820Sjeff			"Destroying mgrp with lid:0x%x\n",
1123219820Sjeff			cl_ntoh16(p_mgrp->mlid));
1124219820Sjeff		sm->p_subn->mgroups[cl_ntoh16(p_mgrp->mlid) - IB_LID_MCAST_START_HO] = NULL;
1125219820Sjeff		osm_mgrp_delete(p_mgrp);
1126219820Sjeff	}
1127219820Sjeff
1128219820SjeffExit:
1129219820Sjeff	OSM_LOG_EXIT(sm->p_log);
1130219820Sjeff	return status;
1131219820Sjeff}
1132219820Sjeff
1133219820Sjeff/**********************************************************************
1134219820Sjeff **********************************************************************/
1135219820Sjeffosm_signal_t osm_mcast_mgr_process(osm_sm_t * sm)
1136219820Sjeff{
1137219820Sjeff	osm_signal_t signal;
1138219820Sjeff	osm_switch_t *p_sw;
1139219820Sjeff	cl_qmap_t *p_sw_tbl;
1140219820Sjeff	cl_qlist_t *p_list = &sm->mgrp_list;
1141219820Sjeff	osm_mgrp_t *p_mgrp;
1142219820Sjeff	boolean_t pending_transactions = FALSE;
1143219820Sjeff	int i;
1144219820Sjeff
1145219820Sjeff	OSM_LOG_ENTER(sm->p_log);
1146219820Sjeff
1147219820Sjeff	p_sw_tbl = &sm->p_subn->sw_guid_tbl;
1148219820Sjeff	/*
1149219820Sjeff	   While holding the lock, iterate over all the established
1150219820Sjeff	   multicast groups, servicing each in turn.
1151219820Sjeff
1152219820Sjeff	   Then, download the multicast tables to the switches.
1153219820Sjeff	 */
1154219820Sjeff	CL_PLOCK_EXCL_ACQUIRE(sm->p_lock);
1155219820Sjeff
1156219820Sjeff	for (i = 0; i <= sm->p_subn->max_mcast_lid_ho - IB_LID_MCAST_START_HO;
1157219820Sjeff	     i++) {
1158219820Sjeff		/*
1159219820Sjeff		   We reached here due to some change that caused a heavy sweep
1160219820Sjeff		   of the subnet. Not due to a specific multicast request.
1161219820Sjeff		   So the request type is subnet_change and the port guid is 0.
1162219820Sjeff		 */
1163219820Sjeff		p_mgrp = sm->p_subn->mgroups[i];
1164219820Sjeff		if (p_mgrp)
1165219820Sjeff			mcast_mgr_process_mgrp(sm, p_mgrp,
1166219820Sjeff					       OSM_MCAST_REQ_TYPE_SUBNET_CHANGE,
1167219820Sjeff					       0);
1168219820Sjeff	}
1169219820Sjeff
1170219820Sjeff	/*
1171219820Sjeff	   Walk the switches and download the tables for each.
1172219820Sjeff	 */
1173219820Sjeff	p_sw = (osm_switch_t *) cl_qmap_head(p_sw_tbl);
1174219820Sjeff	while (p_sw != (osm_switch_t *) cl_qmap_end(p_sw_tbl)) {
1175219820Sjeff		signal = __osm_mcast_mgr_set_tbl(sm, p_sw);
1176219820Sjeff		if (signal == OSM_SIGNAL_DONE_PENDING)
1177219820Sjeff			pending_transactions = TRUE;
1178219820Sjeff		p_sw = (osm_switch_t *) cl_qmap_next(&p_sw->map_item);
1179219820Sjeff	}
1180219820Sjeff
1181219820Sjeff	while (!cl_is_qlist_empty(p_list)) {
1182219820Sjeff		cl_list_item_t *p = cl_qlist_remove_head(p_list);
1183219820Sjeff		free(p);
1184219820Sjeff	}
1185219820Sjeff
1186219820Sjeff	CL_PLOCK_RELEASE(sm->p_lock);
1187219820Sjeff
1188219820Sjeff	OSM_LOG_EXIT(sm->p_log);
1189219820Sjeff
1190219820Sjeff	if (pending_transactions == TRUE)
1191219820Sjeff		return (OSM_SIGNAL_DONE_PENDING);
1192219820Sjeff	else
1193219820Sjeff		return (OSM_SIGNAL_DONE);
1194219820Sjeff}
1195219820Sjeff
1196219820Sjeff/**********************************************************************
1197219820Sjeff  This is the function that is invoked during idle time to handle the
1198219820Sjeff  process request for mcast groups where join/leave/delete was required.
1199219820Sjeff **********************************************************************/
1200219820Sjeffosm_signal_t osm_mcast_mgr_process_mgroups(osm_sm_t * sm)
1201219820Sjeff{
1202219820Sjeff	cl_qlist_t *p_list = &sm->mgrp_list;
1203219820Sjeff	osm_switch_t *p_sw;
1204219820Sjeff	cl_qmap_t *p_sw_tbl;
1205219820Sjeff	osm_mgrp_t *p_mgrp;
1206219820Sjeff	ib_net16_t mlid;
1207219820Sjeff	osm_signal_t ret, signal = OSM_SIGNAL_DONE;
1208219820Sjeff	osm_mcast_mgr_ctxt_t *ctx;
1209219820Sjeff	osm_mcast_req_type_t req_type;
1210219820Sjeff	ib_net64_t port_guid;
1211219820Sjeff
1212219820Sjeff	OSM_LOG_ENTER(sm->p_log);
1213219820Sjeff
1214219820Sjeff	/* we need a lock to make sure the p_mgrp is not change other ways */
1215219820Sjeff	CL_PLOCK_EXCL_ACQUIRE(sm->p_lock);
1216219820Sjeff
1217219820Sjeff	while (!cl_is_qlist_empty(p_list)) {
1218219820Sjeff		ctx = (osm_mcast_mgr_ctxt_t *) cl_qlist_remove_head(p_list);
1219219820Sjeff		req_type = ctx->req_type;
1220219820Sjeff		port_guid = ctx->port_guid;
1221219820Sjeff
1222219820Sjeff		/* nice copy no warning on size diff */
1223219820Sjeff		memcpy(&mlid, &ctx->mlid, sizeof(mlid));
1224219820Sjeff
1225219820Sjeff		/* we can destroy the context now */
1226219820Sjeff		free(ctx);
1227219820Sjeff
1228219820Sjeff		/* since we delayed the execution we prefer to pass the
1229219820Sjeff		   mlid as the mgrp identifier and then find it or abort */
1230219820Sjeff		p_mgrp = osm_get_mgrp_by_mlid(sm->p_subn, mlid);
1231219820Sjeff		if (!p_mgrp)
1232219820Sjeff			continue;
1233219820Sjeff
1234219820Sjeff		/* if there was no change from the last time
1235219820Sjeff		 * we processed the group we can skip doing anything
1236219820Sjeff		 */
1237219820Sjeff		if (p_mgrp->last_change_id == p_mgrp->last_tree_id) {
1238219820Sjeff			OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
1239219820Sjeff				"Skip processing mgrp with lid:0x%X change id:%u\n",
1240219820Sjeff				cl_ntoh16(mlid), p_mgrp->last_change_id);
1241219820Sjeff			continue;
1242219820Sjeff		}
1243219820Sjeff
1244219820Sjeff		OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
1245219820Sjeff			"Processing mgrp with lid:0x%X change id:%u\n",
1246219820Sjeff			cl_ntoh16(mlid), p_mgrp->last_change_id);
1247219820Sjeff		mcast_mgr_process_mgrp(sm, p_mgrp, req_type, port_guid);
1248219820Sjeff	}
1249219820Sjeff
1250219820Sjeff	/*
1251219820Sjeff	   Walk the switches and download the tables for each.
1252219820Sjeff	 */
1253219820Sjeff	p_sw_tbl = &sm->p_subn->sw_guid_tbl;
1254219820Sjeff	p_sw = (osm_switch_t *) cl_qmap_head(p_sw_tbl);
1255219820Sjeff	while (p_sw != (osm_switch_t *) cl_qmap_end(p_sw_tbl)) {
1256219820Sjeff		ret = __osm_mcast_mgr_set_tbl(sm, p_sw);
1257219820Sjeff		if (ret == OSM_SIGNAL_DONE_PENDING)
1258219820Sjeff			signal = ret;
1259219820Sjeff		p_sw = (osm_switch_t *) cl_qmap_next(&p_sw->map_item);
1260219820Sjeff	}
1261219820Sjeff
1262219820Sjeff	osm_dump_mcast_routes(sm->p_subn->p_osm);
1263219820Sjeff
1264219820Sjeff	CL_PLOCK_RELEASE(sm->p_lock);
1265219820Sjeff	OSM_LOG_EXIT(sm->p_log);
1266219820Sjeff	return signal;
1267219820Sjeff}
1268