1/*
2 * Copyright (c) 2004-2009 Voltaire, Inc. All rights reserved.
3 * Copyright (c) 2002-2011 Mellanox Technologies LTD. All rights reserved.
4 * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
5 * Copyright (c) 2008 Xsigo Systems Inc.  All rights reserved.
6 * Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
7 * Copyright (c) 2010 HNR Consulting. All rights reserved.
8 * Copyright (C) 2012-2013 Tokyo Institute of Technology. All rights reserved.
9 *
10 * This software is available to you under a choice of one of two
11 * licenses.  You may choose to be licensed under the terms of the GNU
12 * General Public License (GPL) Version 2, available from the file
13 * COPYING in the main directory of this source tree, or the
14 * OpenIB.org BSD license below:
15 *
16 *     Redistribution and use in source and binary forms, with or
17 *     without modification, are permitted provided that the following
18 *     conditions are met:
19 *
20 *      - Redistributions of source code must retain the above
21 *        copyright notice, this list of conditions and the following
22 *        disclaimer.
23 *
24 *      - Redistributions in binary form must reproduce the above
25 *        copyright notice, this list of conditions and the following
26 *        disclaimer in the documentation and/or other materials
27 *        provided with the distribution.
28 *
29 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
30 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
31 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
32 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
33 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
34 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
35 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
36 * SOFTWARE.
37 *
38 */
39
40/*
41 * Abstract:
42 *    Implementation of osm_mcast_mgr_t.
43 * This file implements the Multicast Manager object.
44 */
45
46#if HAVE_CONFIG_H
47#  include <config.h>
48#endif				/* HAVE_CONFIG_H */
49
50#include <stdlib.h>
51#include <string.h>
52#include <iba/ib_types.h>
53#include <complib/cl_debug.h>
54#include <opensm/osm_file_ids.h>
55#define FILE_ID OSM_FILE_MCAST_MGR_C
56#include <opensm/osm_opensm.h>
57#include <opensm/osm_sm.h>
58#include <opensm/osm_multicast.h>
59#include <opensm/osm_node.h>
60#include <opensm/osm_switch.h>
61#include <opensm/osm_helper.h>
62#include <opensm/osm_msgdef.h>
63#include <opensm/osm_mcast_mgr.h>
64
65static osm_mcast_work_obj_t *mcast_work_obj_new(IN osm_port_t * p_port)
66{
67	osm_mcast_work_obj_t *p_obj;
68
69	/*
70	   clean allocated memory to avoid assertion when trying to insert to
71	   qlist.
72	   see cl_qlist_insert_tail(): CL_ASSERT(p_list_item->p_list != p_list)
73	 */
74	p_obj = malloc(sizeof(*p_obj));
75	if (p_obj) {
76		memset(p_obj, 0, sizeof(*p_obj));
77		p_obj->p_port = p_port;
78	}
79
80	return p_obj;
81}
82
83static void mcast_work_obj_delete(IN osm_mcast_work_obj_t * p_wobj)
84{
85	free(p_wobj);
86}
87
88int osm_mcast_make_port_list_and_map(cl_qlist_t * list, cl_qmap_t * map,
89				     osm_mgrp_box_t * mbox)
90{
91	cl_map_item_t *map_item;
92	cl_list_item_t *list_item;
93	osm_mgrp_t *mgrp;
94	osm_mcm_port_t *mcm_port;
95	osm_mcast_work_obj_t *wobj;
96
97	cl_qmap_init(map);
98	cl_qlist_init(list);
99
100	for (list_item = cl_qlist_head(&mbox->mgrp_list);
101	     list_item != cl_qlist_end(&mbox->mgrp_list);
102	     list_item = cl_qlist_next(list_item)) {
103		mgrp = cl_item_obj(list_item, mgrp, list_item);
104		for (map_item = cl_qmap_head(&mgrp->mcm_port_tbl);
105		     map_item != cl_qmap_end(&mgrp->mcm_port_tbl);
106		     map_item = cl_qmap_next(map_item)) {
107			/* Acquire the port object for this port guid, then
108			   create the new worker object to build the list. */
109			mcm_port = cl_item_obj(map_item, mcm_port, map_item);
110			if (cl_qmap_get(map, mcm_port->port->guid) !=
111			    cl_qmap_end(map))
112				continue;
113			wobj = mcast_work_obj_new(mcm_port->port);
114			if (!wobj)
115				return -1;
116			cl_qlist_insert_tail(list, &wobj->list_item);
117			cl_qmap_insert(map, mcm_port->port->guid,
118				       &wobj->map_item);
119		}
120	}
121	return 0;
122}
123
124void osm_mcast_drop_port_list(cl_qlist_t * list)
125{
126	while (cl_qlist_count(list))
127		mcast_work_obj_delete((osm_mcast_work_obj_t *)
128				      cl_qlist_remove_head(list));
129}
130
131void osm_purge_mtree(osm_sm_t * sm, IN osm_mgrp_box_t * mbox)
132{
133	OSM_LOG_ENTER(sm->p_log);
134
135	if (mbox->root)
136		osm_mtree_destroy(mbox->root);
137	mbox->root = NULL;
138
139	OSM_LOG_EXIT(sm->p_log);
140}
141
142static void create_mgrp_switch_map(cl_qmap_t * m, cl_qlist_t * port_list)
143{
144	osm_mcast_work_obj_t *wobj;
145	osm_port_t *port;
146	osm_switch_t *sw;
147	ib_net64_t guid;
148	cl_list_item_t *i;
149
150	cl_qmap_init(m);
151	for (i = cl_qlist_head(port_list); i != cl_qlist_end(port_list);
152	     i = cl_qlist_next(i)) {
153		wobj = cl_item_obj(i, wobj, list_item);
154		port = wobj->p_port;
155		if (port->p_node->sw) {
156			sw = port->p_node->sw;
157			sw->is_mc_member = 1;
158		} else if (port->p_physp->p_remote_physp) {
159			sw = port->p_physp->p_remote_physp->p_node->sw;
160			sw->num_of_mcm++;
161		} else
162			continue;
163		guid = osm_node_get_node_guid(sw->p_node);
164		if (cl_qmap_get(m, guid) == cl_qmap_end(m))
165			cl_qmap_insert(m, guid, &sw->mgrp_item);
166	}
167}
168
169static void destroy_mgrp_switch_map(cl_qmap_t * m)
170{
171	osm_switch_t *sw;
172	cl_map_item_t *i;
173
174	for (i = cl_qmap_head(m); i != cl_qmap_end(m); i = cl_qmap_next(i)) {
175		sw = cl_item_obj(i, sw, mgrp_item);
176		sw->num_of_mcm = 0;
177		sw->is_mc_member = 0;
178	}
179	cl_qmap_remove_all(m);
180}
181
182/**********************************************************************
183 Calculate the maximal "min hops" from the given switch to any
184 of the group HCAs
185 **********************************************************************/
186#ifdef OSM_VENDOR_INTF_ANAFA
187static float mcast_mgr_compute_avg_hops(osm_sm_t * sm, cl_qmap_t * m,
188					const osm_switch_t * this_sw)
189{
190	float avg_hops = 0;
191	uint32_t hops = 0;
192	uint32_t num_ports = 0;
193	uint16_t lid;
194	uint32_t least_hops;
195	cl_map_item_t *i;
196	osm_switch_t *sw;
197
198	OSM_LOG_ENTER(sm->p_log);
199
200	for (i = cl_qmap_head(m); i != cl_qmap_end(m); i = cl_qmap_next(i)) {
201		sw = cl_item_obj(i, sw, mcast_item);
202		lid = cl_ntoh16(osm_node_get_base_lid(sw->p_node, 0));
203		least_hops = osm_switch_get_least_hops(this_sw, lid);
204		/* for all host that are MC members and attached to the switch,
205		   we should add the (least_hops + 1) * number_of_such_hosts.
206		   If switch itself is in the MC, we should add the least_hops only */
207		hops += (least_hops + 1) * sw->num_of_mcm +
208		    least_hops * sw->is_mc_member;
209		num_ports += sw->num_of_mcm + sw->is_mc_member;
210	}
211
212	/* We shouldn't be here if there aren't any ports in the group. */
213	CL_ASSERT(num_ports);
214
215	avg_hops = (float)(hops / num_ports);
216
217	OSM_LOG_EXIT(sm->p_log);
218	return avg_hops;
219}
220#else
221static float mcast_mgr_compute_max_hops(osm_sm_t * sm, cl_qmap_t * m,
222					const osm_switch_t * this_sw)
223{
224	uint32_t max_hops = 0, hops;
225	uint16_t lid;
226	cl_map_item_t *i;
227	osm_switch_t *sw;
228
229	OSM_LOG_ENTER(sm->p_log);
230
231	/*
232	   For each member of the multicast group, compute the
233	   number of hops to its base LID.
234	 */
235	for (i = cl_qmap_head(m); i != cl_qmap_end(m); i = cl_qmap_next(i)) {
236		sw = cl_item_obj(i, sw, mgrp_item);
237		lid = cl_ntoh16(osm_node_get_base_lid(sw->p_node, 0));
238		hops = osm_switch_get_least_hops(this_sw, lid);
239		if (!sw->is_mc_member)
240			hops += 1;
241		if (hops > max_hops)
242			max_hops = hops;
243	}
244
245	/* Note that at this point we might get (max_hops == 0),
246	   which means that there's only one member in the mcast
247	   group, and it's the current switch */
248
249	OSM_LOG_EXIT(sm->p_log);
250	return (float)max_hops;
251}
252#endif
253
254/**********************************************************************
255   This function attempts to locate the optimal switch for the
256   center of the spanning tree.  The current algorithm chooses
257   a switch with the lowest average hop count to the members
258   of the multicast group.
259**********************************************************************/
260static osm_switch_t *mcast_mgr_find_optimal_switch(osm_sm_t * sm,
261						   cl_qlist_t * list)
262{
263	cl_qmap_t mgrp_sw_map;
264	cl_qmap_t *p_sw_tbl;
265	osm_switch_t *p_sw, *p_best_sw = NULL;
266	float hops = 0;
267	float best_hops = 10000;	/* any big # will do */
268
269	OSM_LOG_ENTER(sm->p_log);
270
271	p_sw_tbl = &sm->p_subn->sw_guid_tbl;
272
273	create_mgrp_switch_map(&mgrp_sw_map, list);
274	for (p_sw = (osm_switch_t *) cl_qmap_head(p_sw_tbl);
275	     p_sw != (osm_switch_t *) cl_qmap_end(p_sw_tbl);
276	     p_sw = (osm_switch_t *) cl_qmap_next(&p_sw->map_item)) {
277		if (!osm_switch_supports_mcast(p_sw))
278			continue;
279
280#ifdef OSM_VENDOR_INTF_ANAFA
281		hops = mcast_mgr_compute_avg_hops(sm, &mgrp_sw_map, p_sw);
282#else
283		hops = mcast_mgr_compute_max_hops(sm, &mgrp_sw_map, p_sw);
284#endif
285
286		OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
287			"Switch 0x%016" PRIx64 ", hops = %f\n",
288			cl_ntoh64(osm_node_get_node_guid(p_sw->p_node)), hops);
289
290		if (hops < best_hops) {
291			p_best_sw = p_sw;
292			best_hops = hops;
293		}
294	}
295
296	if (p_best_sw)
297		OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
298			"Best switch is 0x%" PRIx64 " (%s), hops = %f\n",
299			cl_ntoh64(osm_node_get_node_guid(p_best_sw->p_node)),
300			p_best_sw->p_node->print_desc, best_hops);
301	else
302		OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
303			"No multicast capable switches detected\n");
304
305	destroy_mgrp_switch_map(&mgrp_sw_map);
306	OSM_LOG_EXIT(sm->p_log);
307	return p_best_sw;
308}
309
310/**********************************************************************
311   This function returns the existing or optimal root switch for the tree.
312**********************************************************************/
313osm_switch_t *osm_mcast_mgr_find_root_switch(osm_sm_t * sm, cl_qlist_t *list)
314{
315	osm_switch_t *p_sw = NULL;
316
317	OSM_LOG_ENTER(sm->p_log);
318
319	/*
320	   We always look for the best multicast tree root switch.
321	   Otherwise since we always start with a a single join
322	   the root will be always on the first switch attached to it.
323	   - Very bad ...
324	 */
325	p_sw = mcast_mgr_find_optimal_switch(sm, list);
326
327	OSM_LOG_EXIT(sm->p_log);
328	return p_sw;
329}
330
331static int mcast_mgr_set_mft_block(osm_sm_t * sm, IN osm_switch_t * p_sw,
332				   uint32_t block_num, uint32_t position)
333{
334	osm_node_t *p_node;
335	osm_physp_t *p_physp;
336	osm_dr_path_t *p_path;
337	osm_madw_context_t context;
338	ib_api_status_t status;
339	uint32_t block_id_ho;
340	osm_mcast_tbl_t *p_tbl;
341	ib_net16_t block[IB_MCAST_BLOCK_SIZE];
342	int ret = 0;
343
344	CL_ASSERT(sm);
345
346	OSM_LOG_ENTER(sm->p_log);
347
348	CL_ASSERT(p_sw);
349
350	p_node = p_sw->p_node;
351
352	CL_ASSERT(p_node);
353
354	p_physp = osm_node_get_physp_ptr(p_node, 0);
355	p_path = osm_physp_get_dr_path_ptr(p_physp);
356
357	/*
358	   Send multicast forwarding table blocks to the switch
359	   as long as the switch indicates it has blocks needing
360	   configuration.
361	 */
362
363	context.mft_context.node_guid = osm_node_get_node_guid(p_node);
364	context.mft_context.set_method = TRUE;
365
366	p_tbl = osm_switch_get_mcast_tbl_ptr(p_sw);
367
368	if (osm_mcast_tbl_get_block(p_tbl, (uint16_t) block_num,
369				    (uint8_t) position, block)) {
370		block_id_ho = block_num + (position << 28);
371
372		OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
373			"Writing MFT block %u position %u to switch 0x%" PRIx64
374			"\n", block_num, position,
375			cl_ntoh64(context.mft_context.node_guid));
376
377		status = osm_req_set(sm, p_path, (void *)block, sizeof(block),
378				     IB_MAD_ATTR_MCAST_FWD_TBL,
379				     cl_hton32(block_id_ho), FALSE,
380				     ib_port_info_get_m_key(&p_physp->port_info),
381				     CL_DISP_MSGID_NONE, &context);
382		if (status != IB_SUCCESS) {
383			OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A02: "
384				"Sending multicast fwd. tbl. block 0x%X to %s "
385				"failed (%s)\n", block_id_ho,
386				p_node->print_desc, ib_get_err_str(status));
387			ret = -1;
388		}
389	}
390
391	OSM_LOG_EXIT(sm->p_log);
392	return ret;
393}
394
395/**********************************************************************
396  This is part of the recursive function to compute the paths in the
397  spanning tree that emanate from this switch.  On input, the p_list
398  contains the group members that must be routed from this switch.
399**********************************************************************/
400static void mcast_mgr_subdivide(osm_sm_t * sm, uint16_t mlid_ho,
401				osm_switch_t * p_sw, cl_qlist_t * p_list,
402				cl_qlist_t * list_array, uint8_t array_size)
403{
404	uint8_t port_num;
405	boolean_t ignore_existing;
406	osm_mcast_work_obj_t *p_wobj;
407
408	OSM_LOG_ENTER(sm->p_log);
409
410	/*
411	   For Multicast Groups, we don't want to count on previous
412	   configurations - since we can easily generate a storm
413	   by loops.
414	 */
415	ignore_existing = TRUE;
416
417	/*
418	   Subdivide the set of ports into non-overlapping subsets
419	   that will be routed to other switches.
420	 */
421	while ((p_wobj =
422		(osm_mcast_work_obj_t *) cl_qlist_remove_head(p_list)) !=
423	       (osm_mcast_work_obj_t *) cl_qlist_end(p_list)) {
424		port_num =
425		    osm_switch_recommend_mcast_path(p_sw, p_wobj->p_port,
426						    mlid_ho, ignore_existing);
427		if (port_num == OSM_NO_PATH) {
428			/*
429			   This typically occurs if the switch does not support
430			   multicast and the multicast tree must branch at this
431			   switch.
432			 */
433			OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A03: "
434				"Error routing MLID 0x%X through switch 0x%"
435				PRIx64 " %s\n"
436				"\t\t\t\tNo multicast paths from this switch "
437				"for port with LID %u\n", mlid_ho,
438				cl_ntoh64(osm_node_get_node_guid(p_sw->p_node)),
439				p_sw->p_node->print_desc,
440				cl_ntoh16(osm_port_get_base_lid
441					  (p_wobj->p_port)));
442			mcast_work_obj_delete(p_wobj);
443			continue;
444		}
445
446		if (port_num >= array_size) {
447			OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A04: "
448				"Error routing MLID 0x%X through switch 0x%"
449				PRIx64 " %s\n"
450				"\t\t\t\tNo multicast paths from this switch "
451				"to port with LID %u\n", mlid_ho,
452				cl_ntoh64(osm_node_get_node_guid(p_sw->p_node)),
453				p_sw->p_node->print_desc,
454				cl_ntoh16(osm_port_get_base_lid
455					  (p_wobj->p_port)));
456			mcast_work_obj_delete(p_wobj);
457			continue;
458		}
459
460		cl_qlist_insert_tail(&list_array[port_num], &p_wobj->list_item);
461	}
462
463	OSM_LOG_EXIT(sm->p_log);
464}
465
466static void mcast_mgr_purge_list(osm_sm_t * sm, uint16_t mlid, cl_qlist_t * list)
467{
468	if (OSM_LOG_IS_ACTIVE_V2(sm->p_log, OSM_LOG_ERROR)) {
469		osm_mcast_work_obj_t *wobj;
470		cl_list_item_t *i;
471		for (i = cl_qlist_head(list); i != cl_qlist_end(list);
472		     i = cl_qlist_next(i)) {
473			wobj = cl_item_obj(i, wobj, list_item);
474			OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A06: "
475				"Unable to route MLID 0x%X for port 0x%" PRIx64 "\n",
476				mlid, cl_ntoh64(osm_port_get_guid(wobj->p_port)));
477		}
478	}
479	osm_mcast_drop_port_list(list);
480}
481
482/**********************************************************************
483  This is the recursive function to compute the paths in the spanning
484  tree that emanate from this switch.  On input, the p_list contains
485  the group members that must be routed from this switch.
486
487  The function returns the newly created mtree node element.
488**********************************************************************/
489static osm_mtree_node_t *mcast_mgr_branch(osm_sm_t * sm, uint16_t mlid_ho,
490					  osm_switch_t * p_sw,
491					  cl_qlist_t * p_list, uint8_t depth,
492					  uint8_t upstream_port,
493					  uint8_t * p_max_depth)
494{
495	uint8_t max_children;
496	osm_mtree_node_t *p_mtn = NULL;
497	cl_qlist_t *list_array = NULL;
498	uint8_t i;
499	ib_net64_t node_guid;
500	osm_mcast_work_obj_t *p_wobj;
501	cl_qlist_t *p_port_list;
502	size_t count;
503	osm_mcast_tbl_t *p_tbl;
504
505	OSM_LOG_ENTER(sm->p_log);
506
507	CL_ASSERT(p_sw);
508	CL_ASSERT(p_list);
509	CL_ASSERT(p_max_depth);
510
511	node_guid = osm_node_get_node_guid(p_sw->p_node);
512
513	OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
514		"Routing MLID 0x%X through switch 0x%" PRIx64
515		" %s, %u nodes at depth %u\n",
516		mlid_ho, cl_ntoh64(node_guid), p_sw->p_node->print_desc,
517		cl_qlist_count(p_list), depth);
518
519	CL_ASSERT(cl_qlist_count(p_list) > 0);
520
521	depth++;
522
523	if (depth >= 64) {
524		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A21: "
525			"Maximal hops number is reached for MLID 0x%x."
526			" Break processing\n", mlid_ho);
527		mcast_mgr_purge_list(sm, mlid_ho, p_list);
528		goto Exit;
529	}
530
531	if (depth > *p_max_depth) {
532		CL_ASSERT(depth == *p_max_depth + 1);
533		*p_max_depth = depth;
534	}
535
536	if (osm_switch_supports_mcast(p_sw) == FALSE) {
537		/*
538		   This switch doesn't do multicast.  Clean-up.
539		 */
540		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A14: "
541			"Switch 0x%" PRIx64 " %s does not support multicast\n",
542			cl_ntoh64(node_guid), p_sw->p_node->print_desc);
543
544		/*
545		   Deallocate all the work objects on this branch of the tree.
546		 */
547		mcast_mgr_purge_list(sm, mlid_ho, p_list);
548		goto Exit;
549	}
550
551	p_mtn = osm_mtree_node_new(p_sw);
552	if (p_mtn == NULL) {
553		/*
554		   We are unable to continue routing down this
555		   leg of the tree.  Clean-up.
556		 */
557		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A15: "
558			"Insufficient memory to build multicast tree\n");
559
560		/*
561		   Deallocate all the work objects on this branch of the tree.
562		 */
563		mcast_mgr_purge_list(sm, mlid_ho, p_list);
564		goto Exit;
565	}
566
567	max_children = osm_mtree_node_get_max_children(p_mtn);
568
569	CL_ASSERT(max_children > 1);
570
571	/*
572	   Prepare an empty list for each port in the switch.
573	   TO DO - this list array could probably be moved
574	   inside the switch element to save on malloc thrashing.
575	 */
576	list_array = malloc(sizeof(cl_qlist_t) * max_children);
577	if (list_array == NULL) {
578		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A16: "
579			"Unable to allocate list array\n");
580		mcast_mgr_purge_list(sm, mlid_ho, p_list);
581		osm_mtree_destroy(p_mtn);
582		p_mtn = NULL;
583		goto Exit;
584	}
585
586	memset(list_array, 0, sizeof(cl_qlist_t) * max_children);
587
588	for (i = 0; i < max_children; i++)
589		cl_qlist_init(&list_array[i]);
590
591	mcast_mgr_subdivide(sm, mlid_ho, p_sw, p_list, list_array, max_children);
592
593	p_tbl = osm_switch_get_mcast_tbl_ptr(p_sw);
594
595	/*
596	   Add the upstream port to the forwarding table unless
597	   we're at the root of the spanning tree.
598	 */
599	if (depth > 1) {
600		OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
601			"Adding upstream port %u\n", upstream_port);
602
603		CL_ASSERT(upstream_port);
604		osm_mcast_tbl_set(p_tbl, mlid_ho, upstream_port);
605	}
606
607	/*
608	   For each port that was allocated some routes,
609	   recurse into this function to continue building the tree
610	   if the node on the other end of that port is another switch.
611	   Otherwise, the node is an endpoint, and we've found a leaf
612	   of the tree.  Mark leaves with our special pointer value.
613	 */
614
615	for (i = 0; i < max_children; i++) {
616		const osm_physp_t *p_physp;
617		const osm_physp_t *p_remote_physp;
618		osm_node_t *p_node;
619		const osm_node_t *p_remote_node;
620
621		p_port_list = &list_array[i];
622
623		count = cl_qlist_count(p_port_list);
624
625		/*
626		   There should be no children routed through the upstream port!
627		 */
628		CL_ASSERT(upstream_port == 0 || i != upstream_port ||
629			  (i == upstream_port && count == 0));
630
631		if (count == 0)
632			continue;	/* No routes down this port. */
633
634		OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
635			"Routing %zu destinations via switch port %u\n",
636			count, i);
637
638		if (i == 0) {
639			/* This means we are adding the switch to the MC group.
640			   We do not need to continue looking at the remote
641			   port, just needed to add the port to the table */
642			CL_ASSERT(count == 1);
643
644			osm_mcast_tbl_set(p_tbl, mlid_ho, i);
645
646			p_wobj = (osm_mcast_work_obj_t *)
647			    cl_qlist_remove_head(p_port_list);
648			mcast_work_obj_delete(p_wobj);
649			continue;
650		}
651
652		p_node = p_sw->p_node;
653		p_remote_node = osm_node_get_remote_node(p_node, i, NULL);
654		if (!p_remote_node) {
655			/*
656			 * If we reached here, it means the minhop table has
657			 * invalid entries that leads to disconnected ports.
658			 *
659			 * A possible reason for the code to reach here is
660			 * that ucast cache is enabled, and a leaf switch that
661			 * is used as a non-leaf switch in a multicast has been
662			 * removed from the fabric.
663			 *
664			 * When it happens, we should invalidate the cache
665			 * and force rerouting of the fabric.
666			 */
667
668			OSM_LOG(sm->p_log, OSM_LOG_ERROR,
669				"ERR 0A1E: Tried to route MLID 0x%X through "
670				"disconnected switch 0x%" PRIx64 " port %d\n",
671				mlid_ho, cl_ntoh64(node_guid), i);
672
673			/* Free memory */
674			mcast_mgr_purge_list(sm, mlid_ho, p_port_list);
675
676			/* Invalidate ucast cache */
677			if (sm->ucast_mgr.p_subn->opt.use_ucast_cache &&
678			    sm->ucast_mgr.cache_valid) {
679				OSM_LOG(sm->p_log, OSM_LOG_INFO,
680					"Unicast Cache will be invalidated due "
681					"to multicast routing errors\n");
682				osm_ucast_cache_invalidate(&sm->ucast_mgr);
683				sm->p_subn->force_heavy_sweep = TRUE;
684			}
685
686			continue;
687		}
688
689		/*
690		   This port routes frames for this mcast group.  Therefore,
691		   set the appropriate bit in the multicast forwarding
692		   table for this switch.
693		 */
694		osm_mcast_tbl_set(p_tbl, mlid_ho, i);
695
696		if (osm_node_get_type(p_remote_node) == IB_NODE_TYPE_SWITCH) {
697			/*
698			   Acquire a pointer to the remote switch then recurse.
699			 */
700			CL_ASSERT(p_remote_node->sw);
701
702			p_physp = osm_node_get_physp_ptr(p_node, i);
703			CL_ASSERT(p_physp);
704
705			p_remote_physp = osm_physp_get_remote(p_physp);
706			CL_ASSERT(p_remote_physp);
707
708			p_mtn->child_array[i] =
709			    mcast_mgr_branch(sm, mlid_ho, p_remote_node->sw,
710					     p_port_list, depth,
711					     osm_physp_get_port_num
712					     (p_remote_physp), p_max_depth);
713		} else {
714			/*
715			   The neighbor node is not a switch, so this
716			   must be a leaf.
717			 */
718			CL_ASSERT(count == 1);
719
720			p_mtn->child_array[i] = OSM_MTREE_LEAF;
721			p_wobj = (osm_mcast_work_obj_t *)
722			    cl_qlist_remove_head(p_port_list);
723
724			CL_ASSERT(cl_is_qlist_empty(p_port_list));
725
726			OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
727				"Found leaf for port 0x%016" PRIx64
728				" on switch port %u\n",
729				cl_ntoh64(osm_port_get_guid(p_wobj->p_port)),
730				i);
731			mcast_work_obj_delete(p_wobj);
732		}
733	}
734
735	free(list_array);
736Exit:
737	OSM_LOG_EXIT(sm->p_log);
738	return p_mtn;
739}
740
741static ib_api_status_t mcast_mgr_build_spanning_tree(osm_sm_t * sm,
742						     osm_mgrp_box_t * mbox)
743{
744	cl_qlist_t port_list;
745	cl_qmap_t port_map;
746	uint32_t num_ports;
747	osm_switch_t *p_sw;
748	ib_api_status_t status = IB_SUCCESS;
749	uint8_t max_depth = 0;
750
751	OSM_LOG_ENTER(sm->p_log);
752
753	/*
754	   TO DO - for now, just blow away the old tree.
755	   In the future we'll need to construct the tree based
756	   on multicast forwarding table information if the user wants to
757	   preserve existing multicast routes.
758	 */
759	osm_purge_mtree(sm, mbox);
760
761	/* build the first "subset" containing all member ports */
762	if (osm_mcast_make_port_list_and_map(&port_list, &port_map, mbox)) {
763		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A10: "
764			"Insufficient memory to make port list\n");
765		status = IB_ERROR;
766		goto Exit;
767	}
768
769	num_ports = cl_qlist_count(&port_list);
770	if (num_ports < 2) {
771		OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
772			"MLID 0x%X has %u members - nothing to do\n",
773			mbox->mlid, num_ports);
774		osm_mcast_drop_port_list(&port_list);
775		goto Exit;
776	}
777
778	/*
779	   This function builds the single spanning tree recursively.
780	   At each stage, the ports to be reached are divided into
781	   non-overlapping subsets of member ports that can be reached through
782	   a given switch port.  Construction then moves down each
783	   branch, and the process starts again with each branch computing
784	   for its own subset of the member ports.
785
786	   The maximum recursion depth is at worst the maximum hop count in the
787	   subnet, which is spec limited to 64.
788	 */
789
790	/*
791	   Locate the switch around which to create the spanning
792	   tree for this multicast group.
793	 */
794	p_sw = osm_mcast_mgr_find_root_switch(sm, &port_list);
795	if (p_sw == NULL) {
796		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A08: "
797			"Unable to locate a suitable switch for group 0x%X\n",
798			mbox->mlid);
799		osm_mcast_drop_port_list(&port_list);
800		status = IB_ERROR;
801		goto Exit;
802	}
803
804	mbox->root = mcast_mgr_branch(sm, mbox->mlid, p_sw, &port_list, 0, 0,
805				      &max_depth);
806
807	OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
808		"Configured MLID 0x%X for %u ports, max tree depth = %u\n",
809		mbox->mlid, num_ports, max_depth);
810Exit:
811	OSM_LOG_EXIT(sm->p_log);
812	return status;
813}
814
815#if 0
816/* unused */
817void osm_mcast_mgr_set_table(osm_sm_t * sm, IN const osm_mgrp_t * p_mgrp,
818			     IN const osm_mtree_node_t * p_mtn)
819{
820	uint8_t i;
821	uint8_t max_children;
822	osm_mtree_node_t *p_child_mtn;
823	uint16_t mlid_ho;
824	osm_mcast_tbl_t *p_tbl;
825	osm_switch_t *p_sw;
826
827	OSM_LOG_ENTER(sm->p_log);
828
829	mlid_ho = cl_ntoh16(osm_mgrp_get_mlid(p_mgrp));
830	p_sw = osm_mtree_node_get_switch_ptr(p_mtn);
831
832	CL_ASSERT(p_sw);
833
834	OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
835		"Configuring MLID 0x%X on switch 0x%" PRIx64 "\n",
836		mlid_ho, osm_node_get_node_guid(p_sw->p_node));
837
838	/*
839	   For every child of this tree node, set the corresponding
840	   bit in the switch's mcast table.
841	 */
842	p_tbl = osm_switch_get_mcast_tbl_ptr(p_sw);
843	max_children = osm_mtree_node_get_max_children(p_mtn);
844
845	CL_ASSERT(max_children <= osm_switch_get_num_ports(p_sw));
846
847	osm_mcast_tbl_clear_mlid(p_tbl, mlid_ho);
848
849	for (i = 0; i < max_children; i++) {
850		p_child_mtn = osm_mtree_node_get_child(p_mtn, i);
851		if (p_child_mtn == NULL)
852			continue;
853
854		osm_mcast_tbl_set(p_tbl, mlid_ho, i);
855	}
856
857	OSM_LOG_EXIT(sm->p_log);
858}
859#endif
860
861static void mcast_mgr_clear(osm_sm_t * sm, uint16_t mlid)
862{
863	osm_switch_t *p_sw;
864	cl_qmap_t *p_sw_tbl;
865	osm_mcast_tbl_t *p_mcast_tbl;
866
867	OSM_LOG_ENTER(sm->p_log);
868
869	/* Walk the switches and clear the routing entries for this MLID. */
870	p_sw_tbl = &sm->p_subn->sw_guid_tbl;
871	p_sw = (osm_switch_t *) cl_qmap_head(p_sw_tbl);
872	while (p_sw != (osm_switch_t *) cl_qmap_end(p_sw_tbl)) {
873		p_mcast_tbl = osm_switch_get_mcast_tbl_ptr(p_sw);
874		osm_mcast_tbl_clear_mlid(p_mcast_tbl, mlid);
875		p_sw = (osm_switch_t *) cl_qmap_next(&p_sw->map_item);
876	}
877
878	OSM_LOG_EXIT(sm->p_log);
879}
880
881#if 0
882/* TO DO - make this real -- at least update spanning tree */
883/**********************************************************************
884   Lock must be held on entry.
885**********************************************************************/
886ib_api_status_t osm_mcast_mgr_process_single(osm_sm_t * sm,
887					     IN ib_net16_t const mlid,
888					     IN ib_net64_t const port_guid,
889					     IN uint8_t const join_state)
890{
891	uint8_t port_num;
892	uint16_t mlid_ho;
893	ib_net64_t sw_guid;
894	osm_port_t *p_port;
895	osm_physp_t *p_physp;
896	osm_physp_t *p_remote_physp;
897	osm_node_t *p_remote_node;
898	osm_mcast_tbl_t *p_mcast_tbl;
899	ib_api_status_t status = IB_SUCCESS;
900
901	OSM_LOG_ENTER(sm->p_log);
902
903	CL_ASSERT(mlid);
904	CL_ASSERT(port_guid);
905
906	mlid_ho = cl_ntoh16(mlid);
907
908	OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
909		"Attempting to add port 0x%" PRIx64 " to MLID 0x%X, "
910		"\n\t\t\t\tjoin state = 0x%X\n",
911		cl_ntoh64(port_guid), mlid_ho, join_state);
912
913	/*
914	   Acquire the Port object.
915	 */
916	p_port = osm_get_port_by_guid(sm->p_subn, port_guid);
917	if (!p_port) {
918		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A01: "
919			"Unable to acquire port object for 0x%" PRIx64 "\n",
920			cl_ntoh64(port_guid));
921		status = IB_ERROR;
922		goto Exit;
923	}
924
925	p_physp = p_port->p_physp;
926	if (p_physp == NULL) {
927		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A05: "
928			"Unable to acquire phsyical port object for 0x%" PRIx64
929			"\n", cl_ntoh64(port_guid));
930		status = IB_ERROR;
931		goto Exit;
932	}
933
934	p_remote_physp = osm_physp_get_remote(p_physp);
935	if (p_remote_physp == NULL) {
936		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A11: "
937			"Unable to acquire remote phsyical port object "
938			"for 0x%" PRIx64 "\n", cl_ntoh64(port_guid));
939		status = IB_ERROR;
940		goto Exit;
941	}
942
943	p_remote_node = osm_physp_get_node_ptr(p_remote_physp);
944
945	CL_ASSERT(p_remote_node);
946
947	sw_guid = osm_node_get_node_guid(p_remote_node);
948
949	if (osm_node_get_type(p_remote_node) != IB_NODE_TYPE_SWITCH) {
950		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A22: "
951			"Remote node not a switch node 0x%" PRIx64 "\n",
952			cl_ntoh64(sw_guid));
953		status = IB_ERROR;
954		goto Exit;
955	}
956
957	if (!p_remote_node->sw) {
958		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A12: "
959			"No switch object 0x%" PRIx64 "\n", cl_ntoh64(sw_guid));
960		status = IB_ERROR;
961		goto Exit;
962	}
963
964	if (osm_switch_is_in_mcast_tree(p_remote_node->sw, mlid_ho)) {
965		/*
966		   We're in luck. The switch attached to this port
967		   is already in the multicast group, so we can just
968		   add the specified port as a new leaf of the tree.
969		 */
970		if (join_state & (IB_JOIN_STATE_FULL | IB_JOIN_STATE_NON)) {
971			/*
972			   This node wants to receive multicast frames.
973			   Get the switch port number to which the new member port
974			   is attached, then configure this single mcast table.
975			 */
976			port_num = osm_physp_get_port_num(p_remote_physp);
977			CL_ASSERT(port_num);
978
979			p_mcast_tbl =
980			    osm_switch_get_mcast_tbl_ptr(p_remote_node->sw);
981			osm_mcast_tbl_set(p_mcast_tbl, mlid_ho, port_num);
982		} else {
983			if (join_state & IB_JOIN_STATE_SEND_ONLY)
984				OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
985					"Success.  Nothing to do for send"
986					"only member\n");
987			else {
988				OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A13: "
989					"Unknown join state 0x%X\n",
990					join_state);
991				status = IB_ERROR;
992				goto Exit;
993			}
994		}
995	} else
996		OSM_LOG(sm->p_log, OSM_LOG_DEBUG, "Unable to add port\n");
997
998Exit:
999	OSM_LOG_EXIT(sm->p_log);
1000	return status;
1001}
1002#endif
1003
1004/**********************************************************************
1005 Process the entire group.
1006 NOTE : The lock should be held externally!
1007 **********************************************************************/
1008static ib_api_status_t mcast_mgr_process_mlid(osm_sm_t * sm, uint16_t mlid)
1009{
1010	ib_api_status_t status = IB_SUCCESS;
1011	struct osm_routing_engine *re = sm->p_subn->p_osm->routing_engine_used;
1012	osm_mgrp_box_t *mbox;
1013
1014	OSM_LOG_ENTER(sm->p_log);
1015
1016	OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
1017		"Processing multicast group with mlid 0x%X\n", mlid);
1018
1019	/* Clear the multicast tables to start clean, then build
1020	   the spanning tree which sets the mcast table bits for each
1021	   port in the group. */
1022	mcast_mgr_clear(sm, mlid);
1023
1024	mbox = osm_get_mbox_by_mlid(sm->p_subn, cl_hton16(mlid));
1025	if (mbox) {
1026		if (re && re->mcast_build_stree)
1027			status = re->mcast_build_stree(re->context, mbox);
1028		else
1029			status = mcast_mgr_build_spanning_tree(sm, mbox);
1030
1031		if (status != IB_SUCCESS)
1032			OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A17: "
1033				"Unable to create spanning tree (%s) for mlid "
1034				"0x%x\n", ib_get_err_str(status), mlid);
1035	}
1036
1037	OSM_LOG_EXIT(sm->p_log);
1038	return status;
1039}
1040
1041static void mcast_mgr_set_mfttop(IN osm_sm_t * sm, IN osm_switch_t * p_sw)
1042{
1043	osm_node_t *p_node;
1044	osm_dr_path_t *p_path;
1045	osm_physp_t *p_physp;
1046	osm_mcast_tbl_t *p_tbl;
1047	osm_madw_context_t context;
1048	ib_api_status_t status;
1049	ib_switch_info_t si;
1050	ib_net16_t mcast_top;
1051
1052	OSM_LOG_ENTER(sm->p_log);
1053
1054	CL_ASSERT(p_sw);
1055
1056	p_node = p_sw->p_node;
1057
1058	CL_ASSERT(p_node);
1059
1060	p_physp = osm_node_get_physp_ptr(p_node, 0);
1061	p_path = osm_physp_get_dr_path_ptr(p_physp);
1062	p_tbl = osm_switch_get_mcast_tbl_ptr(p_sw);
1063
1064	if (sm->p_subn->opt.use_mfttop &&
1065	    p_physp->port_info.capability_mask & IB_PORT_CAP_HAS_MCAST_FDB_TOP) {
1066		/*
1067		   Set the top of the multicast forwarding table.
1068		 */
1069		si = p_sw->switch_info;
1070		if (sm->p_subn->first_time_master_sweep == TRUE)
1071			mcast_top = cl_hton16(sm->mlids_init_max);
1072		else {
1073			if (p_tbl->max_block_in_use == -1)
1074				mcast_top = cl_hton16(IB_LID_MCAST_START_HO - 1);
1075			else
1076				mcast_top = cl_hton16(IB_LID_MCAST_START_HO +
1077						      (p_tbl->max_block_in_use + 1) * IB_MCAST_BLOCK_SIZE - 1);
1078		}
1079		if (mcast_top == si.mcast_top)
1080			return;
1081
1082		si.mcast_top = mcast_top;
1083
1084		OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
1085			"Setting switch MFT top to MLID 0x%x\n",
1086			cl_ntoh16(si.mcast_top));
1087
1088		context.si_context.light_sweep = FALSE;
1089		context.si_context.node_guid = osm_node_get_node_guid(p_node);
1090		context.si_context.set_method = TRUE;
1091		context.si_context.lft_top_change = FALSE;
1092
1093		status = osm_req_set(sm, p_path, (uint8_t *) & si,
1094				     sizeof(si), IB_MAD_ATTR_SWITCH_INFO,
1095				     0, FALSE,
1096				     ib_port_info_get_m_key(&p_physp->port_info),
1097				     CL_DISP_MSGID_NONE, &context);
1098
1099		if (status != IB_SUCCESS)
1100			OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A1B: "
1101				"Sending SwitchInfo attribute failed (%s)\n",
1102				ib_get_err_str(status));
1103	}
1104}
1105
1106static int mcast_mgr_set_mftables(osm_sm_t * sm)
1107{
1108	cl_qmap_t *p_sw_tbl = &sm->p_subn->sw_guid_tbl;
1109	osm_switch_t *p_sw;
1110	osm_mcast_tbl_t *p_tbl;
1111	int block_notdone, ret = 0;
1112	int16_t block_num, max_block = -1;
1113
1114	p_sw = (osm_switch_t *) cl_qmap_head(p_sw_tbl);
1115	while (p_sw != (osm_switch_t *) cl_qmap_end(p_sw_tbl)) {
1116		p_sw->mft_block_num = 0;
1117		p_sw->mft_position = 0;
1118		p_tbl = osm_switch_get_mcast_tbl_ptr(p_sw);
1119		if (osm_mcast_tbl_get_max_block_in_use(p_tbl) > max_block)
1120			max_block = osm_mcast_tbl_get_max_block_in_use(p_tbl);
1121		mcast_mgr_set_mfttop(sm, p_sw);
1122		p_sw = (osm_switch_t *) cl_qmap_next(&p_sw->map_item);
1123	}
1124
1125	/* Stripe the MFT blocks across the switches */
1126	for (block_num = 0; block_num <= max_block; block_num++) {
1127		block_notdone = 1;
1128		while (block_notdone) {
1129			block_notdone = 0;
1130			p_sw = (osm_switch_t *) cl_qmap_head(p_sw_tbl);
1131			while (p_sw != (osm_switch_t *) cl_qmap_end(p_sw_tbl)) {
1132				if (p_sw->mft_block_num == block_num) {
1133					block_notdone = 1;
1134					if (mcast_mgr_set_mft_block(sm, p_sw,
1135								    p_sw->mft_block_num,
1136								    p_sw->mft_position))
1137						ret = -1;
1138					p_tbl = osm_switch_get_mcast_tbl_ptr(p_sw);
1139					if (++p_sw->mft_position > p_tbl->max_position) {
1140						p_sw->mft_position = 0;
1141						p_sw->mft_block_num++;
1142					}
1143				}
1144				p_sw = (osm_switch_t *) cl_qmap_next(&p_sw->map_item);
1145			}
1146		}
1147	}
1148
1149	return ret;
1150}
1151
1152static int alloc_mfts(osm_sm_t * sm)
1153{
1154	int i;
1155	cl_map_item_t *item;
1156	osm_switch_t *p_sw;
1157
1158	for (i = sm->p_subn->max_mcast_lid_ho - IB_LID_MCAST_START_HO; i >= 0;
1159	     i--)
1160		if (sm->p_subn->mboxes[i])
1161			break;
1162	if (i < 0)
1163		return 0;
1164
1165	/* Now, walk switches and (re)allocate multicast tables */
1166	for (item = cl_qmap_head(&sm->p_subn->sw_guid_tbl);
1167	     item != cl_qmap_end(&sm->p_subn->sw_guid_tbl);
1168	     item = cl_qmap_next(item)) {
1169		p_sw = (osm_switch_t *) item;
1170		if (osm_mcast_tbl_realloc(&p_sw->mcast_tbl, i))
1171			return -1;
1172	}
1173	return 0;
1174}
1175
1176/**********************************************************************
1177  This is the function that is invoked during idle time and sweep to
1178  handle the process request for mcast groups where join/leave/delete
1179  was required.
1180 **********************************************************************/
1181int osm_mcast_mgr_process(osm_sm_t * sm, boolean_t config_all)
1182{
1183	int ret = 0;
1184	unsigned i;
1185	unsigned max_mlid;
1186
1187	OSM_LOG_ENTER(sm->p_log);
1188
1189	CL_PLOCK_EXCL_ACQUIRE(sm->p_lock);
1190
1191	/* If there are no switches in the subnet we have nothing to do. */
1192	if (cl_qmap_count(&sm->p_subn->sw_guid_tbl) == 0) {
1193		OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
1194			"No switches in subnet. Nothing to do\n");
1195		goto exit;
1196	}
1197
1198	if (alloc_mfts(sm)) {
1199		OSM_LOG(sm->p_log, OSM_LOG_ERROR,
1200			"ERR 0A09: alloc_mfts failed\n");
1201		ret = -1;
1202		goto exit;
1203	}
1204
1205	max_mlid = config_all ? sm->p_subn->max_mcast_lid_ho
1206			- IB_LID_MCAST_START_HO : sm->mlids_req_max;
1207	for (i = 0; i <= max_mlid; i++) {
1208		if (sm->mlids_req[i] ||
1209		    (config_all && sm->p_subn->mboxes[i])) {
1210			sm->mlids_req[i] = 0;
1211			mcast_mgr_process_mlid(sm, i + IB_LID_MCAST_START_HO);
1212		}
1213	}
1214
1215	sm->mlids_req_max = 0;
1216
1217	ret = mcast_mgr_set_mftables(sm);
1218
1219	osm_dump_mcast_routes(sm->p_subn->p_osm);
1220
1221exit:
1222	CL_PLOCK_RELEASE(sm->p_lock);
1223	OSM_LOG_EXIT(sm->p_log);
1224	return ret;
1225}
1226