1/*
2 * Copyright (c) 2004-2009 Voltaire, Inc. All rights reserved.
3 * Copyright (c) 2002-2015 Mellanox Technologies LTD. All rights reserved.
4 * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
5 * Copyright (c) 2009 HNR Consulting. All rights reserved.
6 *
7 * This software is available to you under a choice of one of two
8 * licenses.  You may choose to be licensed under the terms of the GNU
9 * General Public License (GPL) Version 2, available from the file
10 * COPYING in the main directory of this source tree, or the
11 * OpenIB.org BSD license below:
12 *
13 *     Redistribution and use in source and binary forms, with or
14 *     without modification, are permitted provided that the following
15 *     conditions are met:
16 *
17 *      - Redistributions of source code must retain the above
18 *        copyright notice, this list of conditions and the following
19 *        disclaimer.
20 *
21 *      - Redistributions in binary form must reproduce the above
22 *        copyright notice, this list of conditions and the following
23 *        disclaimer in the documentation and/or other materials
24 *        provided with the distribution.
25 *
26 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
27 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
28 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
29 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
30 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
31 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
32 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
33 * SOFTWARE.
34 *
35 */
36
37/*
38 * Abstract:
39 *    Implementation of osm_switch_t.
40 * This object represents an Infiniband switch.
41 * This object is part of the opensm family of objects.
42 */
43
44#if HAVE_CONFIG_H
45#  include <config.h>
46#endif				/* HAVE_CONFIG_H */
47
48#include <stdlib.h>
49#include <string.h>
50#include <complib/cl_math.h>
51#include <iba/ib_types.h>
52#include <opensm/osm_file_ids.h>
53#define FILE_ID OSM_FILE_SWITCH_C
54#include <opensm/osm_switch.h>
55
56struct switch_port_path {
57	uint8_t port_num;
58	uint32_t path_count;
59	int found_sys_guid;
60	int found_node_guid;
61	uint32_t forwarded_to;
62};
63
64cl_status_t osm_switch_set_hops(IN osm_switch_t * p_sw, IN uint16_t lid_ho,
65				IN uint8_t port_num, IN uint8_t num_hops)
66{
67	if (!lid_ho || lid_ho > p_sw->max_lid_ho)
68		return -1;
69	if (port_num >= p_sw->num_ports)
70		return -1;
71	if (!p_sw->hops[lid_ho]) {
72		p_sw->hops[lid_ho] = malloc(p_sw->num_ports);
73		if (!p_sw->hops[lid_ho])
74			return -1;
75		memset(p_sw->hops[lid_ho], OSM_NO_PATH, p_sw->num_ports);
76	}
77
78	p_sw->hops[lid_ho][port_num] = num_hops;
79	if (p_sw->hops[lid_ho][0] > num_hops)
80		p_sw->hops[lid_ho][0] = num_hops;
81
82	return 0;
83}
84
85void osm_switch_delete(IN OUT osm_switch_t ** pp_sw)
86{
87	osm_switch_t *p_sw = *pp_sw;
88	unsigned i;
89
90	osm_mcast_tbl_destroy(&p_sw->mcast_tbl);
91	if (p_sw->p_prof)
92		free(p_sw->p_prof);
93	if (p_sw->search_ordering_ports)
94		free(p_sw->search_ordering_ports);
95	if (p_sw->lft)
96		free(p_sw->lft);
97	if (p_sw->new_lft)
98		free(p_sw->new_lft);
99	if (p_sw->hops) {
100		for (i = 0; i < p_sw->num_hops; i++)
101			if (p_sw->hops[i])
102				free(p_sw->hops[i]);
103		free(p_sw->hops);
104	}
105	free(*pp_sw);
106	*pp_sw = NULL;
107}
108
109osm_switch_t *osm_switch_new(IN osm_node_t * p_node,
110			     IN const osm_madw_t * p_madw)
111{
112	osm_switch_t *p_sw;
113	ib_switch_info_t *p_si;
114	ib_smp_t *p_smp;
115	uint8_t num_ports;
116	uint32_t port_num;
117
118	CL_ASSERT(p_madw);
119	CL_ASSERT(p_node);
120
121	p_smp = osm_madw_get_smp_ptr(p_madw);
122	p_si = ib_smp_get_payload_ptr(p_smp);
123	num_ports = osm_node_get_num_physp(p_node);
124
125	CL_ASSERT(p_smp->attr_id == IB_MAD_ATTR_SWITCH_INFO);
126
127	if (!p_si->lin_cap) /* The switch doesn't support LFT */
128		return NULL;
129
130	p_sw = malloc(sizeof(*p_sw));
131	if (!p_sw)
132		return NULL;
133
134	memset(p_sw, 0, sizeof(*p_sw));
135
136	p_sw->p_node = p_node;
137	p_sw->switch_info = *p_si;
138	p_sw->num_ports = num_ports;
139	p_sw->need_update = 2;
140
141	p_sw->p_prof = malloc(sizeof(*p_sw->p_prof) * num_ports);
142	if (!p_sw->p_prof)
143		goto err;
144
145	memset(p_sw->p_prof, 0, sizeof(*p_sw->p_prof) * num_ports);
146
147	osm_mcast_tbl_init(&p_sw->mcast_tbl, osm_node_get_num_physp(p_node),
148			   cl_ntoh16(p_si->mcast_cap));
149
150	for (port_num = 0; port_num < num_ports; port_num++)
151		osm_port_prof_construct(&p_sw->p_prof[port_num]);
152
153	return p_sw;
154
155err:
156	osm_switch_delete(&p_sw);
157	return NULL;
158}
159
160boolean_t osm_switch_get_lft_block(IN const osm_switch_t * p_sw,
161				   IN uint16_t block_id, OUT uint8_t * p_block)
162{
163	uint16_t base_lid_ho = block_id * IB_SMP_DATA_SIZE;
164
165	CL_ASSERT(p_sw);
166	CL_ASSERT(p_block);
167
168	if (base_lid_ho > p_sw->max_lid_ho)
169		return FALSE;
170
171	CL_ASSERT(base_lid_ho + IB_SMP_DATA_SIZE - 1 <= IB_LID_UCAST_END_HO);
172	memcpy(p_block, &(p_sw->new_lft[base_lid_ho]), IB_SMP_DATA_SIZE);
173	return TRUE;
174}
175
176static struct osm_remote_node *
177switch_find_guid_common(IN const osm_switch_t * p_sw,
178			IN struct osm_remote_guids_count *r,
179			IN uint8_t port_num, IN int find_sys_guid,
180			IN int find_node_guid)
181{
182	struct osm_remote_node *p_remote_guid = NULL;
183	osm_physp_t *p_physp;
184	osm_physp_t *p_rem_physp;
185	osm_node_t *p_rem_node;
186	uint64_t sys_guid;
187	uint64_t node_guid;
188	unsigned int i;
189
190	CL_ASSERT(p_sw);
191
192	if (!r)
193		goto out;
194
195	p_physp = osm_node_get_physp_ptr(p_sw->p_node, port_num);
196	if (!p_physp)
197		goto out;
198
199	p_rem_physp = osm_physp_get_remote(p_physp);
200	p_rem_node = osm_physp_get_node_ptr(p_rem_physp);
201	sys_guid = p_rem_node->node_info.sys_guid;
202	node_guid = p_rem_node->node_info.node_guid;
203
204	for (i = 0; i < r->count; i++) {
205		if ((!find_sys_guid
206		     || r->guids[i].node->node_info.sys_guid == sys_guid)
207		    && (!find_node_guid
208			|| r->guids[i].node->node_info.node_guid == node_guid)) {
209			p_remote_guid = &r->guids[i];
210			break;
211		}
212	}
213
214out:
215	return p_remote_guid;
216}
217
218static struct osm_remote_node *
219switch_find_sys_guid_count(IN const osm_switch_t * p_sw,
220			   IN struct osm_remote_guids_count *r,
221			   IN uint8_t port_num)
222{
223	return switch_find_guid_common(p_sw, r, port_num, 1, 0);
224}
225
226static struct osm_remote_node *
227switch_find_node_guid_count(IN const osm_switch_t * p_sw,
228			    IN struct osm_remote_guids_count *r,
229			    IN uint8_t port_num)
230{
231	return switch_find_guid_common(p_sw, r, port_num, 0, 1);
232}
233
234uint8_t osm_switch_recommend_path(IN const osm_switch_t * p_sw,
235				  IN osm_port_t * p_port, IN uint16_t lid_ho,
236				  IN unsigned start_from,
237				  IN boolean_t ignore_existing,
238				  IN boolean_t routing_for_lmc,
239				  IN boolean_t dor,
240				  IN boolean_t port_shifting,
241				  IN uint32_t scatter_ports,
242				  IN osm_lft_type_enum lft_enum)
243{
244	/*
245	   We support an enhanced LMC aware routing mode:
246	   In the case of LMC > 0, we can track the remote side
247	   system and node for all of the lids of the target
248	   and try and avoid routing again through the same
249	   system / node.
250
251	   Assume if routing_for_lmc is true that this procedure was
252	   provided the tracking array and counter via p_port->priv,
253	   and we can conduct this algorithm.
254	 */
255	uint16_t base_lid;
256	uint8_t hops;
257	uint8_t least_hops;
258	uint8_t port_num;
259	uint8_t num_ports;
260	uint32_t least_paths = 0xFFFFFFFF;
261	unsigned i;
262	/*
263	   The following will track the least paths if the
264	   route should go through a new system/node
265	 */
266	uint32_t least_paths_other_sys = 0xFFFFFFFF;
267	uint32_t least_paths_other_nodes = 0xFFFFFFFF;
268	uint32_t least_forwarded_to = 0xFFFFFFFF;
269	uint32_t check_count;
270	uint8_t best_port = 0;
271	/*
272	   These vars track the best port if it connects to
273	   not used system/node.
274	 */
275	uint8_t best_port_other_sys = 0;
276	uint8_t best_port_other_node = 0;
277	boolean_t port_found = FALSE;
278	osm_physp_t *p_physp;
279	osm_physp_t *p_rem_physp;
280	osm_node_t *p_rem_node;
281	osm_node_t *p_rem_node_first = NULL;
282	struct osm_remote_node *p_remote_guid = NULL;
283	struct osm_remote_node null_remote_node = {NULL, 0, 0};
284	struct switch_port_path port_paths[IB_NODE_NUM_PORTS_MAX];
285	unsigned int port_paths_total_paths = 0;
286	unsigned int port_paths_count = 0;
287	uint8_t scatter_possible_ports[IB_NODE_NUM_PORTS_MAX];
288	unsigned int scatter_possible_ports_count = 0;
289	int found_sys_guid = 0;
290	int found_node_guid = 0;
291
292	CL_ASSERT(lid_ho > 0);
293
294	if (p_port->p_node->sw) {
295		if (p_port->p_node->sw == p_sw)
296			return 0;
297		base_lid = osm_port_get_base_lid(p_port);
298	} else {
299		p_physp = p_port->p_physp;
300		if (!p_physp || !p_physp->p_remote_physp ||
301		    !p_physp->p_remote_physp->p_node->sw)
302			return OSM_NO_PATH;
303
304		if (p_physp->p_remote_physp->p_node->sw == p_sw)
305			return p_physp->p_remote_physp->port_num;
306		base_lid =
307		    osm_node_get_base_lid(p_physp->p_remote_physp->p_node, 0);
308	}
309	base_lid = cl_ntoh16(base_lid);
310
311	num_ports = p_sw->num_ports;
312
313	least_hops = osm_switch_get_least_hops(p_sw, base_lid);
314	if (least_hops == OSM_NO_PATH)
315		return OSM_NO_PATH;
316
317	/*
318	   First, inquire with the forwarding table for an existing
319	   route.  If one is found, honor it unless:
320	   1. the ignore existing flag is set.
321	   2. the physical port is not a valid one or not healthy
322	   3. the physical port has a remote port (the link is up)
323	   4. the port has min-hops to the target (avoid loops)
324	 */
325	if (!ignore_existing) {
326		port_num = osm_switch_get_port_by_lid(p_sw, lid_ho, lft_enum);
327
328		if (port_num != OSM_NO_PATH) {
329			CL_ASSERT(port_num < num_ports);
330
331			p_physp =
332			    osm_node_get_physp_ptr(p_sw->p_node, port_num);
333			/*
334			   Don't be too trusting of the current forwarding table!
335			   Verify that the port number is legal and that the
336			   LID is reachable through this port.
337			 */
338			if (p_physp && osm_physp_is_healthy(p_physp) &&
339			    osm_physp_get_remote(p_physp)) {
340				hops =
341				    osm_switch_get_hop_count(p_sw, base_lid,
342							     port_num);
343				/*
344				   If we aren't using pre-defined user routes
345				   function, then we need to make sure that the
346				   current path is the minimum one. In case of
347				   having such a user function - this check will
348				   not be done, and the old routing will be used.
349				   Note: This means that it is the user's job to
350				   clean all data in the forwarding tables that
351				   he wants to be overridden by the minimum
352				   hop function.
353				 */
354				if (hops == least_hops)
355					return port_num;
356			}
357		}
358	}
359
360	/*
361	   This algorithm selects a port based on a static load balanced
362	   selection across equal hop-count ports.
363	   There is lots of room for improved sophistication here,
364	   possibly guided by user configuration info.
365	 */
366
367	/*
368	   OpenSM routing is "local" - not considering a full lid to lid
369	   path. As such we can not guarantee a path will not loop if we
370	   do not always follow least hops.
371	   So we must abort if not least hops.
372	 */
373
374	/* port number starts with one and num_ports is 1 + num phys ports */
375	for (i = start_from; i < start_from + num_ports; i++) {
376		port_num = osm_switch_get_dimn_port(p_sw, i % num_ports);
377		if (!port_num ||
378		    osm_switch_get_hop_count(p_sw, base_lid, port_num) !=
379		    least_hops)
380			continue;
381
382		/* let us make sure it is not down or unhealthy */
383		p_physp = osm_node_get_physp_ptr(p_sw->p_node, port_num);
384		if (!p_physp || !osm_physp_is_healthy(p_physp) ||
385		    /*
386		       we require all - non sma ports to be linked
387		       to be routed through
388		     */
389		    !osm_physp_get_remote(p_physp))
390			continue;
391
392		/*
393		   We located a least-hop port, possibly one of many.
394		   For this port, check the running total count of
395		   the number of paths through this port.  Select
396		   the port routing the least number of paths.
397		 */
398		check_count =
399		    osm_port_prof_path_count_get(&p_sw->p_prof[port_num]);
400
401
402		if (dor) {
403			/* Get the Remote Node */
404			p_rem_physp = osm_physp_get_remote(p_physp);
405			p_rem_node = osm_physp_get_node_ptr(p_rem_physp);
406			/* use the first dimension, but spread traffic
407			 * out among the group of ports representing
408			 * that dimension */
409			if (!p_rem_node_first)
410				p_rem_node_first = p_rem_node;
411			else if (p_rem_node != p_rem_node_first)
412				continue;
413			if (routing_for_lmc) {
414				struct osm_remote_guids_count *r = p_port->priv;
415				uint8_t rem_port = osm_physp_get_port_num(p_rem_physp);
416				unsigned int j;
417
418				for (j = 0; j < r->count; j++) {
419					p_remote_guid = &r->guids[j];
420					if ((p_remote_guid->node == p_rem_node)
421					    && (p_remote_guid->port == rem_port))
422						break;
423				}
424				if (j == r->count)
425					p_remote_guid = &null_remote_node;
426			}
427		/*
428		   Advanced LMC routing requires tracking of the
429		   best port by the node connected to the other side of
430		   it.
431		 */
432		} else if (routing_for_lmc) {
433			/* Is the sys guid already used ? */
434			p_remote_guid = switch_find_sys_guid_count(p_sw,
435								   p_port->priv,
436								   port_num);
437
438			/* If not update the least hops for this case */
439			if (!p_remote_guid) {
440				if (check_count < least_paths_other_sys) {
441					least_paths_other_sys = check_count;
442					best_port_other_sys = port_num;
443					least_forwarded_to = 0;
444				}
445				found_sys_guid = 0;
446			} else {	/* same sys found - try node */
447
448
449				/* Else is the node guid already used ? */
450				p_remote_guid = switch_find_node_guid_count(p_sw,
451									    p_port->priv,
452									    port_num);
453
454				/* If not update the least hops for this case */
455				if (!p_remote_guid
456				    && check_count < least_paths_other_nodes) {
457					least_paths_other_nodes = check_count;
458					best_port_other_node = port_num;
459					least_forwarded_to = 0;
460				}
461				/* else prior sys and node guid already used */
462
463				if (!p_remote_guid)
464					found_node_guid = 0;
465				else
466					found_node_guid = 1;
467				found_sys_guid = 1;
468			}	/* same sys found */
469		}
470
471		port_paths[port_paths_count].port_num = port_num;
472		port_paths[port_paths_count].path_count = check_count;
473		if (routing_for_lmc) {
474			port_paths[port_paths_count].found_sys_guid = found_sys_guid;
475			port_paths[port_paths_count].found_node_guid = found_node_guid;
476		}
477		if (routing_for_lmc && p_remote_guid)
478			port_paths[port_paths_count].forwarded_to = p_remote_guid->forwarded_to;
479		else
480			port_paths[port_paths_count].forwarded_to = 0;
481		port_paths_total_paths += check_count;
482		port_paths_count++;
483
484		/* routing for LMC mode */
485		/*
486		   the count is min but also lower then the max subscribed
487		 */
488		if (check_count < least_paths) {
489			port_found = TRUE;
490			best_port = port_num;
491			least_paths = check_count;
492			scatter_possible_ports_count = 0;
493			scatter_possible_ports[scatter_possible_ports_count++] = port_num;
494			if (routing_for_lmc
495			    && p_remote_guid
496			    && p_remote_guid->forwarded_to < least_forwarded_to)
497				least_forwarded_to = p_remote_guid->forwarded_to;
498		} else if (scatter_ports
499			   && check_count == least_paths) {
500			scatter_possible_ports[scatter_possible_ports_count++] = port_num;
501		} else if (routing_for_lmc
502			   && p_remote_guid
503			   && check_count == least_paths
504			   && p_remote_guid->forwarded_to < least_forwarded_to) {
505			least_forwarded_to = p_remote_guid->forwarded_to;
506			best_port = port_num;
507		}
508	}
509
510	if (port_found == FALSE)
511		return OSM_NO_PATH;
512
513	if (port_shifting && port_paths_count) {
514		/* In the port_paths[] array, we now have all the ports that we
515		 * can route out of.  Using some shifting math below, possibly
516		 * select a different one so that lids won't align in LFTs
517		 *
518		 * If lmc > 0, we need to loop through these ports to find the
519		 * least_forwarded_to port, best_port_other_sys, and
520		 * best_port_other_node just like before but through the different
521		 * ordering.
522		 */
523
524		least_paths = 0xFFFFFFFF;
525		least_paths_other_sys = 0xFFFFFFFF;
526		least_paths_other_nodes = 0xFFFFFFFF;
527	        least_forwarded_to = 0xFFFFFFFF;
528		best_port = 0;
529		best_port_other_sys = 0;
530		best_port_other_node = 0;
531
532		for (i = 0; i < port_paths_count; i++) {
533			unsigned int idx;
534
535			idx = (port_paths_total_paths/port_paths_count + i) % port_paths_count;
536
537			if (routing_for_lmc) {
538				if (!port_paths[idx].found_sys_guid
539				    && port_paths[idx].path_count < least_paths_other_sys) {
540					least_paths_other_sys = port_paths[idx].path_count;
541					best_port_other_sys = port_paths[idx].port_num;
542					least_forwarded_to = 0;
543				}
544				else if (!port_paths[idx].found_node_guid
545					 && port_paths[idx].path_count < least_paths_other_nodes) {
546					least_paths_other_nodes = port_paths[idx].path_count;
547					best_port_other_node = port_paths[idx].port_num;
548					least_forwarded_to = 0;
549				}
550			}
551
552			if (port_paths[idx].path_count < least_paths) {
553				best_port = port_paths[idx].port_num;
554				least_paths = port_paths[idx].path_count;
555				if (routing_for_lmc
556				    && (port_paths[idx].found_sys_guid
557					|| port_paths[idx].found_node_guid)
558				    && port_paths[idx].forwarded_to < least_forwarded_to)
559					least_forwarded_to = port_paths[idx].forwarded_to;
560			}
561			else if (routing_for_lmc
562				 && (port_paths[idx].found_sys_guid
563				     || port_paths[idx].found_node_guid)
564				 && port_paths[idx].path_count == least_paths
565				 && port_paths[idx].forwarded_to < least_forwarded_to) {
566				least_forwarded_to = port_paths[idx].forwarded_to;
567				best_port = port_paths[idx].port_num;
568			}
569
570		}
571	}
572
573	/*
574	   if we are in enhanced routing mode and the best port is not
575	   the local port 0
576	 */
577	if (routing_for_lmc && best_port && !scatter_ports) {
578		/* Select the least hop port of the non used sys first */
579		if (best_port_other_sys)
580			best_port = best_port_other_sys;
581		else if (best_port_other_node)
582			best_port = best_port_other_node;
583	} else if (scatter_ports) {
584		/*
585		 * There is some danger that this random could "rebalance" the routes
586		 * every time, to combat this there is a global srandom that
587		 * occurs at the start of every sweep.
588		 */
589		unsigned int idx = random() % scatter_possible_ports_count;
590		best_port = scatter_possible_ports[idx];
591	}
592	return best_port;
593}
594
595void osm_switch_clear_hops(IN osm_switch_t * p_sw)
596{
597	unsigned i;
598
599	for (i = 0; i < p_sw->num_hops; i++)
600		if (p_sw->hops[i])
601			memset(p_sw->hops[i], OSM_NO_PATH, p_sw->num_ports);
602}
603
604static int alloc_lft(IN osm_switch_t * p_sw, uint16_t lids)
605{
606	uint16_t lft_size;
607
608	/* Ensure LFT is in units of LFT block size */
609	lft_size = (lids / IB_SMP_DATA_SIZE + 1) * IB_SMP_DATA_SIZE;
610	if (lft_size > p_sw->lft_size) {
611		uint8_t *new_lft = realloc(p_sw->lft, lft_size);
612		if (!new_lft)
613			return -1;
614		memset(new_lft + p_sw->lft_size, OSM_NO_PATH,
615		       lft_size - p_sw->lft_size);
616		p_sw->lft = new_lft;
617		p_sw->lft_size = lft_size;
618	}
619
620	return 0;
621}
622
623int osm_switch_prepare_path_rebuild(IN osm_switch_t * p_sw, IN uint16_t max_lids)
624{
625	uint8_t **hops;
626	uint8_t *new_lft;
627	unsigned i;
628
629	if (alloc_lft(p_sw, max_lids))
630		return -1;
631
632	for (i = 0; i < p_sw->num_ports; i++)
633		osm_port_prof_construct(&p_sw->p_prof[i]);
634
635	osm_switch_clear_hops(p_sw);
636
637	if (!(new_lft = realloc(p_sw->new_lft, p_sw->lft_size)))
638		return -1;
639
640	p_sw->new_lft = new_lft;
641
642	memset(p_sw->new_lft, OSM_NO_PATH, p_sw->lft_size);
643
644	if (!p_sw->hops) {
645		hops = malloc((max_lids + 1) * sizeof(hops[0]));
646		if (!hops)
647			return -1;
648		memset(hops, 0, (max_lids + 1) * sizeof(hops[0]));
649		p_sw->hops = hops;
650		p_sw->num_hops = max_lids + 1;
651	} else if (max_lids + 1 > p_sw->num_hops) {
652		hops = realloc(p_sw->hops, (max_lids + 1) * sizeof(hops[0]));
653		if (!hops)
654			return -1;
655		memset(hops + p_sw->num_hops, 0,
656		       (max_lids + 1 - p_sw->num_hops) * sizeof(hops[0]));
657		p_sw->hops = hops;
658		p_sw->num_hops = max_lids + 1;
659	}
660	p_sw->max_lid_ho = max_lids;
661
662	return 0;
663}
664
665uint8_t osm_switch_get_port_least_hops(IN const osm_switch_t * p_sw,
666				       IN const osm_port_t * p_port)
667{
668	uint16_t lid;
669
670	if (p_port->p_node->sw) {
671		if (p_port->p_node->sw == p_sw)
672			return 0;
673		lid = osm_node_get_base_lid(p_port->p_node, 0);
674		return osm_switch_get_least_hops(p_sw, cl_ntoh16(lid));
675	} else {
676		osm_physp_t *p = p_port->p_physp;
677		uint8_t hops;
678
679		if (!p || !p->p_remote_physp || !p->p_remote_physp->p_node->sw)
680			return OSM_NO_PATH;
681		if (p->p_remote_physp->p_node->sw == p_sw)
682			return 1;
683		lid = osm_node_get_base_lid(p->p_remote_physp->p_node, 0);
684		hops = osm_switch_get_least_hops(p_sw, cl_ntoh16(lid));
685		return hops != OSM_NO_PATH ? hops + 1 : OSM_NO_PATH;
686	}
687}
688
689uint8_t osm_switch_recommend_mcast_path(IN osm_switch_t * p_sw,
690					IN osm_port_t * p_port,
691					IN uint16_t mlid_ho,
692					IN boolean_t ignore_existing)
693{
694	uint16_t base_lid;
695	uint8_t hops;
696	uint8_t port_num;
697	uint8_t num_ports;
698	uint8_t least_hops;
699
700	CL_ASSERT(mlid_ho >= IB_LID_MCAST_START_HO);
701
702	if (p_port->p_node->sw) {
703		if (p_port->p_node->sw == p_sw)
704			return 0;
705		base_lid = osm_port_get_base_lid(p_port);
706	} else {
707		osm_physp_t *p_physp = p_port->p_physp;
708		if (!p_physp || !p_physp->p_remote_physp ||
709		    !p_physp->p_remote_physp->p_node->sw)
710			return OSM_NO_PATH;
711		if (p_physp->p_remote_physp->p_node->sw == p_sw)
712			return p_physp->p_remote_physp->port_num;
713		base_lid =
714		    osm_node_get_base_lid(p_physp->p_remote_physp->p_node, 0);
715	}
716	base_lid = cl_ntoh16(base_lid);
717	num_ports = p_sw->num_ports;
718
719	/*
720	   If the user wants us to ignore existing multicast routes,
721	   then simply return the shortest hop count path to the
722	   target port.
723
724	   Otherwise, return the first port that has a path to the target,
725	   picking from the ports that are already in the multicast group.
726	 */
727	if (!ignore_existing) {
728		for (port_num = 1; port_num < num_ports; port_num++) {
729			if (!osm_mcast_tbl_is_port
730			    (&p_sw->mcast_tbl, mlid_ho, port_num))
731				continue;
732			/*
733			   Don't be too trusting of the current forwarding table!
734			   Verify that the LID is reachable through this port.
735			 */
736			hops =
737			    osm_switch_get_hop_count(p_sw, base_lid, port_num);
738			if (hops != OSM_NO_PATH)
739				return port_num;
740		}
741	}
742
743	/*
744	   Either no existing mcast paths reach this port or we are
745	   ignoring existing paths.
746
747	   Determine the best multicast path to the target.  Note that this
748	   algorithm is slightly different from the one used for unicast route
749	   recommendation.  In this case (multicast), we must NOT
750	   perform any sort of load balancing.  We MUST take the FIRST
751	   port found that has <= the lowest hop count path.  This prevents
752	   more than one multicast path to the same remote switch which
753	   prevents a multicast loop.  Multicast loops are bad since the same
754	   multicast packet will go around and around, inevitably creating
755	   a black hole that will destroy the Earth in a firey conflagration.
756	 */
757	least_hops = osm_switch_get_least_hops(p_sw, base_lid);
758	if (least_hops == OSM_NO_PATH)
759		return OSM_NO_PATH;
760	for (port_num = 1; port_num < num_ports; port_num++)
761		if (osm_switch_get_hop_count(p_sw, base_lid, port_num) ==
762		    least_hops)
763			break;
764
765	CL_ASSERT(port_num < num_ports);
766	return port_num;
767}
768