1219820Sjeff/*
2219820Sjeff * Copyright (c) 2004-2008 Voltaire, Inc. All rights reserved.
3219820Sjeff * Copyright (c) 2002-2008 Mellanox Technologies LTD. All rights reserved.
4219820Sjeff * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
5219820Sjeff *
6219820Sjeff * This software is available to you under a choice of one of two
7219820Sjeff * licenses.  You may choose to be licensed under the terms of the GNU
8219820Sjeff * General Public License (GPL) Version 2, available from the file
9219820Sjeff * COPYING in the main directory of this source tree, or the
10219820Sjeff * OpenIB.org BSD license below:
11219820Sjeff *
12219820Sjeff *     Redistribution and use in source and binary forms, with or
13219820Sjeff *     without modification, are permitted provided that the following
14219820Sjeff *     conditions are met:
15219820Sjeff *
16219820Sjeff *      - Redistributions of source code must retain the above
17219820Sjeff *        copyright notice, this list of conditions and the following
18219820Sjeff *        disclaimer.
19219820Sjeff *
20219820Sjeff *      - Redistributions in binary form must reproduce the above
21219820Sjeff *        copyright notice, this list of conditions and the following
22219820Sjeff *        disclaimer in the documentation and/or other materials
23219820Sjeff *        provided with the distribution.
24219820Sjeff *
25219820Sjeff * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
26219820Sjeff * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
27219820Sjeff * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
28219820Sjeff * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
29219820Sjeff * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
30219820Sjeff * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
31219820Sjeff * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
32219820Sjeff * SOFTWARE.
33219820Sjeff *
34219820Sjeff */
35219820Sjeff
36219820Sjeff/*
37219820Sjeff * Abstract:
38219820Sjeff *    Implementation of osm_switch_t.
39219820Sjeff * This object represents an Infiniband switch.
40219820Sjeff * This object is part of the opensm family of objects.
41219820Sjeff */
42219820Sjeff
43219820Sjeff#if HAVE_CONFIG_H
44219820Sjeff#  include <config.h>
45219820Sjeff#endif				/* HAVE_CONFIG_H */
46219820Sjeff
47219820Sjeff#include <stdlib.h>
48219820Sjeff#include <string.h>
49219820Sjeff#include <complib/cl_math.h>
50219820Sjeff#include <iba/ib_types.h>
51219820Sjeff#include <opensm/osm_switch.h>
52219820Sjeff
53219820Sjeff/**********************************************************************
54219820Sjeff **********************************************************************/
55219820Sjeffcl_status_t
56219820Sjeffosm_switch_set_hops(IN osm_switch_t * const p_sw,
57219820Sjeff		    IN const uint16_t lid_ho,
58219820Sjeff		    IN const uint8_t port_num, IN const uint8_t num_hops)
59219820Sjeff{
60219820Sjeff	if (lid_ho > p_sw->max_lid_ho)
61219820Sjeff		return -1;
62219820Sjeff	if (!p_sw->hops[lid_ho]) {
63219820Sjeff		p_sw->hops[lid_ho] = malloc(p_sw->num_ports);
64219820Sjeff		if (!p_sw->hops[lid_ho])
65219820Sjeff			return -1;
66219820Sjeff		memset(p_sw->hops[lid_ho], OSM_NO_PATH, p_sw->num_ports);
67219820Sjeff	}
68219820Sjeff
69219820Sjeff	p_sw->hops[lid_ho][port_num] = num_hops;
70219820Sjeff	if (p_sw->hops[lid_ho][0] > num_hops)
71219820Sjeff		p_sw->hops[lid_ho][0] = num_hops;
72219820Sjeff
73219820Sjeff	return 0;
74219820Sjeff}
75219820Sjeff
76219820Sjeff/**********************************************************************
77219820Sjeff **********************************************************************/
78219820Sjeffstatic ib_api_status_t
79219820Sjeffosm_switch_init(IN osm_switch_t * const p_sw,
80219820Sjeff		IN osm_node_t * const p_node,
81219820Sjeff		IN const osm_madw_t * const p_madw)
82219820Sjeff{
83219820Sjeff	ib_api_status_t status = IB_SUCCESS;
84219820Sjeff	ib_switch_info_t *p_si;
85219820Sjeff	ib_smp_t *p_smp;
86219820Sjeff	uint8_t num_ports;
87219820Sjeff	uint32_t port_num;
88219820Sjeff
89219820Sjeff	p_smp = osm_madw_get_smp_ptr(p_madw);
90219820Sjeff	p_si = (ib_switch_info_t *) ib_smp_get_payload_ptr(p_smp);
91219820Sjeff	num_ports = osm_node_get_num_physp(p_node);
92219820Sjeff
93219820Sjeff	CL_ASSERT(p_smp->attr_id == IB_MAD_ATTR_SWITCH_INFO);
94219820Sjeff
95219820Sjeff	p_sw->p_node = p_node;
96219820Sjeff	p_sw->switch_info = *p_si;
97219820Sjeff	p_sw->num_ports = num_ports;
98219820Sjeff	p_sw->need_update = 2;
99219820Sjeff
100219820Sjeff	/* Initiate the linear forwarding table */
101219820Sjeff
102219820Sjeff	if (!p_si->lin_cap) {
103219820Sjeff		/* This switch does not support linear forwarding tables */
104219820Sjeff		status = IB_UNSUPPORTED;
105219820Sjeff		goto Exit;
106219820Sjeff	}
107219820Sjeff
108219820Sjeff	p_sw->lft = malloc(IB_LID_UCAST_END_HO + 1);
109219820Sjeff	if (!p_sw->lft) {
110219820Sjeff		status = IB_INSUFFICIENT_MEMORY;
111219820Sjeff		goto Exit;
112219820Sjeff	}
113219820Sjeff
114219820Sjeff	/* Initialize the table to OSM_NO_PATH, which is "invalid port" */
115219820Sjeff	memset(p_sw->lft, OSM_NO_PATH, IB_LID_UCAST_END_HO + 1);
116219820Sjeff
117219820Sjeff	p_sw->p_prof = malloc(sizeof(*p_sw->p_prof) * num_ports);
118219820Sjeff	if (p_sw->p_prof == NULL) {
119219820Sjeff		status = IB_INSUFFICIENT_MEMORY;
120219820Sjeff		goto Exit;
121219820Sjeff	}
122219820Sjeff
123219820Sjeff	memset(p_sw->p_prof, 0, sizeof(*p_sw->p_prof) * num_ports);
124219820Sjeff
125219820Sjeff	status = osm_mcast_tbl_init(&p_sw->mcast_tbl,
126219820Sjeff				    osm_node_get_num_physp(p_node),
127219820Sjeff				    cl_ntoh16(p_si->mcast_cap));
128219820Sjeff	if (status != IB_SUCCESS)
129219820Sjeff		goto Exit;
130219820Sjeff
131219820Sjeff	for (port_num = 0; port_num < num_ports; port_num++)
132219820Sjeff		osm_port_prof_construct(&p_sw->p_prof[port_num]);
133219820Sjeff
134219820SjeffExit:
135219820Sjeff	return (status);
136219820Sjeff}
137219820Sjeff
138219820Sjeff/**********************************************************************
139219820Sjeff **********************************************************************/
140219820Sjeffvoid osm_switch_delete(IN OUT osm_switch_t ** const pp_sw)
141219820Sjeff{
142219820Sjeff	osm_switch_t *p_sw = *pp_sw;
143219820Sjeff	unsigned i;
144219820Sjeff
145219820Sjeff	osm_mcast_tbl_destroy(&p_sw->mcast_tbl);
146219820Sjeff	free(p_sw->p_prof);
147219820Sjeff	if (p_sw->lft)
148219820Sjeff		free(p_sw->lft);
149219820Sjeff	if (p_sw->new_lft)
150219820Sjeff		free(p_sw->new_lft);
151219820Sjeff	if (p_sw->hops) {
152219820Sjeff		for (i = 0; i < p_sw->num_hops; i++)
153219820Sjeff			if (p_sw->hops[i])
154219820Sjeff				free(p_sw->hops[i]);
155219820Sjeff		free(p_sw->hops);
156219820Sjeff	}
157219820Sjeff	free(*pp_sw);
158219820Sjeff	*pp_sw = NULL;
159219820Sjeff}
160219820Sjeff
161219820Sjeff/**********************************************************************
162219820Sjeff **********************************************************************/
163219820Sjeffosm_switch_t *osm_switch_new(IN osm_node_t * const p_node,
164219820Sjeff			     IN const osm_madw_t * const p_madw)
165219820Sjeff{
166219820Sjeff	ib_api_status_t status;
167219820Sjeff	osm_switch_t *p_sw;
168219820Sjeff
169219820Sjeff	CL_ASSERT(p_madw);
170219820Sjeff	CL_ASSERT(p_node);
171219820Sjeff
172219820Sjeff	p_sw = (osm_switch_t *) malloc(sizeof(*p_sw));
173219820Sjeff	if (p_sw) {
174219820Sjeff		memset(p_sw, 0, sizeof(*p_sw));
175219820Sjeff		status = osm_switch_init(p_sw, p_node, p_madw);
176219820Sjeff		if (status != IB_SUCCESS)
177219820Sjeff			osm_switch_delete(&p_sw);
178219820Sjeff	}
179219820Sjeff
180219820Sjeff	return (p_sw);
181219820Sjeff}
182219820Sjeff
183219820Sjeff/**********************************************************************
184219820Sjeff **********************************************************************/
185219820Sjeffboolean_t
186219820Sjeffosm_switch_get_lft_block(IN const osm_switch_t * const p_sw,
187219820Sjeff			 IN const uint16_t block_id,
188219820Sjeff			 OUT uint8_t * const p_block)
189219820Sjeff{
190219820Sjeff	uint16_t base_lid_ho = block_id * IB_SMP_DATA_SIZE;
191219820Sjeff
192219820Sjeff	CL_ASSERT(p_sw);
193219820Sjeff	CL_ASSERT(p_block);
194219820Sjeff
195219820Sjeff	if (base_lid_ho > p_sw->max_lid_ho)
196219820Sjeff		return FALSE;
197219820Sjeff
198219820Sjeff	CL_ASSERT(base_lid_ho + IB_SMP_DATA_SIZE <= IB_LID_UCAST_END_HO);
199219820Sjeff	memcpy(p_block, &(p_sw->lft[base_lid_ho]), IB_SMP_DATA_SIZE);
200219820Sjeff	return TRUE;
201219820Sjeff}
202219820Sjeff
203219820Sjeff/**********************************************************************
204219820Sjeff **********************************************************************/
205219820Sjeffstatic struct osm_remote_node *
206219820Sjeffosm_switch_find_guid_common(IN const osm_switch_t * const p_sw,
207219820Sjeff			    IN struct osm_remote_guids_count *r,
208219820Sjeff			    IN uint8_t port_num,
209219820Sjeff			    IN int find_sys_guid,
210219820Sjeff			    IN int find_node_guid)
211219820Sjeff{
212219820Sjeff	struct osm_remote_node *p_remote_guid = NULL;
213219820Sjeff	osm_physp_t *p_physp;
214219820Sjeff	osm_physp_t *p_rem_physp;
215219820Sjeff	osm_node_t *p_rem_node;
216219820Sjeff	uint64_t sys_guid;
217219820Sjeff	uint64_t node_guid;
218219820Sjeff	int i;
219219820Sjeff
220219820Sjeff	CL_ASSERT(p_sw);
221219820Sjeff
222219820Sjeff	p_physp = osm_node_get_physp_ptr(p_sw->p_node, port_num);
223219820Sjeff	p_rem_physp = osm_physp_get_remote(p_physp);
224219820Sjeff	p_rem_node = osm_physp_get_node_ptr(p_rem_physp);
225219820Sjeff	sys_guid = p_rem_node->node_info.sys_guid;
226219820Sjeff	node_guid = p_rem_node->node_info.node_guid;
227219820Sjeff
228219820Sjeff	for (i = 0; i < r->count; i++) {
229219820Sjeff		if ((!find_sys_guid
230219820Sjeff		     || r->guids[i].node->node_info.sys_guid == sys_guid)
231219820Sjeff		    && (!find_node_guid
232219820Sjeff			|| r->guids[i].node->node_info.node_guid == node_guid)) {
233219820Sjeff			p_remote_guid = &r->guids[i];
234219820Sjeff			break;
235219820Sjeff		}
236219820Sjeff	}
237219820Sjeff
238219820Sjeff	return p_remote_guid;
239219820Sjeff}
240219820Sjeff
241219820Sjeffstatic struct osm_remote_node *
242219820Sjeffosm_switch_find_sys_guid_count(IN const osm_switch_t * const p_sw,
243219820Sjeff			       IN struct osm_remote_guids_count *r,
244219820Sjeff			       IN uint8_t port_num)
245219820Sjeff{
246219820Sjeff	return osm_switch_find_guid_common(p_sw, r, port_num, 1, 0);
247219820Sjeff}
248219820Sjeff
249219820Sjeffstatic struct osm_remote_node *
250219820Sjeffosm_switch_find_node_guid_count(IN const osm_switch_t * const p_sw,
251219820Sjeff				IN struct osm_remote_guids_count *r,
252219820Sjeff				IN uint8_t port_num)
253219820Sjeff{
254219820Sjeff	return osm_switch_find_guid_common(p_sw, r, port_num, 0, 1);
255219820Sjeff}
256219820Sjeff
257219820Sjeff/**********************************************************************
258219820Sjeff **********************************************************************/
259219820Sjeffuint8_t
260219820Sjeffosm_switch_recommend_path(IN const osm_switch_t * const p_sw,
261219820Sjeff			  IN osm_port_t * p_port,
262219820Sjeff			  IN const uint16_t lid_ho,
263219820Sjeff			  IN unsigned start_from,
264219820Sjeff			  IN const boolean_t ignore_existing,
265219820Sjeff			  IN const boolean_t dor)
266219820Sjeff{
267219820Sjeff	/*
268219820Sjeff	   We support an enhanced LMC aware routing mode:
269219820Sjeff	   In the case of LMC > 0, we can track the remote side
270219820Sjeff	   system and node for all of the lids of the target
271219820Sjeff	   and try and avoid routing again through the same
272219820Sjeff	   system / node.
273219820Sjeff
274219820Sjeff	   If this procedure is provided with the tracking array
275219820Sjeff	   and counter we can conduct this algorithm.
276219820Sjeff	 */
277219820Sjeff	boolean_t routing_for_lmc = (p_port->priv != NULL);
278219820Sjeff	uint16_t base_lid;
279219820Sjeff	uint8_t hops;
280219820Sjeff	uint8_t least_hops;
281219820Sjeff	uint8_t port_num;
282219820Sjeff	uint8_t num_ports;
283219820Sjeff	uint32_t least_paths = 0xFFFFFFFF;
284219820Sjeff	unsigned i;
285219820Sjeff	/*
286219820Sjeff	   The follwing will track the least paths if the
287219820Sjeff	   route should go through a new system/node
288219820Sjeff	 */
289219820Sjeff	uint32_t least_paths_other_sys = 0xFFFFFFFF;
290219820Sjeff	uint32_t least_paths_other_nodes = 0xFFFFFFFF;
291219820Sjeff	uint32_t least_forwarded_to = 0xFFFFFFFF;
292219820Sjeff	uint32_t check_count;
293219820Sjeff	uint8_t best_port = 0;
294219820Sjeff	/*
295219820Sjeff	   These vars track the best port if it connects to
296219820Sjeff	   not used system/node.
297219820Sjeff	 */
298219820Sjeff	uint8_t best_port_other_sys = 0;
299219820Sjeff	uint8_t best_port_other_node = 0;
300219820Sjeff	boolean_t port_found = FALSE;
301219820Sjeff	osm_physp_t *p_physp;
302219820Sjeff	osm_physp_t *p_rem_physp;
303219820Sjeff	osm_node_t *p_rem_node;
304219820Sjeff	osm_node_t *p_rem_node_first = NULL;
305219820Sjeff	struct osm_remote_node *p_remote_guid = NULL;
306219820Sjeff
307219820Sjeff	CL_ASSERT(lid_ho > 0);
308219820Sjeff
309219820Sjeff	if (p_port->p_node->sw) {
310219820Sjeff		if (p_port->p_node->sw == p_sw)
311219820Sjeff			return 0;
312219820Sjeff		base_lid = osm_port_get_base_lid(p_port);
313219820Sjeff	} else {
314219820Sjeff		p_physp = p_port->p_physp;
315219820Sjeff		if (!p_physp || !p_physp->p_remote_physp ||
316219820Sjeff		    !p_physp->p_remote_physp->p_node->sw)
317219820Sjeff			return OSM_NO_PATH;
318219820Sjeff
319219820Sjeff		if (p_physp->p_remote_physp->p_node->sw == p_sw)
320219820Sjeff			return p_physp->p_remote_physp->port_num;
321219820Sjeff		base_lid =
322219820Sjeff		    osm_node_get_base_lid(p_physp->p_remote_physp->p_node, 0);
323219820Sjeff	}
324219820Sjeff	base_lid = cl_ntoh16(base_lid);
325219820Sjeff
326219820Sjeff	num_ports = p_sw->num_ports;
327219820Sjeff
328219820Sjeff	least_hops = osm_switch_get_least_hops(p_sw, base_lid);
329219820Sjeff	if (least_hops == OSM_NO_PATH)
330219820Sjeff		return (OSM_NO_PATH);
331219820Sjeff
332219820Sjeff	/*
333219820Sjeff	   First, inquire with the forwarding table for an existing
334219820Sjeff	   route.  If one is found, honor it unless:
335219820Sjeff	   1. the ignore existing flag is set.
336219820Sjeff	   2. the physical port is not a valid one or not healthy
337219820Sjeff	   3. the physical port has a remote port (the link is up)
338219820Sjeff	   4. the port has min-hops to the target (avoid loops)
339219820Sjeff	 */
340219820Sjeff	if (!ignore_existing) {
341219820Sjeff		port_num = osm_switch_get_port_by_lid(p_sw, lid_ho);
342219820Sjeff
343219820Sjeff		if (port_num != OSM_NO_PATH) {
344219820Sjeff			CL_ASSERT(port_num < num_ports);
345219820Sjeff
346219820Sjeff			p_physp =
347219820Sjeff			    osm_node_get_physp_ptr(p_sw->p_node, port_num);
348219820Sjeff			/*
349219820Sjeff			   Don't be too trusting of the current forwarding table!
350219820Sjeff			   Verify that the port number is legal and that the
351219820Sjeff			   LID is reachable through this port.
352219820Sjeff			 */
353219820Sjeff			if (p_physp && osm_physp_is_healthy(p_physp) &&
354219820Sjeff			    osm_physp_get_remote(p_physp)) {
355219820Sjeff				hops =
356219820Sjeff				    osm_switch_get_hop_count(p_sw, base_lid,
357219820Sjeff							     port_num);
358219820Sjeff				/*
359219820Sjeff				   If we aren't using pre-defined user routes
360219820Sjeff				   function, then we need to make sure that the
361219820Sjeff				   current path is the minimum one. In case of
362219820Sjeff				   having such a user function - this check will
363219820Sjeff				   not be done, and the old routing will be used.
364219820Sjeff				   Note: This means that it is the user's job to
365219820Sjeff				   clean all data in the forwarding tables that
366219820Sjeff				   he wants to be overridden by the minimum
367219820Sjeff				   hop function.
368219820Sjeff				 */
369219820Sjeff				if (hops == least_hops)
370219820Sjeff					return (port_num);
371219820Sjeff			}
372219820Sjeff		}
373219820Sjeff	}
374219820Sjeff
375219820Sjeff	/*
376219820Sjeff	   This algorithm selects a port based on a static load balanced
377219820Sjeff	   selection across equal hop-count ports.
378219820Sjeff	   There is lots of room for improved sophistication here,
379219820Sjeff	   possibly guided by user configuration info.
380219820Sjeff	 */
381219820Sjeff
382219820Sjeff	/*
383219820Sjeff	   OpenSM routing is "local" - not considering a full lid to lid
384219820Sjeff	   path. As such we can not guarantee a path will not loop if we
385219820Sjeff	   do not always follow least hops.
386219820Sjeff	   So we must abort if not least hops.
387219820Sjeff	 */
388219820Sjeff
389219820Sjeff	/* port number starts with one and num_ports is 1 + num phys ports */
390219820Sjeff	for (i = start_from; i < start_from + num_ports; i++) {
391219820Sjeff		port_num = i%num_ports;
392219820Sjeff		if (!port_num ||
393219820Sjeff		    osm_switch_get_hop_count(p_sw, base_lid, port_num) !=
394219820Sjeff		    least_hops)
395219820Sjeff			continue;
396219820Sjeff
397219820Sjeff		/* let us make sure it is not down or unhealthy */
398219820Sjeff		p_physp = osm_node_get_physp_ptr(p_sw->p_node, port_num);
399219820Sjeff		if (!p_physp || !osm_physp_is_healthy(p_physp) ||
400219820Sjeff		    /*
401219820Sjeff		       we require all - non sma ports to be linked
402219820Sjeff		       to be routed through
403219820Sjeff		     */
404219820Sjeff		    !osm_physp_get_remote(p_physp))
405219820Sjeff			continue;
406219820Sjeff
407219820Sjeff		/*
408219820Sjeff		   We located a least-hop port, possibly one of many.
409219820Sjeff		   For this port, check the running total count of
410219820Sjeff		   the number of paths through this port.  Select
411219820Sjeff		   the port routing the least number of paths.
412219820Sjeff		 */
413219820Sjeff		check_count =
414219820Sjeff		    osm_port_prof_path_count_get(&p_sw->p_prof[port_num]);
415219820Sjeff
416219820Sjeff		/*
417219820Sjeff		   Advanced LMC routing requires tracking of the
418219820Sjeff		   best port by the node connected to the other side of
419219820Sjeff		   it.
420219820Sjeff		 */
421219820Sjeff		if (routing_for_lmc) {
422219820Sjeff			/* Is the sys guid already used ? */
423219820Sjeff			p_remote_guid = osm_switch_find_sys_guid_count(p_sw,
424219820Sjeff								       p_port->priv,
425219820Sjeff								       port_num);
426219820Sjeff
427219820Sjeff			/* If not update the least hops for this case */
428219820Sjeff			if (!p_remote_guid) {
429219820Sjeff				if (check_count < least_paths_other_sys) {
430219820Sjeff					least_paths_other_sys = check_count;
431219820Sjeff					best_port_other_sys = port_num;
432219820Sjeff					least_forwarded_to = 0;
433219820Sjeff				}
434219820Sjeff			} else {	/* same sys found - try node */
435219820Sjeff				/* Else is the node guid already used ? */
436219820Sjeff				p_remote_guid = osm_switch_find_node_guid_count(p_sw,
437219820Sjeff										p_port->priv,
438219820Sjeff										port_num);
439219820Sjeff
440219820Sjeff				/* If not update the least hops for this case */
441219820Sjeff				if (!p_remote_guid
442219820Sjeff				    && check_count < least_paths_other_nodes) {
443219820Sjeff					least_paths_other_nodes = check_count;
444219820Sjeff					best_port_other_node = port_num;
445219820Sjeff					least_forwarded_to = 0;
446219820Sjeff				}
447219820Sjeff				/* else prior sys and node guid already used */
448219820Sjeff
449219820Sjeff			}	/* same sys found */
450219820Sjeff		}
451219820Sjeff
452219820Sjeff		/* routing for LMC mode */
453219820Sjeff		/*
454219820Sjeff		   the count is min but also lower then the max subscribed
455219820Sjeff		 */
456219820Sjeff		if (check_count < least_paths) {
457219820Sjeff			if (dor) {
458219820Sjeff				/* Get the Remote Node */
459219820Sjeff				p_rem_physp = osm_physp_get_remote(p_physp);
460219820Sjeff				p_rem_node =
461219820Sjeff				    osm_physp_get_node_ptr(p_rem_physp);
462219820Sjeff				/* use the first dimension, but spread
463219820Sjeff				 * traffic out among the group of ports
464219820Sjeff				 * representing that dimension */
465219820Sjeff				if (port_found) {
466219820Sjeff					if (p_rem_node != p_rem_node_first)
467219820Sjeff						continue;
468219820Sjeff				} else
469219820Sjeff					p_rem_node_first = p_rem_node;
470219820Sjeff			}
471219820Sjeff			port_found = TRUE;
472219820Sjeff			best_port = port_num;
473219820Sjeff			least_paths = check_count;
474219820Sjeff			if (routing_for_lmc
475219820Sjeff			    && p_remote_guid
476219820Sjeff			    && p_remote_guid->forwarded_to < least_forwarded_to)
477219820Sjeff				least_forwarded_to = p_remote_guid->forwarded_to;
478219820Sjeff		} else if (routing_for_lmc
479219820Sjeff			   && p_remote_guid
480219820Sjeff			   && check_count == least_paths
481219820Sjeff			   && p_remote_guid->forwarded_to < least_forwarded_to) {
482219820Sjeff			least_forwarded_to = p_remote_guid->forwarded_to;
483219820Sjeff			best_port = port_num;
484219820Sjeff		}
485219820Sjeff	}
486219820Sjeff
487219820Sjeff	if (port_found == FALSE)
488219820Sjeff		return (OSM_NO_PATH);
489219820Sjeff
490219820Sjeff	/*
491219820Sjeff	   if we are in enhanced routing mode and the best port is not
492219820Sjeff	   the local port 0
493219820Sjeff	 */
494219820Sjeff	if (routing_for_lmc && best_port) {
495219820Sjeff		/* Select the least hop port of the non used sys first */
496219820Sjeff		if (best_port_other_sys)
497219820Sjeff			best_port = best_port_other_sys;
498219820Sjeff		else if (best_port_other_node)
499219820Sjeff			best_port = best_port_other_node;
500219820Sjeff	}
501219820Sjeff
502219820Sjeff	return (best_port);
503219820Sjeff}
504219820Sjeff
505219820Sjeff/**********************************************************************
506219820Sjeff **********************************************************************/
507219820Sjeffvoid osm_switch_clear_hops(IN osm_switch_t * p_sw)
508219820Sjeff{
509219820Sjeff	unsigned i;
510219820Sjeff
511219820Sjeff	for (i = 0; i < p_sw->num_hops; i++)
512219820Sjeff		if (p_sw->hops[i])
513219820Sjeff			memset(p_sw->hops[i], OSM_NO_PATH, p_sw->num_ports);
514219820Sjeff}
515219820Sjeff
516219820Sjeff/**********************************************************************
517219820Sjeff **********************************************************************/
518219820Sjeffint
519219820Sjeffosm_switch_prepare_path_rebuild(IN osm_switch_t * p_sw, IN uint16_t max_lids)
520219820Sjeff{
521219820Sjeff	uint8_t **hops;
522219820Sjeff	unsigned i;
523219820Sjeff
524219820Sjeff	for (i = 0; i < p_sw->num_ports; i++)
525219820Sjeff		osm_port_prof_construct(&p_sw->p_prof[i]);
526219820Sjeff
527219820Sjeff	osm_switch_clear_hops(p_sw);
528219820Sjeff
529219820Sjeff	if (!p_sw->new_lft &&
530219820Sjeff	    !(p_sw->new_lft = malloc(IB_LID_UCAST_END_HO + 1)))
531219820Sjeff		return IB_INSUFFICIENT_MEMORY;
532219820Sjeff
533219820Sjeff	memset(p_sw->new_lft, OSM_NO_PATH, IB_LID_UCAST_END_HO + 1);
534219820Sjeff
535219820Sjeff	if (!p_sw->hops) {
536219820Sjeff		hops = malloc((max_lids + 1) * sizeof(hops[0]));
537219820Sjeff		if (!hops)
538219820Sjeff			return -1;
539219820Sjeff		memset(hops, 0, (max_lids + 1) * sizeof(hops[0]));
540219820Sjeff		p_sw->hops = hops;
541219820Sjeff		p_sw->num_hops = max_lids + 1;
542219820Sjeff	} else if (max_lids + 1 > p_sw->num_hops) {
543219820Sjeff		uint8_t **old_hops;
544219820Sjeff
545219820Sjeff		hops = malloc((max_lids + 1) * sizeof(hops[0]));
546219820Sjeff		if (!hops)
547219820Sjeff			return -1;
548219820Sjeff		memcpy(hops, p_sw->hops, p_sw->num_hops * sizeof(hops[0]));
549219820Sjeff		memset(hops + p_sw->num_hops, 0,
550219820Sjeff		       (max_lids + 1 - p_sw->num_hops) * sizeof(hops[0]));
551219820Sjeff		old_hops = p_sw->hops;
552219820Sjeff		p_sw->hops = hops;
553219820Sjeff		p_sw->num_hops = max_lids + 1;
554219820Sjeff		free(old_hops);
555219820Sjeff	}
556219820Sjeff	p_sw->max_lid_ho = max_lids;
557219820Sjeff
558219820Sjeff	return 0;
559219820Sjeff}
560219820Sjeff
561219820Sjeff/**********************************************************************
562219820Sjeff **********************************************************************/
563219820Sjeffuint8_t
564219820Sjeffosm_switch_get_port_least_hops(IN const osm_switch_t * const p_sw,
565219820Sjeff			       IN const osm_port_t * p_port)
566219820Sjeff{
567219820Sjeff	uint16_t lid;
568219820Sjeff
569219820Sjeff	if (p_port->p_node->sw) {
570219820Sjeff		if (p_port->p_node->sw == p_sw)
571219820Sjeff			return 0;
572219820Sjeff		lid = osm_node_get_base_lid(p_port->p_node, 0);
573219820Sjeff		return osm_switch_get_least_hops(p_sw, cl_ntoh16(lid));
574219820Sjeff	} else {
575219820Sjeff		osm_physp_t *p = p_port->p_physp;
576219820Sjeff		uint8_t hops;
577219820Sjeff
578219820Sjeff		if (!p || !p->p_remote_physp || !p->p_remote_physp->p_node->sw)
579219820Sjeff			return OSM_NO_PATH;
580219820Sjeff		if (p->p_remote_physp->p_node->sw == p_sw)
581219820Sjeff			return 1;
582219820Sjeff		lid = osm_node_get_base_lid(p->p_remote_physp->p_node, 0);
583219820Sjeff		hops = osm_switch_get_least_hops(p_sw, cl_ntoh16(lid));
584219820Sjeff		return hops != OSM_NO_PATH ? hops + 1 : OSM_NO_PATH;
585219820Sjeff	}
586219820Sjeff}
587219820Sjeff
588219820Sjeff/**********************************************************************
589219820Sjeff **********************************************************************/
590219820Sjeffuint8_t
591219820Sjeffosm_switch_recommend_mcast_path(IN osm_switch_t * const p_sw,
592219820Sjeff				IN osm_port_t * p_port,
593219820Sjeff				IN uint16_t const mlid_ho,
594219820Sjeff				IN boolean_t const ignore_existing)
595219820Sjeff{
596219820Sjeff	uint16_t base_lid;
597219820Sjeff	uint8_t hops;
598219820Sjeff	uint8_t port_num;
599219820Sjeff	uint8_t num_ports;
600219820Sjeff	uint8_t least_hops;
601219820Sjeff
602219820Sjeff	CL_ASSERT(mlid_ho >= IB_LID_MCAST_START_HO);
603219820Sjeff
604219820Sjeff	if (p_port->p_node->sw) {
605219820Sjeff		if (p_port->p_node->sw == p_sw)
606219820Sjeff			return 0;
607219820Sjeff		base_lid = osm_port_get_base_lid(p_port);
608219820Sjeff	} else {
609219820Sjeff		osm_physp_t *p_physp = p_port->p_physp;
610219820Sjeff		if (!p_physp || !p_physp->p_remote_physp ||
611219820Sjeff		    !p_physp->p_remote_physp->p_node->sw)
612219820Sjeff			return OSM_NO_PATH;
613219820Sjeff		if (p_physp->p_remote_physp->p_node->sw == p_sw)
614219820Sjeff			return p_physp->p_remote_physp->port_num;
615219820Sjeff		base_lid =
616219820Sjeff		    osm_node_get_base_lid(p_physp->p_remote_physp->p_node, 0);
617219820Sjeff	}
618219820Sjeff	base_lid = cl_ntoh16(base_lid);
619219820Sjeff	num_ports = p_sw->num_ports;
620219820Sjeff
621219820Sjeff	/*
622219820Sjeff	   If the user wants us to ignore existing multicast routes,
623219820Sjeff	   then simply return the shortest hop count path to the
624219820Sjeff	   target port.
625219820Sjeff
626219820Sjeff	   Otherwise, return the first port that has a path to the target,
627219820Sjeff	   picking from the ports that are already in the multicast group.
628219820Sjeff	 */
629219820Sjeff	if (!ignore_existing) {
630219820Sjeff		for (port_num = 1; port_num < num_ports; port_num++) {
631219820Sjeff			if (!osm_mcast_tbl_is_port
632219820Sjeff			    (&p_sw->mcast_tbl, mlid_ho, port_num))
633219820Sjeff				continue;
634219820Sjeff			/*
635219820Sjeff			   Don't be too trusting of the current forwarding table!
636219820Sjeff			   Verify that the LID is reachable through this port.
637219820Sjeff			 */
638219820Sjeff			hops =
639219820Sjeff			    osm_switch_get_hop_count(p_sw, base_lid, port_num);
640219820Sjeff			if (hops != OSM_NO_PATH)
641219820Sjeff				return (port_num);
642219820Sjeff		}
643219820Sjeff	}
644219820Sjeff
645219820Sjeff	/*
646219820Sjeff	   Either no existing mcast paths reach this port or we are
647219820Sjeff	   ignoring existing paths.
648219820Sjeff
649219820Sjeff	   Determine the best multicast path to the target.  Note that this
650219820Sjeff	   algorithm is slightly different from the one used for unicast route
651219820Sjeff	   recommendation.  In this case (multicast), we must NOT
652219820Sjeff	   perform any sort of load balancing.  We MUST take the FIRST
653219820Sjeff	   port found that has <= the lowest hop count path.  This prevents
654219820Sjeff	   more than one multicast path to the same remote switch which
655219820Sjeff	   prevents a multicast loop.  Multicast loops are bad since the same
656219820Sjeff	   multicast packet will go around and around, inevitably creating
657219820Sjeff	   a black hole that will destroy the Earth in a firey conflagration.
658219820Sjeff	 */
659219820Sjeff	least_hops = osm_switch_get_least_hops(p_sw, base_lid);
660219820Sjeff	for (port_num = 1; port_num < num_ports; port_num++)
661219820Sjeff		if (osm_switch_get_hop_count(p_sw, base_lid, port_num) ==
662219820Sjeff		    least_hops)
663219820Sjeff			break;
664219820Sjeff
665219820Sjeff	CL_ASSERT(port_num < num_ports);
666219820Sjeff	return (port_num);
667219820Sjeff}
668