1321936Shselasky/*
2321936Shselasky * Copyright (c) 2004-2009 Voltaire, Inc. All rights reserved.
3321936Shselasky * Copyright (c) 2002-2015 Mellanox Technologies LTD. All rights reserved.
4321936Shselasky * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
5321936Shselasky * Copyright (c) 2009 HNR Consulting. All rights reserved.
6321936Shselasky *
7321936Shselasky * This software is available to you under a choice of one of two
8321936Shselasky * licenses.  You may choose to be licensed under the terms of the GNU
9321936Shselasky * General Public License (GPL) Version 2, available from the file
10321936Shselasky * COPYING in the main directory of this source tree, or the
11321936Shselasky * OpenIB.org BSD license below:
12321936Shselasky *
13321936Shselasky *     Redistribution and use in source and binary forms, with or
14321936Shselasky *     without modification, are permitted provided that the following
15321936Shselasky *     conditions are met:
16321936Shselasky *
17321936Shselasky *      - Redistributions of source code must retain the above
18321936Shselasky *        copyright notice, this list of conditions and the following
19321936Shselasky *        disclaimer.
20321936Shselasky *
21321936Shselasky *      - Redistributions in binary form must reproduce the above
22321936Shselasky *        copyright notice, this list of conditions and the following
23321936Shselasky *        disclaimer in the documentation and/or other materials
24321936Shselasky *        provided with the distribution.
25321936Shselasky *
26321936Shselasky * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
27321936Shselasky * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
28321936Shselasky * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
29321936Shselasky * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
30321936Shselasky * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
31321936Shselasky * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
32321936Shselasky * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
33321936Shselasky * SOFTWARE.
34321936Shselasky *
35321936Shselasky */
36321936Shselasky
37321936Shselasky/*
38321936Shselasky * Abstract:
39321936Shselasky *    Implementation of osm_switch_t.
40321936Shselasky * This object represents an Infiniband switch.
41321936Shselasky * This object is part of the opensm family of objects.
42321936Shselasky */
43321936Shselasky
44321936Shselasky#if HAVE_CONFIG_H
45321936Shselasky#  include <config.h>
46321936Shselasky#endif				/* HAVE_CONFIG_H */
47321936Shselasky
48321936Shselasky#include <stdlib.h>
49321936Shselasky#include <string.h>
50321936Shselasky#include <complib/cl_math.h>
51321936Shselasky#include <iba/ib_types.h>
52321936Shselasky#include <opensm/osm_file_ids.h>
53321936Shselasky#define FILE_ID OSM_FILE_SWITCH_C
54321936Shselasky#include <opensm/osm_switch.h>
55321936Shselasky
56321936Shselaskystruct switch_port_path {
57321936Shselasky	uint8_t port_num;
58321936Shselasky	uint32_t path_count;
59321936Shselasky	int found_sys_guid;
60321936Shselasky	int found_node_guid;
61321936Shselasky	uint32_t forwarded_to;
62321936Shselasky};
63321936Shselasky
64321936Shselaskycl_status_t osm_switch_set_hops(IN osm_switch_t * p_sw, IN uint16_t lid_ho,
65321936Shselasky				IN uint8_t port_num, IN uint8_t num_hops)
66321936Shselasky{
67321936Shselasky	if (!lid_ho || lid_ho > p_sw->max_lid_ho)
68321936Shselasky		return -1;
69321936Shselasky	if (port_num >= p_sw->num_ports)
70321936Shselasky		return -1;
71321936Shselasky	if (!p_sw->hops[lid_ho]) {
72321936Shselasky		p_sw->hops[lid_ho] = malloc(p_sw->num_ports);
73321936Shselasky		if (!p_sw->hops[lid_ho])
74321936Shselasky			return -1;
75321936Shselasky		memset(p_sw->hops[lid_ho], OSM_NO_PATH, p_sw->num_ports);
76321936Shselasky	}
77321936Shselasky
78321936Shselasky	p_sw->hops[lid_ho][port_num] = num_hops;
79321936Shselasky	if (p_sw->hops[lid_ho][0] > num_hops)
80321936Shselasky		p_sw->hops[lid_ho][0] = num_hops;
81321936Shselasky
82321936Shselasky	return 0;
83321936Shselasky}
84321936Shselasky
85321936Shselaskyvoid osm_switch_delete(IN OUT osm_switch_t ** pp_sw)
86321936Shselasky{
87321936Shselasky	osm_switch_t *p_sw = *pp_sw;
88321936Shselasky	unsigned i;
89321936Shselasky
90321936Shselasky	osm_mcast_tbl_destroy(&p_sw->mcast_tbl);
91321936Shselasky	if (p_sw->p_prof)
92321936Shselasky		free(p_sw->p_prof);
93321936Shselasky	if (p_sw->search_ordering_ports)
94321936Shselasky		free(p_sw->search_ordering_ports);
95321936Shselasky	if (p_sw->lft)
96321936Shselasky		free(p_sw->lft);
97321936Shselasky	if (p_sw->new_lft)
98321936Shselasky		free(p_sw->new_lft);
99321936Shselasky	if (p_sw->hops) {
100321936Shselasky		for (i = 0; i < p_sw->num_hops; i++)
101321936Shselasky			if (p_sw->hops[i])
102321936Shselasky				free(p_sw->hops[i]);
103321936Shselasky		free(p_sw->hops);
104321936Shselasky	}
105321936Shselasky	free(*pp_sw);
106321936Shselasky	*pp_sw = NULL;
107321936Shselasky}
108321936Shselasky
109321936Shselaskyosm_switch_t *osm_switch_new(IN osm_node_t * p_node,
110321936Shselasky			     IN const osm_madw_t * p_madw)
111321936Shselasky{
112321936Shselasky	osm_switch_t *p_sw;
113321936Shselasky	ib_switch_info_t *p_si;
114321936Shselasky	ib_smp_t *p_smp;
115321936Shselasky	uint8_t num_ports;
116321936Shselasky	uint32_t port_num;
117321936Shselasky
118321936Shselasky	CL_ASSERT(p_madw);
119321936Shselasky	CL_ASSERT(p_node);
120321936Shselasky
121321936Shselasky	p_smp = osm_madw_get_smp_ptr(p_madw);
122321936Shselasky	p_si = ib_smp_get_payload_ptr(p_smp);
123321936Shselasky	num_ports = osm_node_get_num_physp(p_node);
124321936Shselasky
125321936Shselasky	CL_ASSERT(p_smp->attr_id == IB_MAD_ATTR_SWITCH_INFO);
126321936Shselasky
127321936Shselasky	if (!p_si->lin_cap) /* The switch doesn't support LFT */
128321936Shselasky		return NULL;
129321936Shselasky
130321936Shselasky	p_sw = malloc(sizeof(*p_sw));
131321936Shselasky	if (!p_sw)
132321936Shselasky		return NULL;
133321936Shselasky
134321936Shselasky	memset(p_sw, 0, sizeof(*p_sw));
135321936Shselasky
136321936Shselasky	p_sw->p_node = p_node;
137321936Shselasky	p_sw->switch_info = *p_si;
138321936Shselasky	p_sw->num_ports = num_ports;
139321936Shselasky	p_sw->need_update = 2;
140321936Shselasky
141321936Shselasky	p_sw->p_prof = malloc(sizeof(*p_sw->p_prof) * num_ports);
142321936Shselasky	if (!p_sw->p_prof)
143321936Shselasky		goto err;
144321936Shselasky
145321936Shselasky	memset(p_sw->p_prof, 0, sizeof(*p_sw->p_prof) * num_ports);
146321936Shselasky
147321936Shselasky	osm_mcast_tbl_init(&p_sw->mcast_tbl, osm_node_get_num_physp(p_node),
148321936Shselasky			   cl_ntoh16(p_si->mcast_cap));
149321936Shselasky
150321936Shselasky	for (port_num = 0; port_num < num_ports; port_num++)
151321936Shselasky		osm_port_prof_construct(&p_sw->p_prof[port_num]);
152321936Shselasky
153321936Shselasky	return p_sw;
154321936Shselasky
155321936Shselaskyerr:
156321936Shselasky	osm_switch_delete(&p_sw);
157321936Shselasky	return NULL;
158321936Shselasky}
159321936Shselasky
160321936Shselaskyboolean_t osm_switch_get_lft_block(IN const osm_switch_t * p_sw,
161321936Shselasky				   IN uint16_t block_id, OUT uint8_t * p_block)
162321936Shselasky{
163321936Shselasky	uint16_t base_lid_ho = block_id * IB_SMP_DATA_SIZE;
164321936Shselasky
165321936Shselasky	CL_ASSERT(p_sw);
166321936Shselasky	CL_ASSERT(p_block);
167321936Shselasky
168321936Shselasky	if (base_lid_ho > p_sw->max_lid_ho)
169321936Shselasky		return FALSE;
170321936Shselasky
171321936Shselasky	CL_ASSERT(base_lid_ho + IB_SMP_DATA_SIZE - 1 <= IB_LID_UCAST_END_HO);
172321936Shselasky	memcpy(p_block, &(p_sw->new_lft[base_lid_ho]), IB_SMP_DATA_SIZE);
173321936Shselasky	return TRUE;
174321936Shselasky}
175321936Shselasky
176321936Shselaskystatic struct osm_remote_node *
177321936Shselaskyswitch_find_guid_common(IN const osm_switch_t * p_sw,
178321936Shselasky			IN struct osm_remote_guids_count *r,
179321936Shselasky			IN uint8_t port_num, IN int find_sys_guid,
180321936Shselasky			IN int find_node_guid)
181321936Shselasky{
182321936Shselasky	struct osm_remote_node *p_remote_guid = NULL;
183321936Shselasky	osm_physp_t *p_physp;
184321936Shselasky	osm_physp_t *p_rem_physp;
185321936Shselasky	osm_node_t *p_rem_node;
186321936Shselasky	uint64_t sys_guid;
187321936Shselasky	uint64_t node_guid;
188321936Shselasky	unsigned int i;
189321936Shselasky
190321936Shselasky	CL_ASSERT(p_sw);
191321936Shselasky
192321936Shselasky	if (!r)
193321936Shselasky		goto out;
194321936Shselasky
195321936Shselasky	p_physp = osm_node_get_physp_ptr(p_sw->p_node, port_num);
196321936Shselasky	if (!p_physp)
197321936Shselasky		goto out;
198321936Shselasky
199321936Shselasky	p_rem_physp = osm_physp_get_remote(p_physp);
200321936Shselasky	p_rem_node = osm_physp_get_node_ptr(p_rem_physp);
201321936Shselasky	sys_guid = p_rem_node->node_info.sys_guid;
202321936Shselasky	node_guid = p_rem_node->node_info.node_guid;
203321936Shselasky
204321936Shselasky	for (i = 0; i < r->count; i++) {
205321936Shselasky		if ((!find_sys_guid
206321936Shselasky		     || r->guids[i].node->node_info.sys_guid == sys_guid)
207321936Shselasky		    && (!find_node_guid
208321936Shselasky			|| r->guids[i].node->node_info.node_guid == node_guid)) {
209321936Shselasky			p_remote_guid = &r->guids[i];
210321936Shselasky			break;
211321936Shselasky		}
212321936Shselasky	}
213321936Shselasky
214321936Shselaskyout:
215321936Shselasky	return p_remote_guid;
216321936Shselasky}
217321936Shselasky
218321936Shselaskystatic struct osm_remote_node *
219321936Shselaskyswitch_find_sys_guid_count(IN const osm_switch_t * p_sw,
220321936Shselasky			   IN struct osm_remote_guids_count *r,
221321936Shselasky			   IN uint8_t port_num)
222321936Shselasky{
223321936Shselasky	return switch_find_guid_common(p_sw, r, port_num, 1, 0);
224321936Shselasky}
225321936Shselasky
226321936Shselaskystatic struct osm_remote_node *
227321936Shselaskyswitch_find_node_guid_count(IN const osm_switch_t * p_sw,
228321936Shselasky			    IN struct osm_remote_guids_count *r,
229321936Shselasky			    IN uint8_t port_num)
230321936Shselasky{
231321936Shselasky	return switch_find_guid_common(p_sw, r, port_num, 0, 1);
232321936Shselasky}
233321936Shselasky
234321936Shselaskyuint8_t osm_switch_recommend_path(IN const osm_switch_t * p_sw,
235321936Shselasky				  IN osm_port_t * p_port, IN uint16_t lid_ho,
236321936Shselasky				  IN unsigned start_from,
237321936Shselasky				  IN boolean_t ignore_existing,
238321936Shselasky				  IN boolean_t routing_for_lmc,
239321936Shselasky				  IN boolean_t dor,
240321936Shselasky				  IN boolean_t port_shifting,
241321936Shselasky				  IN uint32_t scatter_ports,
242321936Shselasky				  IN osm_lft_type_enum lft_enum)
243321936Shselasky{
244321936Shselasky	/*
245321936Shselasky	   We support an enhanced LMC aware routing mode:
246321936Shselasky	   In the case of LMC > 0, we can track the remote side
247321936Shselasky	   system and node for all of the lids of the target
248321936Shselasky	   and try and avoid routing again through the same
249321936Shselasky	   system / node.
250321936Shselasky
251321936Shselasky	   Assume if routing_for_lmc is true that this procedure was
252321936Shselasky	   provided the tracking array and counter via p_port->priv,
253321936Shselasky	   and we can conduct this algorithm.
254321936Shselasky	 */
255321936Shselasky	uint16_t base_lid;
256321936Shselasky	uint8_t hops;
257321936Shselasky	uint8_t least_hops;
258321936Shselasky	uint8_t port_num;
259321936Shselasky	uint8_t num_ports;
260321936Shselasky	uint32_t least_paths = 0xFFFFFFFF;
261321936Shselasky	unsigned i;
262321936Shselasky	/*
263321936Shselasky	   The following will track the least paths if the
264321936Shselasky	   route should go through a new system/node
265321936Shselasky	 */
266321936Shselasky	uint32_t least_paths_other_sys = 0xFFFFFFFF;
267321936Shselasky	uint32_t least_paths_other_nodes = 0xFFFFFFFF;
268321936Shselasky	uint32_t least_forwarded_to = 0xFFFFFFFF;
269321936Shselasky	uint32_t check_count;
270321936Shselasky	uint8_t best_port = 0;
271321936Shselasky	/*
272321936Shselasky	   These vars track the best port if it connects to
273321936Shselasky	   not used system/node.
274321936Shselasky	 */
275321936Shselasky	uint8_t best_port_other_sys = 0;
276321936Shselasky	uint8_t best_port_other_node = 0;
277321936Shselasky	boolean_t port_found = FALSE;
278321936Shselasky	osm_physp_t *p_physp;
279321936Shselasky	osm_physp_t *p_rem_physp;
280321936Shselasky	osm_node_t *p_rem_node;
281321936Shselasky	osm_node_t *p_rem_node_first = NULL;
282321936Shselasky	struct osm_remote_node *p_remote_guid = NULL;
283321936Shselasky	struct osm_remote_node null_remote_node = {NULL, 0, 0};
284321936Shselasky	struct switch_port_path port_paths[IB_NODE_NUM_PORTS_MAX];
285321936Shselasky	unsigned int port_paths_total_paths = 0;
286321936Shselasky	unsigned int port_paths_count = 0;
287321936Shselasky	uint8_t scatter_possible_ports[IB_NODE_NUM_PORTS_MAX];
288321936Shselasky	unsigned int scatter_possible_ports_count = 0;
289321936Shselasky	int found_sys_guid = 0;
290321936Shselasky	int found_node_guid = 0;
291321936Shselasky
292321936Shselasky	CL_ASSERT(lid_ho > 0);
293321936Shselasky
294321936Shselasky	if (p_port->p_node->sw) {
295321936Shselasky		if (p_port->p_node->sw == p_sw)
296321936Shselasky			return 0;
297321936Shselasky		base_lid = osm_port_get_base_lid(p_port);
298321936Shselasky	} else {
299321936Shselasky		p_physp = p_port->p_physp;
300321936Shselasky		if (!p_physp || !p_physp->p_remote_physp ||
301321936Shselasky		    !p_physp->p_remote_physp->p_node->sw)
302321936Shselasky			return OSM_NO_PATH;
303321936Shselasky
304321936Shselasky		if (p_physp->p_remote_physp->p_node->sw == p_sw)
305321936Shselasky			return p_physp->p_remote_physp->port_num;
306321936Shselasky		base_lid =
307321936Shselasky		    osm_node_get_base_lid(p_physp->p_remote_physp->p_node, 0);
308321936Shselasky	}
309321936Shselasky	base_lid = cl_ntoh16(base_lid);
310321936Shselasky
311321936Shselasky	num_ports = p_sw->num_ports;
312321936Shselasky
313321936Shselasky	least_hops = osm_switch_get_least_hops(p_sw, base_lid);
314321936Shselasky	if (least_hops == OSM_NO_PATH)
315321936Shselasky		return OSM_NO_PATH;
316321936Shselasky
317321936Shselasky	/*
318321936Shselasky	   First, inquire with the forwarding table for an existing
319321936Shselasky	   route.  If one is found, honor it unless:
320321936Shselasky	   1. the ignore existing flag is set.
321321936Shselasky	   2. the physical port is not a valid one or not healthy
322321936Shselasky	   3. the physical port has a remote port (the link is up)
323321936Shselasky	   4. the port has min-hops to the target (avoid loops)
324321936Shselasky	 */
325321936Shselasky	if (!ignore_existing) {
326321936Shselasky		port_num = osm_switch_get_port_by_lid(p_sw, lid_ho, lft_enum);
327321936Shselasky
328321936Shselasky		if (port_num != OSM_NO_PATH) {
329321936Shselasky			CL_ASSERT(port_num < num_ports);
330321936Shselasky
331321936Shselasky			p_physp =
332321936Shselasky			    osm_node_get_physp_ptr(p_sw->p_node, port_num);
333321936Shselasky			/*
334321936Shselasky			   Don't be too trusting of the current forwarding table!
335321936Shselasky			   Verify that the port number is legal and that the
336321936Shselasky			   LID is reachable through this port.
337321936Shselasky			 */
338321936Shselasky			if (p_physp && osm_physp_is_healthy(p_physp) &&
339321936Shselasky			    osm_physp_get_remote(p_physp)) {
340321936Shselasky				hops =
341321936Shselasky				    osm_switch_get_hop_count(p_sw, base_lid,
342321936Shselasky							     port_num);
343321936Shselasky				/*
344321936Shselasky				   If we aren't using pre-defined user routes
345321936Shselasky				   function, then we need to make sure that the
346321936Shselasky				   current path is the minimum one. In case of
347321936Shselasky				   having such a user function - this check will
348321936Shselasky				   not be done, and the old routing will be used.
349321936Shselasky				   Note: This means that it is the user's job to
350321936Shselasky				   clean all data in the forwarding tables that
351321936Shselasky				   he wants to be overridden by the minimum
352321936Shselasky				   hop function.
353321936Shselasky				 */
354321936Shselasky				if (hops == least_hops)
355321936Shselasky					return port_num;
356321936Shselasky			}
357321936Shselasky		}
358321936Shselasky	}
359321936Shselasky
360321936Shselasky	/*
361321936Shselasky	   This algorithm selects a port based on a static load balanced
362321936Shselasky	   selection across equal hop-count ports.
363321936Shselasky	   There is lots of room for improved sophistication here,
364321936Shselasky	   possibly guided by user configuration info.
365321936Shselasky	 */
366321936Shselasky
367321936Shselasky	/*
368321936Shselasky	   OpenSM routing is "local" - not considering a full lid to lid
369321936Shselasky	   path. As such we can not guarantee a path will not loop if we
370321936Shselasky	   do not always follow least hops.
371321936Shselasky	   So we must abort if not least hops.
372321936Shselasky	 */
373321936Shselasky
374321936Shselasky	/* port number starts with one and num_ports is 1 + num phys ports */
375321936Shselasky	for (i = start_from; i < start_from + num_ports; i++) {
376321936Shselasky		port_num = osm_switch_get_dimn_port(p_sw, i % num_ports);
377321936Shselasky		if (!port_num ||
378321936Shselasky		    osm_switch_get_hop_count(p_sw, base_lid, port_num) !=
379321936Shselasky		    least_hops)
380321936Shselasky			continue;
381321936Shselasky
382321936Shselasky		/* let us make sure it is not down or unhealthy */
383321936Shselasky		p_physp = osm_node_get_physp_ptr(p_sw->p_node, port_num);
384321936Shselasky		if (!p_physp || !osm_physp_is_healthy(p_physp) ||
385321936Shselasky		    /*
386321936Shselasky		       we require all - non sma ports to be linked
387321936Shselasky		       to be routed through
388321936Shselasky		     */
389321936Shselasky		    !osm_physp_get_remote(p_physp))
390321936Shselasky			continue;
391321936Shselasky
392321936Shselasky		/*
393321936Shselasky		   We located a least-hop port, possibly one of many.
394321936Shselasky		   For this port, check the running total count of
395321936Shselasky		   the number of paths through this port.  Select
396321936Shselasky		   the port routing the least number of paths.
397321936Shselasky		 */
398321936Shselasky		check_count =
399321936Shselasky		    osm_port_prof_path_count_get(&p_sw->p_prof[port_num]);
400321936Shselasky
401321936Shselasky
402321936Shselasky		if (dor) {
403321936Shselasky			/* Get the Remote Node */
404321936Shselasky			p_rem_physp = osm_physp_get_remote(p_physp);
405321936Shselasky			p_rem_node = osm_physp_get_node_ptr(p_rem_physp);
406321936Shselasky			/* use the first dimension, but spread traffic
407321936Shselasky			 * out among the group of ports representing
408321936Shselasky			 * that dimension */
409321936Shselasky			if (!p_rem_node_first)
410321936Shselasky				p_rem_node_first = p_rem_node;
411321936Shselasky			else if (p_rem_node != p_rem_node_first)
412321936Shselasky				continue;
413321936Shselasky			if (routing_for_lmc) {
414321936Shselasky				struct osm_remote_guids_count *r = p_port->priv;
415321936Shselasky				uint8_t rem_port = osm_physp_get_port_num(p_rem_physp);
416321936Shselasky				unsigned int j;
417321936Shselasky
418321936Shselasky				for (j = 0; j < r->count; j++) {
419321936Shselasky					p_remote_guid = &r->guids[j];
420321936Shselasky					if ((p_remote_guid->node == p_rem_node)
421321936Shselasky					    && (p_remote_guid->port == rem_port))
422321936Shselasky						break;
423321936Shselasky				}
424321936Shselasky				if (j == r->count)
425321936Shselasky					p_remote_guid = &null_remote_node;
426321936Shselasky			}
427321936Shselasky		/*
428321936Shselasky		   Advanced LMC routing requires tracking of the
429321936Shselasky		   best port by the node connected to the other side of
430321936Shselasky		   it.
431321936Shselasky		 */
432321936Shselasky		} else if (routing_for_lmc) {
433321936Shselasky			/* Is the sys guid already used ? */
434321936Shselasky			p_remote_guid = switch_find_sys_guid_count(p_sw,
435321936Shselasky								   p_port->priv,
436321936Shselasky								   port_num);
437321936Shselasky
438321936Shselasky			/* If not update the least hops for this case */
439321936Shselasky			if (!p_remote_guid) {
440321936Shselasky				if (check_count < least_paths_other_sys) {
441321936Shselasky					least_paths_other_sys = check_count;
442321936Shselasky					best_port_other_sys = port_num;
443321936Shselasky					least_forwarded_to = 0;
444321936Shselasky				}
445321936Shselasky				found_sys_guid = 0;
446321936Shselasky			} else {	/* same sys found - try node */
447321936Shselasky
448321936Shselasky
449321936Shselasky				/* Else is the node guid already used ? */
450321936Shselasky				p_remote_guid = switch_find_node_guid_count(p_sw,
451321936Shselasky									    p_port->priv,
452321936Shselasky									    port_num);
453321936Shselasky
454321936Shselasky				/* If not update the least hops for this case */
455321936Shselasky				if (!p_remote_guid
456321936Shselasky				    && check_count < least_paths_other_nodes) {
457321936Shselasky					least_paths_other_nodes = check_count;
458321936Shselasky					best_port_other_node = port_num;
459321936Shselasky					least_forwarded_to = 0;
460321936Shselasky				}
461321936Shselasky				/* else prior sys and node guid already used */
462321936Shselasky
463321936Shselasky				if (!p_remote_guid)
464321936Shselasky					found_node_guid = 0;
465321936Shselasky				else
466321936Shselasky					found_node_guid = 1;
467321936Shselasky				found_sys_guid = 1;
468321936Shselasky			}	/* same sys found */
469321936Shselasky		}
470321936Shselasky
471321936Shselasky		port_paths[port_paths_count].port_num = port_num;
472321936Shselasky		port_paths[port_paths_count].path_count = check_count;
473321936Shselasky		if (routing_for_lmc) {
474321936Shselasky			port_paths[port_paths_count].found_sys_guid = found_sys_guid;
475321936Shselasky			port_paths[port_paths_count].found_node_guid = found_node_guid;
476321936Shselasky		}
477321936Shselasky		if (routing_for_lmc && p_remote_guid)
478321936Shselasky			port_paths[port_paths_count].forwarded_to = p_remote_guid->forwarded_to;
479321936Shselasky		else
480321936Shselasky			port_paths[port_paths_count].forwarded_to = 0;
481321936Shselasky		port_paths_total_paths += check_count;
482321936Shselasky		port_paths_count++;
483321936Shselasky
484321936Shselasky		/* routing for LMC mode */
485321936Shselasky		/*
486321936Shselasky		   the count is min but also lower then the max subscribed
487321936Shselasky		 */
488321936Shselasky		if (check_count < least_paths) {
489321936Shselasky			port_found = TRUE;
490321936Shselasky			best_port = port_num;
491321936Shselasky			least_paths = check_count;
492321936Shselasky			scatter_possible_ports_count = 0;
493321936Shselasky			scatter_possible_ports[scatter_possible_ports_count++] = port_num;
494321936Shselasky			if (routing_for_lmc
495321936Shselasky			    && p_remote_guid
496321936Shselasky			    && p_remote_guid->forwarded_to < least_forwarded_to)
497321936Shselasky				least_forwarded_to = p_remote_guid->forwarded_to;
498321936Shselasky		} else if (scatter_ports
499321936Shselasky			   && check_count == least_paths) {
500321936Shselasky			scatter_possible_ports[scatter_possible_ports_count++] = port_num;
501321936Shselasky		} else if (routing_for_lmc
502321936Shselasky			   && p_remote_guid
503321936Shselasky			   && check_count == least_paths
504321936Shselasky			   && p_remote_guid->forwarded_to < least_forwarded_to) {
505321936Shselasky			least_forwarded_to = p_remote_guid->forwarded_to;
506321936Shselasky			best_port = port_num;
507321936Shselasky		}
508321936Shselasky	}
509321936Shselasky
510321936Shselasky	if (port_found == FALSE)
511321936Shselasky		return OSM_NO_PATH;
512321936Shselasky
513321936Shselasky	if (port_shifting && port_paths_count) {
514321936Shselasky		/* In the port_paths[] array, we now have all the ports that we
515321936Shselasky		 * can route out of.  Using some shifting math below, possibly
516321936Shselasky		 * select a different one so that lids won't align in LFTs
517321936Shselasky		 *
518321936Shselasky		 * If lmc > 0, we need to loop through these ports to find the
519321936Shselasky		 * least_forwarded_to port, best_port_other_sys, and
520321936Shselasky		 * best_port_other_node just like before but through the different
521321936Shselasky		 * ordering.
522321936Shselasky		 */
523321936Shselasky
524321936Shselasky		least_paths = 0xFFFFFFFF;
525321936Shselasky		least_paths_other_sys = 0xFFFFFFFF;
526321936Shselasky		least_paths_other_nodes = 0xFFFFFFFF;
527321936Shselasky	        least_forwarded_to = 0xFFFFFFFF;
528321936Shselasky		best_port = 0;
529321936Shselasky		best_port_other_sys = 0;
530321936Shselasky		best_port_other_node = 0;
531321936Shselasky
532321936Shselasky		for (i = 0; i < port_paths_count; i++) {
533321936Shselasky			unsigned int idx;
534321936Shselasky
535321936Shselasky			idx = (port_paths_total_paths/port_paths_count + i) % port_paths_count;
536321936Shselasky
537321936Shselasky			if (routing_for_lmc) {
538321936Shselasky				if (!port_paths[idx].found_sys_guid
539321936Shselasky				    && port_paths[idx].path_count < least_paths_other_sys) {
540321936Shselasky					least_paths_other_sys = port_paths[idx].path_count;
541321936Shselasky					best_port_other_sys = port_paths[idx].port_num;
542321936Shselasky					least_forwarded_to = 0;
543321936Shselasky				}
544321936Shselasky				else if (!port_paths[idx].found_node_guid
545321936Shselasky					 && port_paths[idx].path_count < least_paths_other_nodes) {
546321936Shselasky					least_paths_other_nodes = port_paths[idx].path_count;
547321936Shselasky					best_port_other_node = port_paths[idx].port_num;
548321936Shselasky					least_forwarded_to = 0;
549321936Shselasky				}
550321936Shselasky			}
551321936Shselasky
552321936Shselasky			if (port_paths[idx].path_count < least_paths) {
553321936Shselasky				best_port = port_paths[idx].port_num;
554321936Shselasky				least_paths = port_paths[idx].path_count;
555321936Shselasky				if (routing_for_lmc
556321936Shselasky				    && (port_paths[idx].found_sys_guid
557321936Shselasky					|| port_paths[idx].found_node_guid)
558321936Shselasky				    && port_paths[idx].forwarded_to < least_forwarded_to)
559321936Shselasky					least_forwarded_to = port_paths[idx].forwarded_to;
560321936Shselasky			}
561321936Shselasky			else if (routing_for_lmc
562321936Shselasky				 && (port_paths[idx].found_sys_guid
563321936Shselasky				     || port_paths[idx].found_node_guid)
564321936Shselasky				 && port_paths[idx].path_count == least_paths
565321936Shselasky				 && port_paths[idx].forwarded_to < least_forwarded_to) {
566321936Shselasky				least_forwarded_to = port_paths[idx].forwarded_to;
567321936Shselasky				best_port = port_paths[idx].port_num;
568321936Shselasky			}
569321936Shselasky
570321936Shselasky		}
571321936Shselasky	}
572321936Shselasky
573321936Shselasky	/*
574321936Shselasky	   if we are in enhanced routing mode and the best port is not
575321936Shselasky	   the local port 0
576321936Shselasky	 */
577321936Shselasky	if (routing_for_lmc && best_port && !scatter_ports) {
578321936Shselasky		/* Select the least hop port of the non used sys first */
579321936Shselasky		if (best_port_other_sys)
580321936Shselasky			best_port = best_port_other_sys;
581321936Shselasky		else if (best_port_other_node)
582321936Shselasky			best_port = best_port_other_node;
583321936Shselasky	} else if (scatter_ports) {
584321936Shselasky		/*
585321936Shselasky		 * There is some danger that this random could "rebalance" the routes
586321936Shselasky		 * every time, to combat this there is a global srandom that
587321936Shselasky		 * occurs at the start of every sweep.
588321936Shselasky		 */
589321936Shselasky		unsigned int idx = random() % scatter_possible_ports_count;
590321936Shselasky		best_port = scatter_possible_ports[idx];
591321936Shselasky	}
592321936Shselasky	return best_port;
593321936Shselasky}
594321936Shselasky
595321936Shselaskyvoid osm_switch_clear_hops(IN osm_switch_t * p_sw)
596321936Shselasky{
597321936Shselasky	unsigned i;
598321936Shselasky
599321936Shselasky	for (i = 0; i < p_sw->num_hops; i++)
600321936Shselasky		if (p_sw->hops[i])
601321936Shselasky			memset(p_sw->hops[i], OSM_NO_PATH, p_sw->num_ports);
602321936Shselasky}
603321936Shselasky
604321936Shselaskystatic int alloc_lft(IN osm_switch_t * p_sw, uint16_t lids)
605321936Shselasky{
606321936Shselasky	uint16_t lft_size;
607321936Shselasky
608321936Shselasky	/* Ensure LFT is in units of LFT block size */
609321936Shselasky	lft_size = (lids / IB_SMP_DATA_SIZE + 1) * IB_SMP_DATA_SIZE;
610321936Shselasky	if (lft_size > p_sw->lft_size) {
611321936Shselasky		uint8_t *new_lft = realloc(p_sw->lft, lft_size);
612321936Shselasky		if (!new_lft)
613321936Shselasky			return -1;
614321936Shselasky		memset(new_lft + p_sw->lft_size, OSM_NO_PATH,
615321936Shselasky		       lft_size - p_sw->lft_size);
616321936Shselasky		p_sw->lft = new_lft;
617321936Shselasky		p_sw->lft_size = lft_size;
618321936Shselasky	}
619321936Shselasky
620321936Shselasky	return 0;
621321936Shselasky}
622321936Shselasky
623321936Shselaskyint osm_switch_prepare_path_rebuild(IN osm_switch_t * p_sw, IN uint16_t max_lids)
624321936Shselasky{
625321936Shselasky	uint8_t **hops;
626321936Shselasky	uint8_t *new_lft;
627321936Shselasky	unsigned i;
628321936Shselasky
629321936Shselasky	if (alloc_lft(p_sw, max_lids))
630321936Shselasky		return -1;
631321936Shselasky
632321936Shselasky	for (i = 0; i < p_sw->num_ports; i++)
633321936Shselasky		osm_port_prof_construct(&p_sw->p_prof[i]);
634321936Shselasky
635321936Shselasky	osm_switch_clear_hops(p_sw);
636321936Shselasky
637321936Shselasky	if (!(new_lft = realloc(p_sw->new_lft, p_sw->lft_size)))
638321936Shselasky		return -1;
639321936Shselasky
640321936Shselasky	p_sw->new_lft = new_lft;
641321936Shselasky
642321936Shselasky	memset(p_sw->new_lft, OSM_NO_PATH, p_sw->lft_size);
643321936Shselasky
644321936Shselasky	if (!p_sw->hops) {
645321936Shselasky		hops = malloc((max_lids + 1) * sizeof(hops[0]));
646321936Shselasky		if (!hops)
647321936Shselasky			return -1;
648321936Shselasky		memset(hops, 0, (max_lids + 1) * sizeof(hops[0]));
649321936Shselasky		p_sw->hops = hops;
650321936Shselasky		p_sw->num_hops = max_lids + 1;
651321936Shselasky	} else if (max_lids + 1 > p_sw->num_hops) {
652321936Shselasky		hops = realloc(p_sw->hops, (max_lids + 1) * sizeof(hops[0]));
653321936Shselasky		if (!hops)
654321936Shselasky			return -1;
655321936Shselasky		memset(hops + p_sw->num_hops, 0,
656321936Shselasky		       (max_lids + 1 - p_sw->num_hops) * sizeof(hops[0]));
657321936Shselasky		p_sw->hops = hops;
658321936Shselasky		p_sw->num_hops = max_lids + 1;
659321936Shselasky	}
660321936Shselasky	p_sw->max_lid_ho = max_lids;
661321936Shselasky
662321936Shselasky	return 0;
663321936Shselasky}
664321936Shselasky
665321936Shselaskyuint8_t osm_switch_get_port_least_hops(IN const osm_switch_t * p_sw,
666321936Shselasky				       IN const osm_port_t * p_port)
667321936Shselasky{
668321936Shselasky	uint16_t lid;
669321936Shselasky
670321936Shselasky	if (p_port->p_node->sw) {
671321936Shselasky		if (p_port->p_node->sw == p_sw)
672321936Shselasky			return 0;
673321936Shselasky		lid = osm_node_get_base_lid(p_port->p_node, 0);
674321936Shselasky		return osm_switch_get_least_hops(p_sw, cl_ntoh16(lid));
675321936Shselasky	} else {
676321936Shselasky		osm_physp_t *p = p_port->p_physp;
677321936Shselasky		uint8_t hops;
678321936Shselasky
679321936Shselasky		if (!p || !p->p_remote_physp || !p->p_remote_physp->p_node->sw)
680321936Shselasky			return OSM_NO_PATH;
681321936Shselasky		if (p->p_remote_physp->p_node->sw == p_sw)
682321936Shselasky			return 1;
683321936Shselasky		lid = osm_node_get_base_lid(p->p_remote_physp->p_node, 0);
684321936Shselasky		hops = osm_switch_get_least_hops(p_sw, cl_ntoh16(lid));
685321936Shselasky		return hops != OSM_NO_PATH ? hops + 1 : OSM_NO_PATH;
686321936Shselasky	}
687321936Shselasky}
688321936Shselasky
689321936Shselaskyuint8_t osm_switch_recommend_mcast_path(IN osm_switch_t * p_sw,
690321936Shselasky					IN osm_port_t * p_port,
691321936Shselasky					IN uint16_t mlid_ho,
692321936Shselasky					IN boolean_t ignore_existing)
693321936Shselasky{
694321936Shselasky	uint16_t base_lid;
695321936Shselasky	uint8_t hops;
696321936Shselasky	uint8_t port_num;
697321936Shselasky	uint8_t num_ports;
698321936Shselasky	uint8_t least_hops;
699321936Shselasky
700321936Shselasky	CL_ASSERT(mlid_ho >= IB_LID_MCAST_START_HO);
701321936Shselasky
702321936Shselasky	if (p_port->p_node->sw) {
703321936Shselasky		if (p_port->p_node->sw == p_sw)
704321936Shselasky			return 0;
705321936Shselasky		base_lid = osm_port_get_base_lid(p_port);
706321936Shselasky	} else {
707321936Shselasky		osm_physp_t *p_physp = p_port->p_physp;
708321936Shselasky		if (!p_physp || !p_physp->p_remote_physp ||
709321936Shselasky		    !p_physp->p_remote_physp->p_node->sw)
710321936Shselasky			return OSM_NO_PATH;
711321936Shselasky		if (p_physp->p_remote_physp->p_node->sw == p_sw)
712321936Shselasky			return p_physp->p_remote_physp->port_num;
713321936Shselasky		base_lid =
714321936Shselasky		    osm_node_get_base_lid(p_physp->p_remote_physp->p_node, 0);
715321936Shselasky	}
716321936Shselasky	base_lid = cl_ntoh16(base_lid);
717321936Shselasky	num_ports = p_sw->num_ports;
718321936Shselasky
719321936Shselasky	/*
720321936Shselasky	   If the user wants us to ignore existing multicast routes,
721321936Shselasky	   then simply return the shortest hop count path to the
722321936Shselasky	   target port.
723321936Shselasky
724321936Shselasky	   Otherwise, return the first port that has a path to the target,
725321936Shselasky	   picking from the ports that are already in the multicast group.
726321936Shselasky	 */
727321936Shselasky	if (!ignore_existing) {
728321936Shselasky		for (port_num = 1; port_num < num_ports; port_num++) {
729321936Shselasky			if (!osm_mcast_tbl_is_port
730321936Shselasky			    (&p_sw->mcast_tbl, mlid_ho, port_num))
731321936Shselasky				continue;
732321936Shselasky			/*
733321936Shselasky			   Don't be too trusting of the current forwarding table!
734321936Shselasky			   Verify that the LID is reachable through this port.
735321936Shselasky			 */
736321936Shselasky			hops =
737321936Shselasky			    osm_switch_get_hop_count(p_sw, base_lid, port_num);
738321936Shselasky			if (hops != OSM_NO_PATH)
739321936Shselasky				return port_num;
740321936Shselasky		}
741321936Shselasky	}
742321936Shselasky
743321936Shselasky	/*
744321936Shselasky	   Either no existing mcast paths reach this port or we are
745321936Shselasky	   ignoring existing paths.
746321936Shselasky
747321936Shselasky	   Determine the best multicast path to the target.  Note that this
748321936Shselasky	   algorithm is slightly different from the one used for unicast route
749321936Shselasky	   recommendation.  In this case (multicast), we must NOT
750321936Shselasky	   perform any sort of load balancing.  We MUST take the FIRST
751321936Shselasky	   port found that has <= the lowest hop count path.  This prevents
752321936Shselasky	   more than one multicast path to the same remote switch which
753321936Shselasky	   prevents a multicast loop.  Multicast loops are bad since the same
754321936Shselasky	   multicast packet will go around and around, inevitably creating
755321936Shselasky	   a black hole that will destroy the Earth in a firey conflagration.
756321936Shselasky	 */
757321936Shselasky	least_hops = osm_switch_get_least_hops(p_sw, base_lid);
758321936Shselasky	if (least_hops == OSM_NO_PATH)
759321936Shselasky		return OSM_NO_PATH;
760321936Shselasky	for (port_num = 1; port_num < num_ports; port_num++)
761321936Shselasky		if (osm_switch_get_hop_count(p_sw, base_lid, port_num) ==
762321936Shselasky		    least_hops)
763321936Shselasky			break;
764321936Shselasky
765321936Shselasky	CL_ASSERT(port_num < num_ports);
766321936Shselasky	return port_num;
767321936Shselasky}
768