1/*
2 * Copyright (c) 2004-2008 Voltaire, Inc. All rights reserved.
3 * Copyright (c) 2002-2007 Mellanox Technologies LTD. All rights reserved.
4 * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
5 *
6 * This software is available to you under a choice of one of two
7 * licenses.  You may choose to be licensed under the terms of the GNU
8 * General Public License (GPL) Version 2, available from the file
9 * COPYING in the main directory of this source tree, or the
10 * OpenIB.org BSD license below:
11 *
12 *     Redistribution and use in source and binary forms, with or
13 *     without modification, are permitted provided that the following
14 *     conditions are met:
15 *
16 *      - Redistributions of source code must retain the above
17 *        copyright notice, this list of conditions and the following
18 *        disclaimer.
19 *
20 *      - Redistributions in binary form must reproduce the above
21 *        copyright notice, this list of conditions and the following
22 *        disclaimer in the documentation and/or other materials
23 *        provided with the distribution.
24 *
25 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
26 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
27 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
28 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
29 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
30 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
31 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
32 * SOFTWARE.
33 *
34 */
35
36/*
37 * Abstract:
38 *    Implementation of OpenSM FatTree routing
39 */
40
41#if HAVE_CONFIG_H
42#  include <config.h>
43#endif
44
45#include <stdlib.h>
46#include <string.h>
47#include <ctype.h>
48#include <errno.h>
49#include <iba/ib_types.h>
50#include <complib/cl_qmap.h>
51#include <complib/cl_debug.h>
52#include <opensm/osm_opensm.h>
53#include <opensm/osm_switch.h>
54
55/*
56 * FatTree rank is bounded between 2 and 8:
57 *  - Tree of rank 1 has only trivial routing paths,
58 *    so no need to use FatTree routing.
59 *  - Why maximum rank is 8:
60 *    Each node (switch) is assigned a unique tuple.
61 *    Switches are stored in two cl_qmaps - one is
62 *    ordered by guid, and the other by a key that is
63 *    generated from tuple. Since cl_qmap supports only
64 *    a 64-bit key, the maximal tuple lenght is 8 bytes.
65 *    which means that maximal tree rank is 8.
66 * Note that the above also implies that each switch
67 * can have at max 255 up/down ports.
68 */
69
70#define FAT_TREE_MIN_RANK 2
71#define FAT_TREE_MAX_RANK 8
72
73typedef enum {
74	FTREE_DIRECTION_DOWN = -1,
75	FTREE_DIRECTION_SAME,
76	FTREE_DIRECTION_UP
77} ftree_direction_t;
78
79/***************************************************
80 **
81 **  Forward references
82 **
83 ***************************************************/
84
85struct ftree_sw_t_;
86struct ftree_hca_t_;
87struct ftree_port_t_;
88struct ftree_port_group_t_;
89struct ftree_fabric_t_;
90
91/***************************************************
92 **
93 **  ftree_tuple_t definition
94 **
95 ***************************************************/
96
97#define FTREE_TUPLE_BUFF_LEN 1024
98#define FTREE_TUPLE_LEN 8
99
100typedef uint8_t ftree_tuple_t[FTREE_TUPLE_LEN];
101typedef uint64_t ftree_tuple_key_t;
102
103struct guid_list_item {
104	cl_list_item_t list;
105	uint64_t guid;
106};
107
108/***************************************************
109 **
110 **  ftree_sw_table_element_t definition
111 **
112 ***************************************************/
113
114typedef struct {
115	cl_map_item_t map_item;
116	struct ftree_sw_t_ *p_sw;
117} ftree_sw_tbl_element_t;
118
119/***************************************************
120 **
121 **  ftree_port_t definition
122 **
123 ***************************************************/
124
125typedef struct ftree_port_t_ {
126	cl_map_item_t map_item;
127	uint8_t port_num;	/* port number on the current node */
128	uint8_t remote_port_num;	/* port number on the remote node */
129	uint32_t counter_up;	/* number of allocated routs upwards */
130	uint32_t counter_down;	/* number of allocated routs downwards */
131} ftree_port_t;
132
133/***************************************************
134 **
135 **  ftree_port_group_t definition
136 **
137 ***************************************************/
138
139typedef union ftree_hca_or_sw_ {
140	struct ftree_hca_t_ *p_hca;
141	struct ftree_sw_t_ *p_sw;
142} ftree_hca_or_sw;
143
144typedef struct ftree_port_group_t_ {
145	cl_map_item_t map_item;
146	ib_net16_t base_lid;	/* base lid of the current node */
147	ib_net16_t remote_base_lid;	/* base lid of the remote node */
148	ib_net64_t port_guid;	/* port guid of this port */
149	ib_net64_t node_guid;	/* this node's guid */
150	uint8_t node_type;	/* this node's type */
151	ib_net64_t remote_port_guid;	/* port guid of the remote port */
152	ib_net64_t remote_node_guid;	/* node guid of the remote node */
153	uint8_t remote_node_type;	/* IB_NODE_TYPE_{CA,SWITCH,ROUTER,...} */
154	ftree_hca_or_sw hca_or_sw;	/* pointer to this hca/switch */
155	ftree_hca_or_sw remote_hca_or_sw;	/* pointer to remote hca/switch */
156	cl_ptr_vector_t ports;	/* vector of ports to the same lid */
157	boolean_t is_cn;	/* whether this port is a compute node */
158	uint32_t counter_down;	/* number of allocated routs downwards */
159} ftree_port_group_t;
160
161/***************************************************
162 **
163 **  ftree_sw_t definition
164 **
165 ***************************************************/
166
167typedef struct ftree_sw_t_ {
168	cl_map_item_t map_item;
169	osm_switch_t *p_osm_sw;
170	uint32_t rank;
171	ftree_tuple_t tuple;
172	ib_net16_t base_lid;
173	ftree_port_group_t **down_port_groups;
174	uint8_t down_port_groups_num;
175	ftree_port_group_t **up_port_groups;
176	uint8_t up_port_groups_num;
177	boolean_t is_leaf;
178	int down_port_groups_idx;
179} ftree_sw_t;
180
181/***************************************************
182 **
183 **  ftree_hca_t definition
184 **
185 ***************************************************/
186
187typedef struct ftree_hca_t_ {
188	cl_map_item_t map_item;
189	osm_node_t *p_osm_node;
190	ftree_port_group_t **up_port_groups;
191	uint16_t up_port_groups_num;
192	unsigned cn_num;
193} ftree_hca_t;
194
195/***************************************************
196 **
197 **  ftree_fabric_t definition
198 **
199 ***************************************************/
200
201typedef struct ftree_fabric_t_ {
202	osm_opensm_t *p_osm;
203	cl_qmap_t hca_tbl;
204	cl_qmap_t sw_tbl;
205	cl_qmap_t sw_by_tuple_tbl;
206	cl_qlist_t root_guid_list;
207	cl_qmap_t cn_guid_tbl;
208	unsigned cn_num;
209	uint8_t leaf_switch_rank;
210	uint8_t max_switch_rank;
211	ftree_sw_t **leaf_switches;
212	uint32_t leaf_switches_num;
213	uint16_t max_cn_per_leaf;
214	uint16_t lft_max_lid_ho;
215	boolean_t fabric_built;
216} ftree_fabric_t;
217
218/***************************************************
219 **
220 ** comparators
221 **
222 ***************************************************/
223
224static int OSM_CDECL __osm_ftree_compare_switches_by_index(IN const void *p1,
225							   IN const void *p2)
226{
227	ftree_sw_t **pp_sw1 = (ftree_sw_t **) p1;
228	ftree_sw_t **pp_sw2 = (ftree_sw_t **) p2;
229
230	uint16_t i;
231	for (i = 0; i < FTREE_TUPLE_LEN; i++) {
232		if ((*pp_sw1)->tuple[i] > (*pp_sw2)->tuple[i])
233			return 1;
234		if ((*pp_sw1)->tuple[i] < (*pp_sw2)->tuple[i])
235			return -1;
236	}
237	return 0;
238}
239
240/***************************************************/
241
242static int OSM_CDECL
243__osm_ftree_compare_port_groups_by_remote_switch_index(IN const void *p1,
244						       IN const void *p2)
245{
246	ftree_port_group_t **pp_g1 = (ftree_port_group_t **) p1;
247	ftree_port_group_t **pp_g2 = (ftree_port_group_t **) p2;
248
249	return
250	    __osm_ftree_compare_switches_by_index(&
251						  ((*pp_g1)->remote_hca_or_sw.
252						   p_sw),
253						  &((*pp_g2)->remote_hca_or_sw.
254						    p_sw));
255}
256
257/***************************************************
258 **
259 ** ftree_tuple_t functions
260 **
261 ***************************************************/
262
263static void __osm_ftree_tuple_init(IN ftree_tuple_t tuple)
264{
265	memset(tuple, 0xFF, FTREE_TUPLE_LEN);
266}
267
268/***************************************************/
269
270static inline boolean_t __osm_ftree_tuple_assigned(IN ftree_tuple_t tuple)
271{
272	return (tuple[0] != 0xFF);
273}
274
275/***************************************************/
276
277#define FTREE_TUPLE_BUFFERS_NUM 6
278
279static char *__osm_ftree_tuple_to_str(IN ftree_tuple_t tuple)
280{
281	static char buffer[FTREE_TUPLE_BUFFERS_NUM][FTREE_TUPLE_BUFF_LEN];
282	static uint8_t ind = 0;
283	char *ret_buffer;
284	uint32_t i;
285
286	if (!__osm_ftree_tuple_assigned(tuple))
287		return "INDEX.NOT.ASSIGNED";
288
289	buffer[ind][0] = '\0';
290
291	for (i = 0; (i < FTREE_TUPLE_LEN) && (tuple[i] != 0xFF); i++) {
292		if ((strlen(buffer[ind]) + 10) > FTREE_TUPLE_BUFF_LEN)
293			return "INDEX.TOO.LONG";
294		if (i != 0)
295			strcat(buffer[ind], ".");
296		sprintf(&buffer[ind][strlen(buffer[ind])], "%u", tuple[i]);
297	}
298
299	ret_buffer = buffer[ind];
300	ind = (ind + 1) % FTREE_TUPLE_BUFFERS_NUM;
301	return ret_buffer;
302}				/* __osm_ftree_tuple_to_str() */
303
304/***************************************************/
305
306static inline ftree_tuple_key_t __osm_ftree_tuple_to_key(IN ftree_tuple_t tuple)
307{
308	ftree_tuple_key_t key;
309	memcpy(&key, tuple, FTREE_TUPLE_LEN);
310	return key;
311}
312
313/***************************************************/
314
315static inline void __osm_ftree_tuple_from_key(IN ftree_tuple_t tuple,
316					      IN ftree_tuple_key_t key)
317{
318	memcpy(tuple, &key, FTREE_TUPLE_LEN);
319}
320
321/***************************************************
322 **
323 ** ftree_sw_tbl_element_t functions
324 **
325 ***************************************************/
326
327static ftree_sw_tbl_element_t *__osm_ftree_sw_tbl_element_create(IN ftree_sw_t *
328								 p_sw)
329{
330	ftree_sw_tbl_element_t *p_element =
331	    (ftree_sw_tbl_element_t *) malloc(sizeof(ftree_sw_tbl_element_t));
332	if (!p_element)
333		return NULL;
334	memset(p_element, 0, sizeof(ftree_sw_tbl_element_t));
335
336	p_element->p_sw = p_sw;
337	return p_element;
338}
339
340/***************************************************/
341
342static void __osm_ftree_sw_tbl_element_destroy(IN ftree_sw_tbl_element_t *
343					       p_element)
344{
345	if (!p_element)
346		return;
347	free(p_element);
348}
349
350/***************************************************
351 **
352 ** ftree_port_t functions
353 **
354 ***************************************************/
355
356static ftree_port_t *__osm_ftree_port_create(IN uint8_t port_num,
357					     IN uint8_t remote_port_num)
358{
359	ftree_port_t *p_port = (ftree_port_t *) malloc(sizeof(ftree_port_t));
360	if (!p_port)
361		return NULL;
362	memset(p_port, 0, sizeof(ftree_port_t));
363
364	p_port->port_num = port_num;
365	p_port->remote_port_num = remote_port_num;
366
367	return p_port;
368}
369
370/***************************************************/
371
372static void __osm_ftree_port_destroy(IN ftree_port_t * p_port)
373{
374	if (p_port)
375		free(p_port);
376}
377
378/***************************************************
379 **
380 ** ftree_port_group_t functions
381 **
382 ***************************************************/
383
384static ftree_port_group_t *
385__osm_ftree_port_group_create(IN ib_net16_t base_lid,
386			      IN ib_net16_t remote_base_lid,
387			      IN ib_net64_t port_guid,
388			      IN ib_net64_t node_guid,
389			      IN uint8_t node_type,
390		              IN void *p_hca_or_sw,
391			      IN ib_net64_t remote_port_guid,
392			      IN ib_net64_t remote_node_guid,
393			      IN uint8_t remote_node_type,
394			      IN void *p_remote_hca_or_sw,
395			      IN boolean_t is_cn)
396{
397	ftree_port_group_t *p_group =
398	    (ftree_port_group_t *) malloc(sizeof(ftree_port_group_t));
399	if (p_group == NULL)
400		return NULL;
401	memset(p_group, 0, sizeof(ftree_port_group_t));
402
403	p_group->base_lid = base_lid;
404	p_group->remote_base_lid = remote_base_lid;
405	memcpy(&p_group->port_guid, &port_guid, sizeof(ib_net64_t));
406	memcpy(&p_group->node_guid, &node_guid, sizeof(ib_net64_t));
407	memcpy(&p_group->remote_port_guid, &remote_port_guid,
408	       sizeof(ib_net64_t));
409	memcpy(&p_group->remote_node_guid, &remote_node_guid,
410	       sizeof(ib_net64_t));
411
412	p_group->node_type = node_type;
413	switch (node_type) {
414	case IB_NODE_TYPE_CA:
415		p_group->hca_or_sw.p_hca = (ftree_hca_t *) p_hca_or_sw;
416		break;
417	case IB_NODE_TYPE_SWITCH:
418		p_group->hca_or_sw.p_sw = (ftree_sw_t *) p_hca_or_sw;
419		break;
420	default:
421		/* we shouldn't get here - port is created only in hca or switch */
422		CL_ASSERT(0);
423	}
424
425	p_group->remote_node_type = remote_node_type;
426	switch (remote_node_type) {
427	case IB_NODE_TYPE_CA:
428		p_group->remote_hca_or_sw.p_hca =
429		    (ftree_hca_t *) p_remote_hca_or_sw;
430		break;
431	case IB_NODE_TYPE_SWITCH:
432		p_group->remote_hca_or_sw.p_sw =
433		    (ftree_sw_t *) p_remote_hca_or_sw;
434		break;
435	default:
436		/* we shouldn't get here - port is created only in hca or switch */
437		CL_ASSERT(0);
438	}
439
440	cl_ptr_vector_init(&p_group->ports, 0,	/* min size */
441			   8);	/* grow size */
442	p_group->is_cn = is_cn;
443	return p_group;
444}				/* __osm_ftree_port_group_create() */
445
446/***************************************************/
447
448static void __osm_ftree_port_group_destroy(IN ftree_port_group_t * p_group)
449{
450	uint32_t i;
451	uint32_t size;
452	ftree_port_t *p_port;
453
454	if (!p_group)
455		return;
456
457	/* remove all the elements of p_group->ports vector */
458	size = cl_ptr_vector_get_size(&p_group->ports);
459	for (i = 0; i < size; i++) {
460		cl_ptr_vector_at(&p_group->ports, i, (void *)&p_port);
461		__osm_ftree_port_destroy(p_port);
462	}
463	cl_ptr_vector_destroy(&p_group->ports);
464	free(p_group);
465}				/* __osm_ftree_port_group_destroy() */
466
467/***************************************************/
468
469static void
470__osm_ftree_port_group_dump(IN ftree_fabric_t * p_ftree,
471			    IN ftree_port_group_t * p_group,
472			    IN ftree_direction_t direction)
473{
474	ftree_port_t *p_port;
475	uint32_t size;
476	uint32_t i;
477	char buff[10 * 1024];
478
479	if (!p_group)
480		return;
481
482	if (!osm_log_is_active(&p_ftree->p_osm->log, OSM_LOG_DEBUG))
483		return;
484
485	size = cl_ptr_vector_get_size(&p_group->ports);
486	buff[0] = '\0';
487
488	for (i = 0; i < size; i++) {
489		cl_ptr_vector_at(&p_group->ports, i, (void *)&p_port);
490		CL_ASSERT(p_port);
491
492		if (i != 0)
493			strcat(buff, ", ");
494		sprintf(buff + strlen(buff), "%u", p_port->port_num);
495	}
496
497	OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
498		"    Port Group of size %u, port(s): %s, direction: %s\n"
499		"                  Local <--> Remote GUID (LID):"
500		"0x%016" PRIx64 " (0x%04x) <--> 0x%016" PRIx64 " (0x%04x)\n",
501		size,
502		buff,
503		(direction == FTREE_DIRECTION_DOWN) ? "DOWN" : "UP",
504		cl_ntoh64(p_group->port_guid),
505		cl_ntoh16(p_group->base_lid),
506		cl_ntoh64(p_group->remote_port_guid),
507		cl_ntoh16(p_group->remote_base_lid));
508
509}				/* __osm_ftree_port_group_dump() */
510
511/***************************************************/
512
513static void
514__osm_ftree_port_group_add_port(IN ftree_port_group_t * p_group,
515				IN uint8_t port_num, IN uint8_t remote_port_num)
516{
517	uint16_t i;
518	ftree_port_t *p_port;
519
520	for (i = 0; i < cl_ptr_vector_get_size(&p_group->ports); i++) {
521		cl_ptr_vector_at(&p_group->ports, i, (void *)&p_port);
522		if (p_port->port_num == port_num)
523			return;
524	}
525
526	p_port = __osm_ftree_port_create(port_num, remote_port_num);
527	cl_ptr_vector_insert(&p_group->ports, p_port, NULL);
528}
529
530/***************************************************
531 **
532 ** ftree_sw_t functions
533 **
534 ***************************************************/
535
536static ftree_sw_t *__osm_ftree_sw_create(IN ftree_fabric_t * p_ftree,
537					 IN osm_switch_t * p_osm_sw)
538{
539	ftree_sw_t *p_sw;
540	uint8_t ports_num;
541
542	/* make sure that the switch has ports */
543	if (p_osm_sw->num_ports == 1)
544		return NULL;
545
546	p_sw = (ftree_sw_t *) malloc(sizeof(ftree_sw_t));
547	if (p_sw == NULL)
548		return NULL;
549	memset(p_sw, 0, sizeof(ftree_sw_t));
550
551	p_sw->p_osm_sw = p_osm_sw;
552	p_sw->rank = 0xFFFFFFFF;
553	__osm_ftree_tuple_init(p_sw->tuple);
554
555	p_sw->base_lid = osm_node_get_base_lid(p_sw->p_osm_sw->p_node, 0);
556
557	ports_num = osm_node_get_num_physp(p_sw->p_osm_sw->p_node);
558	p_sw->down_port_groups =
559	    (ftree_port_group_t **) malloc(ports_num *
560					   sizeof(ftree_port_group_t *));
561	p_sw->up_port_groups =
562	    (ftree_port_group_t **) malloc(ports_num *
563					   sizeof(ftree_port_group_t *));
564	if (!p_sw->down_port_groups || !p_sw->up_port_groups)
565		return NULL;
566	p_sw->down_port_groups_num = 0;
567	p_sw->up_port_groups_num = 0;
568
569	/* initialize lft buffer */
570	memset(p_osm_sw->new_lft, OSM_NO_PATH, IB_LID_UCAST_END_HO + 1);
571
572	p_sw->down_port_groups_idx = -1;
573
574	return p_sw;
575}				/* __osm_ftree_sw_create() */
576
577/***************************************************/
578
579static void __osm_ftree_sw_destroy(IN ftree_fabric_t * p_ftree,
580				   IN ftree_sw_t * p_sw)
581{
582	uint8_t i;
583
584	if (!p_sw)
585		return;
586
587	for (i = 0; i < p_sw->down_port_groups_num; i++)
588		__osm_ftree_port_group_destroy(p_sw->down_port_groups[i]);
589	for (i = 0; i < p_sw->up_port_groups_num; i++)
590		__osm_ftree_port_group_destroy(p_sw->up_port_groups[i]);
591	if (p_sw->down_port_groups)
592		free(p_sw->down_port_groups);
593	if (p_sw->up_port_groups)
594		free(p_sw->up_port_groups);
595
596	free(p_sw);
597}				/* __osm_ftree_sw_destroy() */
598
599/***************************************************/
600
601static uint64_t __osm_ftree_sw_get_guid_no(IN ftree_sw_t * p_sw)
602{
603	if (!p_sw)
604		return 0;
605	return osm_node_get_node_guid(p_sw->p_osm_sw->p_node);
606}
607
608/***************************************************/
609
610static uint64_t __osm_ftree_sw_get_guid_ho(IN ftree_sw_t * p_sw)
611{
612	return cl_ntoh64(__osm_ftree_sw_get_guid_no(p_sw));
613}
614
615/***************************************************/
616
617static void __osm_ftree_sw_dump(IN ftree_fabric_t * p_ftree,
618				IN ftree_sw_t * p_sw)
619{
620	uint32_t i;
621
622	if (!p_sw)
623		return;
624
625	if (!osm_log_is_active(&p_ftree->p_osm->log, OSM_LOG_DEBUG))
626		return;
627
628	OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
629		"Switch index: %s, GUID: 0x%016" PRIx64
630		", Ports: %u DOWN, %u UP\n",
631		__osm_ftree_tuple_to_str(p_sw->tuple),
632		__osm_ftree_sw_get_guid_ho(p_sw), p_sw->down_port_groups_num,
633		p_sw->up_port_groups_num);
634
635	for (i = 0; i < p_sw->down_port_groups_num; i++)
636		__osm_ftree_port_group_dump(p_ftree,
637					    p_sw->down_port_groups[i],
638					    FTREE_DIRECTION_DOWN);
639	for (i = 0; i < p_sw->up_port_groups_num; i++)
640		__osm_ftree_port_group_dump(p_ftree, p_sw->up_port_groups[i],
641					    FTREE_DIRECTION_UP);
642
643}				/* __osm_ftree_sw_dump() */
644
645/***************************************************/
646
647static boolean_t __osm_ftree_sw_ranked(IN ftree_sw_t * p_sw)
648{
649	return (p_sw->rank != 0xFFFFFFFF);
650}
651
652/***************************************************/
653
654static ftree_port_group_t *
655__osm_ftree_sw_get_port_group_by_remote_lid(IN ftree_sw_t * p_sw,
656					    IN ib_net16_t remote_base_lid,
657					    IN ftree_direction_t direction)
658{
659	uint32_t i;
660	uint32_t size;
661	ftree_port_group_t **port_groups;
662
663	if (direction == FTREE_DIRECTION_UP) {
664		port_groups = p_sw->up_port_groups;
665		size = p_sw->up_port_groups_num;
666	} else {
667		port_groups = p_sw->down_port_groups;
668		size = p_sw->down_port_groups_num;
669	}
670
671	for (i = 0; i < size; i++)
672		if (remote_base_lid == port_groups[i]->remote_base_lid)
673			return port_groups[i];
674
675	return NULL;
676}				/* __osm_ftree_sw_get_port_group_by_remote_lid() */
677
678/***************************************************/
679
680static void
681__osm_ftree_sw_add_port(IN ftree_sw_t * p_sw,
682			IN uint8_t port_num,
683			IN uint8_t remote_port_num,
684			IN ib_net16_t base_lid,
685			IN ib_net16_t remote_base_lid,
686			IN ib_net64_t port_guid,
687			IN ib_net64_t remote_port_guid,
688			IN ib_net64_t remote_node_guid,
689			IN uint8_t remote_node_type,
690			IN void *p_remote_hca_or_sw,
691			IN ftree_direction_t direction)
692{
693	ftree_port_group_t *p_group =
694	    __osm_ftree_sw_get_port_group_by_remote_lid(p_sw, remote_base_lid,
695							direction);
696
697	if (!p_group) {
698		p_group = __osm_ftree_port_group_create(base_lid,
699							remote_base_lid,
700							port_guid,
701							__osm_ftree_sw_get_guid_no
702							(p_sw),
703							IB_NODE_TYPE_SWITCH,
704							p_sw, remote_port_guid,
705							remote_node_guid,
706							remote_node_type,
707							p_remote_hca_or_sw,
708							FALSE);
709		CL_ASSERT(p_group);
710
711		if (direction == FTREE_DIRECTION_UP)
712			p_sw->up_port_groups[p_sw->up_port_groups_num++] =
713			    p_group;
714		else
715			p_sw->down_port_groups[p_sw->down_port_groups_num++] =
716			    p_group;
717	}
718	__osm_ftree_port_group_add_port(p_group, port_num, remote_port_num);
719
720}				/* __osm_ftree_sw_add_port() */
721
722/***************************************************/
723
724static inline cl_status_t
725__osm_ftree_sw_set_hops(IN ftree_sw_t * p_sw,
726			IN uint16_t lid_ho, IN uint8_t port_num,
727			IN uint8_t hops)
728{
729	/* set local min hop table(LID) */
730	return osm_switch_set_hops(p_sw->p_osm_sw, lid_ho, port_num, hops);
731}
732
733/***************************************************
734 **
735 ** ftree_hca_t functions
736 **
737 ***************************************************/
738
739static ftree_hca_t *__osm_ftree_hca_create(IN osm_node_t * p_osm_node)
740{
741	ftree_hca_t *p_hca = (ftree_hca_t *) malloc(sizeof(ftree_hca_t));
742	if (p_hca == NULL)
743		return NULL;
744	memset(p_hca, 0, sizeof(ftree_hca_t));
745
746	p_hca->p_osm_node = p_osm_node;
747	p_hca->up_port_groups = (ftree_port_group_t **)
748	    malloc(osm_node_get_num_physp(p_hca->p_osm_node) *
749		   sizeof(ftree_port_group_t *));
750	if (!p_hca->up_port_groups)
751		return NULL;
752	p_hca->up_port_groups_num = 0;
753	return p_hca;
754}
755
756/***************************************************/
757
758static void __osm_ftree_hca_destroy(IN ftree_hca_t * p_hca)
759{
760	uint32_t i;
761
762	if (!p_hca)
763		return;
764
765	for (i = 0; i < p_hca->up_port_groups_num; i++)
766		__osm_ftree_port_group_destroy(p_hca->up_port_groups[i]);
767
768	if (p_hca->up_port_groups)
769		free(p_hca->up_port_groups);
770
771	free(p_hca);
772}
773
774/***************************************************/
775
776static uint64_t __osm_ftree_hca_get_guid_no(IN ftree_hca_t * p_hca)
777{
778	if (!p_hca)
779		return 0;
780	return osm_node_get_node_guid(p_hca->p_osm_node);
781}
782
783/***************************************************/
784
785static uint64_t __osm_ftree_hca_get_guid_ho(IN ftree_hca_t * p_hca)
786{
787	return cl_ntoh64(__osm_ftree_hca_get_guid_no(p_hca));
788}
789
790/***************************************************/
791
792static void __osm_ftree_hca_dump(IN ftree_fabric_t * p_ftree,
793				 IN ftree_hca_t * p_hca)
794{
795	uint32_t i;
796
797	if (!p_hca)
798		return;
799
800	if (!osm_log_is_active(&p_ftree->p_osm->log, OSM_LOG_DEBUG))
801		return;
802
803	OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
804		"CA GUID: 0x%016" PRIx64 ", Ports: %u UP\n",
805		__osm_ftree_hca_get_guid_ho(p_hca), p_hca->up_port_groups_num);
806
807	for (i = 0; i < p_hca->up_port_groups_num; i++)
808		__osm_ftree_port_group_dump(p_ftree, p_hca->up_port_groups[i],
809					    FTREE_DIRECTION_UP);
810}
811
812/***************************************************/
813
814static ftree_port_group_t *
815__osm_ftree_hca_get_port_group_by_remote_lid(IN ftree_hca_t * p_hca,
816					     IN ib_net16_t remote_base_lid)
817{
818	uint32_t i;
819	for (i = 0; i < p_hca->up_port_groups_num; i++)
820		if (remote_base_lid ==
821		    p_hca->up_port_groups[i]->remote_base_lid)
822			return p_hca->up_port_groups[i];
823
824	return NULL;
825}
826
827/***************************************************/
828
829static void
830__osm_ftree_hca_add_port(IN ftree_hca_t * p_hca,
831			 IN uint8_t port_num,
832			 IN uint8_t remote_port_num,
833			 IN ib_net16_t base_lid,
834			 IN ib_net16_t remote_base_lid,
835			 IN ib_net64_t port_guid,
836			 IN ib_net64_t remote_port_guid,
837			 IN ib_net64_t remote_node_guid,
838			 IN uint8_t remote_node_type,
839			 IN void *p_remote_hca_or_sw, IN boolean_t is_cn)
840{
841	ftree_port_group_t *p_group;
842
843	/* this function is supposed to be called only for adding ports
844	   in hca's that lead to switches */
845	CL_ASSERT(remote_node_type == IB_NODE_TYPE_SWITCH);
846
847	p_group =
848	    __osm_ftree_hca_get_port_group_by_remote_lid(p_hca,
849							 remote_base_lid);
850
851	if (!p_group) {
852		p_group = __osm_ftree_port_group_create(base_lid,
853							remote_base_lid,
854							port_guid,
855							__osm_ftree_hca_get_guid_no
856							(p_hca),
857							IB_NODE_TYPE_CA, p_hca,
858							remote_port_guid,
859							remote_node_guid,
860							remote_node_type,
861							p_remote_hca_or_sw,
862							is_cn);
863		p_hca->up_port_groups[p_hca->up_port_groups_num++] = p_group;
864	}
865	__osm_ftree_port_group_add_port(p_group, port_num, remote_port_num);
866
867}				/* __osm_ftree_hca_add_port() */
868
869/***************************************************
870 **
871 ** ftree_fabric_t functions
872 **
873 ***************************************************/
874
875static ftree_fabric_t *__osm_ftree_fabric_create()
876{
877	ftree_fabric_t *p_ftree =
878	    (ftree_fabric_t *) malloc(sizeof(ftree_fabric_t));
879	if (p_ftree == NULL)
880		return NULL;
881
882	memset(p_ftree, 0, sizeof(ftree_fabric_t));
883
884	cl_qmap_init(&p_ftree->hca_tbl);
885	cl_qmap_init(&p_ftree->sw_tbl);
886	cl_qmap_init(&p_ftree->sw_by_tuple_tbl);
887	cl_qmap_init(&p_ftree->cn_guid_tbl);
888
889	cl_qlist_init(&p_ftree->root_guid_list);
890
891	return p_ftree;
892}
893
894/***************************************************/
895
896static void __osm_ftree_fabric_clear(ftree_fabric_t * p_ftree)
897{
898	ftree_hca_t *p_hca;
899	ftree_hca_t *p_next_hca;
900	ftree_sw_t *p_sw;
901	ftree_sw_t *p_next_sw;
902	ftree_sw_tbl_element_t *p_element;
903	ftree_sw_tbl_element_t *p_next_element;
904	name_map_item_t *p_guid_element, *p_next_guid_element;
905
906	if (!p_ftree)
907		return;
908
909	/* remove all the elements of hca_tbl */
910
911	p_next_hca = (ftree_hca_t *) cl_qmap_head(&p_ftree->hca_tbl);
912	while (p_next_hca != (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl)) {
913		p_hca = p_next_hca;
914		p_next_hca = (ftree_hca_t *) cl_qmap_next(&p_hca->map_item);
915		__osm_ftree_hca_destroy(p_hca);
916	}
917	cl_qmap_remove_all(&p_ftree->hca_tbl);
918
919	/* remove all the elements of sw_tbl */
920
921	p_next_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
922	while (p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) {
923		p_sw = p_next_sw;
924		p_next_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item);
925		__osm_ftree_sw_destroy(p_ftree, p_sw);
926	}
927	cl_qmap_remove_all(&p_ftree->sw_tbl);
928
929	/* remove all the elements of sw_by_tuple_tbl */
930
931	p_next_element =
932	    (ftree_sw_tbl_element_t *) cl_qmap_head(&p_ftree->sw_by_tuple_tbl);
933	while (p_next_element !=
934	       (ftree_sw_tbl_element_t *) cl_qmap_end(&p_ftree->
935						      sw_by_tuple_tbl)) {
936		p_element = p_next_element;
937		p_next_element =
938		    (ftree_sw_tbl_element_t *) cl_qmap_next(&p_element->
939							    map_item);
940		__osm_ftree_sw_tbl_element_destroy(p_element);
941	}
942	cl_qmap_remove_all(&p_ftree->sw_by_tuple_tbl);
943
944	/* remove all the elements of cn_guid_tbl */
945	p_next_guid_element =
946	    (name_map_item_t *) cl_qmap_head(&p_ftree->cn_guid_tbl);
947	while (p_next_guid_element !=
948	       (name_map_item_t *) cl_qmap_end(&p_ftree->cn_guid_tbl)) {
949		p_guid_element = p_next_guid_element;
950		p_next_guid_element =
951		    (name_map_item_t *) cl_qmap_next(&p_guid_element->item);
952		free(p_guid_element);
953	}
954	cl_qmap_remove_all(&p_ftree->cn_guid_tbl);
955
956	/* remove all the elements of root_guid_list */
957	while (!cl_is_qlist_empty(&p_ftree->root_guid_list))
958		free(cl_qlist_remove_head(&p_ftree->root_guid_list));
959
960	/* free the leaf switches array */
961	if ((p_ftree->leaf_switches_num > 0) && (p_ftree->leaf_switches))
962		free(p_ftree->leaf_switches);
963
964	p_ftree->leaf_switches_num = 0;
965	p_ftree->cn_num = 0;
966	p_ftree->leaf_switch_rank = 0;
967	p_ftree->max_switch_rank = 0;
968	p_ftree->max_cn_per_leaf = 0;
969	p_ftree->lft_max_lid_ho = 0;
970	p_ftree->leaf_switches = NULL;
971	p_ftree->fabric_built = FALSE;
972
973}				/* __osm_ftree_fabric_destroy() */
974
975/***************************************************/
976
977static void __osm_ftree_fabric_destroy(ftree_fabric_t * p_ftree)
978{
979	if (!p_ftree)
980		return;
981	__osm_ftree_fabric_clear(p_ftree);
982	free(p_ftree);
983}
984
985/***************************************************/
986
987static uint8_t __osm_ftree_fabric_get_rank(ftree_fabric_t * p_ftree)
988{
989	return p_ftree->leaf_switch_rank + 1;
990}
991
992/***************************************************/
993
994static void __osm_ftree_fabric_add_hca(ftree_fabric_t * p_ftree,
995				       osm_node_t * p_osm_node)
996{
997	ftree_hca_t *p_hca = __osm_ftree_hca_create(p_osm_node);
998
999	CL_ASSERT(osm_node_get_type(p_osm_node) == IB_NODE_TYPE_CA);
1000
1001	cl_qmap_insert(&p_ftree->hca_tbl, p_osm_node->node_info.node_guid,
1002		       &p_hca->map_item);
1003}
1004
1005/***************************************************/
1006
1007static void __osm_ftree_fabric_add_sw(ftree_fabric_t * p_ftree,
1008				      osm_switch_t * p_osm_sw)
1009{
1010	ftree_sw_t *p_sw = __osm_ftree_sw_create(p_ftree, p_osm_sw);
1011
1012	CL_ASSERT(osm_node_get_type(p_osm_sw->p_node) == IB_NODE_TYPE_SWITCH);
1013
1014	cl_qmap_insert(&p_ftree->sw_tbl, p_osm_sw->p_node->node_info.node_guid,
1015		       &p_sw->map_item);
1016
1017	/* track the max lid (in host order) that exists in the fabric */
1018	if (cl_ntoh16(p_sw->base_lid) > p_ftree->lft_max_lid_ho)
1019		p_ftree->lft_max_lid_ho = cl_ntoh16(p_sw->base_lid);
1020}
1021
1022/***************************************************/
1023
1024static void __osm_ftree_fabric_add_sw_by_tuple(IN ftree_fabric_t * p_ftree,
1025					       IN ftree_sw_t * p_sw)
1026{
1027	CL_ASSERT(__osm_ftree_tuple_assigned(p_sw->tuple));
1028
1029	cl_qmap_insert(&p_ftree->sw_by_tuple_tbl,
1030		       __osm_ftree_tuple_to_key(p_sw->tuple),
1031		       &__osm_ftree_sw_tbl_element_create(p_sw)->map_item);
1032}
1033
1034/***************************************************/
1035
1036static ftree_sw_t *__osm_ftree_fabric_get_sw_by_tuple(IN ftree_fabric_t *
1037						      p_ftree,
1038						      IN ftree_tuple_t tuple)
1039{
1040	ftree_sw_tbl_element_t *p_element;
1041
1042	CL_ASSERT(__osm_ftree_tuple_assigned(tuple));
1043
1044	__osm_ftree_tuple_to_key(tuple);
1045
1046	p_element =
1047	    (ftree_sw_tbl_element_t *) cl_qmap_get(&p_ftree->sw_by_tuple_tbl,
1048						   __osm_ftree_tuple_to_key
1049						   (tuple));
1050	if (p_element ==
1051	    (ftree_sw_tbl_element_t *) cl_qmap_end(&p_ftree->sw_by_tuple_tbl))
1052		return NULL;
1053
1054	return p_element->p_sw;
1055}
1056
1057/***************************************************/
1058
1059static ftree_sw_t *__osm_ftree_fabric_get_sw_by_guid(IN ftree_fabric_t *
1060						     p_ftree, IN uint64_t guid)
1061{
1062	ftree_sw_t *p_sw;
1063	p_sw = (ftree_sw_t *) cl_qmap_get(&p_ftree->sw_tbl, guid);
1064	if (p_sw == (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl))
1065		return NULL;
1066	return p_sw;
1067}
1068
1069/***************************************************/
1070
1071static ftree_hca_t *__osm_ftree_fabric_get_hca_by_guid(IN ftree_fabric_t *
1072						       p_ftree,
1073						       IN uint64_t guid)
1074{
1075	ftree_hca_t *p_hca;
1076	p_hca = (ftree_hca_t *) cl_qmap_get(&p_ftree->hca_tbl, guid);
1077	if (p_hca == (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl))
1078		return NULL;
1079	return p_hca;
1080}
1081
1082/***************************************************/
1083
1084static void __osm_ftree_fabric_dump(ftree_fabric_t * p_ftree)
1085{
1086	uint32_t i;
1087	ftree_hca_t *p_hca;
1088	ftree_sw_t *p_sw;
1089
1090	if (!osm_log_is_active(&p_ftree->p_osm->log, OSM_LOG_DEBUG))
1091		return;
1092
1093	OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, "\n"
1094		"                       |-------------------------------|\n"
1095		"                       |-  Full fabric topology dump  -|\n"
1096		"                       |-------------------------------|\n\n");
1097
1098	OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, "-- CAs:\n");
1099
1100	for (p_hca = (ftree_hca_t *) cl_qmap_head(&p_ftree->hca_tbl);
1101	     p_hca != (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl);
1102	     p_hca = (ftree_hca_t *) cl_qmap_next(&p_hca->map_item)) {
1103		__osm_ftree_hca_dump(p_ftree, p_hca);
1104	}
1105
1106	for (i = 0; i < p_ftree->max_switch_rank; i++) {
1107		OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
1108			"-- Rank %u switches\n", i);
1109		for (p_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
1110		     p_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl);
1111		     p_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item)) {
1112			if (p_sw->rank == i)
1113				__osm_ftree_sw_dump(p_ftree, p_sw);
1114		}
1115	}
1116
1117	OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, "\n"
1118		"                       |---------------------------------------|\n"
1119		"                       |- Full fabric topology dump completed -|\n"
1120		"                       |---------------------------------------|\n\n");
1121}				/* __osm_ftree_fabric_dump() */
1122
1123/***************************************************/
1124
1125static void __osm_ftree_fabric_dump_general_info(IN ftree_fabric_t * p_ftree)
1126{
1127	uint32_t i, j;
1128	ftree_sw_t *p_sw;
1129
1130	OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO,
1131		"General fabric topology info\n");
1132	OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO,
1133		"============================\n");
1134
1135	OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO,
1136		"  - FatTree rank (roots to leaf switches): %u\n",
1137		p_ftree->leaf_switch_rank + 1);
1138	OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO,
1139		"  - FatTree max switch rank: %u\n", p_ftree->max_switch_rank);
1140	OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO,
1141		"  - Fabric has %u CAs (%u of them CNs), %u switches\n",
1142		cl_qmap_count(&p_ftree->hca_tbl), p_ftree->cn_num,
1143		cl_qmap_count(&p_ftree->sw_tbl));
1144
1145	CL_ASSERT(cl_qmap_count(&p_ftree->hca_tbl) >= p_ftree->cn_num);
1146
1147	for (i = 0; i <= p_ftree->max_switch_rank; i++) {
1148		j = 0;
1149		for (p_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
1150		     p_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl);
1151		     p_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item)) {
1152			if (p_sw->rank == i)
1153				j++;
1154		}
1155		if (i == 0)
1156			OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO,
1157				"  - Fabric has %u switches at rank %u (roots)\n",
1158				j, i);
1159		else if (i == p_ftree->leaf_switch_rank)
1160			OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO,
1161				"  - Fabric has %u switches at rank %u (%u of them leafs)\n",
1162				j, i, p_ftree->leaf_switches_num);
1163		else
1164			OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO,
1165				"  - Fabric has %u switches at rank %u\n", j,
1166				i);
1167	}
1168
1169	if (osm_log_is_active(&p_ftree->p_osm->log, OSM_LOG_VERBOSE)) {
1170		OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
1171			"  - Root switches:\n");
1172		for (p_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
1173		     p_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl);
1174		     p_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item)) {
1175			if (p_sw->rank == 0)
1176				OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
1177					"      GUID: 0x%016" PRIx64
1178					", LID: %u, Index %s\n",
1179					__osm_ftree_sw_get_guid_ho(p_sw),
1180					cl_ntoh16(p_sw->base_lid),
1181					__osm_ftree_tuple_to_str(p_sw->tuple));
1182		}
1183
1184		OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
1185			"  - Leaf switches (sorted by index):\n");
1186		for (i = 0; i < p_ftree->leaf_switches_num; i++) {
1187			OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
1188				"      GUID: 0x%016" PRIx64
1189				", LID: %u, Index %s\n",
1190				__osm_ftree_sw_get_guid_ho(p_ftree->
1191							   leaf_switches[i]),
1192				cl_ntoh16(p_ftree->leaf_switches[i]->base_lid),
1193				__osm_ftree_tuple_to_str(p_ftree->
1194							 leaf_switches[i]->
1195							 tuple));
1196		}
1197	}
1198}				/* __osm_ftree_fabric_dump_general_info() */
1199
1200/***************************************************/
1201
1202static void __osm_ftree_fabric_dump_hca_ordering(IN ftree_fabric_t * p_ftree)
1203{
1204	ftree_hca_t *p_hca;
1205	ftree_sw_t *p_sw;
1206	ftree_port_group_t *p_group_on_sw;
1207	ftree_port_group_t *p_group_on_hca;
1208	uint32_t i;
1209	uint32_t j;
1210	unsigned printed_hcas_on_leaf;
1211
1212	char path[1024];
1213	FILE *p_hca_ordering_file;
1214	char *filename = "opensm-ftree-ca-order.dump";
1215
1216	snprintf(path, sizeof(path), "%s/%s",
1217		 p_ftree->p_osm->subn.opt.dump_files_dir, filename);
1218	p_hca_ordering_file = fopen(path, "w");
1219	if (!p_hca_ordering_file) {
1220		OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, "ERR AB01: "
1221			"cannot open file \'%s\': %s\n", filename,
1222			strerror(errno));
1223		return;
1224	}
1225
1226	/* for each leaf switch (in indexing order) */
1227	for (i = 0; i < p_ftree->leaf_switches_num; i++) {
1228		p_sw = p_ftree->leaf_switches[i];
1229		printed_hcas_on_leaf = 0;
1230
1231		/* for each real CA (CNs and not) connected to this switch */
1232		for (j = 0; j < p_sw->down_port_groups_num; j++) {
1233			p_group_on_sw = p_sw->down_port_groups[j];
1234
1235			if (p_group_on_sw->remote_node_type != IB_NODE_TYPE_CA)
1236				continue;
1237
1238			p_hca = p_group_on_sw->remote_hca_or_sw.p_hca;
1239			p_group_on_hca =
1240			    __osm_ftree_hca_get_port_group_by_remote_lid(p_hca,
1241									 p_group_on_sw->
1242									 base_lid);
1243
1244			/* treat non-compute nodes as dummies */
1245			if (!p_group_on_hca->is_cn)
1246				continue;
1247
1248			fprintf(p_hca_ordering_file, "0x%04x\t%s\n",
1249				cl_ntoh16(p_group_on_hca->base_lid),
1250				p_hca->p_osm_node->print_desc);
1251
1252			printed_hcas_on_leaf++;
1253		}
1254
1255		/* now print missing HCAs */
1256		for (j = 0;
1257		     j < (p_ftree->max_cn_per_leaf - printed_hcas_on_leaf); j++)
1258			fprintf(p_hca_ordering_file, "0xFFFF\tDUMMY\n");
1259
1260	}
1261	/* done going through all the leaf switches */
1262
1263	fclose(p_hca_ordering_file);
1264}				/* __osm_ftree_fabric_dump_hca_ordering() */
1265
1266/***************************************************/
1267
1268static void
1269__osm_ftree_fabric_assign_tuple(IN ftree_fabric_t * p_ftree,
1270				IN ftree_sw_t * p_sw,
1271				IN ftree_tuple_t new_tuple)
1272{
1273	memcpy(p_sw->tuple, new_tuple, FTREE_TUPLE_LEN);
1274	__osm_ftree_fabric_add_sw_by_tuple(p_ftree, p_sw);
1275}
1276
1277/***************************************************/
1278
1279static void __osm_ftree_fabric_assign_first_tuple(IN ftree_fabric_t * p_ftree,
1280						  IN ftree_sw_t * p_sw)
1281{
1282	uint8_t i;
1283	ftree_tuple_t new_tuple;
1284
1285	__osm_ftree_tuple_init(new_tuple);
1286	new_tuple[0] = (uint8_t) p_sw->rank;
1287	for (i = 1; i <= p_sw->rank; i++)
1288		new_tuple[i] = 0;
1289
1290	__osm_ftree_fabric_assign_tuple(p_ftree, p_sw, new_tuple);
1291}
1292
1293/***************************************************/
1294
1295static void
1296__osm_ftree_fabric_get_new_tuple(IN ftree_fabric_t * p_ftree,
1297				 OUT ftree_tuple_t new_tuple,
1298				 IN ftree_tuple_t from_tuple,
1299				 IN ftree_direction_t direction)
1300{
1301	ftree_sw_t *p_sw;
1302	ftree_tuple_t temp_tuple;
1303	uint8_t var_index;
1304	uint8_t i;
1305
1306	__osm_ftree_tuple_init(new_tuple);
1307	memcpy(temp_tuple, from_tuple, FTREE_TUPLE_LEN);
1308
1309	if (direction == FTREE_DIRECTION_DOWN) {
1310		temp_tuple[0]++;
1311		var_index = from_tuple[0] + 1;
1312	} else {
1313		temp_tuple[0]--;
1314		var_index = from_tuple[0];
1315	}
1316
1317	for (i = 0; i < 0xFF; i++) {
1318		temp_tuple[var_index] = i;
1319		p_sw = __osm_ftree_fabric_get_sw_by_tuple(p_ftree, temp_tuple);
1320		if (p_sw == NULL)	/* found free tuple */
1321			break;
1322	}
1323
1324	if (i == 0xFF) {
1325		/* new tuple not found - there are more than 255 ports in one direction */
1326		return;
1327	}
1328	memcpy(new_tuple, temp_tuple, FTREE_TUPLE_LEN);
1329
1330}				/* __osm_ftree_fabric_get_new_tuple() */
1331
1332/***************************************************/
1333
1334static inline boolean_t __osm_ftree_fabric_roots_provided(IN ftree_fabric_t *
1335							  p_ftree)
1336{
1337	return (p_ftree->p_osm->subn.opt.root_guid_file != NULL);
1338}
1339
1340/***************************************************/
1341
1342static inline boolean_t __osm_ftree_fabric_cns_provided(IN ftree_fabric_t *
1343							p_ftree)
1344{
1345	return (p_ftree->p_osm->subn.opt.cn_guid_file != NULL);
1346}
1347
1348/***************************************************/
1349
1350static int __osm_ftree_fabric_mark_leaf_switches(IN ftree_fabric_t * p_ftree)
1351{
1352	ftree_sw_t *p_sw;
1353	ftree_hca_t *p_hca;
1354	ftree_hca_t *p_next_hca;
1355	unsigned i;
1356	int res = 0;
1357
1358	OSM_LOG_ENTER(&p_ftree->p_osm->log);
1359
1360	OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
1361		"Marking leaf switches in fabric\n");
1362
1363	/* Scan all the CAs, if they have CNs - find CN port and mark switch
1364	   that is connected to this port as leaf switch.
1365	   Also, ensure that this marked leaf has rank of p_ftree->leaf_switch_rank. */
1366	p_next_hca = (ftree_hca_t *) cl_qmap_head(&p_ftree->hca_tbl);
1367	while (p_next_hca != (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl)) {
1368		p_hca = p_next_hca;
1369		p_next_hca = (ftree_hca_t *) cl_qmap_next(&p_hca->map_item);
1370		if (!p_hca->cn_num)
1371			continue;
1372
1373		for (i = 0; i < p_hca->up_port_groups_num; i++) {
1374			if (!p_hca->up_port_groups[i]->is_cn)
1375				continue;
1376
1377			/* In CAs, port group alway has one port, and since this
1378			   port group is CN, we know that this port is compute node */
1379			CL_ASSERT(p_hca->up_port_groups[i]->remote_node_type ==
1380				  IB_NODE_TYPE_SWITCH);
1381			p_sw = p_hca->up_port_groups[i]->remote_hca_or_sw.p_sw;
1382
1383			/* check if this switch was already processed */
1384			if (p_sw->is_leaf)
1385				continue;
1386			p_sw->is_leaf = TRUE;
1387
1388			/* ensure that this leaf switch is at the correct tree level */
1389			if (p_sw->rank != p_ftree->leaf_switch_rank) {
1390				OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
1391					"ERR AB26: CN port 0x%" PRIx64
1392					" is connected to switch 0x%" PRIx64
1393					" with rank %u, "
1394					"while FatTree leaf rank is %u\n",
1395					cl_ntoh64(p_hca->up_port_groups[i]->
1396						  port_guid),
1397					__osm_ftree_sw_get_guid_ho(p_sw),
1398					p_sw->rank, p_ftree->leaf_switch_rank);
1399				res = -1;
1400				goto Exit;
1401
1402			}
1403		}
1404	}
1405
1406Exit:
1407	OSM_LOG_EXIT(&p_ftree->p_osm->log);
1408	return res;
1409}				/* __osm_ftree_fabric_mark_leaf_switches() */
1410
1411/***************************************************/
1412
1413static void __osm_ftree_fabric_make_indexing(IN ftree_fabric_t * p_ftree)
1414{
1415	ftree_sw_t *p_remote_sw;
1416	ftree_sw_t *p_sw = NULL;
1417	ftree_sw_t *p_next_sw;
1418	ftree_tuple_t new_tuple;
1419	uint32_t i;
1420	cl_list_t bfs_list;
1421	ftree_sw_tbl_element_t *p_sw_tbl_element;
1422
1423	OSM_LOG_ENTER(&p_ftree->p_osm->log);
1424
1425	OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
1426		"Starting FatTree indexing\n");
1427
1428	/* using the first leaf switch as a starting point for indexing algorithm. */
1429	p_next_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
1430	while (p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) {
1431		p_sw = p_next_sw;
1432		if (p_sw->is_leaf)
1433			break;
1434		p_next_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item);
1435	}
1436
1437	CL_ASSERT(p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl));
1438
1439	/* Assign the first tuple to the switch that is used as BFS starting point.
1440	   The tuple will be as follows: [rank].0.0.0...
1441	   This fuction also adds the switch it into the switch_by_tuple table. */
1442	__osm_ftree_fabric_assign_first_tuple(p_ftree, p_sw);
1443
1444	OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
1445		"Indexing starting point:\n"
1446		"                                            - Switch rank  : %u\n"
1447		"                                            - Switch index : %s\n"
1448		"                                            - Node LID     : %u\n"
1449		"                                            - Node GUID    : 0x%016"
1450		PRIx64 "\n", p_sw->rank, __osm_ftree_tuple_to_str(p_sw->tuple),
1451		cl_ntoh16(p_sw->base_lid), __osm_ftree_sw_get_guid_ho(p_sw));
1452
1453	/*
1454	 * Now run BFS and assign indexes to all switches
1455	 * Pseudo code of the algorithm is as follows:
1456	 *
1457	 *  * Add first switch to BFS queue
1458	 *  * While (BFS queue not empty)
1459	 *      - Pop the switch from the head of the queue
1460	 *      - Scan all the downward and upward ports
1461	 *      - For each port
1462	 *          + Get the remote switch
1463	 *          + Assign index to the remote switch
1464	 *          + Add remote switch to the BFS queue
1465	 */
1466
1467	cl_list_init(&bfs_list, cl_qmap_count(&p_ftree->sw_tbl));
1468	cl_list_insert_tail(&bfs_list,
1469			    &__osm_ftree_sw_tbl_element_create(p_sw)->map_item);
1470
1471	while (!cl_is_list_empty(&bfs_list)) {
1472		p_sw_tbl_element =
1473		    (ftree_sw_tbl_element_t *) cl_list_remove_head(&bfs_list);
1474		p_sw = p_sw_tbl_element->p_sw;
1475		__osm_ftree_sw_tbl_element_destroy(p_sw_tbl_element);
1476
1477		/* Discover all the nodes from ports that are pointing down */
1478
1479		if (p_sw->rank >= p_ftree->leaf_switch_rank) {
1480			/* whether downward ports are pointing to CAs or switches,
1481			   we don't assign indexes to switches that are located
1482			   lower than leaf switches */
1483		} else {
1484			/* This is not the leaf switch */
1485			for (i = 0; i < p_sw->down_port_groups_num; i++) {
1486				/* Work with port groups that are pointing to switches only.
1487				   No need to assign indexing to HCAs */
1488				if (p_sw->down_port_groups[i]->
1489				    remote_node_type != IB_NODE_TYPE_SWITCH)
1490					continue;
1491
1492				p_remote_sw =
1493				    p_sw->down_port_groups[i]->remote_hca_or_sw.
1494				    p_sw;
1495				if (__osm_ftree_tuple_assigned
1496				    (p_remote_sw->tuple)) {
1497					/* this switch has been already indexed */
1498					continue;
1499				}
1500				/* allocate new tuple */
1501				__osm_ftree_fabric_get_new_tuple(p_ftree,
1502								 new_tuple,
1503								 p_sw->tuple,
1504								 FTREE_DIRECTION_DOWN);
1505				/* Assign the new tuple to the remote switch.
1506				   This fuction also adds the switch into the switch_by_tuple table. */
1507				__osm_ftree_fabric_assign_tuple(p_ftree,
1508								p_remote_sw,
1509								new_tuple);
1510
1511				/* add the newly discovered switch to the BFS queue */
1512				cl_list_insert_tail(&bfs_list,
1513						    &__osm_ftree_sw_tbl_element_create
1514						    (p_remote_sw)->map_item);
1515			}
1516			/* Done assigning indexes to all the remote switches
1517			   that are pointed by the downgoing ports.
1518			   Now sort port groups according to remote index. */
1519			qsort(p_sw->down_port_groups,	/* array */
1520			      p_sw->down_port_groups_num,	/* number of elements */
1521			      sizeof(ftree_port_group_t *),	/* size of each element */
1522			      __osm_ftree_compare_port_groups_by_remote_switch_index);	/* comparator */
1523		}
1524
1525		/* Done indexing switches from ports that go down.
1526		   Now do the same with ports that are pointing up. */
1527
1528		if (p_sw->rank != 0) {
1529			/* This is not the root switch, which means that all the ports
1530			   that are pointing up are taking us to another switches. */
1531			for (i = 0; i < p_sw->up_port_groups_num; i++) {
1532				p_remote_sw =
1533				    p_sw->up_port_groups[i]->remote_hca_or_sw.
1534				    p_sw;
1535				if (__osm_ftree_tuple_assigned
1536				    (p_remote_sw->tuple))
1537					continue;
1538				/* allocate new tuple */
1539				__osm_ftree_fabric_get_new_tuple(p_ftree,
1540								 new_tuple,
1541								 p_sw->tuple,
1542								 FTREE_DIRECTION_UP);
1543				/* Assign the new tuple to the remote switch.
1544				   This fuction also adds the switch to the
1545				   switch_by_tuple table. */
1546				__osm_ftree_fabric_assign_tuple(p_ftree,
1547								p_remote_sw,
1548								new_tuple);
1549				/* add the newly discovered switch to the BFS queue */
1550				cl_list_insert_tail(&bfs_list,
1551						    &__osm_ftree_sw_tbl_element_create
1552						    (p_remote_sw)->map_item);
1553			}
1554			/* Done assigning indexes to all the remote switches
1555			   that are pointed by the upgoing ports.
1556			   Now sort port groups according to remote index. */
1557			qsort(p_sw->up_port_groups,	/* array */
1558			      p_sw->up_port_groups_num,	/* number of elements */
1559			      sizeof(ftree_port_group_t *),	/* size of each element */
1560			      __osm_ftree_compare_port_groups_by_remote_switch_index);	/* comparator */
1561		}
1562		/* Done assigning indexes to all the switches that are directly connected
1563		   to the current switch - go to the next switch in the BFS queue */
1564	}
1565	cl_list_destroy(&bfs_list);
1566
1567	OSM_LOG_EXIT(&p_ftree->p_osm->log);
1568}				/* __osm_ftree_fabric_make_indexing() */
1569
1570/***************************************************/
1571
1572static int __osm_ftree_fabric_create_leaf_switch_array(IN ftree_fabric_t *
1573						       p_ftree)
1574{
1575	ftree_sw_t *p_sw;
1576	ftree_sw_t *p_next_sw;
1577	ftree_sw_t **all_switches_at_leaf_level;
1578	unsigned i;
1579	unsigned all_leaf_idx = 0;
1580	unsigned first_leaf_idx;
1581	unsigned last_leaf_idx;
1582	int res = 0;
1583
1584	OSM_LOG_ENTER(&p_ftree->p_osm->log);
1585
1586	/* create array of ALL the switches that have leaf rank */
1587	all_switches_at_leaf_level = (ftree_sw_t **)
1588	    malloc(cl_qmap_count(&p_ftree->sw_tbl) * sizeof(ftree_sw_t *));
1589	if (!all_switches_at_leaf_level) {
1590		osm_log(&p_ftree->p_osm->log, OSM_LOG_SYS,
1591			"Fat-tree routing: Memory allocation failed\n");
1592		res = -1;
1593		goto Exit;
1594	}
1595	memset(all_switches_at_leaf_level, 0,
1596	       cl_qmap_count(&p_ftree->sw_tbl) * sizeof(ftree_sw_t *));
1597
1598	p_next_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
1599	while (p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) {
1600		p_sw = p_next_sw;
1601		p_next_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item);
1602		if (p_sw->rank == p_ftree->leaf_switch_rank) {
1603			OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
1604				"Adding switch 0x%" PRIx64
1605				" to full leaf switch array\n",
1606				__osm_ftree_sw_get_guid_ho(p_sw));
1607			all_switches_at_leaf_level[all_leaf_idx++] = p_sw;
1608
1609		}
1610	}
1611
1612	/* quick-sort array of leaf switches by index */
1613	qsort(all_switches_at_leaf_level,	/* array */
1614	      all_leaf_idx,	/* number of elements */
1615	      sizeof(ftree_sw_t *),	/* size of each element */
1616	      __osm_ftree_compare_switches_by_index);	/* comparator */
1617
1618	/* check the first and the last REAL leaf (the one
1619	   that has CNs) in the array of all the leafs */
1620
1621	first_leaf_idx = all_leaf_idx;
1622	last_leaf_idx = 0;
1623	for (i = 0; i < all_leaf_idx; i++) {
1624		if (all_switches_at_leaf_level[i]->is_leaf) {
1625			if (i < first_leaf_idx)
1626				first_leaf_idx = i;
1627			last_leaf_idx = i;
1628		}
1629	}
1630	CL_ASSERT(first_leaf_idx < last_leaf_idx);
1631
1632	OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
1633		"Full leaf array info: first_leaf_idx = %u, last_leaf_idx = %u\n",
1634		first_leaf_idx, last_leaf_idx);
1635
1636	/* Create array of REAL leaf switches, sorted by index.
1637	   This array may contain switches at the same rank w/o CNs,
1638	   in case this is the order of indexing. */
1639	p_ftree->leaf_switches_num = last_leaf_idx - first_leaf_idx + 1;
1640	p_ftree->leaf_switches = (ftree_sw_t **)
1641	    malloc(p_ftree->leaf_switches_num * sizeof(ftree_sw_t *));
1642	if (!p_ftree->leaf_switches) {
1643		osm_log(&p_ftree->p_osm->log, OSM_LOG_SYS,
1644			"Fat-tree routing: Memory allocation failed\n");
1645		res = -1;
1646		goto Exit;
1647	}
1648
1649	memcpy(p_ftree->leaf_switches,
1650	       &(all_switches_at_leaf_level[first_leaf_idx]),
1651	       p_ftree->leaf_switches_num * sizeof(ftree_sw_t *));
1652
1653	free(all_switches_at_leaf_level);
1654
1655	OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
1656		"Created array of %u leaf switches\n",
1657		p_ftree->leaf_switches_num);
1658
1659Exit:
1660	OSM_LOG_EXIT(&p_ftree->p_osm->log);
1661	return res;
1662}				/* __osm_ftree_fabric_create_leaf_switch_array() */
1663
1664/***************************************************/
1665
1666static void __osm_ftree_fabric_set_max_cn_per_leaf(IN ftree_fabric_t * p_ftree)
1667{
1668	unsigned i;
1669	unsigned j;
1670	unsigned cns_on_this_leaf;
1671	ftree_sw_t *p_sw;
1672	ftree_port_group_t *p_group;
1673
1674	for (i = 0; i < p_ftree->leaf_switches_num; i++) {
1675		p_sw = p_ftree->leaf_switches[i];
1676		cns_on_this_leaf = 0;
1677		for (j = 0; j < p_sw->down_port_groups_num; j++) {
1678			p_group = p_sw->down_port_groups[j];
1679			if (p_group->remote_node_type != IB_NODE_TYPE_CA)
1680				continue;
1681			cns_on_this_leaf +=
1682			    p_group->remote_hca_or_sw.p_hca->cn_num;
1683		}
1684		if (cns_on_this_leaf > p_ftree->max_cn_per_leaf)
1685			p_ftree->max_cn_per_leaf = cns_on_this_leaf;
1686	}
1687}				/* __osm_ftree_fabric_set_max_cn_per_leaf() */
1688
1689/***************************************************/
1690
1691static boolean_t __osm_ftree_fabric_validate_topology(IN ftree_fabric_t *
1692						      p_ftree)
1693{
1694	ftree_port_group_t *p_group;
1695	ftree_port_group_t *p_ref_group;
1696	ftree_sw_t *p_sw;
1697	ftree_sw_t *p_next_sw;
1698	ftree_sw_t **reference_sw_arr;
1699	uint16_t tree_rank = __osm_ftree_fabric_get_rank(p_ftree);
1700	boolean_t res = TRUE;
1701	uint8_t i;
1702
1703	OSM_LOG_ENTER(&p_ftree->p_osm->log);
1704
1705	OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
1706		"Validating fabric topology\n");
1707
1708	reference_sw_arr =
1709	    (ftree_sw_t **) malloc(tree_rank * sizeof(ftree_sw_t *));
1710	if (reference_sw_arr == NULL) {
1711		osm_log(&p_ftree->p_osm->log, OSM_LOG_SYS,
1712			"Fat-tree routing: Memory allocation failed\n");
1713		return FALSE;
1714	}
1715	memset(reference_sw_arr, 0, tree_rank * sizeof(ftree_sw_t *));
1716
1717	p_next_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
1718	while (res && p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) {
1719		p_sw = p_next_sw;
1720		p_next_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item);
1721
1722		if (!reference_sw_arr[p_sw->rank]) {
1723			/* This is the first switch in the current level that
1724			   we're checking - use it as a reference */
1725			reference_sw_arr[p_sw->rank] = p_sw;
1726		} else {
1727			/* compare this switch properties to the reference switch */
1728
1729			if (reference_sw_arr[p_sw->rank]->up_port_groups_num !=
1730			    p_sw->up_port_groups_num) {
1731				OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
1732					"ERR AB09: Different number of upward port groups on switches:\n"
1733					"       GUID 0x%016" PRIx64
1734					", LID %u, Index %s - %u groups\n"
1735					"       GUID 0x%016" PRIx64
1736					", LID %u, Index %s - %u groups\n",
1737					__osm_ftree_sw_get_guid_ho
1738					(reference_sw_arr[p_sw->rank]),
1739					cl_ntoh16(reference_sw_arr[p_sw->rank]->
1740						  base_lid),
1741					__osm_ftree_tuple_to_str
1742					(reference_sw_arr[p_sw->rank]->tuple),
1743					reference_sw_arr[p_sw->rank]->
1744					up_port_groups_num,
1745					__osm_ftree_sw_get_guid_ho(p_sw),
1746					cl_ntoh16(p_sw->base_lid),
1747					__osm_ftree_tuple_to_str(p_sw->tuple),
1748					p_sw->up_port_groups_num);
1749				res = FALSE;
1750				break;
1751			}
1752
1753			if (p_sw->rank != (tree_rank - 1) &&
1754			    reference_sw_arr[p_sw->rank]->
1755			    down_port_groups_num !=
1756			    p_sw->down_port_groups_num) {
1757				/* we're allowing some hca's to be missing */
1758				OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
1759					"ERR AB0A: Different number of downward port groups on switches:\n"
1760					"       GUID 0x%016" PRIx64
1761					", LID %u, Index %s - %u port groups\n"
1762					"       GUID 0x%016" PRIx64
1763					", LID %u, Index %s - %u port groups\n",
1764					__osm_ftree_sw_get_guid_ho
1765					(reference_sw_arr[p_sw->rank]),
1766					cl_ntoh16(reference_sw_arr[p_sw->rank]->
1767						  base_lid),
1768					__osm_ftree_tuple_to_str
1769					(reference_sw_arr[p_sw->rank]->tuple),
1770					reference_sw_arr[p_sw->rank]->
1771					down_port_groups_num,
1772					__osm_ftree_sw_get_guid_ho(p_sw),
1773					cl_ntoh16(p_sw->base_lid),
1774					__osm_ftree_tuple_to_str(p_sw->tuple),
1775					p_sw->down_port_groups_num);
1776				res = FALSE;
1777				break;
1778			}
1779
1780			if (reference_sw_arr[p_sw->rank]->up_port_groups_num !=
1781			    0) {
1782				p_ref_group =
1783				    reference_sw_arr[p_sw->rank]->
1784				    up_port_groups[0];
1785				for (i = 0; i < p_sw->up_port_groups_num; i++) {
1786					p_group = p_sw->up_port_groups[i];
1787					if (cl_ptr_vector_get_size
1788					    (&p_ref_group->ports) !=
1789					    cl_ptr_vector_get_size(&p_group->
1790								   ports)) {
1791						OSM_LOG(&p_ftree->p_osm->log,
1792							OSM_LOG_ERROR,
1793							"ERR AB0B: Different number of ports in an upward port group on switches:\n"
1794							"       GUID 0x%016"
1795							PRIx64
1796							", LID %u, Index %s - %u ports\n"
1797							"       GUID 0x%016"
1798							PRIx64
1799							", LID %u, Index %s - %u ports\n",
1800							__osm_ftree_sw_get_guid_ho
1801							(reference_sw_arr
1802							 [p_sw->rank]),
1803							cl_ntoh16
1804							(reference_sw_arr
1805							 [p_sw->rank]->
1806							 base_lid),
1807							__osm_ftree_tuple_to_str
1808							(reference_sw_arr
1809							 [p_sw->rank]->tuple),
1810							cl_ptr_vector_get_size
1811							(&p_ref_group->ports),
1812							__osm_ftree_sw_get_guid_ho
1813							(p_sw),
1814							cl_ntoh16(p_sw->
1815								  base_lid),
1816							__osm_ftree_tuple_to_str
1817							(p_sw->tuple),
1818							cl_ptr_vector_get_size
1819							(&p_group->ports));
1820						res = FALSE;
1821						break;
1822					}
1823				}
1824			}
1825			if (reference_sw_arr[p_sw->rank]->
1826			    down_port_groups_num != 0
1827			    && p_sw->rank != (tree_rank - 1)) {
1828				/* we're allowing some hca's to be missing */
1829				p_ref_group =
1830				    reference_sw_arr[p_sw->rank]->
1831				    down_port_groups[0];
1832				for (i = 0; i < p_sw->down_port_groups_num; i++) {
1833					p_group = p_sw->down_port_groups[0];
1834					if (cl_ptr_vector_get_size
1835					    (&p_ref_group->ports) !=
1836					    cl_ptr_vector_get_size(&p_group->
1837								   ports)) {
1838						OSM_LOG(&p_ftree->p_osm->log,
1839							OSM_LOG_ERROR,
1840							"ERR AB0C: Different number of ports in an downward port group on switches:\n"
1841							"       GUID 0x%016"
1842							PRIx64
1843							", LID %u, Index %s - %u ports\n"
1844							"       GUID 0x%016"
1845							PRIx64
1846							", LID %u, Index %s - %u ports\n",
1847							__osm_ftree_sw_get_guid_ho
1848							(reference_sw_arr
1849							 [p_sw->rank]),
1850							cl_ntoh16
1851							(reference_sw_arr
1852							 [p_sw->rank]->
1853							 base_lid),
1854							__osm_ftree_tuple_to_str
1855							(reference_sw_arr
1856							 [p_sw->rank]->tuple),
1857							cl_ptr_vector_get_size
1858							(&p_ref_group->ports),
1859							__osm_ftree_sw_get_guid_ho
1860							(p_sw),
1861							cl_ntoh16(p_sw->
1862								  base_lid),
1863							__osm_ftree_tuple_to_str
1864							(p_sw->tuple),
1865							cl_ptr_vector_get_size
1866							(&p_group->ports));
1867						res = FALSE;
1868						break;
1869					}
1870				}
1871			}
1872		}		/* end of else */
1873	}			/* end of while */
1874
1875	if (res == TRUE)
1876		OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
1877			"Fabric topology has been identified as FatTree\n");
1878	else
1879		OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
1880			"ERR AB0D: Fabric topology hasn't been identified as FatTree\n");
1881
1882	free(reference_sw_arr);
1883	OSM_LOG_EXIT(&p_ftree->p_osm->log);
1884	return res;
1885}				/* __osm_ftree_fabric_validate_topology() */
1886
1887/***************************************************
1888 ***************************************************/
1889
1890static void __osm_ftree_set_sw_fwd_table(IN cl_map_item_t * const p_map_item,
1891					 IN void *context)
1892{
1893	ftree_sw_t *p_sw = (ftree_sw_t * const)p_map_item;
1894	ftree_fabric_t *p_ftree = (ftree_fabric_t *) context;
1895
1896	p_sw->p_osm_sw->max_lid_ho = p_ftree->lft_max_lid_ho;
1897	osm_ucast_mgr_set_fwd_table(&p_ftree->p_osm->sm.ucast_mgr,
1898				    p_sw->p_osm_sw);
1899}
1900
1901/***************************************************
1902 ***************************************************/
1903
1904/*
1905 * Function: assign-up-going-port-by-descending-down
1906 * Given   : a switch and a LID
1907 * Pseudo code:
1908 *    foreach down-going-port-group (in indexing order)
1909 *        skip this group if the LFT(LID) port is part of this group
1910 *        find the least loaded port of the group (scan in indexing order)
1911 *        r-port is the remote port connected to it
1912 *        assign the remote switch node LFT(LID) to r-port
1913 *        increase r-port usage counter
1914 *        assign-up-going-port-by-descending-down to r-port node (recursion)
1915 */
1916
1917static void
1918__osm_ftree_fabric_route_upgoing_by_going_down(IN ftree_fabric_t * p_ftree,
1919					       IN ftree_sw_t * p_sw,
1920					       IN ftree_sw_t * p_prev_sw,
1921					       IN ib_net16_t target_lid,
1922					       IN uint8_t target_rank,
1923					       IN boolean_t is_real_lid,
1924					       IN boolean_t is_main_path,
1925					       IN uint8_t highest_rank_in_route)
1926{
1927	ftree_sw_t *p_remote_sw;
1928	uint16_t ports_num;
1929	ftree_port_group_t *p_group;
1930	ftree_port_t *p_port;
1931	ftree_port_t *p_min_port;
1932	uint16_t i;
1933	uint16_t j;
1934	uint16_t k;
1935
1936	/* we shouldn't enter here if both real_lid and main_path are false */
1937	CL_ASSERT(is_real_lid || is_main_path);
1938
1939	/* if there is no down-going ports */
1940	if (p_sw->down_port_groups_num == 0)
1941		return;
1942
1943	/* promote the index that indicates which group should we
1944	   start with when going through all the downgoing groups */
1945	p_sw->down_port_groups_idx =
1946		(p_sw->down_port_groups_idx + 1) % p_sw->down_port_groups_num;
1947
1948	/* foreach down-going port group (in indexing order) */
1949	i = p_sw->down_port_groups_idx;
1950	for (k = 0; k < p_sw->down_port_groups_num; k++) {
1951
1952		p_group = p_sw->down_port_groups[i];
1953		i = (i + 1) % p_sw->down_port_groups_num;
1954
1955		/* Skip this port group unless it points to a switch */
1956		if (p_group->remote_node_type != IB_NODE_TYPE_SWITCH)
1957			continue;
1958
1959		if (p_prev_sw
1960		    && (p_group->remote_base_lid == p_prev_sw->base_lid)) {
1961			/* This port group has a port that was used when we entered this switch,
1962			   which means that the current group points to the switch where we were
1963			   at the previous step of the algorithm (before going up).
1964			   Skipping this group. */
1965			continue;
1966		}
1967
1968		/* find the least loaded port of the group (in indexing order) */
1969		p_min_port = NULL;
1970		ports_num = (uint16_t) cl_ptr_vector_get_size(&p_group->ports);
1971		/* ToDo: no need to select a least loaded port for non-main path.
1972		   Think about optimization. */
1973		for (j = 0; j < ports_num; j++) {
1974			cl_ptr_vector_at(&p_group->ports, j, (void *)&p_port);
1975			if (!p_min_port) {
1976				/* first port that we're checking - set as port with the lowest load */
1977				p_min_port = p_port;
1978			} else if (p_port->counter_up < p_min_port->counter_up) {
1979				/* this port is less loaded - use it as min */
1980				p_min_port = p_port;
1981			}
1982		}
1983		/* At this point we have selected a port in this group with the
1984		   lowest load of upgoing routes.
1985		   Set on the remote switch how to get to the target_lid -
1986		   set LFT(target_lid) on the remote switch to the remote port */
1987		p_remote_sw = p_group->remote_hca_or_sw.p_sw;
1988
1989		if (osm_switch_get_least_hops(p_remote_sw->p_osm_sw,
1990					      cl_ntoh16(target_lid)) !=
1991		    OSM_NO_PATH) {
1992			/* Loop in the fabric - we already routed the remote switch
1993			   on our way UP, and now we see it again on our way DOWN */
1994			OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
1995				"Loop of lenght %d in the fabric:\n                             "
1996				"Switch %s (LID %u) closes loop through switch %s (LID %u)\n",
1997				(p_remote_sw->rank - highest_rank_in_route) * 2,
1998				__osm_ftree_tuple_to_str(p_remote_sw->tuple),
1999				cl_ntoh16(p_group->base_lid),
2000				__osm_ftree_tuple_to_str(p_sw->tuple),
2001				cl_ntoh16(p_group->remote_base_lid));
2002			continue;
2003		}
2004
2005		/* Four possible cases:
2006		 *
2007		 *  1. is_real_lid == TRUE && is_main_path == TRUE:
2008		 *      - going DOWN(TRUE,TRUE) through ALL the groups
2009		 *         + promoting port counter
2010		 *         + setting path in remote switch fwd tbl
2011		 *         + setting hops in remote switch on all the ports of each group
2012		 *
2013		 *  2. is_real_lid == TRUE && is_main_path == FALSE:
2014		 *      - going DOWN(TRUE,FALSE) through ALL the groups but only if
2015		 *        the remote (lower) switch hasn't been already configured
2016		 *        for this target LID
2017		 *         + NOT promoting port counter
2018		 *         + setting path in remote switch fwd tbl if it hasn't been set yet
2019		 *         + setting hops in remote switch on all the ports of each group
2020		 *           if it hasn't been set yet
2021		 *
2022		 *  3. is_real_lid == FALSE && is_main_path == TRUE:
2023		 *      - going DOWN(FALSE,TRUE) through ALL the groups
2024		 *         + promoting port counter
2025		 *         + NOT setting path in remote switch fwd tbl
2026		 *         + NOT setting hops in remote switch
2027		 *
2028		 *  4. is_real_lid == FALSE && is_main_path == FALSE:
2029		 *      - illegal state - we shouldn't get here
2030		 */
2031
2032		/* second case: skip the port group if the remote (lower)
2033		   switch has been already configured for this target LID */
2034		if (is_real_lid && !is_main_path &&
2035		    p_remote_sw->p_osm_sw->new_lft[cl_ntoh16(target_lid)] != OSM_NO_PATH)
2036			continue;
2037
2038		/* setting fwd tbl port only if this is real LID */
2039		if (is_real_lid) {
2040			p_remote_sw->p_osm_sw->new_lft[cl_ntoh16(target_lid)] =
2041				p_min_port->remote_port_num;
2042			OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2043				"Switch %s: set path to CA LID %u through port %u\n",
2044				__osm_ftree_tuple_to_str(p_remote_sw->tuple),
2045				cl_ntoh16(target_lid),
2046				p_min_port->remote_port_num);
2047
2048			/* On the remote switch that is pointed by the p_group,
2049			   set hops for ALL the ports in the remote group. */
2050
2051			for (j = 0; j < ports_num; j++) {
2052				cl_ptr_vector_at(&p_group->ports, j,
2053						 (void *)&p_port);
2054
2055				__osm_ftree_sw_set_hops(p_remote_sw,
2056							cl_ntoh16(target_lid),
2057							p_port->remote_port_num,
2058							((target_rank -
2059							  highest_rank_in_route)
2060							 + (p_remote_sw->rank -
2061							    highest_rank_in_route)));
2062			}
2063
2064		}
2065
2066		/* The number of upgoing routes is tracked in the
2067		   p_port->counter_up counter of the port that belongs to
2068		   the upper side of the link (on switch with lower rank).
2069		   Counter is promoted only if we're routing LID on the main
2070		   path (whether it's a real LID or a dummy one). */
2071		if (is_main_path)
2072			p_min_port->counter_up++;
2073
2074		/* Recursion step:
2075		   Assign upgoing ports by stepping down, starting on REMOTE switch */
2076		__osm_ftree_fabric_route_upgoing_by_going_down(p_ftree, p_remote_sw,	/* remote switch - used as a route-upgoing alg. start point */
2077							       NULL,	/* prev. position - NULL to mark that we went down and not up */
2078							       target_lid,	/* LID that we're routing to */
2079							       target_rank,	/* rank of the LID that we're routing to */
2080							       is_real_lid,	/* whether the target LID is real or dummy */
2081							       is_main_path,	/* whether this is path to HCA that should by tracked by counters */
2082							       highest_rank_in_route);	/* highest visited point in the tree before going down */
2083	}
2084	/* done scanning all the down-going port groups */
2085
2086}				/* __osm_ftree_fabric_route_upgoing_by_going_down() */
2087
2088/***************************************************/
2089
2090/*
2091 * Function: assign-down-going-port-by-ascending-up
2092 * Given   : a switch and a LID
2093 * Pseudo code:
2094 *    find the least loaded port of all the upgoing groups (scan in indexing order)
2095 *    assign the LFT(LID) of remote switch to that port
2096 *    track that port usage
2097 *    assign-up-going-port-by-descending-down on CURRENT switch
2098 *    assign-down-going-port-by-ascending-up on REMOTE switch (recursion)
2099 */
2100
2101static void
2102__osm_ftree_fabric_route_downgoing_by_going_up(IN ftree_fabric_t * p_ftree,
2103					       IN ftree_sw_t * p_sw,
2104					       IN ftree_sw_t * p_prev_sw,
2105					       IN ib_net16_t target_lid,
2106					       IN uint8_t target_rank,
2107					       IN boolean_t is_real_lid,
2108					       IN boolean_t is_main_path)
2109{
2110	ftree_sw_t *p_remote_sw;
2111	uint16_t ports_num;
2112	ftree_port_group_t *p_group;
2113	ftree_port_t *p_port;
2114	ftree_port_group_t *p_min_group;
2115	ftree_port_t *p_min_port;
2116	uint16_t i;
2117	uint16_t j;
2118
2119	/* we shouldn't enter here if both real_lid and main_path are false */
2120	CL_ASSERT(is_real_lid || is_main_path);
2121
2122	/* Assign upgoing ports by stepping down, starting on THIS switch */
2123	__osm_ftree_fabric_route_upgoing_by_going_down(p_ftree, p_sw,	/* local switch - used as a route-upgoing alg. start point */
2124						       p_prev_sw,	/* switch that we went up from (NULL means that we went down) */
2125						       target_lid,	/* LID that we're routing to */
2126						       target_rank,	/* rank of the LID that we're routing to */
2127						       is_real_lid,	/* whether this target LID is real or dummy */
2128						       is_main_path,	/* whether this path to HCA should by tracked by counters */
2129						       p_sw->rank);	/* the highest visited point in the tree before going down */
2130
2131	/* recursion stop condition - if it's a root switch, */
2132	if (p_sw->rank == 0)
2133		return;
2134
2135	/* Find the least loaded upgoing port group */
2136	p_min_group = NULL;
2137	for (i = 0; i < p_sw->up_port_groups_num; i++) {
2138		p_group = p_sw->up_port_groups[i];
2139		if (!p_min_group) {
2140			/* first group that we're checking - use
2141			   it as a group with the lowest load */
2142			p_min_group = p_group;
2143		} else if (p_group->counter_down < p_min_group->counter_down) {
2144			/* this group is less loaded - use it as min */
2145			p_min_group = p_group;
2146		}
2147	}
2148
2149	/* Find the least loaded upgoing port in the selected group */
2150	p_min_port = NULL;
2151	ports_num = (uint16_t) cl_ptr_vector_get_size(&p_min_group->ports);
2152	for (j = 0; j < ports_num; j++) {
2153		cl_ptr_vector_at(&p_min_group->ports, j, (void *)&p_port);
2154		if (!p_min_port) {
2155			/* first port that we're checking - use
2156			   it as a port with the lowest load */
2157			p_min_port = p_port;
2158		} else if (p_port->counter_down < p_min_port->counter_down) {
2159			/* this port is less loaded - use it as min */
2160			p_min_port = p_port;
2161		}
2162	}
2163
2164	/* At this point we have selected a group and port with the
2165	   lowest load of downgoing routes.
2166	   Set on the remote switch how to get to the target_lid -
2167	   set LFT(target_lid) on the remote switch to the remote port */
2168	p_remote_sw = p_min_group->remote_hca_or_sw.p_sw;
2169
2170	/* Four possible cases:
2171	 *
2172	 *  1. is_real_lid == TRUE && is_main_path == TRUE:
2173	 *      - going UP(TRUE,TRUE) on selected min_group and min_port
2174	 *         + promoting port counter
2175	 *         + setting path in remote switch fwd tbl
2176	 *         + setting hops in remote switch on all the ports of selected group
2177	 *      - going UP(TRUE,FALSE) on rest of the groups, each time on port 0
2178	 *         + NOT promoting port counter
2179	 *         + setting path in remote switch fwd tbl if it hasn't been set yet
2180	 *         + setting hops in remote switch on all the ports of each group
2181	 *           if it hasn't been set yet
2182	 *
2183	 *  2. is_real_lid == TRUE && is_main_path == FALSE:
2184	 *      - going UP(TRUE,FALSE) on ALL the groups, each time on port 0,
2185	 *        but only if the remote (upper) switch hasn't been already
2186	 *        configured for this target LID
2187	 *         + NOT promoting port counter
2188	 *         + setting path in remote switch fwd tbl if it hasn't been set yet
2189	 *         + setting hops in remote switch on all the ports of each group
2190	 *           if it hasn't been set yet
2191	 *
2192	 *  3. is_real_lid == FALSE && is_main_path == TRUE:
2193	 *      - going UP(FALSE,TRUE) ONLY on selected min_group and min_port
2194	 *         + promoting port counter
2195	 *         + NOT setting path in remote switch fwd tbl
2196	 *         + NOT setting hops in remote switch
2197	 *
2198	 *  4. is_real_lid == FALSE && is_main_path == FALSE:
2199	 *      - illegal state - we shouldn't get here
2200	 */
2201
2202	/* covering first half of case 1, and case 3 */
2203	if (is_main_path) {
2204		if (p_sw->is_leaf) {
2205			OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2206				" - Routing MAIN path for %s CA LID %u: %s --> %s\n",
2207				(is_real_lid) ? "real" : "DUMMY",
2208				cl_ntoh16(target_lid),
2209				__osm_ftree_tuple_to_str(p_sw->tuple),
2210				__osm_ftree_tuple_to_str(p_remote_sw->tuple));
2211		}
2212		/* The number of downgoing routes is tracked in the
2213		   p_group->counter_down p_port->counter_down counters of the
2214		   group and port that belong to the lower side of the link
2215		   (on switch with higher rank) */
2216		p_min_group->counter_down++;
2217		p_min_port->counter_down++;
2218		if (is_real_lid) {
2219			p_remote_sw->p_osm_sw->new_lft[cl_ntoh16(target_lid)] =
2220				p_min_port->remote_port_num;
2221			OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2222				"Switch %s: set path to CA LID %u through port %u\n",
2223				__osm_ftree_tuple_to_str(p_remote_sw->tuple),
2224				cl_ntoh16(target_lid),
2225				p_min_port->remote_port_num);
2226
2227			/* On the remote switch that is pointed by the min_group,
2228			   set hops for ALL the ports in the remote group. */
2229
2230			ports_num =
2231			    (uint16_t) cl_ptr_vector_get_size(&p_min_group->
2232							      ports);
2233			for (j = 0; j < ports_num; j++) {
2234				cl_ptr_vector_at(&p_min_group->ports, j,
2235						 (void *)&p_port);
2236				__osm_ftree_sw_set_hops(p_remote_sw,
2237							cl_ntoh16(target_lid),
2238							p_port->remote_port_num,
2239							target_rank -
2240							p_remote_sw->rank);
2241			}
2242		}
2243
2244		/* Recursion step:
2245		   Assign downgoing ports by stepping up, starting on REMOTE switch. */
2246		__osm_ftree_fabric_route_downgoing_by_going_up(p_ftree, p_remote_sw,	/* remote switch - used as a route-downgoing alg. next step point */
2247							       p_sw,	/* this switch - prev. position switch for the function */
2248							       target_lid,	/* LID that we're routing to */
2249							       target_rank,	/* rank of the LID that we're routing to */
2250							       is_real_lid,	/* whether this target LID is real or dummy */
2251							       is_main_path);	/* whether this is path to HCA that should by tracked by counters */
2252	}
2253
2254	/* we're done for the third case */
2255	if (!is_real_lid)
2256		return;
2257
2258	/* What's left to do at this point:
2259	 *
2260	 *  1. is_real_lid == TRUE && is_main_path == TRUE:
2261	 *      - going UP(TRUE,FALSE) on rest of the groups, each time on port 0,
2262	 *        but only if the remote (upper) switch hasn't been already
2263	 *        configured for this target LID
2264	 *         + NOT promoting port counter
2265	 *         + setting path in remote switch fwd tbl if it hasn't been set yet
2266	 *         + setting hops in remote switch on all the ports of each group
2267	 *           if it hasn't been set yet
2268	 *
2269	 *  2. is_real_lid == TRUE && is_main_path == FALSE:
2270	 *      - going UP(TRUE,FALSE) on ALL the groups, each time on port 0,
2271	 *        but only if the remote (upper) switch hasn't been already
2272	 *        configured for this target LID
2273	 *         + NOT promoting port counter
2274	 *         + setting path in remote switch fwd tbl if it hasn't been set yet
2275	 *         + setting hops in remote switch on all the ports of each group
2276	 *           if it hasn't been set yet
2277	 *
2278	 *  These two rules can be rephrased this way:
2279	 *   - foreach UP port group
2280	 *      + if remote switch has been set with the target LID
2281	 *         - skip this port group
2282	 *      + else
2283	 *         - select port 0
2284	 *         - do NOT promote port counter
2285	 *         - set path in remote switch fwd tbl
2286	 *         - set hops in remote switch on all the ports of this group
2287	 *         - go UP(TRUE,FALSE) to the remote switch
2288	 */
2289
2290	for (i = 0; i < p_sw->up_port_groups_num; i++) {
2291		p_group = p_sw->up_port_groups[i];
2292		p_remote_sw = p_group->remote_hca_or_sw.p_sw;
2293
2294		/* skip if target lid has been already set on remote switch fwd tbl */
2295		if (p_remote_sw->p_osm_sw->new_lft[cl_ntoh16(target_lid)] != OSM_NO_PATH)
2296			continue;
2297
2298		if (p_sw->is_leaf) {
2299			OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2300				" - Routing SECONDARY path for LID %u: %s --> %s\n",
2301				cl_ntoh16(target_lid),
2302				__osm_ftree_tuple_to_str(p_sw->tuple),
2303				__osm_ftree_tuple_to_str(p_remote_sw->tuple));
2304		}
2305
2306		/* Routing REAL lids on SECONDARY path means routing
2307		   switch-to-switch or switch-to-CA paths.
2308		   We can safely assume that switch will initiate very
2309		   few traffic, so there's no point waisting runtime on
2310		   trying to balance these routes - always pick port 0. */
2311
2312		cl_ptr_vector_at(&p_group->ports, 0, (void *)&p_port);
2313		p_remote_sw->p_osm_sw->new_lft[cl_ntoh16(target_lid)] =
2314			p_port->remote_port_num;
2315
2316		/* On the remote switch that is pointed by the p_group,
2317		   set hops for ALL the ports in the remote group. */
2318
2319		ports_num = (uint16_t) cl_ptr_vector_get_size(&p_group->ports);
2320		for (j = 0; j < ports_num; j++) {
2321			cl_ptr_vector_at(&p_group->ports, j, (void *)&p_port);
2322
2323			__osm_ftree_sw_set_hops(p_remote_sw,
2324						cl_ntoh16(target_lid),
2325						p_port->remote_port_num,
2326						target_rank -
2327						p_remote_sw->rank);
2328		}
2329
2330		/* Recursion step:
2331		   Assign downgoing ports by stepping up, starting on REMOTE switch. */
2332		__osm_ftree_fabric_route_downgoing_by_going_up(p_ftree, p_remote_sw,	/* remote switch - used as a route-downgoing alg. next step point */
2333							       p_sw,	/* this switch - prev. position switch for the function */
2334							       target_lid,	/* LID that we're routing to */
2335							       target_rank,	/* rank of the LID that we're routing to */
2336							       TRUE,	/* whether the target LID is real or dummy */
2337							       FALSE);	/* whether this is path to HCA that should by tracked by counters */
2338	}
2339
2340}				/* ftree_fabric_route_downgoing_by_going_up() */
2341
2342/***************************************************/
2343
2344/*
2345 * Pseudo code:
2346 *    foreach leaf switch (in indexing order)
2347 *       for each compute node (in indexing order)
2348 *          obtain the LID of the compute node
2349 *          set local LFT(LID) of the port connecting to compute node
2350 *          call assign-down-going-port-by-ascending-up(TRUE,TRUE) on CURRENT switch
2351 *       for each MISSING compute node
2352 *          call assign-down-going-port-by-ascending-up(FALSE,TRUE) on CURRENT switch
2353 */
2354
2355static void __osm_ftree_fabric_route_to_cns(IN ftree_fabric_t * p_ftree)
2356{
2357	ftree_sw_t *p_sw;
2358	ftree_hca_t *p_hca;
2359	ftree_port_group_t *p_leaf_port_group;
2360	ftree_port_group_t *p_hca_port_group;
2361	ftree_port_t *p_port;
2362	uint32_t i;
2363	uint32_t j;
2364	ib_net16_t hca_lid;
2365	unsigned routed_targets_on_leaf;
2366
2367	OSM_LOG_ENTER(&p_ftree->p_osm->log);
2368
2369	/* for each leaf switch (in indexing order) */
2370	for (i = 0; i < p_ftree->leaf_switches_num; i++) {
2371		p_sw = p_ftree->leaf_switches[i];
2372		routed_targets_on_leaf = 0;
2373
2374		/* for each HCA connected to this switch */
2375		for (j = 0; j < p_sw->down_port_groups_num; j++) {
2376			p_leaf_port_group = p_sw->down_port_groups[j];
2377
2378			/* work with this port group only if the remote node is CA */
2379			if (p_leaf_port_group->remote_node_type !=
2380			    IB_NODE_TYPE_CA)
2381				continue;
2382
2383			p_hca = p_leaf_port_group->remote_hca_or_sw.p_hca;
2384
2385			/* work with this port group only if remote HCA has CNs */
2386			if (!p_hca->cn_num)
2387				continue;
2388
2389			p_hca_port_group =
2390			    __osm_ftree_hca_get_port_group_by_remote_lid(p_hca,
2391									 p_leaf_port_group->
2392									 base_lid);
2393			CL_ASSERT(p_hca_port_group);
2394
2395			/* work with this port group only if remote port is CN */
2396			if (!p_hca_port_group->is_cn)
2397				continue;
2398
2399			/* obtain the LID of HCA port */
2400			hca_lid = p_leaf_port_group->remote_base_lid;
2401
2402			/* set local LFT(LID) to the port that is connected to HCA */
2403			cl_ptr_vector_at(&p_leaf_port_group->ports, 0,
2404					 (void *)&p_port);
2405			p_sw->p_osm_sw->new_lft[cl_ntoh16(hca_lid)] = p_port->port_num;
2406
2407			OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2408				"Switch %s: set path to CN LID %u through port %u\n",
2409				__osm_ftree_tuple_to_str(p_sw->tuple),
2410				cl_ntoh16(hca_lid), p_port->port_num);
2411
2412			/* set local min hop table(LID) to route to the CA */
2413			__osm_ftree_sw_set_hops(p_sw,
2414						cl_ntoh16(hca_lid),
2415						p_port->port_num, 1);
2416
2417			/* Assign downgoing ports by stepping up.
2418			   Since we're routing here only CNs, we're routing it as REAL
2419			   LID and updating fat-tree balancing counters. */
2420			__osm_ftree_fabric_route_downgoing_by_going_up(p_ftree, p_sw,	/* local switch - used as a route-downgoing alg. start point */
2421								       NULL,	/* prev. position switch */
2422								       hca_lid,	/* LID that we're routing to */
2423								       p_sw->rank + 1,	/* rank of the LID that we're routing to */
2424								       TRUE,	/* whether this HCA LID is real or dummy */
2425								       TRUE);	/* whether this path to HCA should by tracked by counters */
2426
2427			/* count how many real targets have been routed from this leaf switch */
2428			routed_targets_on_leaf++;
2429		}
2430
2431		/* We're done with the real targets (all CNs) of this leaf switch.
2432		   Now route the dummy HCAs that are missing or that are non-CNs.
2433		   When routing to dummy HCAs we don't fill lid matrices. */
2434
2435		if (p_ftree->max_cn_per_leaf > routed_targets_on_leaf) {
2436			OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2437				"Routing %u dummy CAs\n",
2438				p_ftree->max_cn_per_leaf -
2439				p_sw->down_port_groups_num);
2440			for (j = 0;
2441			     ((int)j) <
2442			     (p_ftree->max_cn_per_leaf -
2443			      routed_targets_on_leaf); j++) {
2444				/* assign downgoing ports by stepping up */
2445				__osm_ftree_fabric_route_downgoing_by_going_up(p_ftree, p_sw,	/* local switch - used as a route-downgoing alg. start point */
2446									       NULL,	/* prev. position switch */
2447									       0,	/* LID that we're routing to - ignored for dummy HCA */
2448									       0,	/* rank of the LID that we're routing to - ignored for dummy HCA */
2449									       FALSE,	/* whether this HCA LID is real or dummy */
2450									       TRUE);	/* whether this path to HCA should by tracked by counters */
2451			}
2452		}
2453	}
2454	/* done going through all the leaf switches */
2455	OSM_LOG_EXIT(&p_ftree->p_osm->log);
2456}				/* __osm_ftree_fabric_route_to_cns() */
2457
2458/***************************************************/
2459
2460/*
2461 * Pseudo code:
2462 *    foreach HCA non-CN port in fabric
2463 *       obtain the LID of the HCA port
2464 *       get switch that is connected to this HCA port
2465 *       set switch LFT(LID) to the port connecting to compute node
2466 *       call assign-down-going-port-by-ascending-up(TRUE,FALSE) on CURRENT switch
2467 *
2468 * Routing to these HCAs is routing a REAL hca lid on SECONDARY path.
2469 * However, we do want to allow load-leveling of the traffic to the non-CNs,
2470 * because such nodes may include IO nodes with heavy usage
2471 *   - we should set fwd tables
2472 *   - we should update port counters
2473 * Routing to non-CNs is done after routing to CNs, so updated port
2474 * counters will not affect CN-to-CN routing.
2475 */
2476
2477static void __osm_ftree_fabric_route_to_non_cns(IN ftree_fabric_t * p_ftree)
2478{
2479	ftree_sw_t *p_sw;
2480	ftree_hca_t *p_hca;
2481	ftree_hca_t *p_next_hca;
2482	ftree_port_t *p_hca_port;
2483	ftree_port_group_t *p_hca_port_group;
2484	ib_net16_t hca_lid;
2485	unsigned port_num_on_switch;
2486	unsigned i;
2487
2488	OSM_LOG_ENTER(&p_ftree->p_osm->log);
2489
2490	p_next_hca = (ftree_hca_t *) cl_qmap_head(&p_ftree->hca_tbl);
2491	while (p_next_hca != (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl)) {
2492		p_hca = p_next_hca;
2493		p_next_hca = (ftree_hca_t *) cl_qmap_next(&p_hca->map_item);
2494
2495		for (i = 0; i < p_hca->up_port_groups_num; i++) {
2496			p_hca_port_group = p_hca->up_port_groups[i];
2497
2498			/* skip this port if it's CN, in which case it has been already routed */
2499			if (p_hca_port_group->is_cn)
2500				continue;
2501
2502			/* skip this port if it is not connected to switch */
2503			if (p_hca_port_group->remote_node_type !=
2504			    IB_NODE_TYPE_SWITCH)
2505				continue;
2506
2507			p_sw = p_hca_port_group->remote_hca_or_sw.p_sw;
2508			hca_lid = p_hca_port_group->base_lid;
2509
2510			/* set switches  LFT(LID) to the port that is connected to HCA */
2511			cl_ptr_vector_at(&p_hca_port_group->ports, 0,
2512					 (void *)&p_hca_port);
2513			port_num_on_switch = p_hca_port->remote_port_num;
2514			p_sw->p_osm_sw->new_lft[cl_ntoh16(hca_lid)] = port_num_on_switch;
2515
2516			OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2517				"Switch %s: set path to non-CN HCA LID %u through port %u\n",
2518				__osm_ftree_tuple_to_str(p_sw->tuple),
2519				cl_ntoh16(hca_lid), port_num_on_switch);
2520
2521			/* set local min hop table(LID) to route to the CA */
2522			__osm_ftree_sw_set_hops(p_sw, cl_ntoh16(hca_lid),
2523						port_num_on_switch,	/* port num */
2524						1);	/* hops */
2525
2526			/* Assign downgoing ports by stepping up.
2527			   We're routing REAL targets. They are not CNs and not included
2528			   in the leafs array, but we treat them as MAIN path to allow load
2529			   leveling, which means that the counters will be updated. */
2530			__osm_ftree_fabric_route_downgoing_by_going_up(p_ftree, p_sw,	/* local switch - used as a route-downgoing alg. start point */
2531								       NULL,	/* prev. position switch */
2532								       hca_lid,	/* LID that we're routing to */
2533								       p_sw->rank + 1,	/* rank of the LID that we're routing to */
2534								       TRUE,	/* whether this HCA LID is real or dummy */
2535								       TRUE);	/* whether this path to HCA should by tracked by counters */
2536		}
2537		/* done with all the port groups of this HCA - go to next HCA */
2538	}
2539
2540	OSM_LOG_EXIT(&p_ftree->p_osm->log);
2541}				/* __osm_ftree_fabric_route_to_non_cns() */
2542
2543/***************************************************/
2544
2545/*
2546 * Pseudo code:
2547 *    foreach switch in fabric
2548 *       obtain its LID
2549 *       set local LFT(LID) to port 0
2550 *       call assign-down-going-port-by-ascending-up(TRUE,FALSE) on CURRENT switch
2551 *
2552 * Routing to switch is similar to routing a REAL hca lid on SECONDARY path:
2553 *   - we should set fwd tables
2554 *   - we should NOT update port counters
2555 */
2556
2557static void __osm_ftree_fabric_route_to_switches(IN ftree_fabric_t * p_ftree)
2558{
2559	ftree_sw_t *p_sw;
2560	ftree_sw_t *p_next_sw;
2561
2562	OSM_LOG_ENTER(&p_ftree->p_osm->log);
2563
2564	p_next_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
2565	while (p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) {
2566		p_sw = p_next_sw;
2567		p_next_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item);
2568
2569		/* set local LFT(LID) to 0 (route to itself) */
2570		p_sw->p_osm_sw->new_lft[cl_ntoh16(p_sw->base_lid)] = 0;
2571
2572		OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2573			"Switch %s (LID %u): routing switch-to-switch paths\n",
2574			__osm_ftree_tuple_to_str(p_sw->tuple),
2575			cl_ntoh16(p_sw->base_lid));
2576
2577		/* set min hop table of the switch to itself */
2578		__osm_ftree_sw_set_hops(p_sw, cl_ntoh16(p_sw->base_lid),
2579					0,	/* port_num */
2580					0);	/* hops     */
2581
2582		__osm_ftree_fabric_route_downgoing_by_going_up(p_ftree, p_sw,	/* local switch - used as a route-downgoing alg. start point */
2583							       NULL,	/* prev. position switch */
2584							       p_sw->base_lid,	/* LID that we're routing to */
2585							       p_sw->rank,	/* rank of the LID that we're routing to */
2586							       TRUE,	/* whether the target LID is a real or dummy */
2587							       FALSE);	/* whether this path should by tracked by counters */
2588	}
2589
2590	OSM_LOG_EXIT(&p_ftree->p_osm->log);
2591}				/* __osm_ftree_fabric_route_to_switches() */
2592
2593/***************************************************
2594 ***************************************************/
2595
2596static int __osm_ftree_fabric_populate_nodes(IN ftree_fabric_t * p_ftree)
2597{
2598	osm_node_t *p_osm_node;
2599	osm_node_t *p_next_osm_node;
2600
2601	OSM_LOG_ENTER(&p_ftree->p_osm->log);
2602
2603	p_next_osm_node =
2604	    (osm_node_t *) cl_qmap_head(&p_ftree->p_osm->subn.node_guid_tbl);
2605	while (p_next_osm_node !=
2606	       (osm_node_t *) cl_qmap_end(&p_ftree->p_osm->subn.
2607					  node_guid_tbl)) {
2608		p_osm_node = p_next_osm_node;
2609		p_next_osm_node =
2610		    (osm_node_t *) cl_qmap_next(&p_osm_node->map_item);
2611		switch (osm_node_get_type(p_osm_node)) {
2612		case IB_NODE_TYPE_CA:
2613			__osm_ftree_fabric_add_hca(p_ftree, p_osm_node);
2614			break;
2615		case IB_NODE_TYPE_ROUTER:
2616			break;
2617		case IB_NODE_TYPE_SWITCH:
2618			__osm_ftree_fabric_add_sw(p_ftree, p_osm_node->sw);
2619			break;
2620		default:
2621			OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, "ERR AB0E: "
2622				"Node GUID 0x%016" PRIx64
2623				" - Unknown node type: %s\n",
2624				cl_ntoh64(osm_node_get_node_guid(p_osm_node)),
2625				ib_get_node_type_str(osm_node_get_type
2626						     (p_osm_node)));
2627			OSM_LOG_EXIT(&p_ftree->p_osm->log);
2628			return -1;
2629		}
2630	}
2631
2632	OSM_LOG_EXIT(&p_ftree->p_osm->log);
2633	return 0;
2634}				/* __osm_ftree_fabric_populate_nodes() */
2635
2636/***************************************************
2637 ***************************************************/
2638
2639static boolean_t __osm_ftree_sw_update_rank(IN ftree_sw_t * p_sw,
2640					    IN uint32_t new_rank)
2641{
2642	if (__osm_ftree_sw_ranked(p_sw) && p_sw->rank <= new_rank)
2643		return FALSE;
2644	p_sw->rank = new_rank;
2645	return TRUE;
2646
2647}
2648
2649/***************************************************/
2650
2651static void
2652__osm_ftree_rank_switches_from_leafs(IN ftree_fabric_t * p_ftree,
2653				     IN cl_list_t * p_ranking_bfs_list)
2654{
2655	ftree_sw_t *p_sw;
2656	ftree_sw_t *p_remote_sw;
2657	osm_node_t *p_node;
2658	osm_node_t *p_remote_node;
2659	osm_physp_t *p_osm_port;
2660	uint8_t i;
2661	unsigned max_rank = 0;
2662
2663	while (!cl_is_list_empty(p_ranking_bfs_list)) {
2664		p_sw = (ftree_sw_t *) cl_list_remove_head(p_ranking_bfs_list);
2665		p_node = p_sw->p_osm_sw->p_node;
2666
2667		/* note: skipping port 0 on switches */
2668		for (i = 1; i < osm_node_get_num_physp(p_node); i++) {
2669			p_osm_port = osm_node_get_physp_ptr(p_node, i);
2670			if (!p_osm_port || !osm_link_is_healthy(p_osm_port))
2671				continue;
2672
2673			p_remote_node =
2674			    osm_node_get_remote_node(p_node, i, NULL);
2675			if (!p_remote_node)
2676				continue;
2677			if (osm_node_get_type(p_remote_node) !=
2678			    IB_NODE_TYPE_SWITCH)
2679				continue;
2680
2681			p_remote_sw = __osm_ftree_fabric_get_sw_by_guid(p_ftree,
2682									osm_node_get_node_guid
2683									(p_remote_node));
2684			if (!p_remote_sw) {
2685				/* remote node is not a switch */
2686				continue;
2687			}
2688
2689			/* if needed, rank the remote switch and add it to the BFS list */
2690			if (__osm_ftree_sw_update_rank
2691			    (p_remote_sw, p_sw->rank + 1)) {
2692				max_rank = p_remote_sw->rank;
2693				cl_list_insert_tail(p_ranking_bfs_list,
2694						    p_remote_sw);
2695			}
2696		}
2697	}
2698
2699	/* set FatTree maximal switch rank */
2700	p_ftree->max_switch_rank = max_rank;
2701
2702}				/* __osm_ftree_rank_switches_from_leafs() */
2703
2704/***************************************************/
2705
2706static int
2707__osm_ftree_rank_leaf_switches(IN ftree_fabric_t * p_ftree,
2708			       IN ftree_hca_t * p_hca,
2709			       IN cl_list_t * p_ranking_bfs_list)
2710{
2711	ftree_sw_t *p_sw;
2712	osm_node_t *p_osm_node = p_hca->p_osm_node;
2713	osm_node_t *p_remote_osm_node;
2714	osm_physp_t *p_osm_port;
2715	static uint8_t i = 0;
2716	int res = 0;
2717
2718	OSM_LOG_ENTER(&p_ftree->p_osm->log);
2719
2720	for (i = 0; i < osm_node_get_num_physp(p_osm_node); i++) {
2721		p_osm_port = osm_node_get_physp_ptr(p_osm_node, i);
2722		if (!p_osm_port || !osm_link_is_healthy(p_osm_port))
2723			continue;
2724
2725		p_remote_osm_node =
2726		    osm_node_get_remote_node(p_osm_node, i, NULL);
2727		if (!p_remote_osm_node)
2728			continue;
2729
2730		switch (osm_node_get_type(p_remote_osm_node)) {
2731		case IB_NODE_TYPE_CA:
2732			/* HCA connected directly to another HCA - not FatTree */
2733			OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, "ERR AB0F: "
2734				"CA conected directly to another CA: "
2735				"0x%016" PRIx64 " <---> 0x%016" PRIx64 "\n",
2736				__osm_ftree_hca_get_guid_ho(p_hca),
2737				cl_ntoh64(osm_node_get_node_guid
2738					  (p_remote_osm_node)));
2739			res = -1;
2740			goto Exit;
2741
2742		case IB_NODE_TYPE_ROUTER:
2743			/* leaving this port - proceeding to the next one */
2744			continue;
2745
2746		case IB_NODE_TYPE_SWITCH:
2747			/* continue with this port */
2748			break;
2749
2750		default:
2751			OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
2752				"ERR AB10: Node GUID 0x%016" PRIx64
2753				" - Unknown node type: %s\n",
2754				cl_ntoh64(osm_node_get_node_guid
2755					  (p_remote_osm_node)),
2756				ib_get_node_type_str(osm_node_get_type
2757						     (p_remote_osm_node)));
2758			res = -1;
2759			goto Exit;
2760		}
2761
2762		/* remote node is switch */
2763
2764		p_sw = __osm_ftree_fabric_get_sw_by_guid(p_ftree,
2765							 osm_node_get_node_guid
2766							 (p_osm_port->
2767							  p_remote_physp->
2768							  p_node));
2769		CL_ASSERT(p_sw);
2770
2771		/* if needed, rank the remote switch and add it to the BFS list */
2772
2773		if (!__osm_ftree_sw_update_rank(p_sw, 0))
2774			continue;
2775		OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2776			"Marking rank of switch that is directly connected to CA:\n"
2777			"                                            - CA guid    : 0x%016"
2778			PRIx64 "\n"
2779			"                                            - Switch guid: 0x%016"
2780			PRIx64 "\n"
2781			"                                            - Switch LID : %u\n",
2782			__osm_ftree_hca_get_guid_ho(p_hca),
2783			__osm_ftree_sw_get_guid_ho(p_sw),
2784			cl_ntoh16(p_sw->base_lid));
2785		cl_list_insert_tail(p_ranking_bfs_list, p_sw);
2786	}
2787
2788Exit:
2789	OSM_LOG_EXIT(&p_ftree->p_osm->log);
2790	return res;
2791}				/* __osm_ftree_rank_leaf_switches() */
2792
2793/***************************************************/
2794
2795static void __osm_ftree_sw_reverse_rank(IN cl_map_item_t * const p_map_item,
2796					IN void *context)
2797{
2798	ftree_fabric_t *p_ftree = (ftree_fabric_t *) context;
2799	ftree_sw_t *p_sw = (ftree_sw_t * const)p_map_item;
2800	p_sw->rank = p_ftree->max_switch_rank - p_sw->rank;
2801}
2802
2803/***************************************************
2804 ***************************************************/
2805
2806static int
2807__osm_ftree_fabric_construct_hca_ports(IN ftree_fabric_t * p_ftree,
2808				       IN ftree_hca_t * p_hca)
2809{
2810	ftree_sw_t *p_remote_sw;
2811	osm_node_t *p_node = p_hca->p_osm_node;
2812	osm_node_t *p_remote_node;
2813	uint8_t remote_node_type;
2814	ib_net64_t remote_node_guid;
2815	osm_physp_t *p_remote_osm_port;
2816	uint8_t i;
2817	uint8_t remote_port_num;
2818	boolean_t is_cn = FALSE;
2819	int res = 0;
2820
2821	for (i = 0; i < osm_node_get_num_physp(p_node); i++) {
2822		osm_physp_t *p_osm_port = osm_node_get_physp_ptr(p_node, i);
2823		if (!p_osm_port || !osm_link_is_healthy(p_osm_port))
2824			continue;
2825
2826		p_remote_osm_port = osm_physp_get_remote(p_osm_port);
2827		p_remote_node =
2828		    osm_node_get_remote_node(p_node, i, &remote_port_num);
2829
2830		if (!p_remote_osm_port)
2831			continue;
2832
2833		remote_node_type = osm_node_get_type(p_remote_node);
2834		remote_node_guid = osm_node_get_node_guid(p_remote_node);
2835
2836		switch (remote_node_type) {
2837		case IB_NODE_TYPE_ROUTER:
2838			/* leaving this port - proceeding to the next one */
2839			continue;
2840
2841		case IB_NODE_TYPE_CA:
2842			/* HCA connected directly to another HCA - not FatTree */
2843			OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, "ERR AB11: "
2844				"CA conected directly to another CA: "
2845				"0x%016" PRIx64 " <---> 0x%016" PRIx64 "\n",
2846				cl_ntoh64(osm_node_get_node_guid(p_node)),
2847				cl_ntoh64(remote_node_guid));
2848			res = -1;
2849			goto Exit;
2850
2851		case IB_NODE_TYPE_SWITCH:
2852			/* continue with this port */
2853			break;
2854
2855		default:
2856			OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
2857				"ERR AB12: Node GUID 0x%016" PRIx64
2858				" - Unknown node type: %s\n",
2859				cl_ntoh64(remote_node_guid),
2860				ib_get_node_type_str(remote_node_type));
2861			res = -1;
2862			goto Exit;
2863		}
2864
2865		/* remote node is switch */
2866
2867		p_remote_sw =
2868		    __osm_ftree_fabric_get_sw_by_guid(p_ftree,
2869						      remote_node_guid);
2870		CL_ASSERT(p_remote_sw);
2871
2872		/* If CN file is not supplied, then all the CAs considered as Compute Nodes.
2873		   Otherwise all the CAs are not CNs, and only guids that are present in the
2874		   CN file will be marked as compute nodes. */
2875		if (!__osm_ftree_fabric_cns_provided(p_ftree)) {
2876			is_cn = TRUE;
2877		} else {
2878			name_map_item_t *p_elem =
2879			    (name_map_item_t *) cl_qmap_get(&p_ftree->
2880							    cn_guid_tbl,
2881							    cl_ntoh64(osm_physp_get_port_guid
2882							    (p_osm_port)));
2883			if (p_elem !=
2884			    (name_map_item_t *) cl_qmap_end(&p_ftree->
2885							    cn_guid_tbl))
2886				is_cn = TRUE;
2887		}
2888
2889		if (is_cn) {
2890			p_ftree->cn_num++;
2891			p_hca->cn_num++;
2892			OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2893				"Marking CN port GUID 0x%016" PRIx64 "\n",
2894				cl_ntoh64(osm_physp_get_port_guid(p_osm_port)));
2895		} else {
2896			OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2897				"Marking non-CN port GUID 0x%016" PRIx64 "\n",
2898				cl_ntoh64(osm_physp_get_port_guid(p_osm_port)));
2899		}
2900
2901		__osm_ftree_hca_add_port(p_hca,	/* local ftree_hca object */
2902					 i,	/* local port number */
2903					 remote_port_num,	/* remote port number */
2904					 osm_node_get_base_lid(p_node, i),	/* local lid */
2905					 osm_node_get_base_lid(p_remote_node, 0),	/* remote lid */
2906					 osm_physp_get_port_guid(p_osm_port),	/* local port guid */
2907					 osm_physp_get_port_guid(p_remote_osm_port),	/* remote port guid */
2908					 remote_node_guid,	/* remote node guid */
2909					 remote_node_type,	/* remote node type */
2910					 (void *)p_remote_sw,	/* remote ftree_hca/sw object */
2911					 is_cn);	/* whether this port is compute node */
2912	}
2913
2914Exit:
2915	return res;
2916}				/* __osm_ftree_fabric_construct_hca_ports() */
2917
2918/***************************************************
2919 ***************************************************/
2920static boolean_t __osm_invalid_link_rank_diff(const uint32_t val)
2921{
2922	return (val != 1U && val != -1U);
2923}
2924
2925static int __osm_ftree_fabric_construct_sw_ports(IN ftree_fabric_t * p_ftree,
2926						 IN ftree_sw_t * p_sw)
2927{
2928	ftree_hca_t *p_remote_hca;
2929	ftree_sw_t *p_remote_sw;
2930	osm_node_t *p_node = p_sw->p_osm_sw->p_node;
2931	osm_node_t *p_remote_node;
2932	ib_net16_t remote_base_lid;
2933	uint8_t remote_node_type;
2934	ib_net64_t remote_node_guid;
2935	osm_physp_t *p_remote_osm_port;
2936	ftree_direction_t direction;
2937	void *p_remote_hca_or_sw;
2938	uint8_t i;
2939	uint8_t remote_port_num;
2940	int res = 0;
2941
2942	CL_ASSERT(osm_node_get_type(p_node) == IB_NODE_TYPE_SWITCH);
2943
2944	for (i = 1; i < osm_node_get_num_physp(p_node); i++) {
2945		osm_physp_t *p_osm_port = osm_node_get_physp_ptr(p_node, i);
2946		if (!p_osm_port || !osm_link_is_healthy(p_osm_port))
2947			continue;
2948
2949		p_remote_osm_port = osm_physp_get_remote(p_osm_port);
2950		if (!p_remote_osm_port)
2951			continue;
2952
2953		p_remote_node =
2954		    osm_node_get_remote_node(p_node, i, &remote_port_num);
2955
2956		/* ignore any loopback connection on switch */
2957		if (p_node == p_remote_node) {
2958			OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2959				"Ignoring loopback on switch GUID 0x%016" PRIx64
2960				", LID %u, rank %u\n",
2961				__osm_ftree_sw_get_guid_ho(p_sw),
2962				cl_ntoh16(p_sw->base_lid),
2963				p_sw->rank);
2964			continue;
2965		}
2966
2967		remote_node_type = osm_node_get_type(p_remote_node);
2968		remote_node_guid = osm_node_get_node_guid(p_remote_node);
2969
2970		switch (remote_node_type) {
2971		case IB_NODE_TYPE_ROUTER:
2972			/* leaving this port - proceeding to the next one */
2973			continue;
2974
2975		case IB_NODE_TYPE_CA:
2976			/* switch connected to hca */
2977
2978			p_remote_hca =
2979			    __osm_ftree_fabric_get_hca_by_guid(p_ftree,
2980							       remote_node_guid);
2981			CL_ASSERT(p_remote_hca);
2982
2983			p_remote_hca_or_sw = (void *)p_remote_hca;
2984			direction = FTREE_DIRECTION_DOWN;
2985
2986			remote_base_lid =
2987			    osm_physp_get_base_lid(p_remote_osm_port);
2988			break;
2989
2990		case IB_NODE_TYPE_SWITCH:
2991			/* switch connected to another switch */
2992
2993			p_remote_sw =
2994			    __osm_ftree_fabric_get_sw_by_guid(p_ftree,
2995							      remote_node_guid);
2996			CL_ASSERT(p_remote_sw);
2997
2998			p_remote_hca_or_sw = (void *)p_remote_sw;
2999
3000			if (__osm_invalid_link_rank_diff(p_sw->rank - p_remote_sw->rank)) {
3001				OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
3002					"ERR AB16: "
3003					"Illegal link between switches with ranks %u and %u:\n"
3004					"       GUID 0x%016" PRIx64
3005					", LID %u, rank %u\n"
3006					"       GUID 0x%016" PRIx64
3007					", LID %u, rank %u\n", p_sw->rank,
3008					p_remote_sw->rank,
3009					__osm_ftree_sw_get_guid_ho(p_sw),
3010					cl_ntoh16(p_sw->base_lid), p_sw->rank,
3011					__osm_ftree_sw_get_guid_ho(p_remote_sw),
3012					cl_ntoh16(p_remote_sw->base_lid),
3013					p_remote_sw->rank);
3014				res = -1;
3015				goto Exit;
3016			}
3017
3018			if (p_sw->rank > p_remote_sw->rank)
3019				direction = FTREE_DIRECTION_UP;
3020			else
3021				direction = FTREE_DIRECTION_DOWN;
3022
3023			/* switch LID is only in port 0 port_info structure */
3024			remote_base_lid =
3025			    osm_node_get_base_lid(p_remote_node, 0);
3026
3027			break;
3028
3029		default:
3030			OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
3031				"ERR AB13: Node GUID 0x%016" PRIx64
3032				" - Unknown node type: %s\n",
3033				cl_ntoh64(remote_node_guid),
3034				ib_get_node_type_str(remote_node_type));
3035			res = -1;
3036			goto Exit;
3037		}
3038		__osm_ftree_sw_add_port(p_sw,	/* local ftree_sw object */
3039					i,	/* local port number */
3040					remote_port_num,	/* remote port number */
3041					p_sw->base_lid,	/* local lid */
3042					remote_base_lid,	/* remote lid */
3043					osm_physp_get_port_guid(p_osm_port),	/* local port guid */
3044					osm_physp_get_port_guid(p_remote_osm_port),	/* remote port guid */
3045					remote_node_guid,	/* remote node guid */
3046					remote_node_type,	/* remote node type */
3047					p_remote_hca_or_sw,	/* remote ftree_hca/sw object */
3048					direction);	/* port direction (up or down) */
3049
3050		/* Track the max lid (in host order) that exists in the fabric */
3051		if (cl_ntoh16(remote_base_lid) > p_ftree->lft_max_lid_ho)
3052			p_ftree->lft_max_lid_ho = cl_ntoh16(remote_base_lid);
3053	}
3054
3055Exit:
3056	return res;
3057}				/* __osm_ftree_fabric_construct_sw_ports() */
3058
3059/***************************************************
3060 ***************************************************/
3061
3062static int __osm_ftree_fabric_rank_from_roots(IN ftree_fabric_t * p_ftree)
3063{
3064	osm_node_t *p_osm_node;
3065	osm_node_t *p_remote_osm_node;
3066	osm_physp_t *p_osm_physp;
3067	ftree_sw_t *p_sw;
3068	ftree_sw_t *p_remote_sw;
3069	cl_list_t ranking_bfs_list;
3070	struct guid_list_item *item;
3071	int res = 0;
3072	unsigned num_roots;
3073	unsigned max_rank = 0;
3074	unsigned i;
3075
3076	OSM_LOG_ENTER(&p_ftree->p_osm->log);
3077	cl_list_init(&ranking_bfs_list, 10);
3078
3079	/* Rank all the roots and add them to list */
3080	for (item = (void *)cl_qlist_head(&p_ftree->root_guid_list);
3081	     item != (void *)cl_qlist_end(&p_ftree->root_guid_list);
3082	     item = (void *)cl_qlist_next(&item->list)) {
3083		p_sw =
3084		    __osm_ftree_fabric_get_sw_by_guid(p_ftree,
3085						      cl_hton64(item->guid));
3086		if (!p_sw) {
3087			/* the specified root guid wasn't found in the fabric */
3088			OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, "ERR AB24: "
3089				"Root switch GUID 0x%" PRIx64 " not found\n",
3090				item->guid);
3091			continue;
3092		}
3093
3094		OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3095			"Ranking root switch with GUID 0x%" PRIx64 "\n",
3096			item->guid);
3097		p_sw->rank = 0;
3098		cl_list_insert_tail(&ranking_bfs_list, p_sw);
3099	}
3100
3101	num_roots = cl_list_count(&ranking_bfs_list);
3102	if (!num_roots) {
3103		OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, "ERR AB25: "
3104			"No valid roots supplied\n");
3105		res = -1;
3106		goto Exit;
3107	}
3108
3109	OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
3110		"Ranked %u valid root switches\n", num_roots);
3111
3112	/* Now the list has all the roots.
3113	   BFS the subnet and update rank on all the switches. */
3114
3115	while (!cl_is_list_empty(&ranking_bfs_list)) {
3116		p_sw = (ftree_sw_t *) cl_list_remove_head(&ranking_bfs_list);
3117		p_osm_node = p_sw->p_osm_sw->p_node;
3118
3119		/* note: skipping port 0 on switches */
3120		for (i = 1; i < osm_node_get_num_physp(p_osm_node); i++) {
3121			p_osm_physp = osm_node_get_physp_ptr(p_osm_node, i);
3122			if (!p_osm_physp  || !osm_link_is_healthy(p_osm_physp))
3123				continue;
3124
3125			p_remote_osm_node =
3126			    osm_node_get_remote_node(p_osm_node, i, NULL);
3127			if (!p_remote_osm_node)
3128				continue;
3129
3130			if (osm_node_get_type(p_remote_osm_node) !=
3131			    IB_NODE_TYPE_SWITCH)
3132				continue;
3133
3134			p_remote_sw = __osm_ftree_fabric_get_sw_by_guid(p_ftree,
3135									osm_node_get_node_guid
3136									(p_remote_osm_node));
3137			CL_ASSERT(p_remote_sw);
3138
3139			/* if needed, rank the remote switch and add it to the BFS list */
3140			if (__osm_ftree_sw_update_rank
3141			    (p_remote_sw, p_sw->rank + 1)) {
3142				OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3143					"Ranking switch 0x%" PRIx64
3144					" with rank %u\n",
3145					__osm_ftree_sw_get_guid_ho(p_remote_sw),
3146					p_remote_sw->rank);
3147				max_rank = p_remote_sw->rank;
3148				cl_list_insert_tail(&ranking_bfs_list,
3149						    p_remote_sw);
3150			}
3151		}
3152		/* done with ports of this switch - go to the next switch in the list */
3153	}
3154
3155	OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
3156		"Subnet ranking completed. Max Node Rank = %u\n", max_rank);
3157
3158	/* set FatTree maximal switch rank */
3159	p_ftree->max_switch_rank = max_rank;
3160
3161Exit:
3162	cl_list_destroy(&ranking_bfs_list);
3163	OSM_LOG_EXIT(&p_ftree->p_osm->log);
3164	return res;
3165}				/* __osm_ftree_fabric_rank_from_roots() */
3166
3167/***************************************************
3168 ***************************************************/
3169
3170static int __osm_ftree_fabric_rank_from_hcas(IN ftree_fabric_t * p_ftree)
3171{
3172	ftree_hca_t *p_hca;
3173	ftree_hca_t *p_next_hca;
3174	cl_list_t ranking_bfs_list;
3175	int res = 0;
3176
3177	OSM_LOG_ENTER(&p_ftree->p_osm->log);
3178
3179	cl_list_init(&ranking_bfs_list, 10);
3180
3181	/* Mark REVERSED rank of all the switches in the subnet.
3182	   Start from switches that are connected to hca's, and
3183	   scan all the switches in the subnet. */
3184	p_next_hca = (ftree_hca_t *) cl_qmap_head(&p_ftree->hca_tbl);
3185	while (p_next_hca != (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl)) {
3186		p_hca = p_next_hca;
3187		p_next_hca = (ftree_hca_t *) cl_qmap_next(&p_hca->map_item);
3188		if (__osm_ftree_rank_leaf_switches
3189		    (p_ftree, p_hca, &ranking_bfs_list) != 0) {
3190			res = -1;
3191			OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, "ERR AB14: "
3192				"Subnet ranking failed - subnet is not FatTree");
3193			goto Exit;
3194		}
3195	}
3196
3197	/* Now rank rest of the switches in the fabric, while the
3198	   list already contains all the ranked leaf switches */
3199	__osm_ftree_rank_switches_from_leafs(p_ftree, &ranking_bfs_list);
3200
3201	/* fix ranking of the switches by reversing the ranking direction */
3202	cl_qmap_apply_func(&p_ftree->sw_tbl, __osm_ftree_sw_reverse_rank,
3203			   (void *)p_ftree);
3204
3205Exit:
3206	cl_list_destroy(&ranking_bfs_list);
3207	OSM_LOG_EXIT(&p_ftree->p_osm->log);
3208	return res;
3209}				/* __osm_ftree_fabric_rank_from_hcas() */
3210
3211/***************************************************
3212 ***************************************************/
3213
3214static int __osm_ftree_fabric_rank(IN ftree_fabric_t * p_ftree)
3215{
3216	int res = 0;
3217
3218	OSM_LOG_ENTER(&p_ftree->p_osm->log);
3219
3220	if (__osm_ftree_fabric_roots_provided(p_ftree))
3221		res = __osm_ftree_fabric_rank_from_roots(p_ftree);
3222	else
3223		res = __osm_ftree_fabric_rank_from_hcas(p_ftree);
3224
3225	if (res)
3226		goto Exit;
3227
3228	OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO,
3229		"FatTree max switch rank is %u\n", p_ftree->max_switch_rank);
3230
3231Exit:
3232	OSM_LOG_EXIT(&p_ftree->p_osm->log);
3233	return res;
3234}				/* __osm_ftree_fabric_rank() */
3235
3236/***************************************************
3237 ***************************************************/
3238
3239static void __osm_ftree_fabric_set_leaf_rank(IN ftree_fabric_t * p_ftree)
3240{
3241	unsigned i;
3242	ftree_sw_t *p_sw;
3243	ftree_hca_t *p_hca = NULL;
3244	ftree_hca_t *p_next_hca;
3245
3246	OSM_LOG_ENTER(&p_ftree->p_osm->log);
3247
3248	if (!__osm_ftree_fabric_roots_provided(p_ftree)) {
3249		/* If root file is not provided, the fabric has to be pure fat-tree
3250		   in terms of ranking. Thus, leaf switches rank is the max rank. */
3251		p_ftree->leaf_switch_rank = p_ftree->max_switch_rank;
3252	} else {
3253		/* Find the first CN and set the leaf_switch_rank to the rank
3254		   of the switch that is connected to this CN. Later we will
3255		   ensure that all the leaf switches have the same rank. */
3256		p_next_hca = (ftree_hca_t *) cl_qmap_head(&p_ftree->hca_tbl);
3257		while (p_next_hca !=
3258		       (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl)) {
3259			p_hca = p_next_hca;
3260			if (p_hca->cn_num)
3261				break;
3262			p_next_hca =
3263			    (ftree_hca_t *) cl_qmap_next(&p_hca->map_item);
3264		}
3265		/* we know that there are CNs in the fabric, so just to be sure... */
3266		CL_ASSERT(p_next_hca !=
3267			  (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl));
3268
3269		OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3270			"Selected CN port GUID 0x%" PRIx64 "\n",
3271			__osm_ftree_hca_get_guid_ho(p_hca));
3272
3273		for (i = 0; (i < p_hca->up_port_groups_num)
3274		     && (!p_hca->up_port_groups[i]->is_cn); i++) ;
3275		CL_ASSERT(i < p_hca->up_port_groups_num);
3276		CL_ASSERT(p_hca->up_port_groups[i]->remote_node_type ==
3277			  IB_NODE_TYPE_SWITCH);
3278
3279		p_sw = p_hca->up_port_groups[i]->remote_hca_or_sw.p_sw;
3280		OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3281			"Selected leaf switch GUID 0x%" PRIx64 ", rank %u\n",
3282			__osm_ftree_sw_get_guid_ho(p_sw), p_sw->rank);
3283		p_ftree->leaf_switch_rank = p_sw->rank;
3284	}
3285
3286	OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO,
3287		"FatTree leaf switch rank is %u\n", p_ftree->leaf_switch_rank);
3288	OSM_LOG_EXIT(&p_ftree->p_osm->log);
3289}				/* __osm_ftree_fabric_set_leaf_rank() */
3290
3291/***************************************************
3292 ***************************************************/
3293
3294static int __osm_ftree_fabric_populate_ports(IN ftree_fabric_t * p_ftree)
3295{
3296	ftree_hca_t *p_hca;
3297	ftree_hca_t *p_next_hca;
3298	ftree_sw_t *p_sw;
3299	ftree_sw_t *p_next_sw;
3300	int res = 0;
3301
3302	OSM_LOG_ENTER(&p_ftree->p_osm->log);
3303
3304	p_next_hca = (ftree_hca_t *) cl_qmap_head(&p_ftree->hca_tbl);
3305	while (p_next_hca != (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl)) {
3306		p_hca = p_next_hca;
3307		p_next_hca = (ftree_hca_t *) cl_qmap_next(&p_hca->map_item);
3308		if (__osm_ftree_fabric_construct_hca_ports(p_ftree, p_hca) != 0) {
3309			res = -1;
3310			goto Exit;
3311		}
3312	}
3313
3314	p_next_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
3315	while (p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) {
3316		p_sw = p_next_sw;
3317		p_next_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item);
3318		if (__osm_ftree_fabric_construct_sw_ports(p_ftree, p_sw) != 0) {
3319			res = -1;
3320			goto Exit;
3321		}
3322	}
3323Exit:
3324	OSM_LOG_EXIT(&p_ftree->p_osm->log);
3325	return res;
3326}				/* __osm_ftree_fabric_populate_ports() */
3327
3328/***************************************************
3329 ***************************************************/
3330static int add_guid_item_to_list(void *cxt, uint64_t guid, char *p)
3331{
3332	cl_qlist_t *list = cxt;
3333	struct guid_list_item *item;
3334
3335	item = malloc(sizeof(*item));
3336	if (!item)
3337		return -1;
3338
3339	item->guid = guid;
3340	cl_qlist_insert_tail(list, &item->list);
3341
3342	return 0;
3343}
3344
3345static int add_guid_item_to_map(void *cxt, uint64_t guid, char *p)
3346{
3347	cl_qmap_t *map = cxt;
3348	name_map_item_t *item;
3349
3350	item = malloc(sizeof(*item));
3351	if (!item)
3352		return -1;
3353
3354	item->guid = guid;
3355	cl_qmap_insert(map, guid, &item->item);
3356
3357	return 0;
3358}
3359
3360static int __osm_ftree_fabric_read_guid_files(IN ftree_fabric_t * p_ftree)
3361{
3362	int status = 0;
3363
3364	OSM_LOG_ENTER(&p_ftree->p_osm->log);
3365
3366	if (__osm_ftree_fabric_roots_provided(p_ftree)) {
3367		OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3368			"Fetching root nodes from file %s\n",
3369			p_ftree->p_osm->subn.opt.root_guid_file);
3370
3371		if (parse_node_map(p_ftree->p_osm->subn.opt.root_guid_file,
3372				   add_guid_item_to_list,
3373				   &p_ftree->root_guid_list)) {
3374			status = -1;
3375			goto Exit;
3376		}
3377
3378		if (!cl_qlist_count(&p_ftree->root_guid_list)) {
3379			OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, "ERR AB22: "
3380				"Root guids file has no valid guids\n");
3381			status = -1;
3382			goto Exit;
3383		}
3384	}
3385
3386	if (__osm_ftree_fabric_cns_provided(p_ftree)) {
3387		OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3388			"Fetching compute nodes from file %s\n",
3389			p_ftree->p_osm->subn.opt.cn_guid_file);
3390
3391		if (parse_node_map(p_ftree->p_osm->subn.opt.cn_guid_file,
3392				   add_guid_item_to_map,
3393				   &p_ftree->cn_guid_tbl)) {
3394			status = -1;
3395			goto Exit;
3396		}
3397
3398		if (!cl_qmap_count(&p_ftree->cn_guid_tbl)) {
3399			OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, "ERR AB23: "
3400				"Compute node guids file has no valid guids\n");
3401			status = -1;
3402			goto Exit;
3403		}
3404	}
3405
3406Exit:
3407	OSM_LOG_EXIT(&p_ftree->p_osm->log);
3408	return status;
3409} /*__osm_ftree_fabric_read_guid_files() */
3410
3411/***************************************************
3412 ***************************************************/
3413
3414static int __osm_ftree_construct_fabric(IN void *context)
3415{
3416	ftree_fabric_t *p_ftree = context;
3417	int status = 0;
3418
3419	OSM_LOG_ENTER(&p_ftree->p_osm->log);
3420
3421	__osm_ftree_fabric_clear(p_ftree);
3422
3423	if (p_ftree->p_osm->subn.opt.lmc > 0) {
3424		osm_log(&p_ftree->p_osm->log, OSM_LOG_SYS,
3425			"LMC > 0 is not supported by fat-tree routing.\n"
3426			"Falling back to default routing\n");
3427		status = -1;
3428		goto Exit;
3429	}
3430
3431	if (cl_qmap_count(&p_ftree->p_osm->subn.sw_guid_tbl) < 2) {
3432		osm_log(&p_ftree->p_osm->log, OSM_LOG_SYS,
3433			"Fabric has %u switches - topology is not fat-tree.\n"
3434			"Falling back to default routing\n",
3435			cl_qmap_count(&p_ftree->p_osm->subn.sw_guid_tbl));
3436		status = -1;
3437		goto Exit;
3438	}
3439
3440	if ((cl_qmap_count(&p_ftree->p_osm->subn.node_guid_tbl) -
3441	     cl_qmap_count(&p_ftree->p_osm->subn.sw_guid_tbl)) < 2) {
3442		osm_log(&p_ftree->p_osm->log, OSM_LOG_SYS,
3443			"Fabric has %u nodes (%u switches) - topology is not fat-tree.\n"
3444			"Falling back to default routing\n",
3445			cl_qmap_count(&p_ftree->p_osm->subn.node_guid_tbl),
3446			cl_qmap_count(&p_ftree->p_osm->subn.sw_guid_tbl));
3447		status = -1;
3448		goto Exit;
3449	}
3450
3451	OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, "\n"
3452		"                       |----------------------------------------|\n"
3453		"                       |- Starting FatTree fabric construction -|\n"
3454		"                       |----------------------------------------|\n\n");
3455
3456	OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
3457		"Populating FatTree Switch and CA tables\n");
3458	if (__osm_ftree_fabric_populate_nodes(p_ftree) != 0) {
3459		osm_log(&p_ftree->p_osm->log, OSM_LOG_SYS,
3460			"Fabric topology is not fat-tree - "
3461			"falling back to default routing\n");
3462		status = -1;
3463		goto Exit;
3464	}
3465
3466	OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
3467		"Reading guid files provided by user\n");
3468	if (__osm_ftree_fabric_read_guid_files(p_ftree) != 0) {
3469		osm_log(&p_ftree->p_osm->log, OSM_LOG_SYS,
3470			"Failed reading guid files - "
3471			"falling back to default routing\n");
3472		status = -1;
3473		goto Exit;
3474	}
3475
3476	if (cl_qmap_count(&p_ftree->hca_tbl) < 2) {
3477		osm_log(&p_ftree->p_osm->log, OSM_LOG_SYS,
3478			"Fabric has %u CAa - topology is not fat-tree.\n"
3479			"Falling back to default routing\n",
3480			cl_qmap_count(&p_ftree->hca_tbl));
3481		status = -1;
3482		goto Exit;
3483	}
3484
3485	/* Rank all the switches in the fabric.
3486	   After that we will know only fabric max switch rank.
3487	   We will be able to check leaf switches rank and the
3488	   whole tree rank after filling ports and marking CNs. */
3489	OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, "Ranking FatTree\n");
3490	if (__osm_ftree_fabric_rank(p_ftree) != 0) {
3491		osm_log(&p_ftree->p_osm->log, OSM_LOG_SYS,
3492			"Failed ranking the tree\n");
3493		status = -1;
3494		goto Exit;
3495	}
3496
3497	/* For each hca and switch, construct array of ports.
3498	   This is done after the whole FatTree data structure is ready,
3499	   because we want the ports to have pointers to ftree_{sw,hca}_t
3500	   objects, and we need the switches to be already ranked because
3501	   that's how the port direction is determined. */
3502	OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
3503		"Populating CA & switch ports\n");
3504	if (__osm_ftree_fabric_populate_ports(p_ftree) != 0) {
3505		osm_log(&p_ftree->p_osm->log, OSM_LOG_SYS,
3506			"Fabric topology is not a fat-tree\n");
3507		status = -1;
3508		goto Exit;
3509	} else if (p_ftree->cn_num == 0) {
3510		osm_log(&p_ftree->p_osm->log, OSM_LOG_SYS,
3511			"Fabric has no valid compute nodes\n");
3512		status = -1;
3513		goto Exit;
3514	}
3515
3516	/* Now that the CA ports have been created and CNs were marked,
3517	   we can complete the fabric ranking - set leaf switches rank. */
3518	__osm_ftree_fabric_set_leaf_rank(p_ftree);
3519
3520	if (__osm_ftree_fabric_get_rank(p_ftree) > FAT_TREE_MAX_RANK ||
3521	    __osm_ftree_fabric_get_rank(p_ftree) < FAT_TREE_MIN_RANK) {
3522		osm_log(&p_ftree->p_osm->log, OSM_LOG_SYS,
3523			"Fabric rank is %u (should be between %u and %u)\n",
3524			__osm_ftree_fabric_get_rank(p_ftree), FAT_TREE_MIN_RANK,
3525			FAT_TREE_MAX_RANK);
3526		status = -1;
3527		goto Exit;
3528	}
3529
3530	/* Mark all the switches in the fabric with rank equal to
3531	   p_ftree->leaf_switch_rank and that are also connected to CNs.
3532	   As a by-product, this function also runs basic topology
3533	   validation - it checks that all the CNs are at the same rank. */
3534	if (__osm_ftree_fabric_mark_leaf_switches(p_ftree)) {
3535		osm_log(&p_ftree->p_osm->log, OSM_LOG_SYS,
3536			"Fabric topology is not a fat-tree\n");
3537		status = -1;
3538		goto Exit;
3539	}
3540
3541	/* Assign index to all the switches in the fabric.
3542	   This function also sorts leaf switch array by the switch index,
3543	   sorts all the port arrays of the indexed switches by remote
3544	   switch index, and creates switch-by-tuple table (sw_by_tuple_tbl) */
3545	__osm_ftree_fabric_make_indexing(p_ftree);
3546
3547	/* Create leaf switch array sorted by index.
3548	   This array contains switches with rank equal to p_ftree->leaf_switch_rank
3549	   and that are also connected to CNs (REAL leafs), and it may contain
3550	   switches at the same leaf rank w/o CNs, if this is the order of indexing.
3551	   In any case, the first and the last switches in the array are REAL leafs. */
3552	if (__osm_ftree_fabric_create_leaf_switch_array(p_ftree)) {
3553		osm_log(&p_ftree->p_osm->log, OSM_LOG_SYS,
3554			"Fabric topology is not a fat-tree\n");
3555		status = -1;
3556		goto Exit;
3557	}
3558
3559	/* calculate and set ftree.max_cn_per_leaf field */
3560	__osm_ftree_fabric_set_max_cn_per_leaf(p_ftree);
3561
3562	/* print general info about fabric topology */
3563	__osm_ftree_fabric_dump_general_info(p_ftree);
3564
3565	/* dump full tree topology */
3566	if (osm_log_is_active(&p_ftree->p_osm->log, OSM_LOG_DEBUG))
3567		__osm_ftree_fabric_dump(p_ftree);
3568
3569	/* the fabric is required to be PURE fat-tree only if the root
3570	   guid file hasn't been provided by user */
3571	if (!__osm_ftree_fabric_roots_provided(p_ftree) &&
3572	    !__osm_ftree_fabric_validate_topology(p_ftree)) {
3573		osm_log(&p_ftree->p_osm->log, OSM_LOG_SYS,
3574			"Fabric topology is not a fat-tree\n");
3575		status = -1;
3576		goto Exit;
3577	}
3578
3579	OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
3580		"Max LID in switch LFTs: %u\n",
3581		p_ftree->lft_max_lid_ho);
3582
3583Exit:
3584	if (status != 0) {
3585		OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
3586			"Clearing FatTree Fabric data structures\n");
3587		__osm_ftree_fabric_clear(p_ftree);
3588	} else
3589		p_ftree->fabric_built = TRUE;
3590
3591	OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, "\n"
3592		"                       |--------------------------------------------------|\n"
3593		"                       |- Done constructing FatTree fabric (status = %d) -|\n"
3594		"                       |--------------------------------------------------|\n\n",
3595		status);
3596
3597	OSM_LOG_EXIT(&p_ftree->p_osm->log);
3598	return status;
3599}				/* __osm_ftree_construct_fabric() */
3600
3601/***************************************************
3602 ***************************************************/
3603
3604static int __osm_ftree_do_routing(IN void *context)
3605{
3606	ftree_fabric_t *p_ftree = context;
3607	int status = 0;
3608
3609	OSM_LOG_ENTER(&p_ftree->p_osm->log);
3610
3611	if (!p_ftree->fabric_built) {
3612		status = -1;
3613		goto Exit;
3614	}
3615
3616	OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
3617		"Starting FatTree routing\n");
3618
3619	OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
3620		"Filling switch forwarding tables for Compute Nodes\n");
3621	__osm_ftree_fabric_route_to_cns(p_ftree);
3622
3623	OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
3624		"Filling switch forwarding tables for non-CN targets\n");
3625	__osm_ftree_fabric_route_to_non_cns(p_ftree);
3626
3627	OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
3628		"Filling switch forwarding tables for switch-to-switch paths\n");
3629	__osm_ftree_fabric_route_to_switches(p_ftree);
3630
3631	/* for each switch, set its fwd table */
3632	cl_qmap_apply_func(&p_ftree->sw_tbl, __osm_ftree_set_sw_fwd_table,
3633			   (void *)p_ftree);
3634
3635	/* write out hca ordering file */
3636	__osm_ftree_fabric_dump_hca_ordering(p_ftree);
3637
3638	OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
3639		"FatTree routing is done\n");
3640
3641Exit:
3642	OSM_LOG_EXIT(&p_ftree->p_osm->log);
3643	return status;
3644}
3645
3646/***************************************************
3647 ***************************************************/
3648
3649static void __osm_ftree_delete(IN void *context)
3650{
3651	if (!context)
3652		return;
3653	__osm_ftree_fabric_destroy((ftree_fabric_t *) context);
3654}
3655
3656/***************************************************
3657 ***************************************************/
3658
3659int osm_ucast_ftree_setup(struct osm_routing_engine *r, osm_opensm_t * p_osm)
3660{
3661	ftree_fabric_t *p_ftree = __osm_ftree_fabric_create();
3662	if (!p_ftree)
3663		return -1;
3664
3665	p_ftree->p_osm = p_osm;
3666
3667	r->context = (void *)p_ftree;
3668	r->build_lid_matrices = __osm_ftree_construct_fabric;
3669	r->ucast_build_fwd_tables = __osm_ftree_do_routing;
3670	r->delete = __osm_ftree_delete;
3671
3672	return 0;
3673}
3674