1/*
2 * Copyright (c) 2004-2008 Voltaire, Inc. All rights reserved.
3 * Copyright (c) 2002-2015 Mellanox Technologies LTD. All rights reserved.
4 * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
5 * Copyright (c) 2009-2015 ZIH, TU Dresden, Federal Republic of Germany. All rights reserved.
6 * Copyright (C) 2012-2017 Tokyo Institute of Technology. All rights reserved.
7 *
8 * This software is available to you under a choice of one of two
9 * licenses.  You may choose to be licensed under the terms of the GNU
10 * General Public License (GPL) Version 2, available from the file
11 * COPYING in the main directory of this source tree, or the
12 * OpenIB.org BSD license below:
13 *
14 *     Redistribution and use in source and binary forms, with or
15 *     without modification, are permitted provided that the following
16 *     conditions are met:
17 *
18 *      - Redistributions of source code must retain the above
19 *        copyright notice, this list of conditions and the following
20 *        disclaimer.
21 *
22 *      - Redistributions in binary form must reproduce the above
23 *        copyright notice, this list of conditions and the following
24 *        disclaimer in the documentation and/or other materials
25 *        provided with the distribution.
26 *
27 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
28 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
29 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
30 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
31 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
32 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
33 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
34 * SOFTWARE.
35 *
36 */
37
38/*
39 * Abstract:
40 *    Implementation of OpenSM (deadlock-free) single-source-shortest-path routing
41 *    (with dijkstra algorithm)
42 */
43
44#if HAVE_CONFIG_H
45#  include <config.h>
46#endif				/* HAVE_CONFIG_H */
47
48#include <stdio.h>
49#include <stdlib.h>
50#include <string.h>
51#include <opensm/osm_file_ids.h>
52#define FILE_ID OSM_FILE_UCAST_DFSSSP_C
53#include <opensm/osm_ucast_mgr.h>
54#include <opensm/osm_opensm.h>
55#include <opensm/osm_node.h>
56#include <opensm/osm_multicast.h>
57#include <opensm/osm_mcast_mgr.h>
58
59/* "infinity" for dijkstra */
60#define INF      0x7FFFFFFF
61
62enum {
63	UNDISCOVERED = 0,
64	DISCOVERED
65};
66
67enum {
68	UNKNOWN = 0,
69	GRAY,
70	BLACK,
71};
72
73typedef struct link {
74	uint64_t guid;		/* guid of the neighbor behind the link */
75	uint32_t from;		/* base_index in the adjazenz list (start of the link) */
76	uint8_t from_port;	/* port on the base_side (needed for weight update to identify the correct link for multigraphs) */
77	uint32_t to;		/* index of the neighbor in the adjazenz list (end of the link) */
78	uint8_t to_port;	/* port on the side of the neighbor (needed for the LFT) */
79	uint64_t weight;	/* link weight */
80	struct link *next;
81} link_t;
82
83typedef struct vertex {
84	/* informations of the fabric */
85	uint64_t guid;
86	uint16_t lid;		/* for lft filling */
87	uint32_t num_hca;	/* numbers of Hca/LIDs on the switch, for weight calculation */
88	link_t *links;
89	uint8_t hops;
90	/* for dijkstra routing */
91	link_t *used_link;	/* link between the vertex discovered before and this vertex */
92	uint64_t distance;	/* distance from source to this vertex */
93	uint8_t state;
94	/* for the binary heap */
95	uint32_t heap_id;
96	/* for LFT writing and debug */
97	osm_switch_t *sw;	/* selfpointer */
98	boolean_t dropped;	/* indicate dropped switches (w/ ucast cache) */
99} vertex_t;
100
101typedef struct binary_heap {
102	uint32_t size;		/* size of the heap */
103	vertex_t **nodes;	/* array with pointers to elements of the adj_list */
104} binary_heap_t;
105
106typedef struct vltable {
107	uint64_t num_lids;	/* size of the lids array */
108	uint16_t *lids;		/* sorted array of all lids in the subnet */
109	uint8_t *vls;		/* matrix form assignment lid X lid -> virtual lane */
110} vltable_t;
111
112typedef struct cdg_link {
113	struct cdg_node *node;
114	uint32_t num_pairs;	/* number of src->dest pairs incremented in path adding step */
115	uint32_t max_len;	/* length of the srcdest array */
116	uint32_t removed;	/* number of pairs removed in path deletion step */
117	uint32_t *srcdest_pairs;
118	struct cdg_link *next;
119} cdg_link_t;
120
121/* struct for a node of a binary tree with additional parent pointer */
122typedef struct cdg_node {
123	uint64_t channelID;	/* unique key consist of src lid + port + dest lid + port */
124	cdg_link_t *linklist;	/* edges to adjazent nodes */
125	uint8_t status;		/* node status in cycle search to avoid recursive function */
126	uint8_t visited;	/* needed to traverse the binary tree */
127	struct cdg_node *pre;	/* to save the path in cycle detection algorithm */
128	struct cdg_node *left, *right, *parent;
129} cdg_node_t;
130
131typedef struct dfsssp_context {
132	osm_routing_engine_type_t routing_type;
133	osm_ucast_mgr_t *p_mgr;
134	vertex_t *adj_list;
135	uint32_t adj_list_size;
136	vltable_t *srcdest2vl_table;
137	uint8_t *vl_split_count;
138} dfsssp_context_t;
139
140/**************** set initial values for structs **********************
141 **********************************************************************/
142static inline void set_default_link(link_t * link)
143{
144	link->guid = 0;
145	link->from = 0;
146	link->from_port = 0;
147	link->to = 0;
148	link->to_port = 0;
149	link->weight = 0;
150	link->next = NULL;
151}
152
153static inline void set_default_vertex(vertex_t * vertex)
154{
155	vertex->guid = 0;
156	vertex->lid = 0;
157	vertex->num_hca = 0;
158	vertex->links = NULL;
159	vertex->hops = 0;
160	vertex->used_link = NULL;
161	vertex->distance = 0;
162	vertex->state = UNDISCOVERED;
163	vertex->heap_id = 0;
164	vertex->sw = NULL;
165	vertex->dropped = FALSE;
166}
167
168static inline void set_default_cdg_node(cdg_node_t * node)
169{
170	node->channelID = 0;
171	node->linklist = NULL;
172	node->status = UNKNOWN;
173	node->visited = 0;
174	node->pre = NULL;
175	node->left = NULL;
176	node->right = NULL;
177	node->parent = NULL;
178}
179
180/**********************************************************************
181 **********************************************************************/
182
183/************ helper functions for heap in dijkstra *******************
184 **********************************************************************/
185/* returns true if element 1 is smaller than element 2 */
186static inline uint32_t heap_smaller(binary_heap_t * heap, uint32_t i,
187				    uint32_t j)
188{
189	return (heap->nodes[i]->distance < heap->nodes[j]->distance) ? 1 : 0;
190}
191
192/* swap two elements */
193static void heap_exchange(binary_heap_t * heap, uint32_t i, uint32_t j)
194{
195	uint32_t tmp_heap_id = 0;
196	vertex_t *tmp_node = NULL;
197
198	/* 1. swap the heap_id */
199	tmp_heap_id = heap->nodes[i]->heap_id;
200	heap->nodes[i]->heap_id = heap->nodes[j]->heap_id;
201	heap->nodes[j]->heap_id = tmp_heap_id;
202	/* 2. swap pointers */
203	tmp_node = heap->nodes[i];
204	heap->nodes[i] = heap->nodes[j];
205	heap->nodes[j] = tmp_node;
206}
207
208/* changes position of element with parent until children are bigger */
209static uint32_t heap_up(binary_heap_t * heap, uint32_t i)
210{
211	uint32_t curr = i, father = 0;
212
213	if (curr > 0) {
214		father = (curr - 1) >> 1;
215		while (heap_smaller(heap, curr, father)) {
216			heap_exchange(heap, curr, father);
217			/* try to go up when we arent already root */
218			curr = father;
219			if (curr > 0)
220				father = (curr - 1) >> 1;
221		}
222	}
223
224	return curr;
225}
226
227/* changes position of element with children until parent is smaller */
228static uint32_t heap_down(binary_heap_t * heap, uint32_t i)
229{
230	uint32_t curr = i;
231	uint32_t son1 = 0, son2 = 0, smaller_son = 0;
232	uint32_t exchanged = 0;
233
234	do {
235		son1 = ((curr + 1) << 1) - 1;
236		son2 = (curr + 1) << 1;
237		exchanged = 0;
238
239		/* exchange with smaller son */
240		if (son1 < heap->size && son2 < heap->size) {
241			if (heap_smaller(heap, son1, son2))
242				smaller_son = son1;
243			else
244				smaller_son = son2;
245		} else if (son1 < heap->size) {
246			/* only one son */
247			smaller_son = son1;
248		} else {
249			/* finished */
250			break;
251		}
252
253		/* only exchange when smaller */
254		if (heap_smaller(heap, smaller_son, curr)) {
255			heap_exchange(heap, curr, smaller_son);
256			exchanged = 1;
257			curr = smaller_son;
258		}
259	} while (exchanged);
260
261	return curr;
262}
263
264/* reheapify element */
265static inline void heap_heapify(binary_heap_t * heap, uint32_t i)
266{
267	heap_down(heap, heap_up(heap, i));
268}
269
270/* creates heap for graph */
271static int heap_create(vertex_t * adj_list, uint32_t adj_list_size,
272		       binary_heap_t ** binheap)
273{
274	binary_heap_t *heap = NULL;
275	uint32_t i = 0;
276
277	/* allocate the memory for the heap object */
278	heap = (binary_heap_t *) malloc(sizeof(binary_heap_t));
279	if (!heap)
280		return 1;
281
282	/* the heap size is equivalent to the size of the adj_list */
283	heap->size = adj_list_size;
284
285	/* allocate the pointer array, fill with the pointers to the elements of the adj_list and set the initial heap_id */
286	heap->nodes = (vertex_t **) malloc(heap->size * sizeof(vertex_t *));
287	if (!heap->nodes) {
288		free(heap);
289		return 1;
290	}
291	for (i = 0; i < heap->size; i++) {
292		heap->nodes[i] = &adj_list[i];
293		heap->nodes[i]->heap_id = i;
294	}
295
296	/* sort elements */
297	for (i = heap->size; i > 0; i--)
298		heap_down(heap, i - 1);
299
300	*binheap = heap;
301	return 0;
302}
303
304/* returns current minimum and removes it from heap */
305static vertex_t *heap_getmin(binary_heap_t * heap)
306{
307	vertex_t *min = NULL;
308
309	if (heap->size > 0)
310		min = heap->nodes[0];
311
312	if (min == NULL)
313		return min;
314
315	if (heap->size > 0) {
316		if (heap->size > 1) {
317			heap_exchange(heap, 0, heap->size - 1);
318			heap->size--;
319			heap_down(heap, 0);
320		} else {
321			heap->size--;
322		}
323	}
324
325	return min;
326}
327
328/* cleanup heap */
329static void heap_free(binary_heap_t * heap)
330{
331	if (heap) {
332		if (heap->nodes) {
333			free(heap->nodes);
334			heap->nodes = NULL;
335		}
336		free(heap);
337	}
338}
339
340/**********************************************************************
341 **********************************************************************/
342
343/************ helper functions to save src/dest X vl combination ******
344 **********************************************************************/
345/* compare function of two lids for stdlib qsort */
346static int cmp_lids(const void *l1, const void *l2)
347{
348	ib_net16_t lid1 = *((ib_net16_t *) l1), lid2 = *((ib_net16_t *) l2);
349
350	if (lid1 < lid2)
351		return -1;
352	else if (lid1 > lid2)
353		return 1;
354	else
355		return 0;
356}
357
358/* use stdlib to sort the lid array */
359static inline void vltable_sort_lids(vltable_t * vltable)
360{
361	qsort(vltable->lids, vltable->num_lids, sizeof(ib_net16_t), cmp_lids);
362}
363
364/* use stdlib to get index of key in lid array;
365   return -1 if lid isn't found in lids array
366*/
367static inline int64_t vltable_get_lidindex(ib_net16_t * key, vltable_t * vltable)
368{
369	ib_net16_t *found_lid = NULL;
370
371	found_lid =
372	    (ib_net16_t *) bsearch(key, vltable->lids, vltable->num_lids,
373				   sizeof(ib_net16_t), cmp_lids);
374	if (found_lid)
375		return found_lid - vltable->lids;
376	else
377		return -1;
378}
379
380/* get virtual lane from src lid X dest lid combination;
381   return -1 for invalid lids
382*/
383static int32_t vltable_get_vl(vltable_t * vltable, ib_net16_t slid, ib_net16_t dlid)
384{
385	int64_t ind1 = vltable_get_lidindex(&slid, vltable);
386	int64_t ind2 = vltable_get_lidindex(&dlid, vltable);
387
388	if (ind1 > -1 && ind2 > -1)
389		return (int32_t) (vltable->
390				  vls[ind1 + ind2 * vltable->num_lids]);
391	else
392		return -1;
393}
394
395/* set a virtual lane in the matrix */
396static inline void vltable_insert(vltable_t * vltable, ib_net16_t slid,
397				  ib_net16_t dlid, uint8_t vl)
398{
399	int64_t ind1 = vltable_get_lidindex(&slid, vltable);
400	int64_t ind2 = vltable_get_lidindex(&dlid, vltable);
401
402	if (ind1 > -1 && ind2 > -1)
403		vltable->vls[ind1 + ind2 * vltable->num_lids] = vl;
404}
405
406/* change a number of lanes from lane xy to lane yz */
407static void vltable_change_vl(vltable_t * vltable, uint8_t from, uint8_t to,
408			      uint64_t count)
409{
410	uint64_t set = 0, stop = 0;
411	uint64_t ind1 = 0, ind2 = 0;
412
413	for (ind1 = 0; ind1 < vltable->num_lids; ind1++) {
414		for (ind2 = 0; ind2 < vltable->num_lids; ind2++) {
415			if (set == count) {
416				stop = 1;
417				break;
418			}
419			if (ind1 != ind2) {
420				if (vltable->
421				    vls[ind1 + ind2 * vltable->num_lids] ==
422				    from) {
423					vltable->vls[ind1 +
424						     ind2 * vltable->num_lids] =
425					    to;
426					set++;
427				}
428			}
429		}
430		if (stop)
431			break;
432	}
433}
434
435static void vltable_print(osm_ucast_mgr_t * p_mgr, vltable_t * vltable)
436{
437	uint64_t ind1 = 0, ind2 = 0;
438
439	for (ind1 = 0; ind1 < vltable->num_lids; ind1++) {
440		for (ind2 = 0; ind2 < vltable->num_lids; ind2++) {
441			if (ind1 != ind2) {
442				OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
443					"   route from src_lid=%" PRIu16
444					" to dest_lid=%" PRIu16 " on vl=%" PRIu8
445					"\n", cl_ntoh16(vltable->lids[ind1]),
446					cl_ntoh16(vltable->lids[ind2]),
447					vltable->vls[ind1 +
448						     ind2 * vltable->num_lids]);
449			}
450		}
451	}
452}
453
454static void vltable_dealloc(vltable_t ** vltable)
455{
456	if (*vltable) {
457		if ((*vltable)->lids)
458			free((*vltable)->lids);
459		if ((*vltable)->vls)
460			free((*vltable)->vls);
461		free(*vltable);
462		*vltable = NULL;
463	}
464}
465
466static int vltable_alloc(vltable_t ** vltable, uint64_t size)
467{
468	/* allocate VL table and indexing array */
469	*vltable = (vltable_t *) malloc(sizeof(vltable_t));
470	if (!(*vltable))
471		goto ERROR;
472	(*vltable)->num_lids = size;
473	(*vltable)->lids = (ib_net16_t *) malloc(size * sizeof(ib_net16_t));
474	if (!((*vltable)->lids))
475		goto ERROR;
476	(*vltable)->vls = (uint8_t *) malloc(size * size * sizeof(uint8_t));
477	if (!((*vltable)->vls))
478		goto ERROR;
479	memset((*vltable)->vls, OSM_DEFAULT_SL, size * size);
480
481	return 0;
482
483ERROR:
484	vltable_dealloc(vltable);
485
486	return 1;
487}
488
489/**********************************************************************
490 **********************************************************************/
491
492/************ helper functions to save/manage the channel dep. graph **
493 **********************************************************************/
494/* update the srcdest array;
495   realloc array (double the size) if size is not large enough
496*/
497static void set_next_srcdest_pair(cdg_link_t * link, uint32_t srcdest)
498{
499	uint32_t new_size = 0, start_size = 2;
500	uint32_t *tmp = NULL, *tmp2 = NULL;
501
502	if (link->num_pairs == 0) {
503		link->srcdest_pairs =
504		    (uint32_t *) malloc(start_size * sizeof(uint32_t));
505		link->srcdest_pairs[link->num_pairs] = srcdest;
506		link->max_len = start_size;
507		link->removed = 0;
508	} else if (link->num_pairs == link->max_len) {
509		new_size = link->max_len << 1;
510		tmp = (uint32_t *) malloc(new_size * sizeof(uint32_t));
511		tmp =
512		    memcpy(tmp, link->srcdest_pairs,
513			   link->max_len * sizeof(uint32_t));
514		tmp2 = link->srcdest_pairs;
515		link->srcdest_pairs = tmp;
516		link->srcdest_pairs[link->num_pairs] = srcdest;
517		free(tmp2);
518		link->max_len = new_size;
519	} else {
520		link->srcdest_pairs[link->num_pairs] = srcdest;
521	}
522	link->num_pairs++;
523}
524
525static inline uint32_t get_next_srcdest_pair(cdg_link_t * link, uint32_t index)
526{
527	return link->srcdest_pairs[index];
528}
529
530/* traverse binary tree to find a node */
531static cdg_node_t *cdg_search(cdg_node_t * root, uint64_t channelID)
532{
533	while (root) {
534		if (channelID < root->channelID)
535			root = root->left;
536		else if (channelID > root->channelID)
537			root = root->right;
538		else if (channelID == root->channelID)
539			return root;
540	}
541	return NULL;
542}
543
544/* insert new node into the binary tree */
545static void cdg_insert(cdg_node_t ** root, cdg_node_t * new_node)
546{
547	cdg_node_t *current = *root;
548
549	if (!current) {
550		current = new_node;
551		*root = current;
552		return;
553	}
554
555	while (current) {
556		if (new_node->channelID < current->channelID) {
557			if (current->left) {
558				current = current->left;
559			} else {
560				current->left = new_node;
561				new_node->parent = current;
562				break;
563			}
564		} else if (new_node->channelID > current->channelID) {
565			if (current->right) {
566				current = current->right;
567			} else {
568				current->right = new_node;
569				new_node->parent = current;
570				break;
571			}
572		} else if (new_node->channelID == current->channelID) {
573			/* not really possible, maybe programming error */
574			break;
575		}
576	}
577}
578
579static void cdg_node_dealloc(cdg_node_t * node)
580{
581	cdg_link_t *link = node->linklist, *tmp = NULL;
582
583	/* dealloc linklist */
584	while (link) {
585		tmp = link;
586		link = link->next;
587
588		if (tmp->num_pairs)
589			free(tmp->srcdest_pairs);
590		free(tmp);
591	}
592	/* dealloc node */
593	free(node);
594}
595
596static void cdg_dealloc(cdg_node_t ** root)
597{
598	cdg_node_t *current = *root;
599
600	while (current) {
601		if (current->left) {
602			current = current->left;
603		} else if (current->right) {
604			current = current->right;
605		} else {
606			if (current->parent == NULL) {
607				cdg_node_dealloc(current);
608				*root = NULL;
609				break;
610			}
611			if (current->parent->left == current) {
612				current = current->parent;
613				cdg_node_dealloc(current->left);
614				current->left = NULL;
615			} else if (current->parent->right == current) {
616				current = current->parent;
617				cdg_node_dealloc(current->right);
618				current->right = NULL;
619			}
620		}
621	}
622}
623
624/* search for a edge in the cdg which should be removed to break a cycle */
625static cdg_link_t *get_weakest_link_in_cycle(cdg_node_t * cycle)
626{
627	cdg_node_t *current = cycle, *node_with_weakest_link = NULL;
628	cdg_link_t *link = NULL, *weakest_link = NULL;
629
630	link = current->linklist;
631	while (link) {
632		if (link->node->status == GRAY) {
633			weakest_link = link;
634			node_with_weakest_link = current;
635			current = link->node;
636			break;
637		}
638		link = link->next;
639	}
640
641	while (1) {
642		current->status = UNKNOWN;
643		link = current->linklist;
644		while (link) {
645			if (link->node->status == GRAY) {
646				if ((link->num_pairs - link->removed) <
647				    (weakest_link->num_pairs -
648				     weakest_link->removed)) {
649					weakest_link = link;
650					node_with_weakest_link = current;
651				}
652				current = link->node;
653				break;
654			}
655			link = link->next;
656		}
657		/* if complete cycle is traversed */
658		if (current == cycle) {
659			current->status = UNKNOWN;
660			break;
661		}
662	}
663
664	if (node_with_weakest_link->linklist == weakest_link) {
665		node_with_weakest_link->linklist = weakest_link->next;
666	} else {
667		link = node_with_weakest_link->linklist;
668		while (link) {
669			if (link->next == weakest_link) {
670				link->next = weakest_link->next;
671				break;
672			}
673			link = link->next;
674		}
675	}
676
677	return weakest_link;
678}
679
680/* search for nodes in the cdg not yet reached in the cycle search process;
681   (some nodes are unreachable, e.g. a node is a source or the cdg has not connected parts)
682*/
683static cdg_node_t *get_next_cdg_node(cdg_node_t * root)
684{
685	cdg_node_t *current = root, *res = NULL;
686
687	while (current) {
688		current->visited = 1;
689		if (current->status == UNKNOWN) {
690			res = current;
691			break;
692		}
693		if (current->left && !current->left->visited) {
694			current = current->left;
695		} else if (current->right && !current->right->visited) {
696			current = current->right;
697		} else {
698			if (current->left)
699				current->left->visited = 0;
700			if (current->right)
701				current->right->visited = 0;
702			if (current->parent == NULL)
703				break;
704			else
705				current = current->parent;
706		}
707	}
708
709	/* Clean up */
710	while (current) {
711		current->visited = 0;
712		if (current->left)
713			current->left->visited = 0;
714		if (current->right)
715			current->right->visited = 0;
716		current = current->parent;
717	}
718
719	return res;
720}
721
722/* make a DFS on the cdg to check for a cycle */
723static cdg_node_t *search_cycle_in_channel_dep_graph(cdg_node_t * cdg,
724						     cdg_node_t * start_node)
725{
726	cdg_node_t *cycle = NULL;
727	cdg_node_t *current = start_node, *next_node = NULL, *tmp = NULL;
728	cdg_link_t *link = NULL;
729
730	while (current) {
731		current->status = GRAY;
732		link = current->linklist;
733		next_node = NULL;
734		while (link) {
735			if (link->node->status == UNKNOWN) {
736				next_node = link->node;
737				break;
738			}
739			if (link->node->status == GRAY) {
740				cycle = link->node;
741				goto Exit;
742			}
743			link = link->next;
744		}
745		if (next_node) {
746			next_node->pre = current;
747			current = next_node;
748		} else {
749			/* found a sink in the graph, go to last node */
750			current->status = BLACK;
751
752			/* srcdest_pairs of this node aren't relevant, free the allocated memory */
753			link = current->linklist;
754			while (link) {
755				if (link->num_pairs)
756					free(link->srcdest_pairs);
757				link->srcdest_pairs = NULL;
758				link->num_pairs = 0;
759				link->removed = 0;
760				link = link->next;
761			}
762
763			if (current->pre) {
764				tmp = current;
765				current = current->pre;
766				tmp->pre = NULL;
767			} else {
768				/* search for other subgraphs in cdg */
769				current = get_next_cdg_node(cdg);
770				if (!current)
771					break;	/* all relevant nodes traversed, no more cycles found */
772			}
773		}
774	}
775
776Exit:
777	return cycle;
778}
779
780/* calculate the path from source to destination port;
781   new channels are added directly to the cdg
782*/
783static int update_channel_dep_graph(cdg_node_t ** cdg_root,
784				    osm_port_t * src_port, uint16_t slid,
785				    osm_port_t * dest_port, uint16_t dlid)
786{
787	osm_node_t *local_node = NULL, *remote_node = NULL;
788	uint16_t local_lid = 0, remote_lid = 0;
789	uint32_t srcdest = 0;
790	uint8_t local_port = 0, remote_port = 0;
791	uint64_t channelID = 0;
792
793	cdg_node_t *channel_head = NULL, *channel = NULL, *last_channel = NULL;
794	cdg_link_t *linklist = NULL;
795
796	/* set the identifier for the src/dest pair to save this on each edge of the cdg */
797	srcdest = (((uint32_t) slid) << 16) + ((uint32_t) dlid);
798
799	channel_head = (cdg_node_t *) malloc(sizeof(cdg_node_t));
800	if (!channel_head)
801		goto ERROR;
802	set_default_cdg_node(channel_head);
803	last_channel = channel_head;
804
805	/* if src is a Hca, then the channel from Hca to switch would be a source in the graph
806	   sources can't be part of a cycle -> skip this channel
807	 */
808	remote_node =
809	    osm_node_get_remote_node(src_port->p_node,
810				     src_port->p_physp->port_num, &remote_port);
811
812	while (remote_node && remote_node->sw) {
813		local_node = remote_node;
814		local_port = local_node->sw->new_lft[dlid];
815		/* sanity check: local_port must be set or routing is broken */
816		if (local_port == OSM_NO_PATH)
817			goto ERROR;
818		local_lid = cl_ntoh16(osm_node_get_base_lid(local_node, 0));
819		/* each port belonging to a switch has lmc==0 -> get_base_lid is fine
820		   (local/remote port in this function are always part of a switch)
821		 */
822
823		remote_node =
824		    osm_node_get_remote_node(local_node, local_port,
825					     &remote_port);
826		/* if remote_node is a Hca, then the last channel from switch to Hca would be a sink in the cdg -> skip */
827		if (!remote_node || !remote_node->sw)
828			break;
829		remote_lid = cl_ntoh16(osm_node_get_base_lid(remote_node, 0));
830
831		channelID =
832		    (((uint64_t) local_lid) << 48) +
833		    (((uint64_t) local_port) << 32) +
834		    (((uint64_t) remote_lid) << 16) + ((uint64_t) remote_port);
835		channel = cdg_search(*cdg_root, channelID);
836		if (channel) {
837			/* check whether last channel has connection to this channel, i.e. subpath already exists in cdg */
838			linklist = last_channel->linklist;
839			while (linklist && linklist->node != channel
840			       && linklist->next)
841				linklist = linklist->next;
842			/* if there is no connection, add one */
843			if (linklist) {
844				if (linklist->node == channel) {
845					set_next_srcdest_pair(linklist,
846							      srcdest);
847				} else {
848					linklist->next =
849					    (cdg_link_t *)
850					    malloc(sizeof(cdg_link_t));
851					if (!linklist->next)
852						goto ERROR;
853					linklist = linklist->next;
854					linklist->node = channel;
855					linklist->num_pairs = 0;
856					linklist->srcdest_pairs = NULL;
857					set_next_srcdest_pair(linklist,
858							      srcdest);
859					linklist->next = NULL;
860				}
861			} else {
862				/* either this is the first channel of the path, or the last channel was a new channel, or last channel was a sink */
863				last_channel->linklist =
864				    (cdg_link_t *) malloc(sizeof(cdg_link_t));
865				if (!last_channel->linklist)
866					goto ERROR;
867				last_channel->linklist->node = channel;
868				last_channel->linklist->num_pairs = 0;
869				last_channel->linklist->srcdest_pairs = NULL;
870				set_next_srcdest_pair(last_channel->linklist,
871						      srcdest);
872				last_channel->linklist->next = NULL;
873			}
874		} else {
875			/* create new channel */
876			channel = (cdg_node_t *) malloc(sizeof(cdg_node_t));
877			if (!channel)
878				goto ERROR;
879			set_default_cdg_node(channel);
880			channel->channelID = channelID;
881			cdg_insert(cdg_root, channel);
882
883			/* go to end of link list of last channel */
884			linklist = last_channel->linklist;
885			while (linklist && linklist->next)
886				linklist = linklist->next;
887			if (linklist) {
888				/* update last link of an existing channel */
889				linklist->next =
890				    (cdg_link_t *) malloc(sizeof(cdg_link_t));
891				if (!linklist->next)
892					goto ERROR;
893				linklist = linklist->next;
894				linklist->node = channel;
895				linklist->num_pairs = 0;
896				linklist->srcdest_pairs = NULL;
897				set_next_srcdest_pair(linklist, srcdest);
898				linklist->next = NULL;
899			} else {
900				/* either this is the first channel of the path, or the last channel was a new channel, or last channel was a sink */
901				last_channel->linklist =
902				    (cdg_link_t *) malloc(sizeof(cdg_link_t));
903				if (!last_channel->linklist)
904					goto ERROR;
905				last_channel->linklist->node = channel;
906				last_channel->linklist->num_pairs = 0;
907				last_channel->linklist->srcdest_pairs = NULL;
908				set_next_srcdest_pair(last_channel->linklist,
909						      srcdest);
910				last_channel->linklist->next = NULL;
911			}
912		}
913		last_channel = channel;
914	}
915
916	if (channel_head->linklist) {
917		if (channel_head->linklist->srcdest_pairs)
918			free(channel_head->linklist->srcdest_pairs);
919		free(channel_head->linklist);
920	}
921	free(channel_head);
922
923	return 0;
924
925ERROR:
926	/* cleanup data and exit */
927	if (channel_head) {
928		if (channel_head->linklist)
929			free(channel_head->linklist);
930		free(channel_head);
931	}
932
933	return 1;
934}
935
936/* calculate the path from source to destination port;
937   the links in the cdg representing this path are decremented to simulate the removal
938*/
939static int remove_path_from_cdg(cdg_node_t ** cdg_root, osm_port_t * src_port,
940				uint16_t slid, osm_port_t * dest_port,
941				uint16_t dlid)
942{
943	osm_node_t *local_node = NULL, *remote_node = NULL;
944	uint16_t local_lid = 0, remote_lid = 0;
945	uint8_t local_port = 0, remote_port = 0;
946	uint64_t channelID = 0;
947
948	cdg_node_t *channel_head = NULL, *channel = NULL, *last_channel = NULL;
949	cdg_link_t *linklist = NULL;
950
951	channel_head = (cdg_node_t *) malloc(sizeof(cdg_node_t));
952	if (!channel_head)
953		goto ERROR;
954	set_default_cdg_node(channel_head);
955	last_channel = channel_head;
956
957	/* if src is a Hca, then the channel from Hca to switch would be a source in the graph
958	   sources can't be part of a cycle -> skip this channel
959	 */
960	remote_node =
961	    osm_node_get_remote_node(src_port->p_node,
962				     src_port->p_physp->port_num, &remote_port);
963
964	while (remote_node && remote_node->sw) {
965		local_node = remote_node;
966		local_port = local_node->sw->new_lft[dlid];
967		/* sanity check: local_port must be set or routing is broken */
968		if (local_port == OSM_NO_PATH)
969			goto ERROR;
970		local_lid = cl_ntoh16(osm_node_get_base_lid(local_node, 0));
971
972		remote_node =
973		    osm_node_get_remote_node(local_node, local_port,
974					     &remote_port);
975		/* if remote_node is a Hca, then the last channel from switch to Hca would be a sink in the cdg -> skip */
976		if (!remote_node || !remote_node->sw)
977			break;
978		remote_lid = cl_ntoh16(osm_node_get_base_lid(remote_node, 0));
979
980		channelID =
981		    (((uint64_t) local_lid) << 48) +
982		    (((uint64_t) local_port) << 32) +
983		    (((uint64_t) remote_lid) << 16) + ((uint64_t) remote_port);
984		channel = cdg_search(*cdg_root, channelID);
985		if (channel) {
986			/* check whether last channel has connection to this channel, i.e. subpath already exists in cdg */
987			linklist = last_channel->linklist;
988			while (linklist && linklist->node != channel
989			       && linklist->next)
990				linklist = linklist->next;
991			/* remove the srcdest from the link */
992			if (linklist) {
993				if (linklist->node == channel) {
994					linklist->removed++;
995				} else {
996					/* may happen if the link is missing (thru cycle detect algorithm) */
997				}
998			} else {
999				/* may happen if the link is missing (thru cycle detect algorithm or last_channel==channel_head (dummy channel)) */
1000			}
1001		} else {
1002			/* must be an error, channels for the path are added before, so a missing channel would be a corrupt data structure */
1003			goto ERROR;
1004		}
1005		last_channel = channel;
1006	}
1007
1008	if (channel_head->linklist)
1009		free(channel_head->linklist);
1010	free(channel_head);
1011
1012	return 0;
1013
1014ERROR:
1015	/* cleanup data and exit */
1016	if (channel_head) {
1017		if (channel_head->linklist)
1018			free(channel_head->linklist);
1019		free(channel_head);
1020	}
1021
1022	return 1;
1023}
1024
1025/**********************************************************************
1026 **********************************************************************/
1027
1028/************ helper functions to generate an ordered list of ports ***
1029 ************ (functions copied from osm_ucast_mgr.c and modified) ****
1030 **********************************************************************/
1031static void add_sw_endports_to_order_list(osm_switch_t * sw,
1032					  osm_ucast_mgr_t * m,
1033					  cl_qmap_t * guid_tbl,
1034					  boolean_t add_guids)
1035{
1036	osm_port_t *port;
1037	ib_net64_t port_guid;
1038	uint64_t sw_guid;
1039	osm_physp_t *p;
1040	int i;
1041	boolean_t found;
1042
1043	for (i = 1; i < sw->num_ports; i++) {
1044		p = osm_node_get_physp_ptr(sw->p_node, i);
1045		if (p && p->p_remote_physp && !p->p_remote_physp->p_node->sw) {
1046			port_guid = p->p_remote_physp->port_guid;
1047			/* check if link is healthy, otherwise ignore CA */
1048			if (!osm_link_is_healthy(p)) {
1049				sw_guid =
1050				    cl_ntoh64(osm_node_get_node_guid
1051					      (sw->p_node));
1052				OSM_LOG(m->p_log, OSM_LOG_INFO,
1053					"WRN AD40: ignoring CA due to unhealthy"
1054					" link from switch 0x%016" PRIx64
1055					" port %" PRIu8 " to CA 0x%016" PRIx64
1056					"\n", sw_guid, i, cl_ntoh64(port_guid));
1057			}
1058			port = osm_get_port_by_guid(m->p_subn, port_guid);
1059			if (!port)
1060				continue;
1061			if (!cl_is_qmap_empty(guid_tbl)) {
1062				found = (cl_qmap_get(guid_tbl, port_guid)
1063					 != cl_qmap_end(guid_tbl));
1064				if ((add_guids && !found)
1065				    || (!add_guids && found))
1066					continue;
1067			}
1068			if (!cl_is_item_in_qlist(&m->port_order_list,
1069						 &port->list_item))
1070				cl_qlist_insert_tail(&m->port_order_list,
1071						     &port->list_item);
1072			else
1073				OSM_LOG(m->p_log, OSM_LOG_INFO,
1074					"WRN AD37: guid 0x%016" PRIx64
1075					" already in list\n", port_guid);
1076		}
1077	}
1078}
1079
1080static void add_guid_to_order_list(uint64_t guid, osm_ucast_mgr_t * m)
1081{
1082	osm_port_t *port = osm_get_port_by_guid(m->p_subn, cl_hton64(guid));
1083
1084	if (!port) {
1085		 OSM_LOG(m->p_log, OSM_LOG_DEBUG,
1086			 "port guid not found: 0x%016" PRIx64 "\n", guid);
1087	}
1088
1089	if (!cl_is_item_in_qlist(&m->port_order_list, &port->list_item))
1090		cl_qlist_insert_tail(&m->port_order_list, &port->list_item);
1091	else
1092		OSM_LOG(m->p_log, OSM_LOG_INFO,
1093			"WRN AD38: guid 0x%016" PRIx64 " already in list\n",
1094			guid);
1095}
1096
1097/* compare function of #Hca attached to a switch for stdlib qsort */
1098static int cmp_num_hca(const void * l1, const void * l2)
1099{
1100	vertex_t *sw1 = *((vertex_t **) l1);
1101	vertex_t *sw2 = *((vertex_t **) l2);
1102	uint32_t num_hca1 = 0, num_hca2 = 0;
1103
1104	if (sw1)
1105		num_hca1 = sw1->num_hca;
1106	if (sw2)
1107		num_hca2 = sw2->num_hca;
1108
1109	if (num_hca1 > num_hca2)
1110		return -1;
1111	else if (num_hca1 < num_hca2)
1112		return 1;
1113	else
1114		return 0;
1115}
1116
1117/* use stdlib to sort the switch array depending on num_hca */
1118static inline void sw_list_sort_by_num_hca(vertex_t ** sw_list,
1119					   uint32_t sw_list_size)
1120{
1121	qsort(sw_list, sw_list_size, sizeof(vertex_t *), cmp_num_hca);
1122}
1123
1124/**********************************************************************
1125 **********************************************************************/
1126
1127/************ helper functions to manage a map of CN and I/O guids ****
1128 **********************************************************************/
1129static int add_guid_to_map(void * cxt, uint64_t guid, char * p)
1130{
1131	cl_qmap_t *map = cxt;
1132	name_map_item_t *item;
1133	name_map_item_t *inserted_item;
1134
1135	item = malloc(sizeof(*item));
1136	if (!item)
1137		return -1;
1138
1139	item->guid = cl_hton64(guid);	/* internal: network byte order */
1140	item->name = NULL;		/* name isn't needed */
1141	inserted_item = (name_map_item_t *) cl_qmap_insert(map, item->guid, &item->item);
1142	if (inserted_item != item)
1143                free(item);
1144
1145	return 0;
1146}
1147
1148static void destroy_guid_map(cl_qmap_t * guid_tbl)
1149{
1150	name_map_item_t *p_guid = NULL, *p_next_guid = NULL;
1151
1152	p_next_guid = (name_map_item_t *) cl_qmap_head(guid_tbl);
1153	while (p_next_guid != (name_map_item_t *) cl_qmap_end(guid_tbl)) {
1154		p_guid = p_next_guid;
1155		p_next_guid = (name_map_item_t *) cl_qmap_next(&p_guid->item);
1156		free(p_guid);
1157	}
1158	cl_qmap_remove_all(guid_tbl);
1159}
1160
1161/**********************************************************************
1162 **********************************************************************/
1163
1164static void dfsssp_print_graph(osm_ucast_mgr_t * p_mgr, vertex_t * adj_list,
1165			       uint32_t size)
1166{
1167	uint32_t i = 0, c = 0;
1168	link_t *link = NULL;
1169
1170	/* index 0 is for the source in dijkstra -> ignore */
1171	for (i = 1; i < size; i++) {
1172		OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG, "adj_list[%" PRIu32 "]:\n",
1173			i);
1174		OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
1175			"   guid = 0x%" PRIx64 " lid = %" PRIu16 " (%s)\n",
1176			adj_list[i].guid, adj_list[i].lid,
1177			adj_list[i].sw->p_node->print_desc);
1178		OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
1179			"   num_hca = %" PRIu32 "\n", adj_list[i].num_hca);
1180
1181		c = 1;
1182		for (link = adj_list[i].links; link != NULL;
1183		     link = link->next, c++) {
1184			OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
1185				"   link[%" PRIu32 "]:\n", c);
1186			OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
1187				"      to guid = 0x%" PRIx64 " (%s) port %"
1188				PRIu8 "\n", link->guid,
1189				adj_list[link->to].sw->p_node->print_desc,
1190				link->to_port);
1191			OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
1192				"      weight on this link = %" PRIu64 "\n",
1193				link->weight);
1194		}
1195	}
1196}
1197
1198/* predefine, to use this in next function */
1199static void dfsssp_context_destroy(void *context);
1200static int dijkstra(osm_ucast_mgr_t * p_mgr, vertex_t * adj_list,
1201		    uint32_t adj_list_size, osm_port_t * port, uint16_t lid);
1202
1203/* traverse subnet to gather information about the connected switches */
1204static int dfsssp_build_graph(void *context)
1205{
1206	dfsssp_context_t *dfsssp_ctx = (dfsssp_context_t *) context;
1207	osm_ucast_mgr_t *p_mgr = (osm_ucast_mgr_t *) (dfsssp_ctx->p_mgr);
1208
1209	cl_qmap_t *port_tbl = &p_mgr->p_subn->port_guid_tbl;	/* 1 management port per switch + 1 or 2 ports for each Hca */
1210	osm_port_t *p_port = NULL;
1211	cl_qmap_t *sw_tbl = &p_mgr->p_subn->sw_guid_tbl;
1212	cl_map_item_t *item = NULL;
1213	osm_switch_t *sw = NULL;
1214	osm_node_t *remote_node = NULL;
1215	uint8_t port = 0, remote_port = 0;
1216	uint32_t i = 0, j = 0, err = 0, undiscov = 0, max_num_undiscov = 0;
1217	uint64_t total_num_hca = 0;
1218	vertex_t *adj_list = NULL;
1219	osm_physp_t *p_physp = NULL;
1220	link_t *link = NULL, *head = NULL;
1221	uint32_t num_sw = 0, adj_list_size = 0;
1222	uint8_t lmc = 0;
1223	uint16_t sm_lid = 0;
1224
1225	OSM_LOG_ENTER(p_mgr->p_log);
1226	OSM_LOG(p_mgr->p_log, OSM_LOG_VERBOSE,
1227		"Building graph for df-/sssp routing\n");
1228
1229	/* if this pointer isn't NULL, this is a reroute step;
1230	   old context will be destroyed (adj_list and srcdest2vl_table)
1231	 */
1232	if (dfsssp_ctx->adj_list)
1233		dfsssp_context_destroy(context);
1234
1235	num_sw = cl_qmap_count(sw_tbl);
1236	adj_list_size = num_sw + 1;
1237	/* allocate an adjazenz list (array), 0. element is reserved for the source (Hca) in the routing algo, others are switches */
1238	adj_list = (vertex_t *) malloc(adj_list_size * sizeof(vertex_t));
1239	if (!adj_list) {
1240		OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
1241			"ERR AD02: cannot allocate memory for adj_list\n");
1242		goto ERROR;
1243	}
1244	for (i = 0; i < adj_list_size; i++)
1245		set_default_vertex(&adj_list[i]);
1246
1247	dfsssp_ctx->adj_list = adj_list;
1248	dfsssp_ctx->adj_list_size = adj_list_size;
1249
1250	/* count the total number of Hca / LIDs (for lmc>0) in the fabric;
1251	   even include base/enhanced switch port 0; base SP0 will have lmc=0
1252	 */
1253	for (item = cl_qmap_head(port_tbl); item != cl_qmap_end(port_tbl);
1254	     item = cl_qmap_next(item)) {
1255		p_port = (osm_port_t *) item;
1256		if (osm_node_get_type(p_port->p_node) == IB_NODE_TYPE_CA ||
1257		    osm_node_get_type(p_port->p_node) == IB_NODE_TYPE_SWITCH) {
1258			lmc = osm_port_get_lmc(p_port);
1259			total_num_hca += (1 << lmc);
1260		}
1261	}
1262
1263	i = 1;			/* fill adj_list -> start with index 1 */
1264	for (item = cl_qmap_head(sw_tbl); item != cl_qmap_end(sw_tbl);
1265	     item = cl_qmap_next(item), i++) {
1266		sw = (osm_switch_t *) item;
1267		OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
1268			"Processing switch with GUID 0x%" PRIx64 "\n",
1269			cl_ntoh64(osm_node_get_node_guid(sw->p_node)));
1270
1271		adj_list[i].guid =
1272		    cl_ntoh64(osm_node_get_node_guid(sw->p_node));
1273		adj_list[i].lid =
1274		    cl_ntoh16(osm_node_get_base_lid(sw->p_node, 0));
1275		adj_list[i].sw = sw;
1276
1277		link = (link_t *) malloc(sizeof(link_t));
1278		if (!link) {
1279			OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
1280				"ERR AD03: cannot allocate memory for a link\n");
1281			goto ERROR;
1282		}
1283		head = link;
1284		head->next = NULL;
1285
1286		/* add SP0 to number of CA connected to a switch */
1287		lmc = osm_node_get_lmc(sw->p_node, 0);
1288		adj_list[i].num_hca += (1 << lmc);
1289
1290		/* iterate over all ports in the switch, start with port 1 (port 0 is a management port) */
1291		for (port = 1; port < sw->num_ports; port++) {
1292			/* get the node behind the port */
1293			remote_node =
1294			    osm_node_get_remote_node(sw->p_node, port,
1295						     &remote_port);
1296			/* if there is no remote node on this port or it's the same switch -> try next port */
1297			if (!remote_node || remote_node->sw == sw)
1298				continue;
1299			/* make sure the link is healthy */
1300			p_physp = osm_node_get_physp_ptr(sw->p_node, port);
1301			if (!p_physp || !osm_link_is_healthy(p_physp))
1302				continue;
1303			/* if there is a Hca connected -> count and cycle */
1304			if (!remote_node->sw) {
1305				lmc = osm_node_get_lmc(remote_node, (uint32_t)remote_port);
1306				adj_list[i].num_hca += (1 << lmc);
1307				continue;
1308			}
1309			OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
1310				"Node 0x%" PRIx64 ", remote node 0x%" PRIx64
1311				", port %" PRIu8 ", remote port %" PRIu8 "\n",
1312				cl_ntoh64(osm_node_get_node_guid(sw->p_node)),
1313				cl_ntoh64(osm_node_get_node_guid(remote_node)),
1314				port, remote_port);
1315
1316			link->next = (link_t *) malloc(sizeof(link_t));
1317			if (!link->next) {
1318				OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
1319					"ERR AD08: cannot allocate memory for a link\n");
1320				while (head) {
1321					link = head;
1322					head = head->next;
1323					free(link);
1324				}
1325				goto ERROR;
1326			}
1327			link = link->next;
1328			set_default_link(link);
1329			link->guid =
1330			    cl_ntoh64(osm_node_get_node_guid(remote_node));
1331			link->from = i;
1332			link->from_port = port;
1333			link->to_port = remote_port;
1334			link->weight = total_num_hca * total_num_hca;	/* initialize with P^2 to force shortest paths */
1335		}
1336
1337		adj_list[i].links = head->next;
1338		free(head);
1339	}
1340	/* connect the links with it's second adjacent node in the list */
1341	for (i = 1; i < adj_list_size; i++) {
1342		link = adj_list[i].links;
1343		while (link) {
1344			for (j = 1; j < adj_list_size; j++) {
1345				if (link->guid == adj_list[j].guid) {
1346					link->to = j;
1347					break;
1348				}
1349			}
1350			link = link->next;
1351		}
1352	}
1353	/* do one dry run to determine connectivity issues */
1354	sm_lid = p_mgr->p_subn->master_sm_base_lid;
1355	p_port = osm_get_port_by_lid(p_mgr->p_subn, sm_lid);
1356	err = dijkstra(p_mgr, adj_list, adj_list_size, p_port, sm_lid);
1357	if (err) {
1358		goto ERROR;
1359	} else {
1360		/* if sm is running on a switch, then dijkstra doesn't
1361		   initialize the used_link for this switch
1362		 */
1363		if (osm_node_get_type(p_port->p_node) != IB_NODE_TYPE_CA)
1364			max_num_undiscov = 1;
1365		for (i = 1; i < adj_list_size; i++)
1366			undiscov += (adj_list[i].used_link) ? 0 : 1;
1367		if (max_num_undiscov < undiscov) {
1368			OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
1369				"ERR AD0C: unsupported network state (detached"
1370				" and inaccessible switches found; gracefully"
1371				" shutdown this routing engine)\n");
1372			goto ERROR;
1373		}
1374	}
1375	/* print the discovered graph */
1376	if (OSM_LOG_IS_ACTIVE_V2(p_mgr->p_log, OSM_LOG_DEBUG))
1377		dfsssp_print_graph(p_mgr, adj_list, adj_list_size);
1378
1379	OSM_LOG_EXIT(p_mgr->p_log);
1380	return 0;
1381
1382ERROR:
1383	dfsssp_context_destroy(context);
1384	return -1;
1385}
1386
1387static void print_routes(osm_ucast_mgr_t * p_mgr, vertex_t * adj_list,
1388			 uint32_t adj_list_size, osm_port_t * port)
1389{
1390	uint32_t i = 0, j = 0;
1391
1392	for (i = 1; i < adj_list_size; i++) {
1393		if (adj_list[i].state == DISCOVERED) {
1394			OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
1395				"Route from 0x%" PRIx64 " (%s) to 0x%" PRIx64
1396				" (%s):\n", adj_list[i].guid,
1397				adj_list[i].sw->p_node->print_desc,
1398				cl_ntoh64(osm_node_get_node_guid(port->p_node)),
1399				port->p_node->print_desc);
1400			j = i;
1401			while (adj_list[j].used_link) {
1402				if (j > 0) {
1403					OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
1404						"   0x%" PRIx64
1405						" (%s) routes thru port %" PRIu8
1406						"\n", adj_list[j].guid,
1407						adj_list[j].sw->p_node->
1408						print_desc,
1409						adj_list[j].used_link->to_port);
1410				} else {
1411					OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
1412						"   0x%" PRIx64
1413						" (%s) routes thru port %" PRIu8
1414						"\n", adj_list[j].guid,
1415						port->p_node->print_desc,
1416						adj_list[j].used_link->to_port);
1417				}
1418				j = adj_list[j].used_link->from;
1419			}
1420		}
1421	}
1422}
1423
1424/* dijkstra step from one source to all switches in the df-/sssp graph */
1425static int dijkstra(osm_ucast_mgr_t * p_mgr, vertex_t * adj_list,
1426		    uint32_t adj_list_size, osm_port_t * port, uint16_t lid)
1427{
1428	uint32_t i = 0, j = 0, index = 0;
1429	osm_node_t *remote_node = NULL;
1430	uint8_t remote_port = 0;
1431	vertex_t *current = NULL;
1432	link_t *link = NULL;
1433	uint64_t guid = 0;
1434	binary_heap_t *heap = NULL;
1435	int err = 0;
1436
1437	OSM_LOG_ENTER(p_mgr->p_log);
1438
1439	/* reset all switches for new round with a new source for dijkstra */
1440	for (i = 1; i < adj_list_size; i++) {
1441		adj_list[i].hops = 0;
1442		adj_list[i].used_link = NULL;
1443		adj_list[i].distance = INF;
1444		adj_list[i].state = UNDISCOVERED;
1445	}
1446
1447	/* if behind port is a Hca -> set adj_list[0] */
1448	if (osm_node_get_type(port->p_node) == IB_NODE_TYPE_CA) {
1449		/* save old link to prevent many mallocs after set_default_... */
1450		link = adj_list[0].links;
1451		/* initialize adj_list[0] (the source for the routing, a Hca) */
1452		set_default_vertex(&adj_list[0]);
1453		adj_list[0].guid =
1454		    cl_ntoh64(osm_node_get_node_guid(port->p_node));
1455		adj_list[0].lid = lid;
1456		index = 0;
1457		/* write saved link back to new adj_list[0] */
1458		adj_list[0].links = link;
1459
1460		/* initialize link to neighbor for adj_list[0];
1461		   make sure the link is healthy
1462		 */
1463		if (port->p_physp && osm_link_is_healthy(port->p_physp)) {
1464			remote_node =
1465			    osm_node_get_remote_node(port->p_node,
1466						     port->p_physp->port_num,
1467						     &remote_port);
1468			/* if there is no remote node on this port or it's the same Hca -> ignore */
1469			if (remote_node
1470			    && (osm_node_get_type(remote_node) ==
1471				IB_NODE_TYPE_SWITCH)) {
1472				if (!(adj_list[0].links)) {
1473					adj_list[0].links =
1474					    (link_t *) malloc(sizeof(link_t));
1475					if (!(adj_list[0].links)) {
1476						OSM_LOG(p_mgr->p_log,
1477							OSM_LOG_ERROR,
1478							"ERR AD07: cannot allocate memory for a link\n");
1479						return 1;
1480					}
1481				}
1482				set_default_link(adj_list[0].links);
1483				adj_list[0].links->guid =
1484				    cl_ntoh64(osm_node_get_node_guid
1485					      (remote_node));
1486				adj_list[0].links->from_port =
1487				    port->p_physp->port_num;
1488				adj_list[0].links->to_port = remote_port;
1489				adj_list[0].links->weight = 1;
1490				for (j = 1; j < adj_list_size; j++) {
1491					if (adj_list[0].links->guid ==
1492					    adj_list[j].guid) {
1493						adj_list[0].links->to = j;
1494						break;
1495					}
1496				}
1497			}
1498		} else {
1499			/* if link is unhealthy then there's a severe issue */
1500			OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
1501				"ERR AD0B: unsupported network state (CA with"
1502				" unhealthy link state discovered; should have"
1503				" been filtered out before already; gracefully"
1504				" shutdown this routing engine)\n");
1505			return 1;
1506		}
1507		/* if behind port is a switch -> search switch in adj_list */
1508	} else {
1509		/* reset adj_list[0], if links=NULL reset was done before, then skip */
1510		if (adj_list[0].links) {
1511			free(adj_list[0].links);
1512			set_default_vertex(&adj_list[0]);
1513		}
1514		/* search for the switch which is the source in this round */
1515		guid = cl_ntoh64(osm_node_get_node_guid(port->p_node));
1516		for (i = 1; i < adj_list_size; i++) {
1517			if (guid == adj_list[i].guid) {
1518				index = i;
1519				break;
1520			}
1521		}
1522	}
1523
1524	/* source in dijkstra */
1525	adj_list[index].distance = 0;
1526	adj_list[index].state = DISCOVERED;
1527	adj_list[index].hops = 0;	/* the source has hop count = 0 */
1528
1529	/* create a heap to find (efficient) the node with the smallest distance */
1530	if (osm_node_get_type(port->p_node) == IB_NODE_TYPE_CA)
1531		err = heap_create(adj_list, adj_list_size, &heap);
1532	else
1533		err = heap_create(&adj_list[1], adj_list_size - 1, &heap);
1534	if (err) {
1535		OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
1536			"ERR AD09: cannot allocate memory for heap or heap->node in heap_create(...)\n");
1537		return err;
1538	}
1539
1540	current = heap_getmin(heap);
1541	while (current) {
1542		current->state = DISCOVERED;
1543		if (current->used_link)	/* increment the number of hops to the source for each new node */
1544			current->hops =
1545			    adj_list[current->used_link->from].hops + 1;
1546
1547		/* add/update nodes which aren't discovered but accessible */
1548		for (link = current->links; link != NULL; link = link->next) {
1549			if ((adj_list[link->to].state != DISCOVERED)
1550			    && (current->distance + link->weight <
1551				adj_list[link->to].distance)) {
1552				adj_list[link->to].used_link = link;
1553				adj_list[link->to].distance =
1554				    current->distance + link->weight;
1555				heap_heapify(heap, adj_list[link->to].heap_id);
1556			}
1557		}
1558
1559		current = heap_getmin(heap);
1560	}
1561
1562	/* destroy the heap */
1563	heap_free(heap);
1564	heap = NULL;
1565
1566	OSM_LOG_EXIT(p_mgr->p_log);
1567	return 0;
1568}
1569
1570/* update the linear forwarding tables of all switches with the informations
1571   from the last dijsktra step
1572*/
1573static int update_lft(osm_ucast_mgr_t * p_mgr, vertex_t * adj_list,
1574		      uint32_t adj_list_size, osm_port_t * p_port, uint16_t lid)
1575{
1576	uint32_t i = 0;
1577	uint8_t port = 0;
1578	uint8_t hops = 0;
1579	osm_switch_t *p_sw = NULL;
1580	boolean_t is_ignored_by_port_prof = FALSE;
1581	osm_physp_t *p = NULL;
1582	cl_status_t ret;
1583
1584	OSM_LOG_ENTER(p_mgr->p_log);
1585
1586	for (i = 1; i < adj_list_size; i++) {
1587		/* if no route goes thru this switch -> cycle */
1588		if (!(adj_list[i].used_link))
1589			continue;
1590
1591		p_sw = adj_list[i].sw;
1592		hops = adj_list[i].hops;
1593		port = adj_list[i].used_link->to_port;
1594		/* the used_link is the link that was used in dijkstra to reach this node,
1595		   so the to_port is the local port on this node
1596		 */
1597
1598		if (port == OSM_NO_PATH) {	/* if clause shouldn't be possible in this routing, but who cares */
1599			OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
1600				"ERR AD06: No path to get to LID %" PRIu16
1601				" from switch 0x%" PRIx64 "\n", lid,
1602				cl_ntoh64(osm_node_get_node_guid
1603					  (p_sw->p_node)));
1604
1605			/* do not try to overwrite the ppro of non existing port ... */
1606			is_ignored_by_port_prof = TRUE;
1607			return 1;
1608		} else {
1609			OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
1610				"Routing LID %" PRIu16 " to port %" PRIu8
1611				" for switch 0x%" PRIx64 "\n", lid, port,
1612				cl_ntoh64(osm_node_get_node_guid
1613					  (p_sw->p_node)));
1614
1615			p = osm_node_get_physp_ptr(p_sw->p_node, port);
1616			if (!p) {
1617				OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
1618					"ERR AD0A: Physical port %d of Node GUID 0x%"
1619					PRIx64 "not found\n", port,
1620					cl_ntoh64(osm_node_get_node_guid(p_sw->p_node)));
1621				return 1;
1622			}
1623
1624			/* we would like to optionally ignore this port in equalization
1625			   as in the case of the Mellanox Anafa Internal PCI TCA port
1626			 */
1627			is_ignored_by_port_prof = p->is_prof_ignored;
1628
1629			/* We also would ignore this route if the target lid is of
1630			   a switch and the port_profile_switch_node is not TRUE
1631			 */
1632			if (!p_mgr->p_subn->opt.port_profile_switch_nodes)
1633				is_ignored_by_port_prof |=
1634				    (osm_node_get_type(p_port->p_node) ==
1635				     IB_NODE_TYPE_SWITCH);
1636		}
1637
1638		/* to support lmc > 0 the functions alloc_ports_priv, free_ports_priv, find_and_add_remote_sys
1639		   from minhop aren't needed cause osm_switch_recommend_path is implicitly calculated
1640		   for each LID pair thru dijkstra;
1641		   for each port the dijkstra algorithm calculates (max_lid_ho - min_lid_ho)-times maybe
1642		   disjoint routes to spread the bandwidth -> diffent routes for one port and lmc>0
1643		 */
1644
1645		/* set port in LFT */
1646		p_sw->new_lft[lid] = port;
1647		if (!is_ignored_by_port_prof) {
1648			/* update the number of path routing thru this port */
1649			osm_switch_count_path(p_sw, port);
1650		}
1651		/* set the hop count from this switch to the lid */
1652		ret = osm_switch_set_hops(p_sw, lid, port, hops);
1653		if (ret != CL_SUCCESS)
1654			OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
1655				"ERR AD05: cannot set hops for LID %" PRIu16
1656				" at switch 0x%" PRIx64 "\n", lid,
1657				cl_ntoh64(osm_node_get_node_guid
1658					  (p_sw->p_node)));
1659	}
1660
1661	OSM_LOG_EXIT(p_mgr->p_log);
1662	return 0;
1663}
1664
1665/* the function updates the multicast group membership information
1666   similar to create_mgrp_switch_map (osm_mcast_mgr.c)
1667   => with it we can identify if a switch needs to be processed
1668   or not in update_mcft
1669*/
1670static void update_mgrp_membership(cl_qlist_t * port_list)
1671{
1672	osm_mcast_work_obj_t *wobj = NULL;
1673	osm_port_t *port = NULL;
1674	osm_switch_t *sw = NULL;
1675	cl_list_item_t *i = NULL;
1676
1677	for (i = cl_qlist_head(port_list); i != cl_qlist_end(port_list);
1678	     i = cl_qlist_next(i)) {
1679		wobj = cl_item_obj(i, wobj, list_item);
1680		port = wobj->p_port;
1681		if (port->p_node->sw) {
1682			sw = port->p_node->sw;
1683			sw->is_mc_member = 1;
1684		} else {
1685			sw = port->p_physp->p_remote_physp->p_node->sw;
1686			sw->num_of_mcm++;
1687		}
1688	}
1689}
1690
1691/* reset is_mc_member and num_of_mcm for future computations */
1692static void reset_mgrp_membership(vertex_t * adj_list, uint32_t adj_list_size)
1693{
1694	uint32_t i = 0;
1695
1696	for (i = 1; i < adj_list_size; i++) {
1697		if (adj_list[i].dropped)
1698			continue;
1699
1700		adj_list[i].sw->is_mc_member = 0;
1701		adj_list[i].sw->num_of_mcm = 0;
1702	}
1703}
1704
1705/* update the multicast forwarding tables of all switches with the informations
1706   from the previous dijsktra step for the current mlid
1707*/
1708static int update_mcft(osm_sm_t * p_sm, vertex_t * adj_list,
1709		       uint32_t adj_list_size, uint16_t mlid_ho,
1710		       cl_qmap_t * port_map, osm_switch_t * root_sw)
1711{
1712	uint32_t i = 0;
1713	uint8_t port = 0, remote_port = 0;
1714	uint8_t upstream_port = 0, downstream_port = 0;
1715	ib_net64_t guid = 0;
1716	osm_switch_t *p_sw = NULL;
1717	osm_node_t *remote_node = NULL;
1718	osm_physp_t *p_physp = NULL;
1719	osm_mcast_tbl_t *p_tbl = NULL;
1720	vertex_t *curr_adj = NULL;
1721
1722	OSM_LOG_ENTER(p_sm->p_log);
1723
1724	for (i = 1; i < adj_list_size; i++) {
1725		if (adj_list[i].dropped)
1726			continue;
1727
1728		p_sw = adj_list[i].sw;
1729		OSM_LOG(p_sm->p_log, OSM_LOG_VERBOSE,
1730			"Processing switch 0x%016" PRIx64
1731			" (%s) for MLID 0x%X\n", cl_ntoh64(adj_list[i].guid),
1732			p_sw->p_node->print_desc, mlid_ho);
1733
1734		/* if a) the switch does not support mcast  or
1735		      b) no ports of this switch are part or the mcast group
1736		   then cycle
1737		 */
1738		if (osm_switch_supports_mcast(p_sw) == FALSE ||
1739		    (p_sw->num_of_mcm == 0 && !(p_sw->is_mc_member)))
1740			continue;
1741
1742		p_tbl = osm_switch_get_mcast_tbl_ptr(p_sw);
1743
1744		/* add all ports of this sw to the mcast table,
1745		   if they are part of the mcast grp
1746		 */
1747		if (p_sw->is_mc_member)
1748			osm_mcast_tbl_set(p_tbl, mlid_ho, 0);
1749		for (port = 1; port < p_sw->num_ports; port++) {
1750			/* get the node behind the port */
1751			remote_node =
1752				osm_node_get_remote_node(p_sw->p_node, port,
1753							 &remote_port);
1754			/* check if connected and its not the same switch */
1755			if (!remote_node || remote_node->sw == p_sw)
1756				continue;
1757			/* make sure the link is healthy */
1758			p_physp = osm_node_get_physp_ptr(p_sw->p_node, port);
1759			if (!p_physp || !osm_link_is_healthy(p_physp))
1760				continue;
1761			/* we don't add upstream ports in this step */
1762			if (osm_node_get_type(remote_node) != IB_NODE_TYPE_CA)
1763				continue;
1764
1765			guid = osm_physp_get_port_guid(osm_node_get_physp_ptr(
1766						       remote_node,
1767						       remote_port));
1768			if (cl_qmap_get(port_map, guid)
1769			    != cl_qmap_end(port_map))
1770				osm_mcast_tbl_set(p_tbl, mlid_ho, port);
1771		}
1772
1773		/* now we have to add the upstream port of 'this' switch and
1774		   the downstream port of the next switch to the mcast table
1775		   until we reach the root_sw
1776		 */
1777		curr_adj = &adj_list[i];
1778		while (curr_adj->sw != root_sw) {
1779			/* the used_link is the link that was used in dijkstra to reach this node,
1780			   so the to_port is the local (upstream) port on curr_adj->sw
1781			 */
1782			upstream_port = curr_adj->used_link->to_port;
1783			osm_mcast_tbl_set(p_tbl, mlid_ho, upstream_port);
1784
1785			/* now we go one step in direction root_sw and add the
1786			   downstream port for the spanning tree
1787			 */
1788			downstream_port = curr_adj->used_link->from_port;
1789			p_tbl = osm_switch_get_mcast_tbl_ptr(
1790				adj_list[curr_adj->used_link->from].sw);
1791			osm_mcast_tbl_set(p_tbl, mlid_ho, downstream_port);
1792
1793			curr_adj = &adj_list[curr_adj->used_link->from];
1794		}
1795	}
1796
1797	OSM_LOG_EXIT(p_sm->p_log);
1798	return 0;
1799}
1800
1801/* increment the edge weights of the df-/sssp graph which represent the number
1802   of paths on this link
1803*/
1804static void update_weights(osm_ucast_mgr_t * p_mgr, vertex_t * adj_list,
1805			   uint32_t adj_list_size)
1806{
1807	uint32_t i = 0, j = 0;
1808	uint32_t additional_weight = 0;
1809
1810	OSM_LOG_ENTER(p_mgr->p_log);
1811
1812	for (i = 1; i < adj_list_size; i++) {
1813		/* if no route goes thru this switch -> cycle */
1814		if (!(adj_list[i].used_link))
1815			continue;
1816		additional_weight = adj_list[i].num_hca;
1817
1818		j = i;
1819		while (adj_list[j].used_link) {
1820			/* update the link from pre to this node */
1821			adj_list[j].used_link->weight += additional_weight;
1822
1823			j = adj_list[j].used_link->from;
1824		}
1825	}
1826
1827	OSM_LOG_EXIT(p_mgr->p_log);
1828}
1829
1830/* get the largest number of virtual lanes which is supported by all switches
1831   in the subnet
1832*/
1833static uint8_t get_avail_vl_in_subn(osm_ucast_mgr_t * p_mgr)
1834{
1835	uint32_t i = 0;
1836	uint8_t vls_avail = 0xFF, port_vls_avail = 0;
1837	cl_qmap_t *sw_tbl = &p_mgr->p_subn->sw_guid_tbl;
1838	cl_map_item_t *item = NULL;
1839	osm_switch_t *sw = NULL;
1840
1841	/* traverse all switches to get the number of available virtual lanes in the subnet */
1842	for (item = cl_qmap_head(sw_tbl); item != cl_qmap_end(sw_tbl);
1843	     item = cl_qmap_next(item)) {
1844		sw = (osm_switch_t *) item;
1845
1846		/* ignore management port 0 */
1847		for (i = 1; i < osm_node_get_num_physp(sw->p_node); i++) {
1848			osm_physp_t *p_physp =
1849			    osm_node_get_physp_ptr(sw->p_node, i);
1850
1851			if (p_physp && p_physp->p_remote_physp) {
1852				port_vls_avail =
1853				    ib_port_info_get_op_vls(&p_physp->
1854							    port_info);
1855				if (port_vls_avail
1856				    && port_vls_avail < vls_avail)
1857					vls_avail = port_vls_avail;
1858			}
1859		}
1860	}
1861
1862	/* ib_port_info_get_op_vls gives values 1 ... 5 (s. IBAS 14.2.5.6) */
1863	vls_avail = 1 << (vls_avail - 1);
1864
1865	/* set boundaries (s. IBAS 3.5.7) */
1866	if (vls_avail > 15)
1867		vls_avail = 15;
1868	if (vls_avail < 1)
1869		vls_avail = 1;
1870
1871	return vls_avail;
1872}
1873
1874/* search for cycles in the channel dependency graph to identify possible
1875   deadlocks in the network;
1876   assign new virtual lanes to some paths to break the deadlocks
1877*/
1878static int dfsssp_remove_deadlocks(dfsssp_context_t * dfsssp_ctx)
1879{
1880	osm_ucast_mgr_t *p_mgr = (osm_ucast_mgr_t *) dfsssp_ctx->p_mgr;
1881
1882	cl_qlist_t *port_tbl = &p_mgr->port_order_list;	/* 1 management port per switch + 1 or 2 ports for each Hca */
1883	cl_list_item_t *item1 = NULL, *item2 = NULL;
1884	osm_port_t *src_port = NULL, *dest_port = NULL;
1885
1886	uint32_t i = 0, j = 0, err = 0;
1887	uint8_t vl = 0, test_vl = 0, vl_avail = 0, vl_needed = 1;
1888	double most_avg_paths = 0.0;
1889	cdg_node_t **cdg = NULL, *start_here = NULL, *cycle = NULL;
1890	cdg_link_t *weakest_link = NULL;
1891	uint32_t srcdest = 0;
1892
1893	vltable_t *srcdest2vl_table = NULL;
1894	uint8_t lmc = 0;
1895	uint16_t slid = 0, dlid = 0, min_lid_ho = 0, max_lid_ho =
1896	    0, min_lid_ho2 = 0, max_lid_ho2 = 0;;
1897	uint64_t *paths_per_vl = NULL;
1898	uint64_t from = 0, to = 0, count = 0;
1899	uint8_t *split_count = NULL;
1900	uint8_t ntype = 0;
1901
1902	OSM_LOG_ENTER(p_mgr->p_log);
1903	OSM_LOG(p_mgr->p_log, OSM_LOG_VERBOSE,
1904		"Assign each src/dest pair a Virtual Lanes, to remove deadlocks in the routing\n");
1905
1906	vl_avail = get_avail_vl_in_subn(p_mgr);
1907	OSM_LOG(p_mgr->p_log, OSM_LOG_INFO,
1908		"Virtual Lanes available: %" PRIu8 "\n", vl_avail);
1909
1910	paths_per_vl = (uint64_t *) malloc(vl_avail * sizeof(uint64_t));
1911	if (!paths_per_vl) {
1912		OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
1913			"ERR AD22: cannot allocate memory for paths_per_vl\n");
1914		return 1;
1915	}
1916	memset(paths_per_vl, 0, vl_avail * sizeof(uint64_t));
1917
1918	cdg = (cdg_node_t **) malloc(vl_avail * sizeof(cdg_node_t *));
1919	if (!cdg) {
1920		OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
1921			"ERR AD23: cannot allocate memory for cdg\n");
1922		free(paths_per_vl);
1923		return 1;
1924	}
1925	for (i = 0; i < vl_avail; i++)
1926		cdg[i] = NULL;
1927
1928	count = 0;
1929	/* count all ports (also multiple LIDs) of type CA or SP0 for size of VL table */
1930	for (item1 = cl_qlist_head(port_tbl); item1 != cl_qlist_end(port_tbl);
1931	     item1 = cl_qlist_next(item1)) {
1932		dest_port = (osm_port_t *)cl_item_obj(item1, dest_port,
1933						      list_item);
1934		ntype = osm_node_get_type(dest_port->p_node);
1935		if (ntype == IB_NODE_TYPE_CA || ntype == IB_NODE_TYPE_SWITCH) {
1936			/* only SP0 with SLtoVLMapping support will be processed */
1937			if (ntype == IB_NODE_TYPE_SWITCH
1938			    && !(dest_port->p_physp->port_info.capability_mask
1939			    & IB_PORT_CAP_HAS_SL_MAP))
1940				continue;
1941
1942			lmc = osm_port_get_lmc(dest_port);
1943			count += (1 << lmc);
1944		}
1945	}
1946	/* allocate VL table and indexing array */
1947	err = vltable_alloc(&srcdest2vl_table, count);
1948	if (err) {
1949		OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
1950			"ERR AD26: cannot allocate memory for srcdest2vl_table\n");
1951		goto ERROR;
1952	}
1953
1954	i = 0;
1955	/* fill lids into indexing array */
1956	for (item1 = cl_qlist_head(port_tbl); item1 != cl_qlist_end(port_tbl);
1957	     item1 = cl_qlist_next(item1)) {
1958		dest_port = (osm_port_t *)cl_item_obj(item1, dest_port,
1959						      list_item);
1960		ntype = osm_node_get_type(dest_port->p_node);
1961		if (ntype == IB_NODE_TYPE_CA || ntype == IB_NODE_TYPE_SWITCH) {
1962			/* only SP0 with SLtoVLMapping support will be processed */
1963			if (ntype == IB_NODE_TYPE_SWITCH
1964			    && !(dest_port->p_physp->port_info.capability_mask
1965			    & IB_PORT_CAP_HAS_SL_MAP))
1966				continue;
1967
1968			osm_port_get_lid_range_ho(dest_port, &min_lid_ho,
1969						  &max_lid_ho);
1970			for (dlid = min_lid_ho; dlid <= max_lid_ho; dlid++, i++)
1971				srcdest2vl_table->lids[i] = cl_hton16(dlid);
1972		}
1973	}
1974	/* sort lids */
1975	vltable_sort_lids(srcdest2vl_table);
1976
1977	test_vl = 0;
1978	/* fill cdg[0] with routes from each src/dest port combination for all Hca/SP0 in the subnet */
1979	for (item1 = cl_qlist_head(port_tbl); item1 != cl_qlist_end(port_tbl);
1980	     item1 = cl_qlist_next(item1)) {
1981		dest_port = (osm_port_t *)cl_item_obj(item1, dest_port,
1982						      list_item);
1983		ntype = osm_node_get_type(dest_port->p_node);
1984		if ((ntype != IB_NODE_TYPE_CA && ntype != IB_NODE_TYPE_SWITCH)
1985		    || !(dest_port->p_physp->port_info.capability_mask
1986		    & IB_PORT_CAP_HAS_SL_MAP))
1987			continue;
1988
1989		for (item2 = cl_qlist_head(port_tbl);
1990		     item2 != cl_qlist_end(port_tbl);
1991		     item2 = cl_qlist_next(item2)) {
1992			src_port = (osm_port_t *)cl_item_obj(item2, src_port,
1993							     list_item);
1994			ntype = osm_node_get_type(src_port->p_node);
1995			if ((ntype != IB_NODE_TYPE_CA
1996			    && ntype != IB_NODE_TYPE_SWITCH)
1997			    || !(src_port->p_physp->port_info.capability_mask
1998			    & IB_PORT_CAP_HAS_SL_MAP))
1999				continue;
2000
2001			if (src_port != dest_port) {
2002				/* iterate over LIDs of src and dest port */
2003				osm_port_get_lid_range_ho(src_port, &min_lid_ho,
2004							  &max_lid_ho);
2005				for (slid = min_lid_ho; slid <= max_lid_ho;
2006				     slid++) {
2007					osm_port_get_lid_range_ho
2008					    (dest_port, &min_lid_ho2,
2009					     &max_lid_ho2);
2010					for (dlid = min_lid_ho2;
2011					     dlid <= max_lid_ho2;
2012					     dlid++) {
2013
2014						/* try to add the path to cdg[0] */
2015						err =
2016						    update_channel_dep_graph
2017						    (&(cdg[test_vl]),
2018						     src_port, slid,
2019						     dest_port, dlid);
2020						if (err) {
2021							OSM_LOG(p_mgr->
2022								p_log,
2023								OSM_LOG_ERROR,
2024								"ERR AD14: cannot allocate memory for cdg node or link in update_channel_dep_graph(...)\n");
2025							goto ERROR;
2026						}
2027						/* add the <s,d> combination / corresponding virtual lane to the VL table */
2028						vltable_insert
2029						    (srcdest2vl_table,
2030						     cl_hton16(slid),
2031						     cl_hton16(dlid),
2032						     test_vl);
2033						paths_per_vl[test_vl]++;
2034
2035					}
2036
2037				}
2038			}
2039
2040		}
2041	}
2042	dfsssp_ctx->srcdest2vl_table = srcdest2vl_table;
2043
2044	/* test all cdg for cycles and break the cycles by moving paths on the weakest link to the next cdg */
2045	for (test_vl = 0; test_vl < vl_avail - 1; test_vl++) {
2046		start_here = cdg[test_vl];
2047		while (start_here) {
2048			cycle =
2049			    search_cycle_in_channel_dep_graph(cdg[test_vl],
2050							      start_here);
2051
2052			if (cycle) {
2053				vl_needed = test_vl + 2;
2054
2055				/* calc weakest link n cycle */
2056				weakest_link = get_weakest_link_in_cycle(cycle);
2057				if (!weakest_link) {
2058					OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
2059						"ERR AD27: something went wrong in get_weakest_link_in_cycle(...)\n");
2060					err = 1;
2061					goto ERROR;
2062				}
2063
2064				paths_per_vl[test_vl] -=
2065				    weakest_link->num_pairs;
2066				paths_per_vl[test_vl + 1] +=
2067				    weakest_link->num_pairs;
2068
2069				/* move all <s,d> paths on this link to the next cdg */
2070				for (i = 0; i < weakest_link->num_pairs; i++) {
2071					srcdest =
2072					    get_next_srcdest_pair(weakest_link,
2073								  i);
2074					slid = (uint16_t) (srcdest >> 16);
2075					dlid =
2076					    (uint16_t) ((srcdest << 16) >> 16);
2077
2078					/* only move if not moved in a previous step */
2079					if (test_vl !=
2080					    (uint8_t)
2081					    vltable_get_vl(srcdest2vl_table,
2082							   cl_hton16(slid),
2083							   cl_hton16(dlid))) {
2084						/* this path has been moved
2085						   before -> don't count
2086						 */
2087						paths_per_vl[test_vl]++;
2088						paths_per_vl[test_vl + 1]--;
2089						continue;
2090					}
2091
2092					src_port =
2093					    osm_get_port_by_lid(p_mgr->p_subn,
2094								cl_hton16
2095								(slid));
2096					dest_port =
2097					    osm_get_port_by_lid(p_mgr->p_subn,
2098								cl_hton16
2099								(dlid));
2100
2101					/* remove path from current cdg / vl */
2102					err =
2103					    remove_path_from_cdg(&
2104								 (cdg[test_vl]),
2105								 src_port, slid,
2106								 dest_port,
2107								 dlid);
2108					if (err) {
2109						OSM_LOG(p_mgr->p_log,
2110							OSM_LOG_ERROR,
2111							"ERR AD44: something went wrong in remove_path_from_cdg(...)\n");
2112						goto ERROR;
2113					}
2114
2115					/* add path to next cdg / vl */
2116					err =
2117					    update_channel_dep_graph(&
2118								     (cdg
2119								      [test_vl +
2120								       1]),
2121								     src_port,
2122								     slid,
2123								     dest_port,
2124								     dlid);
2125					if (err) {
2126						OSM_LOG(p_mgr->p_log,
2127							OSM_LOG_ERROR,
2128							"ERR AD14: cannot allocate memory for cdg node or link in update_channel_dep_graph(...)\n");
2129						goto ERROR;
2130					}
2131					vltable_insert(srcdest2vl_table,
2132						       cl_hton16(slid),
2133						       cl_hton16(dlid),
2134						       test_vl + 1);
2135				}
2136
2137				if (weakest_link->num_pairs)
2138					free(weakest_link->srcdest_pairs);
2139				if (weakest_link)
2140					free(weakest_link);
2141			}
2142
2143			start_here = cycle;
2144		}
2145	}
2146
2147	/* test the last avail cdg for a cycle;
2148	   if there is one, than vl_needed > vl_avail
2149	 */
2150	start_here = cdg[vl_avail - 1];
2151	if (start_here) {
2152		cycle =
2153		    search_cycle_in_channel_dep_graph(cdg[vl_avail - 1],
2154						      start_here);
2155		if (cycle) {
2156			vl_needed = vl_avail + 1;
2157		}
2158	}
2159
2160	OSM_LOG(p_mgr->p_log, OSM_LOG_INFO,
2161		"Virtual Lanes needed: %" PRIu8 "\n", vl_needed);
2162	if (OSM_LOG_IS_ACTIVE_V2(p_mgr->p_log, OSM_LOG_INFO)) {
2163		OSM_LOG(p_mgr->p_log, OSM_LOG_INFO,
2164			"Paths per VL (before balancing):\n");
2165		for (i = 0; i < vl_avail; i++)
2166			OSM_LOG(p_mgr->p_log, OSM_LOG_INFO,
2167				"   %" PRIu32 ". lane: %" PRIu64 "\n", i,
2168				paths_per_vl[i]);
2169	}
2170
2171	OSM_LOG(p_mgr->p_log, OSM_LOG_VERBOSE,
2172		"Balancing the paths on the available Virtual Lanes\n");
2173
2174	/* optimal balancing virtual lanes, under condition: no additional cycle checks;
2175	   sl/vl != 0 might be assigned to loopback packets (i.e. slid/dlid on the
2176	   same port for lmc>0), but thats no problem, see IBAS 10.2.2.3
2177	 */
2178	split_count = (uint8_t *) calloc(vl_avail, sizeof(uint8_t));
2179	if (!split_count) {
2180		OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
2181			"ERR AD24: cannot allocate memory for split_count, skip balancing\n");
2182		err = 1;
2183		goto ERROR;
2184	}
2185	/* initial state: paths for VLs won't be separated */
2186	for (i = 0; i < ((vl_needed < vl_avail) ? vl_needed : vl_avail); i++)
2187		split_count[i] = 1;
2188	dfsssp_ctx->vl_split_count = split_count;
2189	/* balancing is necessary if we have empty VLs */
2190	if (vl_needed < vl_avail) {
2191		/* split paths of VLs until we find an equal distribution */
2192		for (i = vl_needed; i < vl_avail; i++) {
2193			/* find VL with most paths in it */
2194			vl = 0;
2195			most_avg_paths = 0.0;
2196			for (test_vl = 0; test_vl < vl_needed; test_vl++) {
2197				if (most_avg_paths <
2198				    ((double)paths_per_vl[test_vl] /
2199				    split_count[test_vl])) {
2200					vl = test_vl;
2201					most_avg_paths =
2202						(double)paths_per_vl[test_vl] /
2203						split_count[test_vl];
2204				}
2205			}
2206			split_count[vl]++;
2207		}
2208		/* change the VL assignment depending on split_count for
2209		   all VLs except VL 0
2210		 */
2211		for (from = vl_needed - 1; from > 0; from--) {
2212			/* how much space needed for others? */
2213			to = 0;
2214			for (i = 0; i < from; i++)
2215				to += split_count[i];
2216			count = paths_per_vl[from];
2217			vltable_change_vl(srcdest2vl_table, from, to, count);
2218			/* change also the information within the split_count
2219			   array; this is important for fast calculation later
2220			 */
2221			split_count[to] = split_count[from];
2222			split_count[from] = 0;
2223			paths_per_vl[to] = paths_per_vl[from];
2224			paths_per_vl[from] = 0;
2225		}
2226	} else if (vl_needed > vl_avail) {
2227		/* routing not possible, a further development would be the LASH-TOR approach (update: LASH-TOR isn't possible, there is a mistake in the theory) */
2228		OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
2229			"ERR AD25: Not enough VLs available (avail=%d, needed=%d); Stopping dfsssp routing!\n",
2230			vl_avail, vl_needed);
2231		err = 1;
2232		goto ERROR;
2233	}
2234	/* else { no balancing } */
2235
2236	if (OSM_LOG_IS_ACTIVE_V2(p_mgr->p_log, OSM_LOG_DEBUG)) {
2237		OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
2238			"Virtual Lanes per src/dest combination after balancing:\n");
2239		vltable_print(p_mgr, srcdest2vl_table);
2240	}
2241	if (OSM_LOG_IS_ACTIVE_V2(p_mgr->p_log, OSM_LOG_INFO)) {
2242		OSM_LOG(p_mgr->p_log, OSM_LOG_INFO,
2243			"Approx. #paths per VL (after balancing):\n");
2244		j = 0;
2245		count = 1; /* to prevent div. by 0 */
2246		for (i = 0; i < vl_avail; i++) {
2247			if (split_count[i] > 0) {
2248				j = i;
2249				count = split_count[i];
2250			}
2251			OSM_LOG(p_mgr->p_log, OSM_LOG_INFO,
2252				"   %" PRIu32 ". lane: %" PRIu64 "\n", i,
2253				paths_per_vl[j] / count);
2254		}
2255	}
2256
2257	free(paths_per_vl);
2258
2259	/* deallocate channel dependency graphs */
2260	for (i = 0; i < vl_avail; i++)
2261		cdg_dealloc(&cdg[i]);
2262	free(cdg);
2263
2264	OSM_LOG_EXIT(p_mgr->p_log);
2265	return 0;
2266
2267ERROR:
2268	free(paths_per_vl);
2269
2270	for (i = 0; i < vl_avail; i++)
2271		cdg_dealloc(&cdg[i]);
2272	free(cdg);
2273
2274	vltable_dealloc(&srcdest2vl_table);
2275	dfsssp_ctx->srcdest2vl_table = NULL;
2276
2277	return err;
2278}
2279
2280/* meta function which calls subfunctions for dijkstra, update lft and weights,
2281   (and remove deadlocks) to calculate the routing for the subnet
2282*/
2283static int dfsssp_do_dijkstra_routing(void *context)
2284{
2285	dfsssp_context_t *dfsssp_ctx = (dfsssp_context_t *) context;
2286	osm_ucast_mgr_t *p_mgr = (osm_ucast_mgr_t *) dfsssp_ctx->p_mgr;
2287	vertex_t *adj_list = (vertex_t *) dfsssp_ctx->adj_list;
2288	uint32_t adj_list_size = dfsssp_ctx->adj_list_size;
2289
2290	vertex_t **sw_list = NULL;
2291	uint32_t sw_list_size = 0;
2292	uint64_t guid = 0;
2293	cl_qlist_t *qlist = NULL;
2294	cl_list_item_t *qlist_item = NULL;
2295
2296	cl_qmap_t *sw_tbl = &p_mgr->p_subn->sw_guid_tbl;
2297	cl_qmap_t cn_tbl, io_tbl, *p_mixed_tbl = NULL;
2298	cl_map_item_t *item = NULL;
2299	osm_switch_t *sw = NULL;
2300	osm_port_t *port = NULL;
2301	uint32_t i = 0, err = 0;
2302	uint16_t lid = 0, min_lid_ho = 0, max_lid_ho = 0;
2303	uint8_t lmc = 0;
2304	boolean_t cn_nodes_provided = FALSE, io_nodes_provided = FALSE;
2305
2306	OSM_LOG_ENTER(p_mgr->p_log);
2307	OSM_LOG(p_mgr->p_log, OSM_LOG_VERBOSE,
2308		"Calculating shortest path from all Hca/switches to all\n");
2309
2310	cl_qmap_init(&cn_tbl);
2311	cl_qmap_init(&io_tbl);
2312	p_mixed_tbl = &cn_tbl;
2313
2314	cl_qlist_init(&p_mgr->port_order_list);
2315
2316	/* reset the new_lft for each switch */
2317	for (item = cl_qmap_head(sw_tbl); item != cl_qmap_end(sw_tbl);
2318	     item = cl_qmap_next(item)) {
2319		sw = (osm_switch_t *) item;
2320		/* initialize LIDs in buffer to invalid port number */
2321		memset(sw->new_lft, OSM_NO_PATH, sw->max_lid_ho + 1);
2322		/* initialize LFT and hop count for bsp0/esp0 of the switch */
2323		min_lid_ho = cl_ntoh16(osm_node_get_base_lid(sw->p_node, 0));
2324		lmc = osm_node_get_lmc(sw->p_node, 0);
2325		for (i = min_lid_ho; i < min_lid_ho + (1 << lmc); i++) {
2326			/* for each switch the port to the 'self'lid is the management port 0 */
2327			sw->new_lft[i] = 0;
2328			/* the hop count to the 'self'lid is 0 for each switch */
2329			osm_switch_set_hops(sw, i, 0, 0);
2330		}
2331	}
2332
2333	/* we need an intermediate array of pointers to switches in adj_list;
2334	   this array will be sorted in respect to num_hca (descending)
2335	 */
2336	sw_list_size = adj_list_size - 1;
2337	sw_list = (vertex_t **)malloc(sw_list_size * sizeof(vertex_t *));
2338	if (!sw_list) {
2339		OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
2340			"ERR AD29: cannot allocate memory for sw_list in dfsssp_do_dijkstra_routing\n");
2341		goto ERROR;
2342	}
2343	memset(sw_list, 0, sw_list_size * sizeof(vertex_t *));
2344
2345	/* fill the array with references to the 'real' sw in adj_list */
2346	for (i = 0; i < sw_list_size; i++)
2347		sw_list[i] = &(adj_list[i + 1]);
2348
2349	/* sort the sw_list in descending order */
2350	sw_list_sort_by_num_hca(sw_list, sw_list_size);
2351
2352	/* parse compute node guid file, if provided by the user */
2353	if (p_mgr->p_subn->opt.cn_guid_file) {
2354		OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
2355			"Parsing compute nodes from file %s\n",
2356			p_mgr->p_subn->opt.cn_guid_file);
2357
2358		if (parse_node_map(p_mgr->p_subn->opt.cn_guid_file,
2359				   add_guid_to_map, &cn_tbl)) {
2360			OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
2361				"ERR AD33: Problem parsing compute node guid file\n");
2362			goto ERROR;
2363		}
2364
2365		if (cl_is_qmap_empty(&cn_tbl))
2366			OSM_LOG(p_mgr->p_log, OSM_LOG_INFO,
2367				"WRN AD34: compute node guids file contains no valid guids\n");
2368		else
2369			cn_nodes_provided = TRUE;
2370	}
2371
2372	/* parse I/O guid file, if provided by the user */
2373	if (p_mgr->p_subn->opt.io_guid_file) {
2374		OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
2375			"Parsing I/O nodes from file %s\n",
2376			p_mgr->p_subn->opt.io_guid_file);
2377
2378		if (parse_node_map(p_mgr->p_subn->opt.io_guid_file,
2379				   add_guid_to_map, &io_tbl)) {
2380			OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
2381				"ERR AD35: Problem parsing I/O guid file\n");
2382			goto ERROR;
2383		}
2384
2385		if (cl_is_qmap_empty(&io_tbl))
2386			OSM_LOG(p_mgr->p_log, OSM_LOG_INFO,
2387				"WRN AD36: I/O node guids file contains no valid guids\n");
2388		else
2389			io_nodes_provided = TRUE;
2390	}
2391
2392	/* if we mix Hca/Tca/SP0 during the dijkstra routing, we might end up
2393	   in rare cases with a bad balancing for Hca<->Hca connections, i.e.
2394	   some inter-switch links get oversubscribed with paths;
2395	   therefore: add Hca ports first to ensure good Hca<->Hca balancing
2396	 */
2397	if (cn_nodes_provided) {
2398		for (i = 0; i < adj_list_size - 1; i++) {
2399			if (sw_list[i] && sw_list[i]->sw) {
2400				sw = (osm_switch_t *)(sw_list[i]->sw);
2401				add_sw_endports_to_order_list(sw, p_mgr,
2402							      &cn_tbl, TRUE);
2403			} else {
2404				OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
2405					"ERR AD30: corrupted sw_list array in dfsssp_do_dijkstra_routing\n");
2406				goto ERROR;
2407			}
2408		}
2409	}
2410	/* then: add Tca ports to ensure good Hca->Tca balancing and separate
2411	   paths towards I/O nodes on the same switch (if possible)
2412	 */
2413	if (io_nodes_provided) {
2414		for (i = 0; i < adj_list_size - 1; i++) {
2415			if (sw_list[i] && sw_list[i]->sw) {
2416				sw = (osm_switch_t *)(sw_list[i]->sw);
2417				add_sw_endports_to_order_list(sw, p_mgr,
2418							      &io_tbl, TRUE);
2419			} else {
2420				OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
2421					"ERR AD32: corrupted sw_list array in dfsssp_do_dijkstra_routing\n");
2422				goto ERROR;
2423			}
2424		}
2425	}
2426	/* then: add anything else, such as administration nodes, ... */
2427	if (cn_nodes_provided && io_nodes_provided) {
2428		cl_qmap_merge(&cn_tbl, &io_tbl);
2429	} else if (io_nodes_provided) {
2430		p_mixed_tbl = &io_tbl;
2431	}
2432	for (i = 0; i < adj_list_size - 1; i++) {
2433		if (sw_list[i] && sw_list[i]->sw) {
2434			sw = (osm_switch_t *)(sw_list[i]->sw);
2435			add_sw_endports_to_order_list(sw, p_mgr, p_mixed_tbl,
2436						      FALSE);
2437		} else {
2438			OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
2439				"ERR AD39: corrupted sw_list array in dfsssp_do_dijkstra_routing\n");
2440			goto ERROR;
2441		}
2442	}
2443	/* last: add SP0 afterwards which have lower priority for balancing */
2444	for (i = 0; i < sw_list_size; i++) {
2445		if (sw_list[i] && sw_list[i]->sw) {
2446			sw = (osm_switch_t *)(sw_list[i]->sw);
2447			guid = cl_ntoh64(osm_node_get_node_guid(sw->p_node));
2448			add_guid_to_order_list(guid, p_mgr);
2449		} else {
2450			OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
2451				"ERR AD31: corrupted sw_list array in dfsssp_do_dijkstra_routing\n");
2452			goto ERROR;
2453		}
2454	}
2455
2456	/* the intermediate array lived long enough */
2457	free(sw_list);
2458	sw_list = NULL;
2459	/* same is true for the compute node and I/O guid map */
2460	destroy_guid_map(&cn_tbl);
2461	cn_nodes_provided = FALSE;
2462	destroy_guid_map(&io_tbl);
2463	io_nodes_provided = FALSE;
2464
2465	/* do the routing for the each Hca in the subnet and each switch
2466	   in the subnet (to add the routes to base/enhanced SP0)
2467	 */
2468	qlist = &p_mgr->port_order_list;
2469	for (qlist_item = cl_qlist_head(qlist);
2470	     qlist_item != cl_qlist_end(qlist);
2471	     qlist_item = cl_qlist_next(qlist_item)) {
2472		port = (osm_port_t *)cl_item_obj(qlist_item, port, list_item);
2473
2474		/* calculate shortest path with dijkstra from node to all switches/Hca */
2475		if (osm_node_get_type(port->p_node) == IB_NODE_TYPE_CA) {
2476			OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
2477				"Processing Hca with GUID 0x%" PRIx64 "\n",
2478				cl_ntoh64(osm_node_get_node_guid
2479					  (port->p_node)));
2480		} else if (osm_node_get_type(port->p_node) == IB_NODE_TYPE_SWITCH) {
2481			OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
2482				"Processing switch with GUID 0x%" PRIx64 "\n",
2483				cl_ntoh64(osm_node_get_node_guid
2484					  (port->p_node)));
2485		} else {
2486			/* we don't handle routers, in case they show up */
2487			continue;
2488		}
2489
2490		/* distribute the LID range across the ports that can reach those LIDs
2491		   to have disjoint paths for one destination port with lmc>0;
2492		   for switches with bsp0: min=max; with esp0: max>min if lmc>0
2493		 */
2494		osm_port_get_lid_range_ho(port, &min_lid_ho,
2495					  &max_lid_ho);
2496		for (lid = min_lid_ho; lid <= max_lid_ho; lid++) {
2497			/* do dijkstra from this Hca/LID/SP0 to each switch */
2498			err =
2499			    dijkstra(p_mgr, adj_list, adj_list_size, port, lid);
2500			if (err)
2501				goto ERROR;
2502			if (OSM_LOG_IS_ACTIVE_V2(p_mgr->p_log, OSM_LOG_DEBUG))
2503				print_routes(p_mgr, adj_list, adj_list_size,
2504					     port);
2505
2506			/* make an update for the linear forwarding tables of the switches */
2507			err =
2508			    update_lft(p_mgr, adj_list, adj_list_size, port, lid);
2509			if (err)
2510				goto ERROR;
2511
2512			/* add weights for calculated routes to adjust the weights for the next cycle */
2513			update_weights(p_mgr, adj_list, adj_list_size);
2514
2515			if (OSM_LOG_IS_ACTIVE_V2(p_mgr->p_log, OSM_LOG_DEBUG))
2516				dfsssp_print_graph(p_mgr, adj_list,
2517						   adj_list_size);
2518		}
2519	}
2520
2521	/* try deadlock removal only for the dfsssp routing (not for the sssp case, which is a subset of the dfsssp algorithm) */
2522	if (dfsssp_ctx->routing_type == OSM_ROUTING_ENGINE_TYPE_DFSSSP) {
2523		/* remove potential deadlocks by assigning different virtual lanes to src/dest paths and balance the lanes */
2524		err = dfsssp_remove_deadlocks(dfsssp_ctx);
2525		if (err)
2526			goto ERROR;
2527	} else if (dfsssp_ctx->routing_type == OSM_ROUTING_ENGINE_TYPE_SSSP) {
2528		OSM_LOG(p_mgr->p_log, OSM_LOG_INFO,
2529			"SSSP routing specified -> skipping deadlock removal thru dfsssp_remove_deadlocks(...)\n");
2530	} else {
2531		OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
2532			"ERR AD28: wrong routing engine specified in dfsssp_ctx\n");
2533		goto ERROR;
2534	}
2535
2536	/* list not needed after the dijkstra steps and deadlock removal */
2537	cl_qlist_remove_all(&p_mgr->port_order_list);
2538
2539	/* print the new_lft for each switch after routing is done */
2540	if (OSM_LOG_IS_ACTIVE_V2(p_mgr->p_log, OSM_LOG_DEBUG)) {
2541		for (item = cl_qmap_head(sw_tbl); item != cl_qmap_end(sw_tbl);
2542		     item = cl_qmap_next(item)) {
2543			sw = (osm_switch_t *) item;
2544			OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
2545				"Summary of the (new) LFT for switch 0x%" PRIx64
2546				" (%s):\n",
2547				cl_ntoh64(osm_node_get_node_guid(sw->p_node)),
2548				sw->p_node->print_desc);
2549			for (i = 0; i < sw->max_lid_ho + 1; i++)
2550				if (sw->new_lft[i] != OSM_NO_PATH) {
2551					OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
2552						"   for LID=%" PRIu32
2553						" use port=%" PRIu8 "\n", i,
2554						sw->new_lft[i]);
2555				}
2556		}
2557	}
2558
2559	OSM_LOG_EXIT(p_mgr->p_log);
2560	return 0;
2561
2562ERROR:
2563	if (!cl_is_qlist_empty(&p_mgr->port_order_list))
2564		cl_qlist_remove_all(&p_mgr->port_order_list);
2565	if (cn_nodes_provided)
2566		destroy_guid_map(&cn_tbl);
2567	if (io_nodes_provided)
2568		destroy_guid_map(&io_tbl);
2569	if (sw_list)
2570		free(sw_list);
2571	return -1;
2572}
2573
2574/* meta function which calls subfunctions for finding the optimal switch
2575   for the spanning tree, performing a dijkstra step with this sw as root,
2576   and calculating the mcast table for MLID
2577*/
2578static ib_api_status_t dfsssp_do_mcast_routing(void * context,
2579					       osm_mgrp_box_t * mbox)
2580{
2581	dfsssp_context_t *dfsssp_ctx = (dfsssp_context_t *) context;
2582	osm_ucast_mgr_t *p_mgr = (osm_ucast_mgr_t *) dfsssp_ctx->p_mgr;
2583	osm_sm_t *sm = (osm_sm_t *) p_mgr->sm;
2584	vertex_t *adj_list = (vertex_t *) dfsssp_ctx->adj_list;
2585	uint32_t adj_list_size = dfsssp_ctx->adj_list_size;
2586	cl_qlist_t mcastgrp_port_list;
2587	cl_qmap_t mcastgrp_port_map;
2588	osm_switch_t *root_sw = NULL, *p_sw = NULL;
2589	osm_port_t *port = NULL;
2590	ib_net16_t lid = 0;
2591	uint32_t err = 0, num_ports = 0, i = 0;
2592	ib_net64_t guid = 0;
2593	ib_api_status_t status = IB_SUCCESS;
2594
2595	OSM_LOG_ENTER(sm->p_log);
2596
2597	/* using the ucast cache feature with dfsssp might mean that a leaf sw
2598	   got removed (and got back) without calling dfsssp_build_graph
2599	   and therefore the adj_list (and pointers to osm's internal switches)
2600	   could be outdated (here we have no knowledge if it has happened, so
2601	   unfortunately a check is necessary... still better than rebuilding
2602	   adj_list every time we arrive here)
2603	 */
2604	if (p_mgr->p_subn->opt.use_ucast_cache && p_mgr->cache_valid) {
2605		for (i = 1; i < adj_list_size; i++) {
2606			guid = cl_hton64(adj_list[i].guid);
2607			p_sw = osm_get_switch_by_guid(p_mgr->p_subn, guid);
2608			if (p_sw) {
2609				/* check if switch came back from the dead */
2610				if (adj_list[i].dropped)
2611					adj_list[i].dropped = FALSE;
2612
2613				/* verify that sw object has not been moved
2614				   (this can happen for a leaf switch, if it
2615				   was dropped and came back later without a
2616				   rerouting), otherwise we have to update
2617				   dfsssp's internal switch list with the new
2618				   sw pointer
2619				 */
2620				if (p_sw == adj_list[i].sw)
2621					continue;
2622				else
2623					adj_list[i].sw = p_sw;
2624			} else {
2625				/* if a switch from adj_list is not in the
2626				   sw_guid_tbl anymore, then the only reason is
2627				   that it was a leaf switch and opensm dropped
2628				   it without calling a rerouting
2629				   -> calling dijkstra is no problem, since it
2630				      is a leaf and different from root_sw
2631				   -> only update_mcft and reset_mgrp_membership
2632				      need to be aware of these dropped switches
2633				 */
2634				if (!adj_list[i].dropped)
2635					adj_list[i].dropped = TRUE;
2636			}
2637		}
2638	}
2639
2640	/* create a map and a list of all ports which are member in the mcast
2641	   group; map for searching elements and list for iteration
2642	 */
2643	if (osm_mcast_make_port_list_and_map(&mcastgrp_port_list,
2644					     &mcastgrp_port_map, mbox)) {
2645		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR AD50: "
2646			"Insufficient memory to make port list\n");
2647		status = IB_ERROR;
2648		goto Exit;
2649	}
2650
2651	num_ports = cl_qlist_count(&mcastgrp_port_list);
2652	if (num_ports < 2) {
2653		OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
2654			"MLID 0x%X has %u members - nothing to do\n",
2655			mbox->mlid, num_ports);
2656		goto Exit;
2657	}
2658
2659	/* find the root switch for the spanning tree, which has the smallest
2660	   hops count to all LIDs in the mcast group
2661	 */
2662	root_sw = osm_mcast_mgr_find_root_switch(sm, &mcastgrp_port_list);
2663	if (!root_sw) {
2664		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR AD51: "
2665			"Unable to locate a suitable switch for group 0x%X\n",
2666			mbox->mlid);
2667		status = IB_ERROR;
2668		goto Exit;
2669	}
2670
2671	/* a) start one dijkstra step from the root switch to generate a
2672	   spanning tree
2673	   b) this might be a bit of an overkill to span the whole
2674	   network, if there are only a few ports in the mcast group, but
2675	   its only one dijkstra step for each mcast group and we did many
2676	   steps before in the ucast routing for each LID in the subnet;
2677	   c) we can use the subnet structure from the ucast routing, and
2678	   don't even have to reset the link weights (=> therefore the mcast
2679	   spanning tree will use less 'growded' links in the network)
2680	   d) the mcast dfsssp algorithm will not change the link weights
2681	 */
2682	lid = osm_node_get_base_lid(root_sw->p_node, 0);
2683	port = osm_get_port_by_lid(sm->p_subn, lid);
2684	err = dijkstra(p_mgr, adj_list, adj_list_size, port, lid);
2685	if (err) {
2686		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR AD52: "
2687			"Dijkstra step for mcast failed for group 0x%X\n",
2688			mbox->mlid);
2689		status = IB_ERROR;
2690		goto Exit;
2691	}
2692
2693	/* set mcast group membership again for update_mcft
2694	   (unfortunately: osm_mcast_mgr_find_root_switch resets it)
2695	 */
2696	update_mgrp_membership(&mcastgrp_port_list);
2697
2698	/* update the mcast forwarding tables of the switches */
2699	err = update_mcft(sm, adj_list, adj_list_size, mbox->mlid,
2700			  &mcastgrp_port_map, root_sw);
2701	if (err) {
2702		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR AD53: "
2703			"Update of mcast forwarding tables failed for group 0x%X\n",
2704			mbox->mlid);
2705		status = IB_ERROR;
2706		goto Exit;
2707	}
2708
2709Exit:
2710	reset_mgrp_membership(adj_list, adj_list_size);
2711	osm_mcast_drop_port_list(&mcastgrp_port_list);
2712	OSM_LOG_EXIT(sm->p_log);
2713	return status;
2714}
2715
2716/* called from extern in QP creation process to gain the the service level and
2717   the virtual lane respectively for a <s,d> pair
2718*/
2719static uint8_t get_dfsssp_sl(void *context, uint8_t hint_for_default_sl,
2720			     const ib_net16_t slid, const ib_net16_t dlid)
2721{
2722	dfsssp_context_t *dfsssp_ctx = (dfsssp_context_t *) context;
2723	osm_port_t *src_port, *dest_port;
2724	vltable_t *srcdest2vl_table = NULL;
2725	uint8_t *vl_split_count = NULL;
2726	osm_ucast_mgr_t *p_mgr = NULL;
2727	int32_t res = 0;
2728
2729	if (dfsssp_ctx
2730	    && dfsssp_ctx->routing_type == OSM_ROUTING_ENGINE_TYPE_DFSSSP) {
2731		p_mgr = (osm_ucast_mgr_t *) dfsssp_ctx->p_mgr;
2732		srcdest2vl_table = (vltable_t *) (dfsssp_ctx->srcdest2vl_table);
2733		vl_split_count = (uint8_t *) (dfsssp_ctx->vl_split_count);
2734	}
2735	else
2736		return hint_for_default_sl;
2737
2738	src_port = osm_get_port_by_lid(p_mgr->p_subn, slid);
2739	if (!src_port)
2740		return hint_for_default_sl;
2741
2742	dest_port = osm_get_port_by_lid(p_mgr->p_subn, dlid);
2743	if (!dest_port)
2744		return hint_for_default_sl;
2745
2746	if (!srcdest2vl_table)
2747		return hint_for_default_sl;
2748
2749	res = vltable_get_vl(srcdest2vl_table, slid, dlid);
2750
2751	/* we will randomly distribute the traffic over multiple VLs if
2752	   necessary for good balancing; therefore vl_split_count provides
2753	   the number of VLs to use for certain traffic
2754	 */
2755	if (res > -1) {
2756		if (vl_split_count[res] > 1)
2757			return (uint8_t) (res + rand()%(vl_split_count[res]));
2758		else
2759			return (uint8_t) res;
2760	} else
2761		return hint_for_default_sl;
2762}
2763
2764static dfsssp_context_t *dfsssp_context_create(osm_opensm_t * p_osm,
2765					       osm_routing_engine_type_t
2766					       routing_type)
2767{
2768	dfsssp_context_t *dfsssp_ctx = NULL;
2769
2770	/* allocate memory */
2771	dfsssp_ctx = (dfsssp_context_t *) malloc(sizeof(dfsssp_context_t));
2772	if (dfsssp_ctx) {
2773		/* set initial values */
2774		dfsssp_ctx->routing_type = routing_type;
2775		dfsssp_ctx->p_mgr = (osm_ucast_mgr_t *) & (p_osm->sm.ucast_mgr);
2776		dfsssp_ctx->adj_list = NULL;
2777		dfsssp_ctx->adj_list_size = 0;
2778		dfsssp_ctx->srcdest2vl_table = NULL;
2779		dfsssp_ctx->vl_split_count = NULL;
2780	} else {
2781		OSM_LOG(p_osm->sm.ucast_mgr.p_log, OSM_LOG_ERROR,
2782			"ERR AD04: cannot allocate memory for dfsssp_ctx in dfsssp_context_create\n");
2783		return NULL;
2784	}
2785
2786	return dfsssp_ctx;
2787}
2788
2789static void dfsssp_context_destroy(void *context)
2790{
2791	dfsssp_context_t *dfsssp_ctx = (dfsssp_context_t *) context;
2792	vertex_t *adj_list = (vertex_t *) (dfsssp_ctx->adj_list);
2793	uint32_t i = 0;
2794	link_t *link = NULL, *tmp = NULL;
2795
2796	/* free adj_list */
2797	for (i = 0; i < dfsssp_ctx->adj_list_size; i++) {
2798		link = adj_list[i].links;
2799		while (link) {
2800			tmp = link;
2801			link = link->next;
2802			free(tmp);
2803		}
2804	}
2805	free(adj_list);
2806	dfsssp_ctx->adj_list = NULL;
2807	dfsssp_ctx->adj_list_size = 0;
2808
2809	/* free srcdest2vl table and the split count information table
2810	   (can be done, because dfsssp_context_destroy is called after
2811	    osm_get_dfsssp_sl)
2812	 */
2813	vltable_dealloc(&(dfsssp_ctx->srcdest2vl_table));
2814	dfsssp_ctx->srcdest2vl_table = NULL;
2815
2816	if (dfsssp_ctx->vl_split_count) {
2817		free(dfsssp_ctx->vl_split_count);
2818		dfsssp_ctx->vl_split_count = NULL;
2819	}
2820}
2821
2822static void delete(void *context)
2823{
2824	if (!context)
2825		return;
2826	dfsssp_context_destroy(context);
2827
2828	free(context);
2829}
2830
2831int osm_ucast_dfsssp_setup(struct osm_routing_engine *r, osm_opensm_t * p_osm)
2832{
2833	/* create context container and add ucast management object */
2834	dfsssp_context_t *dfsssp_context =
2835	    dfsssp_context_create(p_osm, OSM_ROUTING_ENGINE_TYPE_DFSSSP);
2836	if (!dfsssp_context) {
2837		return 1;	/* alloc failed -> skip this routing */
2838	}
2839
2840	/* reset function pointers to dfsssp routines */
2841	r->context = (void *)dfsssp_context;
2842	r->build_lid_matrices = dfsssp_build_graph;
2843	r->ucast_build_fwd_tables = dfsssp_do_dijkstra_routing;
2844	r->mcast_build_stree = dfsssp_do_mcast_routing;
2845	r->path_sl = get_dfsssp_sl;
2846	r->destroy = delete;
2847
2848	/* we initialize with the current time to achieve a 'good' randomized
2849	   assignment in get_dfsssp_sl(...)
2850	 */
2851	srand(time(NULL));
2852
2853	return 0;
2854}
2855
2856int osm_ucast_sssp_setup(struct osm_routing_engine *r, osm_opensm_t * p_osm)
2857{
2858	/* create context container and add ucast management object */
2859	dfsssp_context_t *dfsssp_context =
2860	    dfsssp_context_create(p_osm, OSM_ROUTING_ENGINE_TYPE_SSSP);
2861	if (!dfsssp_context) {
2862		return 1;	/* alloc failed -> skip this routing */
2863	}
2864
2865	/* reset function pointers to sssp routines */
2866	r->context = (void *)dfsssp_context;
2867	r->build_lid_matrices = dfsssp_build_graph;
2868	r->ucast_build_fwd_tables = dfsssp_do_dijkstra_routing;
2869	r->mcast_build_stree = dfsssp_do_mcast_routing;
2870	r->destroy = delete;
2871
2872	return 0;
2873}
2874