1/* 2 * Copyright (c) 2004-2008 Voltaire, Inc. All rights reserved. 3 * Copyright (c) 2002-2007 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 1996-2003 Intel Corporation. All rights reserved. 5 * 6 * This software is available to you under a choice of one of two 7 * licenses. You may choose to be licensed under the terms of the GNU 8 * General Public License (GPL) Version 2, available from the file 9 * COPYING in the main directory of this source tree, or the 10 * OpenIB.org BSD license below: 11 * 12 * Redistribution and use in source and binary forms, with or 13 * without modification, are permitted provided that the following 14 * conditions are met: 15 * 16 * - Redistributions of source code must retain the above 17 * copyright notice, this list of conditions and the following 18 * disclaimer. 19 * 20 * - Redistributions in binary form must reproduce the above 21 * copyright notice, this list of conditions and the following 22 * disclaimer in the documentation and/or other materials 23 * provided with the distribution. 24 * 25 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 26 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 27 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 28 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 29 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 30 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 31 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 32 * SOFTWARE. 33 * 34 */ 35 36/* 37 * Abstract: 38 * Implementation of OpenSM FatTree routing 39 */ 40 41#if HAVE_CONFIG_H 42# include <config.h> 43#endif 44 45#include <stdlib.h> 46#include <string.h> 47#include <ctype.h> 48#include <errno.h> 49#include <iba/ib_types.h> 50#include <complib/cl_qmap.h> 51#include <complib/cl_debug.h> 52#include <opensm/osm_opensm.h> 53#include <opensm/osm_switch.h> 54 55/* 56 * FatTree rank is bounded between 2 and 8: 57 * - Tree of rank 1 has only trivial routing paths, 58 * so no need to use FatTree routing. 59 * - Why maximum rank is 8: 60 * Each node (switch) is assigned a unique tuple. 61 * Switches are stored in two cl_qmaps - one is 62 * ordered by guid, and the other by a key that is 63 * generated from tuple. Since cl_qmap supports only 64 * a 64-bit key, the maximal tuple lenght is 8 bytes. 65 * which means that maximal tree rank is 8. 66 * Note that the above also implies that each switch 67 * can have at max 255 up/down ports. 68 */ 69 70#define FAT_TREE_MIN_RANK 2 71#define FAT_TREE_MAX_RANK 8 72 73typedef enum { 74 FTREE_DIRECTION_DOWN = -1, 75 FTREE_DIRECTION_SAME, 76 FTREE_DIRECTION_UP 77} ftree_direction_t; 78 79/*************************************************** 80 ** 81 ** Forward references 82 ** 83 ***************************************************/ 84 85struct ftree_sw_t_; 86struct ftree_hca_t_; 87struct ftree_port_t_; 88struct ftree_port_group_t_; 89struct ftree_fabric_t_; 90 91/*************************************************** 92 ** 93 ** ftree_tuple_t definition 94 ** 95 ***************************************************/ 96 97#define FTREE_TUPLE_BUFF_LEN 1024 98#define FTREE_TUPLE_LEN 8 99 100typedef uint8_t ftree_tuple_t[FTREE_TUPLE_LEN]; 101typedef uint64_t ftree_tuple_key_t; 102 103struct guid_list_item { 104 cl_list_item_t list; 105 uint64_t guid; 106}; 107 108/*************************************************** 109 ** 110 ** ftree_sw_table_element_t definition 111 ** 112 ***************************************************/ 113 114typedef struct { 115 cl_map_item_t map_item; 116 struct ftree_sw_t_ *p_sw; 117} ftree_sw_tbl_element_t; 118 119/*************************************************** 120 ** 121 ** ftree_port_t definition 122 ** 123 ***************************************************/ 124 125typedef struct ftree_port_t_ { 126 cl_map_item_t map_item; 127 uint8_t port_num; /* port number on the current node */ 128 uint8_t remote_port_num; /* port number on the remote node */ 129 uint32_t counter_up; /* number of allocated routs upwards */ 130 uint32_t counter_down; /* number of allocated routs downwards */ 131} ftree_port_t; 132 133/*************************************************** 134 ** 135 ** ftree_port_group_t definition 136 ** 137 ***************************************************/ 138 139typedef union ftree_hca_or_sw_ { 140 struct ftree_hca_t_ *p_hca; 141 struct ftree_sw_t_ *p_sw; 142} ftree_hca_or_sw; 143 144typedef struct ftree_port_group_t_ { 145 cl_map_item_t map_item; 146 ib_net16_t base_lid; /* base lid of the current node */ 147 ib_net16_t remote_base_lid; /* base lid of the remote node */ 148 ib_net64_t port_guid; /* port guid of this port */ 149 ib_net64_t node_guid; /* this node's guid */ 150 uint8_t node_type; /* this node's type */ 151 ib_net64_t remote_port_guid; /* port guid of the remote port */ 152 ib_net64_t remote_node_guid; /* node guid of the remote node */ 153 uint8_t remote_node_type; /* IB_NODE_TYPE_{CA,SWITCH,ROUTER,...} */ 154 ftree_hca_or_sw hca_or_sw; /* pointer to this hca/switch */ 155 ftree_hca_or_sw remote_hca_or_sw; /* pointer to remote hca/switch */ 156 cl_ptr_vector_t ports; /* vector of ports to the same lid */ 157 boolean_t is_cn; /* whether this port is a compute node */ 158 uint32_t counter_down; /* number of allocated routs downwards */ 159} ftree_port_group_t; 160 161/*************************************************** 162 ** 163 ** ftree_sw_t definition 164 ** 165 ***************************************************/ 166 167typedef struct ftree_sw_t_ { 168 cl_map_item_t map_item; 169 osm_switch_t *p_osm_sw; 170 uint32_t rank; 171 ftree_tuple_t tuple; 172 ib_net16_t base_lid; 173 ftree_port_group_t **down_port_groups; 174 uint8_t down_port_groups_num; 175 ftree_port_group_t **up_port_groups; 176 uint8_t up_port_groups_num; 177 boolean_t is_leaf; 178 int down_port_groups_idx; 179} ftree_sw_t; 180 181/*************************************************** 182 ** 183 ** ftree_hca_t definition 184 ** 185 ***************************************************/ 186 187typedef struct ftree_hca_t_ { 188 cl_map_item_t map_item; 189 osm_node_t *p_osm_node; 190 ftree_port_group_t **up_port_groups; 191 uint16_t up_port_groups_num; 192 unsigned cn_num; 193} ftree_hca_t; 194 195/*************************************************** 196 ** 197 ** ftree_fabric_t definition 198 ** 199 ***************************************************/ 200 201typedef struct ftree_fabric_t_ { 202 osm_opensm_t *p_osm; 203 cl_qmap_t hca_tbl; 204 cl_qmap_t sw_tbl; 205 cl_qmap_t sw_by_tuple_tbl; 206 cl_qlist_t root_guid_list; 207 cl_qmap_t cn_guid_tbl; 208 unsigned cn_num; 209 uint8_t leaf_switch_rank; 210 uint8_t max_switch_rank; 211 ftree_sw_t **leaf_switches; 212 uint32_t leaf_switches_num; 213 uint16_t max_cn_per_leaf; 214 uint16_t lft_max_lid_ho; 215 boolean_t fabric_built; 216} ftree_fabric_t; 217 218/*************************************************** 219 ** 220 ** comparators 221 ** 222 ***************************************************/ 223 224static int OSM_CDECL __osm_ftree_compare_switches_by_index(IN const void *p1, 225 IN const void *p2) 226{ 227 ftree_sw_t **pp_sw1 = (ftree_sw_t **) p1; 228 ftree_sw_t **pp_sw2 = (ftree_sw_t **) p2; 229 230 uint16_t i; 231 for (i = 0; i < FTREE_TUPLE_LEN; i++) { 232 if ((*pp_sw1)->tuple[i] > (*pp_sw2)->tuple[i]) 233 return 1; 234 if ((*pp_sw1)->tuple[i] < (*pp_sw2)->tuple[i]) 235 return -1; 236 } 237 return 0; 238} 239 240/***************************************************/ 241 242static int OSM_CDECL 243__osm_ftree_compare_port_groups_by_remote_switch_index(IN const void *p1, 244 IN const void *p2) 245{ 246 ftree_port_group_t **pp_g1 = (ftree_port_group_t **) p1; 247 ftree_port_group_t **pp_g2 = (ftree_port_group_t **) p2; 248 249 return 250 __osm_ftree_compare_switches_by_index(& 251 ((*pp_g1)->remote_hca_or_sw. 252 p_sw), 253 &((*pp_g2)->remote_hca_or_sw. 254 p_sw)); 255} 256 257/*************************************************** 258 ** 259 ** ftree_tuple_t functions 260 ** 261 ***************************************************/ 262 263static void __osm_ftree_tuple_init(IN ftree_tuple_t tuple) 264{ 265 memset(tuple, 0xFF, FTREE_TUPLE_LEN); 266} 267 268/***************************************************/ 269 270static inline boolean_t __osm_ftree_tuple_assigned(IN ftree_tuple_t tuple) 271{ 272 return (tuple[0] != 0xFF); 273} 274 275/***************************************************/ 276 277#define FTREE_TUPLE_BUFFERS_NUM 6 278 279static char *__osm_ftree_tuple_to_str(IN ftree_tuple_t tuple) 280{ 281 static char buffer[FTREE_TUPLE_BUFFERS_NUM][FTREE_TUPLE_BUFF_LEN]; 282 static uint8_t ind = 0; 283 char *ret_buffer; 284 uint32_t i; 285 286 if (!__osm_ftree_tuple_assigned(tuple)) 287 return "INDEX.NOT.ASSIGNED"; 288 289 buffer[ind][0] = '\0'; 290 291 for (i = 0; (i < FTREE_TUPLE_LEN) && (tuple[i] != 0xFF); i++) { 292 if ((strlen(buffer[ind]) + 10) > FTREE_TUPLE_BUFF_LEN) 293 return "INDEX.TOO.LONG"; 294 if (i != 0) 295 strcat(buffer[ind], "."); 296 sprintf(&buffer[ind][strlen(buffer[ind])], "%u", tuple[i]); 297 } 298 299 ret_buffer = buffer[ind]; 300 ind = (ind + 1) % FTREE_TUPLE_BUFFERS_NUM; 301 return ret_buffer; 302} /* __osm_ftree_tuple_to_str() */ 303 304/***************************************************/ 305 306static inline ftree_tuple_key_t __osm_ftree_tuple_to_key(IN ftree_tuple_t tuple) 307{ 308 ftree_tuple_key_t key; 309 memcpy(&key, tuple, FTREE_TUPLE_LEN); 310 return key; 311} 312 313/***************************************************/ 314 315static inline void __osm_ftree_tuple_from_key(IN ftree_tuple_t tuple, 316 IN ftree_tuple_key_t key) 317{ 318 memcpy(tuple, &key, FTREE_TUPLE_LEN); 319} 320 321/*************************************************** 322 ** 323 ** ftree_sw_tbl_element_t functions 324 ** 325 ***************************************************/ 326 327static ftree_sw_tbl_element_t *__osm_ftree_sw_tbl_element_create(IN ftree_sw_t * 328 p_sw) 329{ 330 ftree_sw_tbl_element_t *p_element = 331 (ftree_sw_tbl_element_t *) malloc(sizeof(ftree_sw_tbl_element_t)); 332 if (!p_element) 333 return NULL; 334 memset(p_element, 0, sizeof(ftree_sw_tbl_element_t)); 335 336 p_element->p_sw = p_sw; 337 return p_element; 338} 339 340/***************************************************/ 341 342static void __osm_ftree_sw_tbl_element_destroy(IN ftree_sw_tbl_element_t * 343 p_element) 344{ 345 if (!p_element) 346 return; 347 free(p_element); 348} 349 350/*************************************************** 351 ** 352 ** ftree_port_t functions 353 ** 354 ***************************************************/ 355 356static ftree_port_t *__osm_ftree_port_create(IN uint8_t port_num, 357 IN uint8_t remote_port_num) 358{ 359 ftree_port_t *p_port = (ftree_port_t *) malloc(sizeof(ftree_port_t)); 360 if (!p_port) 361 return NULL; 362 memset(p_port, 0, sizeof(ftree_port_t)); 363 364 p_port->port_num = port_num; 365 p_port->remote_port_num = remote_port_num; 366 367 return p_port; 368} 369 370/***************************************************/ 371 372static void __osm_ftree_port_destroy(IN ftree_port_t * p_port) 373{ 374 if (p_port) 375 free(p_port); 376} 377 378/*************************************************** 379 ** 380 ** ftree_port_group_t functions 381 ** 382 ***************************************************/ 383 384static ftree_port_group_t * 385__osm_ftree_port_group_create(IN ib_net16_t base_lid, 386 IN ib_net16_t remote_base_lid, 387 IN ib_net64_t port_guid, 388 IN ib_net64_t node_guid, 389 IN uint8_t node_type, 390 IN void *p_hca_or_sw, 391 IN ib_net64_t remote_port_guid, 392 IN ib_net64_t remote_node_guid, 393 IN uint8_t remote_node_type, 394 IN void *p_remote_hca_or_sw, 395 IN boolean_t is_cn) 396{ 397 ftree_port_group_t *p_group = 398 (ftree_port_group_t *) malloc(sizeof(ftree_port_group_t)); 399 if (p_group == NULL) 400 return NULL; 401 memset(p_group, 0, sizeof(ftree_port_group_t)); 402 403 p_group->base_lid = base_lid; 404 p_group->remote_base_lid = remote_base_lid; 405 memcpy(&p_group->port_guid, &port_guid, sizeof(ib_net64_t)); 406 memcpy(&p_group->node_guid, &node_guid, sizeof(ib_net64_t)); 407 memcpy(&p_group->remote_port_guid, &remote_port_guid, 408 sizeof(ib_net64_t)); 409 memcpy(&p_group->remote_node_guid, &remote_node_guid, 410 sizeof(ib_net64_t)); 411 412 p_group->node_type = node_type; 413 switch (node_type) { 414 case IB_NODE_TYPE_CA: 415 p_group->hca_or_sw.p_hca = (ftree_hca_t *) p_hca_or_sw; 416 break; 417 case IB_NODE_TYPE_SWITCH: 418 p_group->hca_or_sw.p_sw = (ftree_sw_t *) p_hca_or_sw; 419 break; 420 default: 421 /* we shouldn't get here - port is created only in hca or switch */ 422 CL_ASSERT(0); 423 } 424 425 p_group->remote_node_type = remote_node_type; 426 switch (remote_node_type) { 427 case IB_NODE_TYPE_CA: 428 p_group->remote_hca_or_sw.p_hca = 429 (ftree_hca_t *) p_remote_hca_or_sw; 430 break; 431 case IB_NODE_TYPE_SWITCH: 432 p_group->remote_hca_or_sw.p_sw = 433 (ftree_sw_t *) p_remote_hca_or_sw; 434 break; 435 default: 436 /* we shouldn't get here - port is created only in hca or switch */ 437 CL_ASSERT(0); 438 } 439 440 cl_ptr_vector_init(&p_group->ports, 0, /* min size */ 441 8); /* grow size */ 442 p_group->is_cn = is_cn; 443 return p_group; 444} /* __osm_ftree_port_group_create() */ 445 446/***************************************************/ 447 448static void __osm_ftree_port_group_destroy(IN ftree_port_group_t * p_group) 449{ 450 uint32_t i; 451 uint32_t size; 452 ftree_port_t *p_port; 453 454 if (!p_group) 455 return; 456 457 /* remove all the elements of p_group->ports vector */ 458 size = cl_ptr_vector_get_size(&p_group->ports); 459 for (i = 0; i < size; i++) { 460 cl_ptr_vector_at(&p_group->ports, i, (void *)&p_port); 461 __osm_ftree_port_destroy(p_port); 462 } 463 cl_ptr_vector_destroy(&p_group->ports); 464 free(p_group); 465} /* __osm_ftree_port_group_destroy() */ 466 467/***************************************************/ 468 469static void 470__osm_ftree_port_group_dump(IN ftree_fabric_t * p_ftree, 471 IN ftree_port_group_t * p_group, 472 IN ftree_direction_t direction) 473{ 474 ftree_port_t *p_port; 475 uint32_t size; 476 uint32_t i; 477 char buff[10 * 1024]; 478 479 if (!p_group) 480 return; 481 482 if (!osm_log_is_active(&p_ftree->p_osm->log, OSM_LOG_DEBUG)) 483 return; 484 485 size = cl_ptr_vector_get_size(&p_group->ports); 486 buff[0] = '\0'; 487 488 for (i = 0; i < size; i++) { 489 cl_ptr_vector_at(&p_group->ports, i, (void *)&p_port); 490 CL_ASSERT(p_port); 491 492 if (i != 0) 493 strcat(buff, ", "); 494 sprintf(buff + strlen(buff), "%u", p_port->port_num); 495 } 496 497 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, 498 " Port Group of size %u, port(s): %s, direction: %s\n" 499 " Local <--> Remote GUID (LID):" 500 "0x%016" PRIx64 " (0x%04x) <--> 0x%016" PRIx64 " (0x%04x)\n", 501 size, 502 buff, 503 (direction == FTREE_DIRECTION_DOWN) ? "DOWN" : "UP", 504 cl_ntoh64(p_group->port_guid), 505 cl_ntoh16(p_group->base_lid), 506 cl_ntoh64(p_group->remote_port_guid), 507 cl_ntoh16(p_group->remote_base_lid)); 508 509} /* __osm_ftree_port_group_dump() */ 510 511/***************************************************/ 512 513static void 514__osm_ftree_port_group_add_port(IN ftree_port_group_t * p_group, 515 IN uint8_t port_num, IN uint8_t remote_port_num) 516{ 517 uint16_t i; 518 ftree_port_t *p_port; 519 520 for (i = 0; i < cl_ptr_vector_get_size(&p_group->ports); i++) { 521 cl_ptr_vector_at(&p_group->ports, i, (void *)&p_port); 522 if (p_port->port_num == port_num) 523 return; 524 } 525 526 p_port = __osm_ftree_port_create(port_num, remote_port_num); 527 cl_ptr_vector_insert(&p_group->ports, p_port, NULL); 528} 529 530/*************************************************** 531 ** 532 ** ftree_sw_t functions 533 ** 534 ***************************************************/ 535 536static ftree_sw_t *__osm_ftree_sw_create(IN ftree_fabric_t * p_ftree, 537 IN osm_switch_t * p_osm_sw) 538{ 539 ftree_sw_t *p_sw; 540 uint8_t ports_num; 541 542 /* make sure that the switch has ports */ 543 if (p_osm_sw->num_ports == 1) 544 return NULL; 545 546 p_sw = (ftree_sw_t *) malloc(sizeof(ftree_sw_t)); 547 if (p_sw == NULL) 548 return NULL; 549 memset(p_sw, 0, sizeof(ftree_sw_t)); 550 551 p_sw->p_osm_sw = p_osm_sw; 552 p_sw->rank = 0xFFFFFFFF; 553 __osm_ftree_tuple_init(p_sw->tuple); 554 555 p_sw->base_lid = osm_node_get_base_lid(p_sw->p_osm_sw->p_node, 0); 556 557 ports_num = osm_node_get_num_physp(p_sw->p_osm_sw->p_node); 558 p_sw->down_port_groups = 559 (ftree_port_group_t **) malloc(ports_num * 560 sizeof(ftree_port_group_t *)); 561 p_sw->up_port_groups = 562 (ftree_port_group_t **) malloc(ports_num * 563 sizeof(ftree_port_group_t *)); 564 if (!p_sw->down_port_groups || !p_sw->up_port_groups) 565 return NULL; 566 p_sw->down_port_groups_num = 0; 567 p_sw->up_port_groups_num = 0; 568 569 /* initialize lft buffer */ 570 memset(p_osm_sw->new_lft, OSM_NO_PATH, IB_LID_UCAST_END_HO + 1); 571 572 p_sw->down_port_groups_idx = -1; 573 574 return p_sw; 575} /* __osm_ftree_sw_create() */ 576 577/***************************************************/ 578 579static void __osm_ftree_sw_destroy(IN ftree_fabric_t * p_ftree, 580 IN ftree_sw_t * p_sw) 581{ 582 uint8_t i; 583 584 if (!p_sw) 585 return; 586 587 for (i = 0; i < p_sw->down_port_groups_num; i++) 588 __osm_ftree_port_group_destroy(p_sw->down_port_groups[i]); 589 for (i = 0; i < p_sw->up_port_groups_num; i++) 590 __osm_ftree_port_group_destroy(p_sw->up_port_groups[i]); 591 if (p_sw->down_port_groups) 592 free(p_sw->down_port_groups); 593 if (p_sw->up_port_groups) 594 free(p_sw->up_port_groups); 595 596 free(p_sw); 597} /* __osm_ftree_sw_destroy() */ 598 599/***************************************************/ 600 601static uint64_t __osm_ftree_sw_get_guid_no(IN ftree_sw_t * p_sw) 602{ 603 if (!p_sw) 604 return 0; 605 return osm_node_get_node_guid(p_sw->p_osm_sw->p_node); 606} 607 608/***************************************************/ 609 610static uint64_t __osm_ftree_sw_get_guid_ho(IN ftree_sw_t * p_sw) 611{ 612 return cl_ntoh64(__osm_ftree_sw_get_guid_no(p_sw)); 613} 614 615/***************************************************/ 616 617static void __osm_ftree_sw_dump(IN ftree_fabric_t * p_ftree, 618 IN ftree_sw_t * p_sw) 619{ 620 uint32_t i; 621 622 if (!p_sw) 623 return; 624 625 if (!osm_log_is_active(&p_ftree->p_osm->log, OSM_LOG_DEBUG)) 626 return; 627 628 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, 629 "Switch index: %s, GUID: 0x%016" PRIx64 630 ", Ports: %u DOWN, %u UP\n", 631 __osm_ftree_tuple_to_str(p_sw->tuple), 632 __osm_ftree_sw_get_guid_ho(p_sw), p_sw->down_port_groups_num, 633 p_sw->up_port_groups_num); 634 635 for (i = 0; i < p_sw->down_port_groups_num; i++) 636 __osm_ftree_port_group_dump(p_ftree, 637 p_sw->down_port_groups[i], 638 FTREE_DIRECTION_DOWN); 639 for (i = 0; i < p_sw->up_port_groups_num; i++) 640 __osm_ftree_port_group_dump(p_ftree, p_sw->up_port_groups[i], 641 FTREE_DIRECTION_UP); 642 643} /* __osm_ftree_sw_dump() */ 644 645/***************************************************/ 646 647static boolean_t __osm_ftree_sw_ranked(IN ftree_sw_t * p_sw) 648{ 649 return (p_sw->rank != 0xFFFFFFFF); 650} 651 652/***************************************************/ 653 654static ftree_port_group_t * 655__osm_ftree_sw_get_port_group_by_remote_lid(IN ftree_sw_t * p_sw, 656 IN ib_net16_t remote_base_lid, 657 IN ftree_direction_t direction) 658{ 659 uint32_t i; 660 uint32_t size; 661 ftree_port_group_t **port_groups; 662 663 if (direction == FTREE_DIRECTION_UP) { 664 port_groups = p_sw->up_port_groups; 665 size = p_sw->up_port_groups_num; 666 } else { 667 port_groups = p_sw->down_port_groups; 668 size = p_sw->down_port_groups_num; 669 } 670 671 for (i = 0; i < size; i++) 672 if (remote_base_lid == port_groups[i]->remote_base_lid) 673 return port_groups[i]; 674 675 return NULL; 676} /* __osm_ftree_sw_get_port_group_by_remote_lid() */ 677 678/***************************************************/ 679 680static void 681__osm_ftree_sw_add_port(IN ftree_sw_t * p_sw, 682 IN uint8_t port_num, 683 IN uint8_t remote_port_num, 684 IN ib_net16_t base_lid, 685 IN ib_net16_t remote_base_lid, 686 IN ib_net64_t port_guid, 687 IN ib_net64_t remote_port_guid, 688 IN ib_net64_t remote_node_guid, 689 IN uint8_t remote_node_type, 690 IN void *p_remote_hca_or_sw, 691 IN ftree_direction_t direction) 692{ 693 ftree_port_group_t *p_group = 694 __osm_ftree_sw_get_port_group_by_remote_lid(p_sw, remote_base_lid, 695 direction); 696 697 if (!p_group) { 698 p_group = __osm_ftree_port_group_create(base_lid, 699 remote_base_lid, 700 port_guid, 701 __osm_ftree_sw_get_guid_no 702 (p_sw), 703 IB_NODE_TYPE_SWITCH, 704 p_sw, remote_port_guid, 705 remote_node_guid, 706 remote_node_type, 707 p_remote_hca_or_sw, 708 FALSE); 709 CL_ASSERT(p_group); 710 711 if (direction == FTREE_DIRECTION_UP) 712 p_sw->up_port_groups[p_sw->up_port_groups_num++] = 713 p_group; 714 else 715 p_sw->down_port_groups[p_sw->down_port_groups_num++] = 716 p_group; 717 } 718 __osm_ftree_port_group_add_port(p_group, port_num, remote_port_num); 719 720} /* __osm_ftree_sw_add_port() */ 721 722/***************************************************/ 723 724static inline cl_status_t 725__osm_ftree_sw_set_hops(IN ftree_sw_t * p_sw, 726 IN uint16_t lid_ho, IN uint8_t port_num, 727 IN uint8_t hops) 728{ 729 /* set local min hop table(LID) */ 730 return osm_switch_set_hops(p_sw->p_osm_sw, lid_ho, port_num, hops); 731} 732 733/*************************************************** 734 ** 735 ** ftree_hca_t functions 736 ** 737 ***************************************************/ 738 739static ftree_hca_t *__osm_ftree_hca_create(IN osm_node_t * p_osm_node) 740{ 741 ftree_hca_t *p_hca = (ftree_hca_t *) malloc(sizeof(ftree_hca_t)); 742 if (p_hca == NULL) 743 return NULL; 744 memset(p_hca, 0, sizeof(ftree_hca_t)); 745 746 p_hca->p_osm_node = p_osm_node; 747 p_hca->up_port_groups = (ftree_port_group_t **) 748 malloc(osm_node_get_num_physp(p_hca->p_osm_node) * 749 sizeof(ftree_port_group_t *)); 750 if (!p_hca->up_port_groups) 751 return NULL; 752 p_hca->up_port_groups_num = 0; 753 return p_hca; 754} 755 756/***************************************************/ 757 758static void __osm_ftree_hca_destroy(IN ftree_hca_t * p_hca) 759{ 760 uint32_t i; 761 762 if (!p_hca) 763 return; 764 765 for (i = 0; i < p_hca->up_port_groups_num; i++) 766 __osm_ftree_port_group_destroy(p_hca->up_port_groups[i]); 767 768 if (p_hca->up_port_groups) 769 free(p_hca->up_port_groups); 770 771 free(p_hca); 772} 773 774/***************************************************/ 775 776static uint64_t __osm_ftree_hca_get_guid_no(IN ftree_hca_t * p_hca) 777{ 778 if (!p_hca) 779 return 0; 780 return osm_node_get_node_guid(p_hca->p_osm_node); 781} 782 783/***************************************************/ 784 785static uint64_t __osm_ftree_hca_get_guid_ho(IN ftree_hca_t * p_hca) 786{ 787 return cl_ntoh64(__osm_ftree_hca_get_guid_no(p_hca)); 788} 789 790/***************************************************/ 791 792static void __osm_ftree_hca_dump(IN ftree_fabric_t * p_ftree, 793 IN ftree_hca_t * p_hca) 794{ 795 uint32_t i; 796 797 if (!p_hca) 798 return; 799 800 if (!osm_log_is_active(&p_ftree->p_osm->log, OSM_LOG_DEBUG)) 801 return; 802 803 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, 804 "CA GUID: 0x%016" PRIx64 ", Ports: %u UP\n", 805 __osm_ftree_hca_get_guid_ho(p_hca), p_hca->up_port_groups_num); 806 807 for (i = 0; i < p_hca->up_port_groups_num; i++) 808 __osm_ftree_port_group_dump(p_ftree, p_hca->up_port_groups[i], 809 FTREE_DIRECTION_UP); 810} 811 812/***************************************************/ 813 814static ftree_port_group_t * 815__osm_ftree_hca_get_port_group_by_remote_lid(IN ftree_hca_t * p_hca, 816 IN ib_net16_t remote_base_lid) 817{ 818 uint32_t i; 819 for (i = 0; i < p_hca->up_port_groups_num; i++) 820 if (remote_base_lid == 821 p_hca->up_port_groups[i]->remote_base_lid) 822 return p_hca->up_port_groups[i]; 823 824 return NULL; 825} 826 827/***************************************************/ 828 829static void 830__osm_ftree_hca_add_port(IN ftree_hca_t * p_hca, 831 IN uint8_t port_num, 832 IN uint8_t remote_port_num, 833 IN ib_net16_t base_lid, 834 IN ib_net16_t remote_base_lid, 835 IN ib_net64_t port_guid, 836 IN ib_net64_t remote_port_guid, 837 IN ib_net64_t remote_node_guid, 838 IN uint8_t remote_node_type, 839 IN void *p_remote_hca_or_sw, IN boolean_t is_cn) 840{ 841 ftree_port_group_t *p_group; 842 843 /* this function is supposed to be called only for adding ports 844 in hca's that lead to switches */ 845 CL_ASSERT(remote_node_type == IB_NODE_TYPE_SWITCH); 846 847 p_group = 848 __osm_ftree_hca_get_port_group_by_remote_lid(p_hca, 849 remote_base_lid); 850 851 if (!p_group) { 852 p_group = __osm_ftree_port_group_create(base_lid, 853 remote_base_lid, 854 port_guid, 855 __osm_ftree_hca_get_guid_no 856 (p_hca), 857 IB_NODE_TYPE_CA, p_hca, 858 remote_port_guid, 859 remote_node_guid, 860 remote_node_type, 861 p_remote_hca_or_sw, 862 is_cn); 863 p_hca->up_port_groups[p_hca->up_port_groups_num++] = p_group; 864 } 865 __osm_ftree_port_group_add_port(p_group, port_num, remote_port_num); 866 867} /* __osm_ftree_hca_add_port() */ 868 869/*************************************************** 870 ** 871 ** ftree_fabric_t functions 872 ** 873 ***************************************************/ 874 875static ftree_fabric_t *__osm_ftree_fabric_create() 876{ 877 ftree_fabric_t *p_ftree = 878 (ftree_fabric_t *) malloc(sizeof(ftree_fabric_t)); 879 if (p_ftree == NULL) 880 return NULL; 881 882 memset(p_ftree, 0, sizeof(ftree_fabric_t)); 883 884 cl_qmap_init(&p_ftree->hca_tbl); 885 cl_qmap_init(&p_ftree->sw_tbl); 886 cl_qmap_init(&p_ftree->sw_by_tuple_tbl); 887 cl_qmap_init(&p_ftree->cn_guid_tbl); 888 889 cl_qlist_init(&p_ftree->root_guid_list); 890 891 return p_ftree; 892} 893 894/***************************************************/ 895 896static void __osm_ftree_fabric_clear(ftree_fabric_t * p_ftree) 897{ 898 ftree_hca_t *p_hca; 899 ftree_hca_t *p_next_hca; 900 ftree_sw_t *p_sw; 901 ftree_sw_t *p_next_sw; 902 ftree_sw_tbl_element_t *p_element; 903 ftree_sw_tbl_element_t *p_next_element; 904 name_map_item_t *p_guid_element, *p_next_guid_element; 905 906 if (!p_ftree) 907 return; 908 909 /* remove all the elements of hca_tbl */ 910 911 p_next_hca = (ftree_hca_t *) cl_qmap_head(&p_ftree->hca_tbl); 912 while (p_next_hca != (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl)) { 913 p_hca = p_next_hca; 914 p_next_hca = (ftree_hca_t *) cl_qmap_next(&p_hca->map_item); 915 __osm_ftree_hca_destroy(p_hca); 916 } 917 cl_qmap_remove_all(&p_ftree->hca_tbl); 918 919 /* remove all the elements of sw_tbl */ 920 921 p_next_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl); 922 while (p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) { 923 p_sw = p_next_sw; 924 p_next_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item); 925 __osm_ftree_sw_destroy(p_ftree, p_sw); 926 } 927 cl_qmap_remove_all(&p_ftree->sw_tbl); 928 929 /* remove all the elements of sw_by_tuple_tbl */ 930 931 p_next_element = 932 (ftree_sw_tbl_element_t *) cl_qmap_head(&p_ftree->sw_by_tuple_tbl); 933 while (p_next_element != 934 (ftree_sw_tbl_element_t *) cl_qmap_end(&p_ftree-> 935 sw_by_tuple_tbl)) { 936 p_element = p_next_element; 937 p_next_element = 938 (ftree_sw_tbl_element_t *) cl_qmap_next(&p_element-> 939 map_item); 940 __osm_ftree_sw_tbl_element_destroy(p_element); 941 } 942 cl_qmap_remove_all(&p_ftree->sw_by_tuple_tbl); 943 944 /* remove all the elements of cn_guid_tbl */ 945 p_next_guid_element = 946 (name_map_item_t *) cl_qmap_head(&p_ftree->cn_guid_tbl); 947 while (p_next_guid_element != 948 (name_map_item_t *) cl_qmap_end(&p_ftree->cn_guid_tbl)) { 949 p_guid_element = p_next_guid_element; 950 p_next_guid_element = 951 (name_map_item_t *) cl_qmap_next(&p_guid_element->item); 952 free(p_guid_element); 953 } 954 cl_qmap_remove_all(&p_ftree->cn_guid_tbl); 955 956 /* remove all the elements of root_guid_list */ 957 while (!cl_is_qlist_empty(&p_ftree->root_guid_list)) 958 free(cl_qlist_remove_head(&p_ftree->root_guid_list)); 959 960 /* free the leaf switches array */ 961 if ((p_ftree->leaf_switches_num > 0) && (p_ftree->leaf_switches)) 962 free(p_ftree->leaf_switches); 963 964 p_ftree->leaf_switches_num = 0; 965 p_ftree->cn_num = 0; 966 p_ftree->leaf_switch_rank = 0; 967 p_ftree->max_switch_rank = 0; 968 p_ftree->max_cn_per_leaf = 0; 969 p_ftree->lft_max_lid_ho = 0; 970 p_ftree->leaf_switches = NULL; 971 p_ftree->fabric_built = FALSE; 972 973} /* __osm_ftree_fabric_destroy() */ 974 975/***************************************************/ 976 977static void __osm_ftree_fabric_destroy(ftree_fabric_t * p_ftree) 978{ 979 if (!p_ftree) 980 return; 981 __osm_ftree_fabric_clear(p_ftree); 982 free(p_ftree); 983} 984 985/***************************************************/ 986 987static uint8_t __osm_ftree_fabric_get_rank(ftree_fabric_t * p_ftree) 988{ 989 return p_ftree->leaf_switch_rank + 1; 990} 991 992/***************************************************/ 993 994static void __osm_ftree_fabric_add_hca(ftree_fabric_t * p_ftree, 995 osm_node_t * p_osm_node) 996{ 997 ftree_hca_t *p_hca = __osm_ftree_hca_create(p_osm_node); 998 999 CL_ASSERT(osm_node_get_type(p_osm_node) == IB_NODE_TYPE_CA); 1000 1001 cl_qmap_insert(&p_ftree->hca_tbl, p_osm_node->node_info.node_guid, 1002 &p_hca->map_item); 1003} 1004 1005/***************************************************/ 1006 1007static void __osm_ftree_fabric_add_sw(ftree_fabric_t * p_ftree, 1008 osm_switch_t * p_osm_sw) 1009{ 1010 ftree_sw_t *p_sw = __osm_ftree_sw_create(p_ftree, p_osm_sw); 1011 1012 CL_ASSERT(osm_node_get_type(p_osm_sw->p_node) == IB_NODE_TYPE_SWITCH); 1013 1014 cl_qmap_insert(&p_ftree->sw_tbl, p_osm_sw->p_node->node_info.node_guid, 1015 &p_sw->map_item); 1016 1017 /* track the max lid (in host order) that exists in the fabric */ 1018 if (cl_ntoh16(p_sw->base_lid) > p_ftree->lft_max_lid_ho) 1019 p_ftree->lft_max_lid_ho = cl_ntoh16(p_sw->base_lid); 1020} 1021 1022/***************************************************/ 1023 1024static void __osm_ftree_fabric_add_sw_by_tuple(IN ftree_fabric_t * p_ftree, 1025 IN ftree_sw_t * p_sw) 1026{ 1027 CL_ASSERT(__osm_ftree_tuple_assigned(p_sw->tuple)); 1028 1029 cl_qmap_insert(&p_ftree->sw_by_tuple_tbl, 1030 __osm_ftree_tuple_to_key(p_sw->tuple), 1031 &__osm_ftree_sw_tbl_element_create(p_sw)->map_item); 1032} 1033 1034/***************************************************/ 1035 1036static ftree_sw_t *__osm_ftree_fabric_get_sw_by_tuple(IN ftree_fabric_t * 1037 p_ftree, 1038 IN ftree_tuple_t tuple) 1039{ 1040 ftree_sw_tbl_element_t *p_element; 1041 1042 CL_ASSERT(__osm_ftree_tuple_assigned(tuple)); 1043 1044 __osm_ftree_tuple_to_key(tuple); 1045 1046 p_element = 1047 (ftree_sw_tbl_element_t *) cl_qmap_get(&p_ftree->sw_by_tuple_tbl, 1048 __osm_ftree_tuple_to_key 1049 (tuple)); 1050 if (p_element == 1051 (ftree_sw_tbl_element_t *) cl_qmap_end(&p_ftree->sw_by_tuple_tbl)) 1052 return NULL; 1053 1054 return p_element->p_sw; 1055} 1056 1057/***************************************************/ 1058 1059static ftree_sw_t *__osm_ftree_fabric_get_sw_by_guid(IN ftree_fabric_t * 1060 p_ftree, IN uint64_t guid) 1061{ 1062 ftree_sw_t *p_sw; 1063 p_sw = (ftree_sw_t *) cl_qmap_get(&p_ftree->sw_tbl, guid); 1064 if (p_sw == (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) 1065 return NULL; 1066 return p_sw; 1067} 1068 1069/***************************************************/ 1070 1071static ftree_hca_t *__osm_ftree_fabric_get_hca_by_guid(IN ftree_fabric_t * 1072 p_ftree, 1073 IN uint64_t guid) 1074{ 1075 ftree_hca_t *p_hca; 1076 p_hca = (ftree_hca_t *) cl_qmap_get(&p_ftree->hca_tbl, guid); 1077 if (p_hca == (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl)) 1078 return NULL; 1079 return p_hca; 1080} 1081 1082/***************************************************/ 1083 1084static void __osm_ftree_fabric_dump(ftree_fabric_t * p_ftree) 1085{ 1086 uint32_t i; 1087 ftree_hca_t *p_hca; 1088 ftree_sw_t *p_sw; 1089 1090 if (!osm_log_is_active(&p_ftree->p_osm->log, OSM_LOG_DEBUG)) 1091 return; 1092 1093 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, "\n" 1094 " |-------------------------------|\n" 1095 " |- Full fabric topology dump -|\n" 1096 " |-------------------------------|\n\n"); 1097 1098 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, "-- CAs:\n"); 1099 1100 for (p_hca = (ftree_hca_t *) cl_qmap_head(&p_ftree->hca_tbl); 1101 p_hca != (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl); 1102 p_hca = (ftree_hca_t *) cl_qmap_next(&p_hca->map_item)) { 1103 __osm_ftree_hca_dump(p_ftree, p_hca); 1104 } 1105 1106 for (i = 0; i < p_ftree->max_switch_rank; i++) { 1107 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, 1108 "-- Rank %u switches\n", i); 1109 for (p_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl); 1110 p_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl); 1111 p_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item)) { 1112 if (p_sw->rank == i) 1113 __osm_ftree_sw_dump(p_ftree, p_sw); 1114 } 1115 } 1116 1117 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, "\n" 1118 " |---------------------------------------|\n" 1119 " |- Full fabric topology dump completed -|\n" 1120 " |---------------------------------------|\n\n"); 1121} /* __osm_ftree_fabric_dump() */ 1122 1123/***************************************************/ 1124 1125static void __osm_ftree_fabric_dump_general_info(IN ftree_fabric_t * p_ftree) 1126{ 1127 uint32_t i, j; 1128 ftree_sw_t *p_sw; 1129 1130 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO, 1131 "General fabric topology info\n"); 1132 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO, 1133 "============================\n"); 1134 1135 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO, 1136 " - FatTree rank (roots to leaf switches): %u\n", 1137 p_ftree->leaf_switch_rank + 1); 1138 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO, 1139 " - FatTree max switch rank: %u\n", p_ftree->max_switch_rank); 1140 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO, 1141 " - Fabric has %u CAs (%u of them CNs), %u switches\n", 1142 cl_qmap_count(&p_ftree->hca_tbl), p_ftree->cn_num, 1143 cl_qmap_count(&p_ftree->sw_tbl)); 1144 1145 CL_ASSERT(cl_qmap_count(&p_ftree->hca_tbl) >= p_ftree->cn_num); 1146 1147 for (i = 0; i <= p_ftree->max_switch_rank; i++) { 1148 j = 0; 1149 for (p_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl); 1150 p_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl); 1151 p_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item)) { 1152 if (p_sw->rank == i) 1153 j++; 1154 } 1155 if (i == 0) 1156 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO, 1157 " - Fabric has %u switches at rank %u (roots)\n", 1158 j, i); 1159 else if (i == p_ftree->leaf_switch_rank) 1160 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO, 1161 " - Fabric has %u switches at rank %u (%u of them leafs)\n", 1162 j, i, p_ftree->leaf_switches_num); 1163 else 1164 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO, 1165 " - Fabric has %u switches at rank %u\n", j, 1166 i); 1167 } 1168 1169 if (osm_log_is_active(&p_ftree->p_osm->log, OSM_LOG_VERBOSE)) { 1170 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, 1171 " - Root switches:\n"); 1172 for (p_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl); 1173 p_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl); 1174 p_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item)) { 1175 if (p_sw->rank == 0) 1176 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, 1177 " GUID: 0x%016" PRIx64 1178 ", LID: %u, Index %s\n", 1179 __osm_ftree_sw_get_guid_ho(p_sw), 1180 cl_ntoh16(p_sw->base_lid), 1181 __osm_ftree_tuple_to_str(p_sw->tuple)); 1182 } 1183 1184 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, 1185 " - Leaf switches (sorted by index):\n"); 1186 for (i = 0; i < p_ftree->leaf_switches_num; i++) { 1187 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, 1188 " GUID: 0x%016" PRIx64 1189 ", LID: %u, Index %s\n", 1190 __osm_ftree_sw_get_guid_ho(p_ftree-> 1191 leaf_switches[i]), 1192 cl_ntoh16(p_ftree->leaf_switches[i]->base_lid), 1193 __osm_ftree_tuple_to_str(p_ftree-> 1194 leaf_switches[i]-> 1195 tuple)); 1196 } 1197 } 1198} /* __osm_ftree_fabric_dump_general_info() */ 1199 1200/***************************************************/ 1201 1202static void __osm_ftree_fabric_dump_hca_ordering(IN ftree_fabric_t * p_ftree) 1203{ 1204 ftree_hca_t *p_hca; 1205 ftree_sw_t *p_sw; 1206 ftree_port_group_t *p_group_on_sw; 1207 ftree_port_group_t *p_group_on_hca; 1208 uint32_t i; 1209 uint32_t j; 1210 unsigned printed_hcas_on_leaf; 1211 1212 char path[1024]; 1213 FILE *p_hca_ordering_file; 1214 char *filename = "opensm-ftree-ca-order.dump"; 1215 1216 snprintf(path, sizeof(path), "%s/%s", 1217 p_ftree->p_osm->subn.opt.dump_files_dir, filename); 1218 p_hca_ordering_file = fopen(path, "w"); 1219 if (!p_hca_ordering_file) { 1220 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, "ERR AB01: " 1221 "cannot open file \'%s\': %s\n", filename, 1222 strerror(errno)); 1223 return; 1224 } 1225 1226 /* for each leaf switch (in indexing order) */ 1227 for (i = 0; i < p_ftree->leaf_switches_num; i++) { 1228 p_sw = p_ftree->leaf_switches[i]; 1229 printed_hcas_on_leaf = 0; 1230 1231 /* for each real CA (CNs and not) connected to this switch */ 1232 for (j = 0; j < p_sw->down_port_groups_num; j++) { 1233 p_group_on_sw = p_sw->down_port_groups[j]; 1234 1235 if (p_group_on_sw->remote_node_type != IB_NODE_TYPE_CA) 1236 continue; 1237 1238 p_hca = p_group_on_sw->remote_hca_or_sw.p_hca; 1239 p_group_on_hca = 1240 __osm_ftree_hca_get_port_group_by_remote_lid(p_hca, 1241 p_group_on_sw-> 1242 base_lid); 1243 1244 /* treat non-compute nodes as dummies */ 1245 if (!p_group_on_hca->is_cn) 1246 continue; 1247 1248 fprintf(p_hca_ordering_file, "0x%04x\t%s\n", 1249 cl_ntoh16(p_group_on_hca->base_lid), 1250 p_hca->p_osm_node->print_desc); 1251 1252 printed_hcas_on_leaf++; 1253 } 1254 1255 /* now print missing HCAs */ 1256 for (j = 0; 1257 j < (p_ftree->max_cn_per_leaf - printed_hcas_on_leaf); j++) 1258 fprintf(p_hca_ordering_file, "0xFFFF\tDUMMY\n"); 1259 1260 } 1261 /* done going through all the leaf switches */ 1262 1263 fclose(p_hca_ordering_file); 1264} /* __osm_ftree_fabric_dump_hca_ordering() */ 1265 1266/***************************************************/ 1267 1268static void 1269__osm_ftree_fabric_assign_tuple(IN ftree_fabric_t * p_ftree, 1270 IN ftree_sw_t * p_sw, 1271 IN ftree_tuple_t new_tuple) 1272{ 1273 memcpy(p_sw->tuple, new_tuple, FTREE_TUPLE_LEN); 1274 __osm_ftree_fabric_add_sw_by_tuple(p_ftree, p_sw); 1275} 1276 1277/***************************************************/ 1278 1279static void __osm_ftree_fabric_assign_first_tuple(IN ftree_fabric_t * p_ftree, 1280 IN ftree_sw_t * p_sw) 1281{ 1282 uint8_t i; 1283 ftree_tuple_t new_tuple; 1284 1285 __osm_ftree_tuple_init(new_tuple); 1286 new_tuple[0] = (uint8_t) p_sw->rank; 1287 for (i = 1; i <= p_sw->rank; i++) 1288 new_tuple[i] = 0; 1289 1290 __osm_ftree_fabric_assign_tuple(p_ftree, p_sw, new_tuple); 1291} 1292 1293/***************************************************/ 1294 1295static void 1296__osm_ftree_fabric_get_new_tuple(IN ftree_fabric_t * p_ftree, 1297 OUT ftree_tuple_t new_tuple, 1298 IN ftree_tuple_t from_tuple, 1299 IN ftree_direction_t direction) 1300{ 1301 ftree_sw_t *p_sw; 1302 ftree_tuple_t temp_tuple; 1303 uint8_t var_index; 1304 uint8_t i; 1305 1306 __osm_ftree_tuple_init(new_tuple); 1307 memcpy(temp_tuple, from_tuple, FTREE_TUPLE_LEN); 1308 1309 if (direction == FTREE_DIRECTION_DOWN) { 1310 temp_tuple[0]++; 1311 var_index = from_tuple[0] + 1; 1312 } else { 1313 temp_tuple[0]--; 1314 var_index = from_tuple[0]; 1315 } 1316 1317 for (i = 0; i < 0xFF; i++) { 1318 temp_tuple[var_index] = i; 1319 p_sw = __osm_ftree_fabric_get_sw_by_tuple(p_ftree, temp_tuple); 1320 if (p_sw == NULL) /* found free tuple */ 1321 break; 1322 } 1323 1324 if (i == 0xFF) { 1325 /* new tuple not found - there are more than 255 ports in one direction */ 1326 return; 1327 } 1328 memcpy(new_tuple, temp_tuple, FTREE_TUPLE_LEN); 1329 1330} /* __osm_ftree_fabric_get_new_tuple() */ 1331 1332/***************************************************/ 1333 1334static inline boolean_t __osm_ftree_fabric_roots_provided(IN ftree_fabric_t * 1335 p_ftree) 1336{ 1337 return (p_ftree->p_osm->subn.opt.root_guid_file != NULL); 1338} 1339 1340/***************************************************/ 1341 1342static inline boolean_t __osm_ftree_fabric_cns_provided(IN ftree_fabric_t * 1343 p_ftree) 1344{ 1345 return (p_ftree->p_osm->subn.opt.cn_guid_file != NULL); 1346} 1347 1348/***************************************************/ 1349 1350static int __osm_ftree_fabric_mark_leaf_switches(IN ftree_fabric_t * p_ftree) 1351{ 1352 ftree_sw_t *p_sw; 1353 ftree_hca_t *p_hca; 1354 ftree_hca_t *p_next_hca; 1355 unsigned i; 1356 int res = 0; 1357 1358 OSM_LOG_ENTER(&p_ftree->p_osm->log); 1359 1360 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, 1361 "Marking leaf switches in fabric\n"); 1362 1363 /* Scan all the CAs, if they have CNs - find CN port and mark switch 1364 that is connected to this port as leaf switch. 1365 Also, ensure that this marked leaf has rank of p_ftree->leaf_switch_rank. */ 1366 p_next_hca = (ftree_hca_t *) cl_qmap_head(&p_ftree->hca_tbl); 1367 while (p_next_hca != (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl)) { 1368 p_hca = p_next_hca; 1369 p_next_hca = (ftree_hca_t *) cl_qmap_next(&p_hca->map_item); 1370 if (!p_hca->cn_num) 1371 continue; 1372 1373 for (i = 0; i < p_hca->up_port_groups_num; i++) { 1374 if (!p_hca->up_port_groups[i]->is_cn) 1375 continue; 1376 1377 /* In CAs, port group alway has one port, and since this 1378 port group is CN, we know that this port is compute node */ 1379 CL_ASSERT(p_hca->up_port_groups[i]->remote_node_type == 1380 IB_NODE_TYPE_SWITCH); 1381 p_sw = p_hca->up_port_groups[i]->remote_hca_or_sw.p_sw; 1382 1383 /* check if this switch was already processed */ 1384 if (p_sw->is_leaf) 1385 continue; 1386 p_sw->is_leaf = TRUE; 1387 1388 /* ensure that this leaf switch is at the correct tree level */ 1389 if (p_sw->rank != p_ftree->leaf_switch_rank) { 1390 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, 1391 "ERR AB26: CN port 0x%" PRIx64 1392 " is connected to switch 0x%" PRIx64 1393 " with rank %u, " 1394 "while FatTree leaf rank is %u\n", 1395 cl_ntoh64(p_hca->up_port_groups[i]-> 1396 port_guid), 1397 __osm_ftree_sw_get_guid_ho(p_sw), 1398 p_sw->rank, p_ftree->leaf_switch_rank); 1399 res = -1; 1400 goto Exit; 1401 1402 } 1403 } 1404 } 1405 1406Exit: 1407 OSM_LOG_EXIT(&p_ftree->p_osm->log); 1408 return res; 1409} /* __osm_ftree_fabric_mark_leaf_switches() */ 1410 1411/***************************************************/ 1412 1413static void __osm_ftree_fabric_make_indexing(IN ftree_fabric_t * p_ftree) 1414{ 1415 ftree_sw_t *p_remote_sw; 1416 ftree_sw_t *p_sw = NULL; 1417 ftree_sw_t *p_next_sw; 1418 ftree_tuple_t new_tuple; 1419 uint32_t i; 1420 cl_list_t bfs_list; 1421 ftree_sw_tbl_element_t *p_sw_tbl_element; 1422 1423 OSM_LOG_ENTER(&p_ftree->p_osm->log); 1424 1425 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, 1426 "Starting FatTree indexing\n"); 1427 1428 /* using the first leaf switch as a starting point for indexing algorithm. */ 1429 p_next_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl); 1430 while (p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) { 1431 p_sw = p_next_sw; 1432 if (p_sw->is_leaf) 1433 break; 1434 p_next_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item); 1435 } 1436 1437 CL_ASSERT(p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)); 1438 1439 /* Assign the first tuple to the switch that is used as BFS starting point. 1440 The tuple will be as follows: [rank].0.0.0... 1441 This fuction also adds the switch it into the switch_by_tuple table. */ 1442 __osm_ftree_fabric_assign_first_tuple(p_ftree, p_sw); 1443 1444 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, 1445 "Indexing starting point:\n" 1446 " - Switch rank : %u\n" 1447 " - Switch index : %s\n" 1448 " - Node LID : %u\n" 1449 " - Node GUID : 0x%016" 1450 PRIx64 "\n", p_sw->rank, __osm_ftree_tuple_to_str(p_sw->tuple), 1451 cl_ntoh16(p_sw->base_lid), __osm_ftree_sw_get_guid_ho(p_sw)); 1452 1453 /* 1454 * Now run BFS and assign indexes to all switches 1455 * Pseudo code of the algorithm is as follows: 1456 * 1457 * * Add first switch to BFS queue 1458 * * While (BFS queue not empty) 1459 * - Pop the switch from the head of the queue 1460 * - Scan all the downward and upward ports 1461 * - For each port 1462 * + Get the remote switch 1463 * + Assign index to the remote switch 1464 * + Add remote switch to the BFS queue 1465 */ 1466 1467 cl_list_init(&bfs_list, cl_qmap_count(&p_ftree->sw_tbl)); 1468 cl_list_insert_tail(&bfs_list, 1469 &__osm_ftree_sw_tbl_element_create(p_sw)->map_item); 1470 1471 while (!cl_is_list_empty(&bfs_list)) { 1472 p_sw_tbl_element = 1473 (ftree_sw_tbl_element_t *) cl_list_remove_head(&bfs_list); 1474 p_sw = p_sw_tbl_element->p_sw; 1475 __osm_ftree_sw_tbl_element_destroy(p_sw_tbl_element); 1476 1477 /* Discover all the nodes from ports that are pointing down */ 1478 1479 if (p_sw->rank >= p_ftree->leaf_switch_rank) { 1480 /* whether downward ports are pointing to CAs or switches, 1481 we don't assign indexes to switches that are located 1482 lower than leaf switches */ 1483 } else { 1484 /* This is not the leaf switch */ 1485 for (i = 0; i < p_sw->down_port_groups_num; i++) { 1486 /* Work with port groups that are pointing to switches only. 1487 No need to assign indexing to HCAs */ 1488 if (p_sw->down_port_groups[i]-> 1489 remote_node_type != IB_NODE_TYPE_SWITCH) 1490 continue; 1491 1492 p_remote_sw = 1493 p_sw->down_port_groups[i]->remote_hca_or_sw. 1494 p_sw; 1495 if (__osm_ftree_tuple_assigned 1496 (p_remote_sw->tuple)) { 1497 /* this switch has been already indexed */ 1498 continue; 1499 } 1500 /* allocate new tuple */ 1501 __osm_ftree_fabric_get_new_tuple(p_ftree, 1502 new_tuple, 1503 p_sw->tuple, 1504 FTREE_DIRECTION_DOWN); 1505 /* Assign the new tuple to the remote switch. 1506 This fuction also adds the switch into the switch_by_tuple table. */ 1507 __osm_ftree_fabric_assign_tuple(p_ftree, 1508 p_remote_sw, 1509 new_tuple); 1510 1511 /* add the newly discovered switch to the BFS queue */ 1512 cl_list_insert_tail(&bfs_list, 1513 &__osm_ftree_sw_tbl_element_create 1514 (p_remote_sw)->map_item); 1515 } 1516 /* Done assigning indexes to all the remote switches 1517 that are pointed by the downgoing ports. 1518 Now sort port groups according to remote index. */ 1519 qsort(p_sw->down_port_groups, /* array */ 1520 p_sw->down_port_groups_num, /* number of elements */ 1521 sizeof(ftree_port_group_t *), /* size of each element */ 1522 __osm_ftree_compare_port_groups_by_remote_switch_index); /* comparator */ 1523 } 1524 1525 /* Done indexing switches from ports that go down. 1526 Now do the same with ports that are pointing up. */ 1527 1528 if (p_sw->rank != 0) { 1529 /* This is not the root switch, which means that all the ports 1530 that are pointing up are taking us to another switches. */ 1531 for (i = 0; i < p_sw->up_port_groups_num; i++) { 1532 p_remote_sw = 1533 p_sw->up_port_groups[i]->remote_hca_or_sw. 1534 p_sw; 1535 if (__osm_ftree_tuple_assigned 1536 (p_remote_sw->tuple)) 1537 continue; 1538 /* allocate new tuple */ 1539 __osm_ftree_fabric_get_new_tuple(p_ftree, 1540 new_tuple, 1541 p_sw->tuple, 1542 FTREE_DIRECTION_UP); 1543 /* Assign the new tuple to the remote switch. 1544 This fuction also adds the switch to the 1545 switch_by_tuple table. */ 1546 __osm_ftree_fabric_assign_tuple(p_ftree, 1547 p_remote_sw, 1548 new_tuple); 1549 /* add the newly discovered switch to the BFS queue */ 1550 cl_list_insert_tail(&bfs_list, 1551 &__osm_ftree_sw_tbl_element_create 1552 (p_remote_sw)->map_item); 1553 } 1554 /* Done assigning indexes to all the remote switches 1555 that are pointed by the upgoing ports. 1556 Now sort port groups according to remote index. */ 1557 qsort(p_sw->up_port_groups, /* array */ 1558 p_sw->up_port_groups_num, /* number of elements */ 1559 sizeof(ftree_port_group_t *), /* size of each element */ 1560 __osm_ftree_compare_port_groups_by_remote_switch_index); /* comparator */ 1561 } 1562 /* Done assigning indexes to all the switches that are directly connected 1563 to the current switch - go to the next switch in the BFS queue */ 1564 } 1565 cl_list_destroy(&bfs_list); 1566 1567 OSM_LOG_EXIT(&p_ftree->p_osm->log); 1568} /* __osm_ftree_fabric_make_indexing() */ 1569 1570/***************************************************/ 1571 1572static int __osm_ftree_fabric_create_leaf_switch_array(IN ftree_fabric_t * 1573 p_ftree) 1574{ 1575 ftree_sw_t *p_sw; 1576 ftree_sw_t *p_next_sw; 1577 ftree_sw_t **all_switches_at_leaf_level; 1578 unsigned i; 1579 unsigned all_leaf_idx = 0; 1580 unsigned first_leaf_idx; 1581 unsigned last_leaf_idx; 1582 int res = 0; 1583 1584 OSM_LOG_ENTER(&p_ftree->p_osm->log); 1585 1586 /* create array of ALL the switches that have leaf rank */ 1587 all_switches_at_leaf_level = (ftree_sw_t **) 1588 malloc(cl_qmap_count(&p_ftree->sw_tbl) * sizeof(ftree_sw_t *)); 1589 if (!all_switches_at_leaf_level) { 1590 osm_log(&p_ftree->p_osm->log, OSM_LOG_SYS, 1591 "Fat-tree routing: Memory allocation failed\n"); 1592 res = -1; 1593 goto Exit; 1594 } 1595 memset(all_switches_at_leaf_level, 0, 1596 cl_qmap_count(&p_ftree->sw_tbl) * sizeof(ftree_sw_t *)); 1597 1598 p_next_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl); 1599 while (p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) { 1600 p_sw = p_next_sw; 1601 p_next_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item); 1602 if (p_sw->rank == p_ftree->leaf_switch_rank) { 1603 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, 1604 "Adding switch 0x%" PRIx64 1605 " to full leaf switch array\n", 1606 __osm_ftree_sw_get_guid_ho(p_sw)); 1607 all_switches_at_leaf_level[all_leaf_idx++] = p_sw; 1608 1609 } 1610 } 1611 1612 /* quick-sort array of leaf switches by index */ 1613 qsort(all_switches_at_leaf_level, /* array */ 1614 all_leaf_idx, /* number of elements */ 1615 sizeof(ftree_sw_t *), /* size of each element */ 1616 __osm_ftree_compare_switches_by_index); /* comparator */ 1617 1618 /* check the first and the last REAL leaf (the one 1619 that has CNs) in the array of all the leafs */ 1620 1621 first_leaf_idx = all_leaf_idx; 1622 last_leaf_idx = 0; 1623 for (i = 0; i < all_leaf_idx; i++) { 1624 if (all_switches_at_leaf_level[i]->is_leaf) { 1625 if (i < first_leaf_idx) 1626 first_leaf_idx = i; 1627 last_leaf_idx = i; 1628 } 1629 } 1630 CL_ASSERT(first_leaf_idx < last_leaf_idx); 1631 1632 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, 1633 "Full leaf array info: first_leaf_idx = %u, last_leaf_idx = %u\n", 1634 first_leaf_idx, last_leaf_idx); 1635 1636 /* Create array of REAL leaf switches, sorted by index. 1637 This array may contain switches at the same rank w/o CNs, 1638 in case this is the order of indexing. */ 1639 p_ftree->leaf_switches_num = last_leaf_idx - first_leaf_idx + 1; 1640 p_ftree->leaf_switches = (ftree_sw_t **) 1641 malloc(p_ftree->leaf_switches_num * sizeof(ftree_sw_t *)); 1642 if (!p_ftree->leaf_switches) { 1643 osm_log(&p_ftree->p_osm->log, OSM_LOG_SYS, 1644 "Fat-tree routing: Memory allocation failed\n"); 1645 res = -1; 1646 goto Exit; 1647 } 1648 1649 memcpy(p_ftree->leaf_switches, 1650 &(all_switches_at_leaf_level[first_leaf_idx]), 1651 p_ftree->leaf_switches_num * sizeof(ftree_sw_t *)); 1652 1653 free(all_switches_at_leaf_level); 1654 1655 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, 1656 "Created array of %u leaf switches\n", 1657 p_ftree->leaf_switches_num); 1658 1659Exit: 1660 OSM_LOG_EXIT(&p_ftree->p_osm->log); 1661 return res; 1662} /* __osm_ftree_fabric_create_leaf_switch_array() */ 1663 1664/***************************************************/ 1665 1666static void __osm_ftree_fabric_set_max_cn_per_leaf(IN ftree_fabric_t * p_ftree) 1667{ 1668 unsigned i; 1669 unsigned j; 1670 unsigned cns_on_this_leaf; 1671 ftree_sw_t *p_sw; 1672 ftree_port_group_t *p_group; 1673 1674 for (i = 0; i < p_ftree->leaf_switches_num; i++) { 1675 p_sw = p_ftree->leaf_switches[i]; 1676 cns_on_this_leaf = 0; 1677 for (j = 0; j < p_sw->down_port_groups_num; j++) { 1678 p_group = p_sw->down_port_groups[j]; 1679 if (p_group->remote_node_type != IB_NODE_TYPE_CA) 1680 continue; 1681 cns_on_this_leaf += 1682 p_group->remote_hca_or_sw.p_hca->cn_num; 1683 } 1684 if (cns_on_this_leaf > p_ftree->max_cn_per_leaf) 1685 p_ftree->max_cn_per_leaf = cns_on_this_leaf; 1686 } 1687} /* __osm_ftree_fabric_set_max_cn_per_leaf() */ 1688 1689/***************************************************/ 1690 1691static boolean_t __osm_ftree_fabric_validate_topology(IN ftree_fabric_t * 1692 p_ftree) 1693{ 1694 ftree_port_group_t *p_group; 1695 ftree_port_group_t *p_ref_group; 1696 ftree_sw_t *p_sw; 1697 ftree_sw_t *p_next_sw; 1698 ftree_sw_t **reference_sw_arr; 1699 uint16_t tree_rank = __osm_ftree_fabric_get_rank(p_ftree); 1700 boolean_t res = TRUE; 1701 uint8_t i; 1702 1703 OSM_LOG_ENTER(&p_ftree->p_osm->log); 1704 1705 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, 1706 "Validating fabric topology\n"); 1707 1708 reference_sw_arr = 1709 (ftree_sw_t **) malloc(tree_rank * sizeof(ftree_sw_t *)); 1710 if (reference_sw_arr == NULL) { 1711 osm_log(&p_ftree->p_osm->log, OSM_LOG_SYS, 1712 "Fat-tree routing: Memory allocation failed\n"); 1713 return FALSE; 1714 } 1715 memset(reference_sw_arr, 0, tree_rank * sizeof(ftree_sw_t *)); 1716 1717 p_next_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl); 1718 while (res && p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) { 1719 p_sw = p_next_sw; 1720 p_next_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item); 1721 1722 if (!reference_sw_arr[p_sw->rank]) { 1723 /* This is the first switch in the current level that 1724 we're checking - use it as a reference */ 1725 reference_sw_arr[p_sw->rank] = p_sw; 1726 } else { 1727 /* compare this switch properties to the reference switch */ 1728 1729 if (reference_sw_arr[p_sw->rank]->up_port_groups_num != 1730 p_sw->up_port_groups_num) { 1731 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, 1732 "ERR AB09: Different number of upward port groups on switches:\n" 1733 " GUID 0x%016" PRIx64 1734 ", LID %u, Index %s - %u groups\n" 1735 " GUID 0x%016" PRIx64 1736 ", LID %u, Index %s - %u groups\n", 1737 __osm_ftree_sw_get_guid_ho 1738 (reference_sw_arr[p_sw->rank]), 1739 cl_ntoh16(reference_sw_arr[p_sw->rank]-> 1740 base_lid), 1741 __osm_ftree_tuple_to_str 1742 (reference_sw_arr[p_sw->rank]->tuple), 1743 reference_sw_arr[p_sw->rank]-> 1744 up_port_groups_num, 1745 __osm_ftree_sw_get_guid_ho(p_sw), 1746 cl_ntoh16(p_sw->base_lid), 1747 __osm_ftree_tuple_to_str(p_sw->tuple), 1748 p_sw->up_port_groups_num); 1749 res = FALSE; 1750 break; 1751 } 1752 1753 if (p_sw->rank != (tree_rank - 1) && 1754 reference_sw_arr[p_sw->rank]-> 1755 down_port_groups_num != 1756 p_sw->down_port_groups_num) { 1757 /* we're allowing some hca's to be missing */ 1758 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, 1759 "ERR AB0A: Different number of downward port groups on switches:\n" 1760 " GUID 0x%016" PRIx64 1761 ", LID %u, Index %s - %u port groups\n" 1762 " GUID 0x%016" PRIx64 1763 ", LID %u, Index %s - %u port groups\n", 1764 __osm_ftree_sw_get_guid_ho 1765 (reference_sw_arr[p_sw->rank]), 1766 cl_ntoh16(reference_sw_arr[p_sw->rank]-> 1767 base_lid), 1768 __osm_ftree_tuple_to_str 1769 (reference_sw_arr[p_sw->rank]->tuple), 1770 reference_sw_arr[p_sw->rank]-> 1771 down_port_groups_num, 1772 __osm_ftree_sw_get_guid_ho(p_sw), 1773 cl_ntoh16(p_sw->base_lid), 1774 __osm_ftree_tuple_to_str(p_sw->tuple), 1775 p_sw->down_port_groups_num); 1776 res = FALSE; 1777 break; 1778 } 1779 1780 if (reference_sw_arr[p_sw->rank]->up_port_groups_num != 1781 0) { 1782 p_ref_group = 1783 reference_sw_arr[p_sw->rank]-> 1784 up_port_groups[0]; 1785 for (i = 0; i < p_sw->up_port_groups_num; i++) { 1786 p_group = p_sw->up_port_groups[i]; 1787 if (cl_ptr_vector_get_size 1788 (&p_ref_group->ports) != 1789 cl_ptr_vector_get_size(&p_group-> 1790 ports)) { 1791 OSM_LOG(&p_ftree->p_osm->log, 1792 OSM_LOG_ERROR, 1793 "ERR AB0B: Different number of ports in an upward port group on switches:\n" 1794 " GUID 0x%016" 1795 PRIx64 1796 ", LID %u, Index %s - %u ports\n" 1797 " GUID 0x%016" 1798 PRIx64 1799 ", LID %u, Index %s - %u ports\n", 1800 __osm_ftree_sw_get_guid_ho 1801 (reference_sw_arr 1802 [p_sw->rank]), 1803 cl_ntoh16 1804 (reference_sw_arr 1805 [p_sw->rank]-> 1806 base_lid), 1807 __osm_ftree_tuple_to_str 1808 (reference_sw_arr 1809 [p_sw->rank]->tuple), 1810 cl_ptr_vector_get_size 1811 (&p_ref_group->ports), 1812 __osm_ftree_sw_get_guid_ho 1813 (p_sw), 1814 cl_ntoh16(p_sw-> 1815 base_lid), 1816 __osm_ftree_tuple_to_str 1817 (p_sw->tuple), 1818 cl_ptr_vector_get_size 1819 (&p_group->ports)); 1820 res = FALSE; 1821 break; 1822 } 1823 } 1824 } 1825 if (reference_sw_arr[p_sw->rank]-> 1826 down_port_groups_num != 0 1827 && p_sw->rank != (tree_rank - 1)) { 1828 /* we're allowing some hca's to be missing */ 1829 p_ref_group = 1830 reference_sw_arr[p_sw->rank]-> 1831 down_port_groups[0]; 1832 for (i = 0; i < p_sw->down_port_groups_num; i++) { 1833 p_group = p_sw->down_port_groups[0]; 1834 if (cl_ptr_vector_get_size 1835 (&p_ref_group->ports) != 1836 cl_ptr_vector_get_size(&p_group-> 1837 ports)) { 1838 OSM_LOG(&p_ftree->p_osm->log, 1839 OSM_LOG_ERROR, 1840 "ERR AB0C: Different number of ports in an downward port group on switches:\n" 1841 " GUID 0x%016" 1842 PRIx64 1843 ", LID %u, Index %s - %u ports\n" 1844 " GUID 0x%016" 1845 PRIx64 1846 ", LID %u, Index %s - %u ports\n", 1847 __osm_ftree_sw_get_guid_ho 1848 (reference_sw_arr 1849 [p_sw->rank]), 1850 cl_ntoh16 1851 (reference_sw_arr 1852 [p_sw->rank]-> 1853 base_lid), 1854 __osm_ftree_tuple_to_str 1855 (reference_sw_arr 1856 [p_sw->rank]->tuple), 1857 cl_ptr_vector_get_size 1858 (&p_ref_group->ports), 1859 __osm_ftree_sw_get_guid_ho 1860 (p_sw), 1861 cl_ntoh16(p_sw-> 1862 base_lid), 1863 __osm_ftree_tuple_to_str 1864 (p_sw->tuple), 1865 cl_ptr_vector_get_size 1866 (&p_group->ports)); 1867 res = FALSE; 1868 break; 1869 } 1870 } 1871 } 1872 } /* end of else */ 1873 } /* end of while */ 1874 1875 if (res == TRUE) 1876 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, 1877 "Fabric topology has been identified as FatTree\n"); 1878 else 1879 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, 1880 "ERR AB0D: Fabric topology hasn't been identified as FatTree\n"); 1881 1882 free(reference_sw_arr); 1883 OSM_LOG_EXIT(&p_ftree->p_osm->log); 1884 return res; 1885} /* __osm_ftree_fabric_validate_topology() */ 1886 1887/*************************************************** 1888 ***************************************************/ 1889 1890static void __osm_ftree_set_sw_fwd_table(IN cl_map_item_t * const p_map_item, 1891 IN void *context) 1892{ 1893 ftree_sw_t *p_sw = (ftree_sw_t * const)p_map_item; 1894 ftree_fabric_t *p_ftree = (ftree_fabric_t *) context; 1895 1896 p_sw->p_osm_sw->max_lid_ho = p_ftree->lft_max_lid_ho; 1897 osm_ucast_mgr_set_fwd_table(&p_ftree->p_osm->sm.ucast_mgr, 1898 p_sw->p_osm_sw); 1899} 1900 1901/*************************************************** 1902 ***************************************************/ 1903 1904/* 1905 * Function: assign-up-going-port-by-descending-down 1906 * Given : a switch and a LID 1907 * Pseudo code: 1908 * foreach down-going-port-group (in indexing order) 1909 * skip this group if the LFT(LID) port is part of this group 1910 * find the least loaded port of the group (scan in indexing order) 1911 * r-port is the remote port connected to it 1912 * assign the remote switch node LFT(LID) to r-port 1913 * increase r-port usage counter 1914 * assign-up-going-port-by-descending-down to r-port node (recursion) 1915 */ 1916 1917static void 1918__osm_ftree_fabric_route_upgoing_by_going_down(IN ftree_fabric_t * p_ftree, 1919 IN ftree_sw_t * p_sw, 1920 IN ftree_sw_t * p_prev_sw, 1921 IN ib_net16_t target_lid, 1922 IN uint8_t target_rank, 1923 IN boolean_t is_real_lid, 1924 IN boolean_t is_main_path, 1925 IN uint8_t highest_rank_in_route) 1926{ 1927 ftree_sw_t *p_remote_sw; 1928 uint16_t ports_num; 1929 ftree_port_group_t *p_group; 1930 ftree_port_t *p_port; 1931 ftree_port_t *p_min_port; 1932 uint16_t i; 1933 uint16_t j; 1934 uint16_t k; 1935 1936 /* we shouldn't enter here if both real_lid and main_path are false */ 1937 CL_ASSERT(is_real_lid || is_main_path); 1938 1939 /* if there is no down-going ports */ 1940 if (p_sw->down_port_groups_num == 0) 1941 return; 1942 1943 /* promote the index that indicates which group should we 1944 start with when going through all the downgoing groups */ 1945 p_sw->down_port_groups_idx = 1946 (p_sw->down_port_groups_idx + 1) % p_sw->down_port_groups_num; 1947 1948 /* foreach down-going port group (in indexing order) */ 1949 i = p_sw->down_port_groups_idx; 1950 for (k = 0; k < p_sw->down_port_groups_num; k++) { 1951 1952 p_group = p_sw->down_port_groups[i]; 1953 i = (i + 1) % p_sw->down_port_groups_num; 1954 1955 /* Skip this port group unless it points to a switch */ 1956 if (p_group->remote_node_type != IB_NODE_TYPE_SWITCH) 1957 continue; 1958 1959 if (p_prev_sw 1960 && (p_group->remote_base_lid == p_prev_sw->base_lid)) { 1961 /* This port group has a port that was used when we entered this switch, 1962 which means that the current group points to the switch where we were 1963 at the previous step of the algorithm (before going up). 1964 Skipping this group. */ 1965 continue; 1966 } 1967 1968 /* find the least loaded port of the group (in indexing order) */ 1969 p_min_port = NULL; 1970 ports_num = (uint16_t) cl_ptr_vector_get_size(&p_group->ports); 1971 /* ToDo: no need to select a least loaded port for non-main path. 1972 Think about optimization. */ 1973 for (j = 0; j < ports_num; j++) { 1974 cl_ptr_vector_at(&p_group->ports, j, (void *)&p_port); 1975 if (!p_min_port) { 1976 /* first port that we're checking - set as port with the lowest load */ 1977 p_min_port = p_port; 1978 } else if (p_port->counter_up < p_min_port->counter_up) { 1979 /* this port is less loaded - use it as min */ 1980 p_min_port = p_port; 1981 } 1982 } 1983 /* At this point we have selected a port in this group with the 1984 lowest load of upgoing routes. 1985 Set on the remote switch how to get to the target_lid - 1986 set LFT(target_lid) on the remote switch to the remote port */ 1987 p_remote_sw = p_group->remote_hca_or_sw.p_sw; 1988 1989 if (osm_switch_get_least_hops(p_remote_sw->p_osm_sw, 1990 cl_ntoh16(target_lid)) != 1991 OSM_NO_PATH) { 1992 /* Loop in the fabric - we already routed the remote switch 1993 on our way UP, and now we see it again on our way DOWN */ 1994 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, 1995 "Loop of lenght %d in the fabric:\n " 1996 "Switch %s (LID %u) closes loop through switch %s (LID %u)\n", 1997 (p_remote_sw->rank - highest_rank_in_route) * 2, 1998 __osm_ftree_tuple_to_str(p_remote_sw->tuple), 1999 cl_ntoh16(p_group->base_lid), 2000 __osm_ftree_tuple_to_str(p_sw->tuple), 2001 cl_ntoh16(p_group->remote_base_lid)); 2002 continue; 2003 } 2004 2005 /* Four possible cases: 2006 * 2007 * 1. is_real_lid == TRUE && is_main_path == TRUE: 2008 * - going DOWN(TRUE,TRUE) through ALL the groups 2009 * + promoting port counter 2010 * + setting path in remote switch fwd tbl 2011 * + setting hops in remote switch on all the ports of each group 2012 * 2013 * 2. is_real_lid == TRUE && is_main_path == FALSE: 2014 * - going DOWN(TRUE,FALSE) through ALL the groups but only if 2015 * the remote (lower) switch hasn't been already configured 2016 * for this target LID 2017 * + NOT promoting port counter 2018 * + setting path in remote switch fwd tbl if it hasn't been set yet 2019 * + setting hops in remote switch on all the ports of each group 2020 * if it hasn't been set yet 2021 * 2022 * 3. is_real_lid == FALSE && is_main_path == TRUE: 2023 * - going DOWN(FALSE,TRUE) through ALL the groups 2024 * + promoting port counter 2025 * + NOT setting path in remote switch fwd tbl 2026 * + NOT setting hops in remote switch 2027 * 2028 * 4. is_real_lid == FALSE && is_main_path == FALSE: 2029 * - illegal state - we shouldn't get here 2030 */ 2031 2032 /* second case: skip the port group if the remote (lower) 2033 switch has been already configured for this target LID */ 2034 if (is_real_lid && !is_main_path && 2035 p_remote_sw->p_osm_sw->new_lft[cl_ntoh16(target_lid)] != OSM_NO_PATH) 2036 continue; 2037 2038 /* setting fwd tbl port only if this is real LID */ 2039 if (is_real_lid) { 2040 p_remote_sw->p_osm_sw->new_lft[cl_ntoh16(target_lid)] = 2041 p_min_port->remote_port_num; 2042 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, 2043 "Switch %s: set path to CA LID %u through port %u\n", 2044 __osm_ftree_tuple_to_str(p_remote_sw->tuple), 2045 cl_ntoh16(target_lid), 2046 p_min_port->remote_port_num); 2047 2048 /* On the remote switch that is pointed by the p_group, 2049 set hops for ALL the ports in the remote group. */ 2050 2051 for (j = 0; j < ports_num; j++) { 2052 cl_ptr_vector_at(&p_group->ports, j, 2053 (void *)&p_port); 2054 2055 __osm_ftree_sw_set_hops(p_remote_sw, 2056 cl_ntoh16(target_lid), 2057 p_port->remote_port_num, 2058 ((target_rank - 2059 highest_rank_in_route) 2060 + (p_remote_sw->rank - 2061 highest_rank_in_route))); 2062 } 2063 2064 } 2065 2066 /* The number of upgoing routes is tracked in the 2067 p_port->counter_up counter of the port that belongs to 2068 the upper side of the link (on switch with lower rank). 2069 Counter is promoted only if we're routing LID on the main 2070 path (whether it's a real LID or a dummy one). */ 2071 if (is_main_path) 2072 p_min_port->counter_up++; 2073 2074 /* Recursion step: 2075 Assign upgoing ports by stepping down, starting on REMOTE switch */ 2076 __osm_ftree_fabric_route_upgoing_by_going_down(p_ftree, p_remote_sw, /* remote switch - used as a route-upgoing alg. start point */ 2077 NULL, /* prev. position - NULL to mark that we went down and not up */ 2078 target_lid, /* LID that we're routing to */ 2079 target_rank, /* rank of the LID that we're routing to */ 2080 is_real_lid, /* whether the target LID is real or dummy */ 2081 is_main_path, /* whether this is path to HCA that should by tracked by counters */ 2082 highest_rank_in_route); /* highest visited point in the tree before going down */ 2083 } 2084 /* done scanning all the down-going port groups */ 2085 2086} /* __osm_ftree_fabric_route_upgoing_by_going_down() */ 2087 2088/***************************************************/ 2089 2090/* 2091 * Function: assign-down-going-port-by-ascending-up 2092 * Given : a switch and a LID 2093 * Pseudo code: 2094 * find the least loaded port of all the upgoing groups (scan in indexing order) 2095 * assign the LFT(LID) of remote switch to that port 2096 * track that port usage 2097 * assign-up-going-port-by-descending-down on CURRENT switch 2098 * assign-down-going-port-by-ascending-up on REMOTE switch (recursion) 2099 */ 2100 2101static void 2102__osm_ftree_fabric_route_downgoing_by_going_up(IN ftree_fabric_t * p_ftree, 2103 IN ftree_sw_t * p_sw, 2104 IN ftree_sw_t * p_prev_sw, 2105 IN ib_net16_t target_lid, 2106 IN uint8_t target_rank, 2107 IN boolean_t is_real_lid, 2108 IN boolean_t is_main_path) 2109{ 2110 ftree_sw_t *p_remote_sw; 2111 uint16_t ports_num; 2112 ftree_port_group_t *p_group; 2113 ftree_port_t *p_port; 2114 ftree_port_group_t *p_min_group; 2115 ftree_port_t *p_min_port; 2116 uint16_t i; 2117 uint16_t j; 2118 2119 /* we shouldn't enter here if both real_lid and main_path are false */ 2120 CL_ASSERT(is_real_lid || is_main_path); 2121 2122 /* Assign upgoing ports by stepping down, starting on THIS switch */ 2123 __osm_ftree_fabric_route_upgoing_by_going_down(p_ftree, p_sw, /* local switch - used as a route-upgoing alg. start point */ 2124 p_prev_sw, /* switch that we went up from (NULL means that we went down) */ 2125 target_lid, /* LID that we're routing to */ 2126 target_rank, /* rank of the LID that we're routing to */ 2127 is_real_lid, /* whether this target LID is real or dummy */ 2128 is_main_path, /* whether this path to HCA should by tracked by counters */ 2129 p_sw->rank); /* the highest visited point in the tree before going down */ 2130 2131 /* recursion stop condition - if it's a root switch, */ 2132 if (p_sw->rank == 0) 2133 return; 2134 2135 /* Find the least loaded upgoing port group */ 2136 p_min_group = NULL; 2137 for (i = 0; i < p_sw->up_port_groups_num; i++) { 2138 p_group = p_sw->up_port_groups[i]; 2139 if (!p_min_group) { 2140 /* first group that we're checking - use 2141 it as a group with the lowest load */ 2142 p_min_group = p_group; 2143 } else if (p_group->counter_down < p_min_group->counter_down) { 2144 /* this group is less loaded - use it as min */ 2145 p_min_group = p_group; 2146 } 2147 } 2148 2149 /* Find the least loaded upgoing port in the selected group */ 2150 p_min_port = NULL; 2151 ports_num = (uint16_t) cl_ptr_vector_get_size(&p_min_group->ports); 2152 for (j = 0; j < ports_num; j++) { 2153 cl_ptr_vector_at(&p_min_group->ports, j, (void *)&p_port); 2154 if (!p_min_port) { 2155 /* first port that we're checking - use 2156 it as a port with the lowest load */ 2157 p_min_port = p_port; 2158 } else if (p_port->counter_down < p_min_port->counter_down) { 2159 /* this port is less loaded - use it as min */ 2160 p_min_port = p_port; 2161 } 2162 } 2163 2164 /* At this point we have selected a group and port with the 2165 lowest load of downgoing routes. 2166 Set on the remote switch how to get to the target_lid - 2167 set LFT(target_lid) on the remote switch to the remote port */ 2168 p_remote_sw = p_min_group->remote_hca_or_sw.p_sw; 2169 2170 /* Four possible cases: 2171 * 2172 * 1. is_real_lid == TRUE && is_main_path == TRUE: 2173 * - going UP(TRUE,TRUE) on selected min_group and min_port 2174 * + promoting port counter 2175 * + setting path in remote switch fwd tbl 2176 * + setting hops in remote switch on all the ports of selected group 2177 * - going UP(TRUE,FALSE) on rest of the groups, each time on port 0 2178 * + NOT promoting port counter 2179 * + setting path in remote switch fwd tbl if it hasn't been set yet 2180 * + setting hops in remote switch on all the ports of each group 2181 * if it hasn't been set yet 2182 * 2183 * 2. is_real_lid == TRUE && is_main_path == FALSE: 2184 * - going UP(TRUE,FALSE) on ALL the groups, each time on port 0, 2185 * but only if the remote (upper) switch hasn't been already 2186 * configured for this target LID 2187 * + NOT promoting port counter 2188 * + setting path in remote switch fwd tbl if it hasn't been set yet 2189 * + setting hops in remote switch on all the ports of each group 2190 * if it hasn't been set yet 2191 * 2192 * 3. is_real_lid == FALSE && is_main_path == TRUE: 2193 * - going UP(FALSE,TRUE) ONLY on selected min_group and min_port 2194 * + promoting port counter 2195 * + NOT setting path in remote switch fwd tbl 2196 * + NOT setting hops in remote switch 2197 * 2198 * 4. is_real_lid == FALSE && is_main_path == FALSE: 2199 * - illegal state - we shouldn't get here 2200 */ 2201 2202 /* covering first half of case 1, and case 3 */ 2203 if (is_main_path) { 2204 if (p_sw->is_leaf) { 2205 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, 2206 " - Routing MAIN path for %s CA LID %u: %s --> %s\n", 2207 (is_real_lid) ? "real" : "DUMMY", 2208 cl_ntoh16(target_lid), 2209 __osm_ftree_tuple_to_str(p_sw->tuple), 2210 __osm_ftree_tuple_to_str(p_remote_sw->tuple)); 2211 } 2212 /* The number of downgoing routes is tracked in the 2213 p_group->counter_down p_port->counter_down counters of the 2214 group and port that belong to the lower side of the link 2215 (on switch with higher rank) */ 2216 p_min_group->counter_down++; 2217 p_min_port->counter_down++; 2218 if (is_real_lid) { 2219 p_remote_sw->p_osm_sw->new_lft[cl_ntoh16(target_lid)] = 2220 p_min_port->remote_port_num; 2221 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, 2222 "Switch %s: set path to CA LID %u through port %u\n", 2223 __osm_ftree_tuple_to_str(p_remote_sw->tuple), 2224 cl_ntoh16(target_lid), 2225 p_min_port->remote_port_num); 2226 2227 /* On the remote switch that is pointed by the min_group, 2228 set hops for ALL the ports in the remote group. */ 2229 2230 ports_num = 2231 (uint16_t) cl_ptr_vector_get_size(&p_min_group-> 2232 ports); 2233 for (j = 0; j < ports_num; j++) { 2234 cl_ptr_vector_at(&p_min_group->ports, j, 2235 (void *)&p_port); 2236 __osm_ftree_sw_set_hops(p_remote_sw, 2237 cl_ntoh16(target_lid), 2238 p_port->remote_port_num, 2239 target_rank - 2240 p_remote_sw->rank); 2241 } 2242 } 2243 2244 /* Recursion step: 2245 Assign downgoing ports by stepping up, starting on REMOTE switch. */ 2246 __osm_ftree_fabric_route_downgoing_by_going_up(p_ftree, p_remote_sw, /* remote switch - used as a route-downgoing alg. next step point */ 2247 p_sw, /* this switch - prev. position switch for the function */ 2248 target_lid, /* LID that we're routing to */ 2249 target_rank, /* rank of the LID that we're routing to */ 2250 is_real_lid, /* whether this target LID is real or dummy */ 2251 is_main_path); /* whether this is path to HCA that should by tracked by counters */ 2252 } 2253 2254 /* we're done for the third case */ 2255 if (!is_real_lid) 2256 return; 2257 2258 /* What's left to do at this point: 2259 * 2260 * 1. is_real_lid == TRUE && is_main_path == TRUE: 2261 * - going UP(TRUE,FALSE) on rest of the groups, each time on port 0, 2262 * but only if the remote (upper) switch hasn't been already 2263 * configured for this target LID 2264 * + NOT promoting port counter 2265 * + setting path in remote switch fwd tbl if it hasn't been set yet 2266 * + setting hops in remote switch on all the ports of each group 2267 * if it hasn't been set yet 2268 * 2269 * 2. is_real_lid == TRUE && is_main_path == FALSE: 2270 * - going UP(TRUE,FALSE) on ALL the groups, each time on port 0, 2271 * but only if the remote (upper) switch hasn't been already 2272 * configured for this target LID 2273 * + NOT promoting port counter 2274 * + setting path in remote switch fwd tbl if it hasn't been set yet 2275 * + setting hops in remote switch on all the ports of each group 2276 * if it hasn't been set yet 2277 * 2278 * These two rules can be rephrased this way: 2279 * - foreach UP port group 2280 * + if remote switch has been set with the target LID 2281 * - skip this port group 2282 * + else 2283 * - select port 0 2284 * - do NOT promote port counter 2285 * - set path in remote switch fwd tbl 2286 * - set hops in remote switch on all the ports of this group 2287 * - go UP(TRUE,FALSE) to the remote switch 2288 */ 2289 2290 for (i = 0; i < p_sw->up_port_groups_num; i++) { 2291 p_group = p_sw->up_port_groups[i]; 2292 p_remote_sw = p_group->remote_hca_or_sw.p_sw; 2293 2294 /* skip if target lid has been already set on remote switch fwd tbl */ 2295 if (p_remote_sw->p_osm_sw->new_lft[cl_ntoh16(target_lid)] != OSM_NO_PATH) 2296 continue; 2297 2298 if (p_sw->is_leaf) { 2299 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, 2300 " - Routing SECONDARY path for LID %u: %s --> %s\n", 2301 cl_ntoh16(target_lid), 2302 __osm_ftree_tuple_to_str(p_sw->tuple), 2303 __osm_ftree_tuple_to_str(p_remote_sw->tuple)); 2304 } 2305 2306 /* Routing REAL lids on SECONDARY path means routing 2307 switch-to-switch or switch-to-CA paths. 2308 We can safely assume that switch will initiate very 2309 few traffic, so there's no point waisting runtime on 2310 trying to balance these routes - always pick port 0. */ 2311 2312 cl_ptr_vector_at(&p_group->ports, 0, (void *)&p_port); 2313 p_remote_sw->p_osm_sw->new_lft[cl_ntoh16(target_lid)] = 2314 p_port->remote_port_num; 2315 2316 /* On the remote switch that is pointed by the p_group, 2317 set hops for ALL the ports in the remote group. */ 2318 2319 ports_num = (uint16_t) cl_ptr_vector_get_size(&p_group->ports); 2320 for (j = 0; j < ports_num; j++) { 2321 cl_ptr_vector_at(&p_group->ports, j, (void *)&p_port); 2322 2323 __osm_ftree_sw_set_hops(p_remote_sw, 2324 cl_ntoh16(target_lid), 2325 p_port->remote_port_num, 2326 target_rank - 2327 p_remote_sw->rank); 2328 } 2329 2330 /* Recursion step: 2331 Assign downgoing ports by stepping up, starting on REMOTE switch. */ 2332 __osm_ftree_fabric_route_downgoing_by_going_up(p_ftree, p_remote_sw, /* remote switch - used as a route-downgoing alg. next step point */ 2333 p_sw, /* this switch - prev. position switch for the function */ 2334 target_lid, /* LID that we're routing to */ 2335 target_rank, /* rank of the LID that we're routing to */ 2336 TRUE, /* whether the target LID is real or dummy */ 2337 FALSE); /* whether this is path to HCA that should by tracked by counters */ 2338 } 2339 2340} /* ftree_fabric_route_downgoing_by_going_up() */ 2341 2342/***************************************************/ 2343 2344/* 2345 * Pseudo code: 2346 * foreach leaf switch (in indexing order) 2347 * for each compute node (in indexing order) 2348 * obtain the LID of the compute node 2349 * set local LFT(LID) of the port connecting to compute node 2350 * call assign-down-going-port-by-ascending-up(TRUE,TRUE) on CURRENT switch 2351 * for each MISSING compute node 2352 * call assign-down-going-port-by-ascending-up(FALSE,TRUE) on CURRENT switch 2353 */ 2354 2355static void __osm_ftree_fabric_route_to_cns(IN ftree_fabric_t * p_ftree) 2356{ 2357 ftree_sw_t *p_sw; 2358 ftree_hca_t *p_hca; 2359 ftree_port_group_t *p_leaf_port_group; 2360 ftree_port_group_t *p_hca_port_group; 2361 ftree_port_t *p_port; 2362 uint32_t i; 2363 uint32_t j; 2364 ib_net16_t hca_lid; 2365 unsigned routed_targets_on_leaf; 2366 2367 OSM_LOG_ENTER(&p_ftree->p_osm->log); 2368 2369 /* for each leaf switch (in indexing order) */ 2370 for (i = 0; i < p_ftree->leaf_switches_num; i++) { 2371 p_sw = p_ftree->leaf_switches[i]; 2372 routed_targets_on_leaf = 0; 2373 2374 /* for each HCA connected to this switch */ 2375 for (j = 0; j < p_sw->down_port_groups_num; j++) { 2376 p_leaf_port_group = p_sw->down_port_groups[j]; 2377 2378 /* work with this port group only if the remote node is CA */ 2379 if (p_leaf_port_group->remote_node_type != 2380 IB_NODE_TYPE_CA) 2381 continue; 2382 2383 p_hca = p_leaf_port_group->remote_hca_or_sw.p_hca; 2384 2385 /* work with this port group only if remote HCA has CNs */ 2386 if (!p_hca->cn_num) 2387 continue; 2388 2389 p_hca_port_group = 2390 __osm_ftree_hca_get_port_group_by_remote_lid(p_hca, 2391 p_leaf_port_group-> 2392 base_lid); 2393 CL_ASSERT(p_hca_port_group); 2394 2395 /* work with this port group only if remote port is CN */ 2396 if (!p_hca_port_group->is_cn) 2397 continue; 2398 2399 /* obtain the LID of HCA port */ 2400 hca_lid = p_leaf_port_group->remote_base_lid; 2401 2402 /* set local LFT(LID) to the port that is connected to HCA */ 2403 cl_ptr_vector_at(&p_leaf_port_group->ports, 0, 2404 (void *)&p_port); 2405 p_sw->p_osm_sw->new_lft[cl_ntoh16(hca_lid)] = p_port->port_num; 2406 2407 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, 2408 "Switch %s: set path to CN LID %u through port %u\n", 2409 __osm_ftree_tuple_to_str(p_sw->tuple), 2410 cl_ntoh16(hca_lid), p_port->port_num); 2411 2412 /* set local min hop table(LID) to route to the CA */ 2413 __osm_ftree_sw_set_hops(p_sw, 2414 cl_ntoh16(hca_lid), 2415 p_port->port_num, 1); 2416 2417 /* Assign downgoing ports by stepping up. 2418 Since we're routing here only CNs, we're routing it as REAL 2419 LID and updating fat-tree balancing counters. */ 2420 __osm_ftree_fabric_route_downgoing_by_going_up(p_ftree, p_sw, /* local switch - used as a route-downgoing alg. start point */ 2421 NULL, /* prev. position switch */ 2422 hca_lid, /* LID that we're routing to */ 2423 p_sw->rank + 1, /* rank of the LID that we're routing to */ 2424 TRUE, /* whether this HCA LID is real or dummy */ 2425 TRUE); /* whether this path to HCA should by tracked by counters */ 2426 2427 /* count how many real targets have been routed from this leaf switch */ 2428 routed_targets_on_leaf++; 2429 } 2430 2431 /* We're done with the real targets (all CNs) of this leaf switch. 2432 Now route the dummy HCAs that are missing or that are non-CNs. 2433 When routing to dummy HCAs we don't fill lid matrices. */ 2434 2435 if (p_ftree->max_cn_per_leaf > routed_targets_on_leaf) { 2436 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, 2437 "Routing %u dummy CAs\n", 2438 p_ftree->max_cn_per_leaf - 2439 p_sw->down_port_groups_num); 2440 for (j = 0; 2441 ((int)j) < 2442 (p_ftree->max_cn_per_leaf - 2443 routed_targets_on_leaf); j++) { 2444 /* assign downgoing ports by stepping up */ 2445 __osm_ftree_fabric_route_downgoing_by_going_up(p_ftree, p_sw, /* local switch - used as a route-downgoing alg. start point */ 2446 NULL, /* prev. position switch */ 2447 0, /* LID that we're routing to - ignored for dummy HCA */ 2448 0, /* rank of the LID that we're routing to - ignored for dummy HCA */ 2449 FALSE, /* whether this HCA LID is real or dummy */ 2450 TRUE); /* whether this path to HCA should by tracked by counters */ 2451 } 2452 } 2453 } 2454 /* done going through all the leaf switches */ 2455 OSM_LOG_EXIT(&p_ftree->p_osm->log); 2456} /* __osm_ftree_fabric_route_to_cns() */ 2457 2458/***************************************************/ 2459 2460/* 2461 * Pseudo code: 2462 * foreach HCA non-CN port in fabric 2463 * obtain the LID of the HCA port 2464 * get switch that is connected to this HCA port 2465 * set switch LFT(LID) to the port connecting to compute node 2466 * call assign-down-going-port-by-ascending-up(TRUE,FALSE) on CURRENT switch 2467 * 2468 * Routing to these HCAs is routing a REAL hca lid on SECONDARY path. 2469 * However, we do want to allow load-leveling of the traffic to the non-CNs, 2470 * because such nodes may include IO nodes with heavy usage 2471 * - we should set fwd tables 2472 * - we should update port counters 2473 * Routing to non-CNs is done after routing to CNs, so updated port 2474 * counters will not affect CN-to-CN routing. 2475 */ 2476 2477static void __osm_ftree_fabric_route_to_non_cns(IN ftree_fabric_t * p_ftree) 2478{ 2479 ftree_sw_t *p_sw; 2480 ftree_hca_t *p_hca; 2481 ftree_hca_t *p_next_hca; 2482 ftree_port_t *p_hca_port; 2483 ftree_port_group_t *p_hca_port_group; 2484 ib_net16_t hca_lid; 2485 unsigned port_num_on_switch; 2486 unsigned i; 2487 2488 OSM_LOG_ENTER(&p_ftree->p_osm->log); 2489 2490 p_next_hca = (ftree_hca_t *) cl_qmap_head(&p_ftree->hca_tbl); 2491 while (p_next_hca != (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl)) { 2492 p_hca = p_next_hca; 2493 p_next_hca = (ftree_hca_t *) cl_qmap_next(&p_hca->map_item); 2494 2495 for (i = 0; i < p_hca->up_port_groups_num; i++) { 2496 p_hca_port_group = p_hca->up_port_groups[i]; 2497 2498 /* skip this port if it's CN, in which case it has been already routed */ 2499 if (p_hca_port_group->is_cn) 2500 continue; 2501 2502 /* skip this port if it is not connected to switch */ 2503 if (p_hca_port_group->remote_node_type != 2504 IB_NODE_TYPE_SWITCH) 2505 continue; 2506 2507 p_sw = p_hca_port_group->remote_hca_or_sw.p_sw; 2508 hca_lid = p_hca_port_group->base_lid; 2509 2510 /* set switches LFT(LID) to the port that is connected to HCA */ 2511 cl_ptr_vector_at(&p_hca_port_group->ports, 0, 2512 (void *)&p_hca_port); 2513 port_num_on_switch = p_hca_port->remote_port_num; 2514 p_sw->p_osm_sw->new_lft[cl_ntoh16(hca_lid)] = port_num_on_switch; 2515 2516 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, 2517 "Switch %s: set path to non-CN HCA LID %u through port %u\n", 2518 __osm_ftree_tuple_to_str(p_sw->tuple), 2519 cl_ntoh16(hca_lid), port_num_on_switch); 2520 2521 /* set local min hop table(LID) to route to the CA */ 2522 __osm_ftree_sw_set_hops(p_sw, cl_ntoh16(hca_lid), 2523 port_num_on_switch, /* port num */ 2524 1); /* hops */ 2525 2526 /* Assign downgoing ports by stepping up. 2527 We're routing REAL targets. They are not CNs and not included 2528 in the leafs array, but we treat them as MAIN path to allow load 2529 leveling, which means that the counters will be updated. */ 2530 __osm_ftree_fabric_route_downgoing_by_going_up(p_ftree, p_sw, /* local switch - used as a route-downgoing alg. start point */ 2531 NULL, /* prev. position switch */ 2532 hca_lid, /* LID that we're routing to */ 2533 p_sw->rank + 1, /* rank of the LID that we're routing to */ 2534 TRUE, /* whether this HCA LID is real or dummy */ 2535 TRUE); /* whether this path to HCA should by tracked by counters */ 2536 } 2537 /* done with all the port groups of this HCA - go to next HCA */ 2538 } 2539 2540 OSM_LOG_EXIT(&p_ftree->p_osm->log); 2541} /* __osm_ftree_fabric_route_to_non_cns() */ 2542 2543/***************************************************/ 2544 2545/* 2546 * Pseudo code: 2547 * foreach switch in fabric 2548 * obtain its LID 2549 * set local LFT(LID) to port 0 2550 * call assign-down-going-port-by-ascending-up(TRUE,FALSE) on CURRENT switch 2551 * 2552 * Routing to switch is similar to routing a REAL hca lid on SECONDARY path: 2553 * - we should set fwd tables 2554 * - we should NOT update port counters 2555 */ 2556 2557static void __osm_ftree_fabric_route_to_switches(IN ftree_fabric_t * p_ftree) 2558{ 2559 ftree_sw_t *p_sw; 2560 ftree_sw_t *p_next_sw; 2561 2562 OSM_LOG_ENTER(&p_ftree->p_osm->log); 2563 2564 p_next_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl); 2565 while (p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) { 2566 p_sw = p_next_sw; 2567 p_next_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item); 2568 2569 /* set local LFT(LID) to 0 (route to itself) */ 2570 p_sw->p_osm_sw->new_lft[cl_ntoh16(p_sw->base_lid)] = 0; 2571 2572 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, 2573 "Switch %s (LID %u): routing switch-to-switch paths\n", 2574 __osm_ftree_tuple_to_str(p_sw->tuple), 2575 cl_ntoh16(p_sw->base_lid)); 2576 2577 /* set min hop table of the switch to itself */ 2578 __osm_ftree_sw_set_hops(p_sw, cl_ntoh16(p_sw->base_lid), 2579 0, /* port_num */ 2580 0); /* hops */ 2581 2582 __osm_ftree_fabric_route_downgoing_by_going_up(p_ftree, p_sw, /* local switch - used as a route-downgoing alg. start point */ 2583 NULL, /* prev. position switch */ 2584 p_sw->base_lid, /* LID that we're routing to */ 2585 p_sw->rank, /* rank of the LID that we're routing to */ 2586 TRUE, /* whether the target LID is a real or dummy */ 2587 FALSE); /* whether this path should by tracked by counters */ 2588 } 2589 2590 OSM_LOG_EXIT(&p_ftree->p_osm->log); 2591} /* __osm_ftree_fabric_route_to_switches() */ 2592 2593/*************************************************** 2594 ***************************************************/ 2595 2596static int __osm_ftree_fabric_populate_nodes(IN ftree_fabric_t * p_ftree) 2597{ 2598 osm_node_t *p_osm_node; 2599 osm_node_t *p_next_osm_node; 2600 2601 OSM_LOG_ENTER(&p_ftree->p_osm->log); 2602 2603 p_next_osm_node = 2604 (osm_node_t *) cl_qmap_head(&p_ftree->p_osm->subn.node_guid_tbl); 2605 while (p_next_osm_node != 2606 (osm_node_t *) cl_qmap_end(&p_ftree->p_osm->subn. 2607 node_guid_tbl)) { 2608 p_osm_node = p_next_osm_node; 2609 p_next_osm_node = 2610 (osm_node_t *) cl_qmap_next(&p_osm_node->map_item); 2611 switch (osm_node_get_type(p_osm_node)) { 2612 case IB_NODE_TYPE_CA: 2613 __osm_ftree_fabric_add_hca(p_ftree, p_osm_node); 2614 break; 2615 case IB_NODE_TYPE_ROUTER: 2616 break; 2617 case IB_NODE_TYPE_SWITCH: 2618 __osm_ftree_fabric_add_sw(p_ftree, p_osm_node->sw); 2619 break; 2620 default: 2621 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, "ERR AB0E: " 2622 "Node GUID 0x%016" PRIx64 2623 " - Unknown node type: %s\n", 2624 cl_ntoh64(osm_node_get_node_guid(p_osm_node)), 2625 ib_get_node_type_str(osm_node_get_type 2626 (p_osm_node))); 2627 OSM_LOG_EXIT(&p_ftree->p_osm->log); 2628 return -1; 2629 } 2630 } 2631 2632 OSM_LOG_EXIT(&p_ftree->p_osm->log); 2633 return 0; 2634} /* __osm_ftree_fabric_populate_nodes() */ 2635 2636/*************************************************** 2637 ***************************************************/ 2638 2639static boolean_t __osm_ftree_sw_update_rank(IN ftree_sw_t * p_sw, 2640 IN uint32_t new_rank) 2641{ 2642 if (__osm_ftree_sw_ranked(p_sw) && p_sw->rank <= new_rank) 2643 return FALSE; 2644 p_sw->rank = new_rank; 2645 return TRUE; 2646 2647} 2648 2649/***************************************************/ 2650 2651static void 2652__osm_ftree_rank_switches_from_leafs(IN ftree_fabric_t * p_ftree, 2653 IN cl_list_t * p_ranking_bfs_list) 2654{ 2655 ftree_sw_t *p_sw; 2656 ftree_sw_t *p_remote_sw; 2657 osm_node_t *p_node; 2658 osm_node_t *p_remote_node; 2659 osm_physp_t *p_osm_port; 2660 uint8_t i; 2661 unsigned max_rank = 0; 2662 2663 while (!cl_is_list_empty(p_ranking_bfs_list)) { 2664 p_sw = (ftree_sw_t *) cl_list_remove_head(p_ranking_bfs_list); 2665 p_node = p_sw->p_osm_sw->p_node; 2666 2667 /* note: skipping port 0 on switches */ 2668 for (i = 1; i < osm_node_get_num_physp(p_node); i++) { 2669 p_osm_port = osm_node_get_physp_ptr(p_node, i); 2670 if (!p_osm_port || !osm_link_is_healthy(p_osm_port)) 2671 continue; 2672 2673 p_remote_node = 2674 osm_node_get_remote_node(p_node, i, NULL); 2675 if (!p_remote_node) 2676 continue; 2677 if (osm_node_get_type(p_remote_node) != 2678 IB_NODE_TYPE_SWITCH) 2679 continue; 2680 2681 p_remote_sw = __osm_ftree_fabric_get_sw_by_guid(p_ftree, 2682 osm_node_get_node_guid 2683 (p_remote_node)); 2684 if (!p_remote_sw) { 2685 /* remote node is not a switch */ 2686 continue; 2687 } 2688 2689 /* if needed, rank the remote switch and add it to the BFS list */ 2690 if (__osm_ftree_sw_update_rank 2691 (p_remote_sw, p_sw->rank + 1)) { 2692 max_rank = p_remote_sw->rank; 2693 cl_list_insert_tail(p_ranking_bfs_list, 2694 p_remote_sw); 2695 } 2696 } 2697 } 2698 2699 /* set FatTree maximal switch rank */ 2700 p_ftree->max_switch_rank = max_rank; 2701 2702} /* __osm_ftree_rank_switches_from_leafs() */ 2703 2704/***************************************************/ 2705 2706static int 2707__osm_ftree_rank_leaf_switches(IN ftree_fabric_t * p_ftree, 2708 IN ftree_hca_t * p_hca, 2709 IN cl_list_t * p_ranking_bfs_list) 2710{ 2711 ftree_sw_t *p_sw; 2712 osm_node_t *p_osm_node = p_hca->p_osm_node; 2713 osm_node_t *p_remote_osm_node; 2714 osm_physp_t *p_osm_port; 2715 static uint8_t i = 0; 2716 int res = 0; 2717 2718 OSM_LOG_ENTER(&p_ftree->p_osm->log); 2719 2720 for (i = 0; i < osm_node_get_num_physp(p_osm_node); i++) { 2721 p_osm_port = osm_node_get_physp_ptr(p_osm_node, i); 2722 if (!p_osm_port || !osm_link_is_healthy(p_osm_port)) 2723 continue; 2724 2725 p_remote_osm_node = 2726 osm_node_get_remote_node(p_osm_node, i, NULL); 2727 if (!p_remote_osm_node) 2728 continue; 2729 2730 switch (osm_node_get_type(p_remote_osm_node)) { 2731 case IB_NODE_TYPE_CA: 2732 /* HCA connected directly to another HCA - not FatTree */ 2733 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, "ERR AB0F: " 2734 "CA conected directly to another CA: " 2735 "0x%016" PRIx64 " <---> 0x%016" PRIx64 "\n", 2736 __osm_ftree_hca_get_guid_ho(p_hca), 2737 cl_ntoh64(osm_node_get_node_guid 2738 (p_remote_osm_node))); 2739 res = -1; 2740 goto Exit; 2741 2742 case IB_NODE_TYPE_ROUTER: 2743 /* leaving this port - proceeding to the next one */ 2744 continue; 2745 2746 case IB_NODE_TYPE_SWITCH: 2747 /* continue with this port */ 2748 break; 2749 2750 default: 2751 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, 2752 "ERR AB10: Node GUID 0x%016" PRIx64 2753 " - Unknown node type: %s\n", 2754 cl_ntoh64(osm_node_get_node_guid 2755 (p_remote_osm_node)), 2756 ib_get_node_type_str(osm_node_get_type 2757 (p_remote_osm_node))); 2758 res = -1; 2759 goto Exit; 2760 } 2761 2762 /* remote node is switch */ 2763 2764 p_sw = __osm_ftree_fabric_get_sw_by_guid(p_ftree, 2765 osm_node_get_node_guid 2766 (p_osm_port-> 2767 p_remote_physp-> 2768 p_node)); 2769 CL_ASSERT(p_sw); 2770 2771 /* if needed, rank the remote switch and add it to the BFS list */ 2772 2773 if (!__osm_ftree_sw_update_rank(p_sw, 0)) 2774 continue; 2775 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, 2776 "Marking rank of switch that is directly connected to CA:\n" 2777 " - CA guid : 0x%016" 2778 PRIx64 "\n" 2779 " - Switch guid: 0x%016" 2780 PRIx64 "\n" 2781 " - Switch LID : %u\n", 2782 __osm_ftree_hca_get_guid_ho(p_hca), 2783 __osm_ftree_sw_get_guid_ho(p_sw), 2784 cl_ntoh16(p_sw->base_lid)); 2785 cl_list_insert_tail(p_ranking_bfs_list, p_sw); 2786 } 2787 2788Exit: 2789 OSM_LOG_EXIT(&p_ftree->p_osm->log); 2790 return res; 2791} /* __osm_ftree_rank_leaf_switches() */ 2792 2793/***************************************************/ 2794 2795static void __osm_ftree_sw_reverse_rank(IN cl_map_item_t * const p_map_item, 2796 IN void *context) 2797{ 2798 ftree_fabric_t *p_ftree = (ftree_fabric_t *) context; 2799 ftree_sw_t *p_sw = (ftree_sw_t * const)p_map_item; 2800 p_sw->rank = p_ftree->max_switch_rank - p_sw->rank; 2801} 2802 2803/*************************************************** 2804 ***************************************************/ 2805 2806static int 2807__osm_ftree_fabric_construct_hca_ports(IN ftree_fabric_t * p_ftree, 2808 IN ftree_hca_t * p_hca) 2809{ 2810 ftree_sw_t *p_remote_sw; 2811 osm_node_t *p_node = p_hca->p_osm_node; 2812 osm_node_t *p_remote_node; 2813 uint8_t remote_node_type; 2814 ib_net64_t remote_node_guid; 2815 osm_physp_t *p_remote_osm_port; 2816 uint8_t i; 2817 uint8_t remote_port_num; 2818 boolean_t is_cn = FALSE; 2819 int res = 0; 2820 2821 for (i = 0; i < osm_node_get_num_physp(p_node); i++) { 2822 osm_physp_t *p_osm_port = osm_node_get_physp_ptr(p_node, i); 2823 if (!p_osm_port || !osm_link_is_healthy(p_osm_port)) 2824 continue; 2825 2826 p_remote_osm_port = osm_physp_get_remote(p_osm_port); 2827 p_remote_node = 2828 osm_node_get_remote_node(p_node, i, &remote_port_num); 2829 2830 if (!p_remote_osm_port) 2831 continue; 2832 2833 remote_node_type = osm_node_get_type(p_remote_node); 2834 remote_node_guid = osm_node_get_node_guid(p_remote_node); 2835 2836 switch (remote_node_type) { 2837 case IB_NODE_TYPE_ROUTER: 2838 /* leaving this port - proceeding to the next one */ 2839 continue; 2840 2841 case IB_NODE_TYPE_CA: 2842 /* HCA connected directly to another HCA - not FatTree */ 2843 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, "ERR AB11: " 2844 "CA conected directly to another CA: " 2845 "0x%016" PRIx64 " <---> 0x%016" PRIx64 "\n", 2846 cl_ntoh64(osm_node_get_node_guid(p_node)), 2847 cl_ntoh64(remote_node_guid)); 2848 res = -1; 2849 goto Exit; 2850 2851 case IB_NODE_TYPE_SWITCH: 2852 /* continue with this port */ 2853 break; 2854 2855 default: 2856 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, 2857 "ERR AB12: Node GUID 0x%016" PRIx64 2858 " - Unknown node type: %s\n", 2859 cl_ntoh64(remote_node_guid), 2860 ib_get_node_type_str(remote_node_type)); 2861 res = -1; 2862 goto Exit; 2863 } 2864 2865 /* remote node is switch */ 2866 2867 p_remote_sw = 2868 __osm_ftree_fabric_get_sw_by_guid(p_ftree, 2869 remote_node_guid); 2870 CL_ASSERT(p_remote_sw); 2871 2872 /* If CN file is not supplied, then all the CAs considered as Compute Nodes. 2873 Otherwise all the CAs are not CNs, and only guids that are present in the 2874 CN file will be marked as compute nodes. */ 2875 if (!__osm_ftree_fabric_cns_provided(p_ftree)) { 2876 is_cn = TRUE; 2877 } else { 2878 name_map_item_t *p_elem = 2879 (name_map_item_t *) cl_qmap_get(&p_ftree-> 2880 cn_guid_tbl, 2881 cl_ntoh64(osm_physp_get_port_guid 2882 (p_osm_port))); 2883 if (p_elem != 2884 (name_map_item_t *) cl_qmap_end(&p_ftree-> 2885 cn_guid_tbl)) 2886 is_cn = TRUE; 2887 } 2888 2889 if (is_cn) { 2890 p_ftree->cn_num++; 2891 p_hca->cn_num++; 2892 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, 2893 "Marking CN port GUID 0x%016" PRIx64 "\n", 2894 cl_ntoh64(osm_physp_get_port_guid(p_osm_port))); 2895 } else { 2896 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, 2897 "Marking non-CN port GUID 0x%016" PRIx64 "\n", 2898 cl_ntoh64(osm_physp_get_port_guid(p_osm_port))); 2899 } 2900 2901 __osm_ftree_hca_add_port(p_hca, /* local ftree_hca object */ 2902 i, /* local port number */ 2903 remote_port_num, /* remote port number */ 2904 osm_node_get_base_lid(p_node, i), /* local lid */ 2905 osm_node_get_base_lid(p_remote_node, 0), /* remote lid */ 2906 osm_physp_get_port_guid(p_osm_port), /* local port guid */ 2907 osm_physp_get_port_guid(p_remote_osm_port), /* remote port guid */ 2908 remote_node_guid, /* remote node guid */ 2909 remote_node_type, /* remote node type */ 2910 (void *)p_remote_sw, /* remote ftree_hca/sw object */ 2911 is_cn); /* whether this port is compute node */ 2912 } 2913 2914Exit: 2915 return res; 2916} /* __osm_ftree_fabric_construct_hca_ports() */ 2917 2918/*************************************************** 2919 ***************************************************/ 2920static boolean_t __osm_invalid_link_rank_diff(const uint32_t val) 2921{ 2922 return (val != 1U && val != -1U); 2923} 2924 2925static int __osm_ftree_fabric_construct_sw_ports(IN ftree_fabric_t * p_ftree, 2926 IN ftree_sw_t * p_sw) 2927{ 2928 ftree_hca_t *p_remote_hca; 2929 ftree_sw_t *p_remote_sw; 2930 osm_node_t *p_node = p_sw->p_osm_sw->p_node; 2931 osm_node_t *p_remote_node; 2932 ib_net16_t remote_base_lid; 2933 uint8_t remote_node_type; 2934 ib_net64_t remote_node_guid; 2935 osm_physp_t *p_remote_osm_port; 2936 ftree_direction_t direction; 2937 void *p_remote_hca_or_sw; 2938 uint8_t i; 2939 uint8_t remote_port_num; 2940 int res = 0; 2941 2942 CL_ASSERT(osm_node_get_type(p_node) == IB_NODE_TYPE_SWITCH); 2943 2944 for (i = 1; i < osm_node_get_num_physp(p_node); i++) { 2945 osm_physp_t *p_osm_port = osm_node_get_physp_ptr(p_node, i); 2946 if (!p_osm_port || !osm_link_is_healthy(p_osm_port)) 2947 continue; 2948 2949 p_remote_osm_port = osm_physp_get_remote(p_osm_port); 2950 if (!p_remote_osm_port) 2951 continue; 2952 2953 p_remote_node = 2954 osm_node_get_remote_node(p_node, i, &remote_port_num); 2955 2956 /* ignore any loopback connection on switch */ 2957 if (p_node == p_remote_node) { 2958 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, 2959 "Ignoring loopback on switch GUID 0x%016" PRIx64 2960 ", LID %u, rank %u\n", 2961 __osm_ftree_sw_get_guid_ho(p_sw), 2962 cl_ntoh16(p_sw->base_lid), 2963 p_sw->rank); 2964 continue; 2965 } 2966 2967 remote_node_type = osm_node_get_type(p_remote_node); 2968 remote_node_guid = osm_node_get_node_guid(p_remote_node); 2969 2970 switch (remote_node_type) { 2971 case IB_NODE_TYPE_ROUTER: 2972 /* leaving this port - proceeding to the next one */ 2973 continue; 2974 2975 case IB_NODE_TYPE_CA: 2976 /* switch connected to hca */ 2977 2978 p_remote_hca = 2979 __osm_ftree_fabric_get_hca_by_guid(p_ftree, 2980 remote_node_guid); 2981 CL_ASSERT(p_remote_hca); 2982 2983 p_remote_hca_or_sw = (void *)p_remote_hca; 2984 direction = FTREE_DIRECTION_DOWN; 2985 2986 remote_base_lid = 2987 osm_physp_get_base_lid(p_remote_osm_port); 2988 break; 2989 2990 case IB_NODE_TYPE_SWITCH: 2991 /* switch connected to another switch */ 2992 2993 p_remote_sw = 2994 __osm_ftree_fabric_get_sw_by_guid(p_ftree, 2995 remote_node_guid); 2996 CL_ASSERT(p_remote_sw); 2997 2998 p_remote_hca_or_sw = (void *)p_remote_sw; 2999 3000 if (__osm_invalid_link_rank_diff(p_sw->rank - p_remote_sw->rank)) { 3001 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, 3002 "ERR AB16: " 3003 "Illegal link between switches with ranks %u and %u:\n" 3004 " GUID 0x%016" PRIx64 3005 ", LID %u, rank %u\n" 3006 " GUID 0x%016" PRIx64 3007 ", LID %u, rank %u\n", p_sw->rank, 3008 p_remote_sw->rank, 3009 __osm_ftree_sw_get_guid_ho(p_sw), 3010 cl_ntoh16(p_sw->base_lid), p_sw->rank, 3011 __osm_ftree_sw_get_guid_ho(p_remote_sw), 3012 cl_ntoh16(p_remote_sw->base_lid), 3013 p_remote_sw->rank); 3014 res = -1; 3015 goto Exit; 3016 } 3017 3018 if (p_sw->rank > p_remote_sw->rank) 3019 direction = FTREE_DIRECTION_UP; 3020 else 3021 direction = FTREE_DIRECTION_DOWN; 3022 3023 /* switch LID is only in port 0 port_info structure */ 3024 remote_base_lid = 3025 osm_node_get_base_lid(p_remote_node, 0); 3026 3027 break; 3028 3029 default: 3030 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, 3031 "ERR AB13: Node GUID 0x%016" PRIx64 3032 " - Unknown node type: %s\n", 3033 cl_ntoh64(remote_node_guid), 3034 ib_get_node_type_str(remote_node_type)); 3035 res = -1; 3036 goto Exit; 3037 } 3038 __osm_ftree_sw_add_port(p_sw, /* local ftree_sw object */ 3039 i, /* local port number */ 3040 remote_port_num, /* remote port number */ 3041 p_sw->base_lid, /* local lid */ 3042 remote_base_lid, /* remote lid */ 3043 osm_physp_get_port_guid(p_osm_port), /* local port guid */ 3044 osm_physp_get_port_guid(p_remote_osm_port), /* remote port guid */ 3045 remote_node_guid, /* remote node guid */ 3046 remote_node_type, /* remote node type */ 3047 p_remote_hca_or_sw, /* remote ftree_hca/sw object */ 3048 direction); /* port direction (up or down) */ 3049 3050 /* Track the max lid (in host order) that exists in the fabric */ 3051 if (cl_ntoh16(remote_base_lid) > p_ftree->lft_max_lid_ho) 3052 p_ftree->lft_max_lid_ho = cl_ntoh16(remote_base_lid); 3053 } 3054 3055Exit: 3056 return res; 3057} /* __osm_ftree_fabric_construct_sw_ports() */ 3058 3059/*************************************************** 3060 ***************************************************/ 3061 3062static int __osm_ftree_fabric_rank_from_roots(IN ftree_fabric_t * p_ftree) 3063{ 3064 osm_node_t *p_osm_node; 3065 osm_node_t *p_remote_osm_node; 3066 osm_physp_t *p_osm_physp; 3067 ftree_sw_t *p_sw; 3068 ftree_sw_t *p_remote_sw; 3069 cl_list_t ranking_bfs_list; 3070 struct guid_list_item *item; 3071 int res = 0; 3072 unsigned num_roots; 3073 unsigned max_rank = 0; 3074 unsigned i; 3075 3076 OSM_LOG_ENTER(&p_ftree->p_osm->log); 3077 cl_list_init(&ranking_bfs_list, 10); 3078 3079 /* Rank all the roots and add them to list */ 3080 for (item = (void *)cl_qlist_head(&p_ftree->root_guid_list); 3081 item != (void *)cl_qlist_end(&p_ftree->root_guid_list); 3082 item = (void *)cl_qlist_next(&item->list)) { 3083 p_sw = 3084 __osm_ftree_fabric_get_sw_by_guid(p_ftree, 3085 cl_hton64(item->guid)); 3086 if (!p_sw) { 3087 /* the specified root guid wasn't found in the fabric */ 3088 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, "ERR AB24: " 3089 "Root switch GUID 0x%" PRIx64 " not found\n", 3090 item->guid); 3091 continue; 3092 } 3093 3094 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, 3095 "Ranking root switch with GUID 0x%" PRIx64 "\n", 3096 item->guid); 3097 p_sw->rank = 0; 3098 cl_list_insert_tail(&ranking_bfs_list, p_sw); 3099 } 3100 3101 num_roots = cl_list_count(&ranking_bfs_list); 3102 if (!num_roots) { 3103 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, "ERR AB25: " 3104 "No valid roots supplied\n"); 3105 res = -1; 3106 goto Exit; 3107 } 3108 3109 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, 3110 "Ranked %u valid root switches\n", num_roots); 3111 3112 /* Now the list has all the roots. 3113 BFS the subnet and update rank on all the switches. */ 3114 3115 while (!cl_is_list_empty(&ranking_bfs_list)) { 3116 p_sw = (ftree_sw_t *) cl_list_remove_head(&ranking_bfs_list); 3117 p_osm_node = p_sw->p_osm_sw->p_node; 3118 3119 /* note: skipping port 0 on switches */ 3120 for (i = 1; i < osm_node_get_num_physp(p_osm_node); i++) { 3121 p_osm_physp = osm_node_get_physp_ptr(p_osm_node, i); 3122 if (!p_osm_physp || !osm_link_is_healthy(p_osm_physp)) 3123 continue; 3124 3125 p_remote_osm_node = 3126 osm_node_get_remote_node(p_osm_node, i, NULL); 3127 if (!p_remote_osm_node) 3128 continue; 3129 3130 if (osm_node_get_type(p_remote_osm_node) != 3131 IB_NODE_TYPE_SWITCH) 3132 continue; 3133 3134 p_remote_sw = __osm_ftree_fabric_get_sw_by_guid(p_ftree, 3135 osm_node_get_node_guid 3136 (p_remote_osm_node)); 3137 CL_ASSERT(p_remote_sw); 3138 3139 /* if needed, rank the remote switch and add it to the BFS list */ 3140 if (__osm_ftree_sw_update_rank 3141 (p_remote_sw, p_sw->rank + 1)) { 3142 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, 3143 "Ranking switch 0x%" PRIx64 3144 " with rank %u\n", 3145 __osm_ftree_sw_get_guid_ho(p_remote_sw), 3146 p_remote_sw->rank); 3147 max_rank = p_remote_sw->rank; 3148 cl_list_insert_tail(&ranking_bfs_list, 3149 p_remote_sw); 3150 } 3151 } 3152 /* done with ports of this switch - go to the next switch in the list */ 3153 } 3154 3155 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, 3156 "Subnet ranking completed. Max Node Rank = %u\n", max_rank); 3157 3158 /* set FatTree maximal switch rank */ 3159 p_ftree->max_switch_rank = max_rank; 3160 3161Exit: 3162 cl_list_destroy(&ranking_bfs_list); 3163 OSM_LOG_EXIT(&p_ftree->p_osm->log); 3164 return res; 3165} /* __osm_ftree_fabric_rank_from_roots() */ 3166 3167/*************************************************** 3168 ***************************************************/ 3169 3170static int __osm_ftree_fabric_rank_from_hcas(IN ftree_fabric_t * p_ftree) 3171{ 3172 ftree_hca_t *p_hca; 3173 ftree_hca_t *p_next_hca; 3174 cl_list_t ranking_bfs_list; 3175 int res = 0; 3176 3177 OSM_LOG_ENTER(&p_ftree->p_osm->log); 3178 3179 cl_list_init(&ranking_bfs_list, 10); 3180 3181 /* Mark REVERSED rank of all the switches in the subnet. 3182 Start from switches that are connected to hca's, and 3183 scan all the switches in the subnet. */ 3184 p_next_hca = (ftree_hca_t *) cl_qmap_head(&p_ftree->hca_tbl); 3185 while (p_next_hca != (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl)) { 3186 p_hca = p_next_hca; 3187 p_next_hca = (ftree_hca_t *) cl_qmap_next(&p_hca->map_item); 3188 if (__osm_ftree_rank_leaf_switches 3189 (p_ftree, p_hca, &ranking_bfs_list) != 0) { 3190 res = -1; 3191 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, "ERR AB14: " 3192 "Subnet ranking failed - subnet is not FatTree"); 3193 goto Exit; 3194 } 3195 } 3196 3197 /* Now rank rest of the switches in the fabric, while the 3198 list already contains all the ranked leaf switches */ 3199 __osm_ftree_rank_switches_from_leafs(p_ftree, &ranking_bfs_list); 3200 3201 /* fix ranking of the switches by reversing the ranking direction */ 3202 cl_qmap_apply_func(&p_ftree->sw_tbl, __osm_ftree_sw_reverse_rank, 3203 (void *)p_ftree); 3204 3205Exit: 3206 cl_list_destroy(&ranking_bfs_list); 3207 OSM_LOG_EXIT(&p_ftree->p_osm->log); 3208 return res; 3209} /* __osm_ftree_fabric_rank_from_hcas() */ 3210 3211/*************************************************** 3212 ***************************************************/ 3213 3214static int __osm_ftree_fabric_rank(IN ftree_fabric_t * p_ftree) 3215{ 3216 int res = 0; 3217 3218 OSM_LOG_ENTER(&p_ftree->p_osm->log); 3219 3220 if (__osm_ftree_fabric_roots_provided(p_ftree)) 3221 res = __osm_ftree_fabric_rank_from_roots(p_ftree); 3222 else 3223 res = __osm_ftree_fabric_rank_from_hcas(p_ftree); 3224 3225 if (res) 3226 goto Exit; 3227 3228 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO, 3229 "FatTree max switch rank is %u\n", p_ftree->max_switch_rank); 3230 3231Exit: 3232 OSM_LOG_EXIT(&p_ftree->p_osm->log); 3233 return res; 3234} /* __osm_ftree_fabric_rank() */ 3235 3236/*************************************************** 3237 ***************************************************/ 3238 3239static void __osm_ftree_fabric_set_leaf_rank(IN ftree_fabric_t * p_ftree) 3240{ 3241 unsigned i; 3242 ftree_sw_t *p_sw; 3243 ftree_hca_t *p_hca = NULL; 3244 ftree_hca_t *p_next_hca; 3245 3246 OSM_LOG_ENTER(&p_ftree->p_osm->log); 3247 3248 if (!__osm_ftree_fabric_roots_provided(p_ftree)) { 3249 /* If root file is not provided, the fabric has to be pure fat-tree 3250 in terms of ranking. Thus, leaf switches rank is the max rank. */ 3251 p_ftree->leaf_switch_rank = p_ftree->max_switch_rank; 3252 } else { 3253 /* Find the first CN and set the leaf_switch_rank to the rank 3254 of the switch that is connected to this CN. Later we will 3255 ensure that all the leaf switches have the same rank. */ 3256 p_next_hca = (ftree_hca_t *) cl_qmap_head(&p_ftree->hca_tbl); 3257 while (p_next_hca != 3258 (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl)) { 3259 p_hca = p_next_hca; 3260 if (p_hca->cn_num) 3261 break; 3262 p_next_hca = 3263 (ftree_hca_t *) cl_qmap_next(&p_hca->map_item); 3264 } 3265 /* we know that there are CNs in the fabric, so just to be sure... */ 3266 CL_ASSERT(p_next_hca != 3267 (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl)); 3268 3269 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, 3270 "Selected CN port GUID 0x%" PRIx64 "\n", 3271 __osm_ftree_hca_get_guid_ho(p_hca)); 3272 3273 for (i = 0; (i < p_hca->up_port_groups_num) 3274 && (!p_hca->up_port_groups[i]->is_cn); i++) ; 3275 CL_ASSERT(i < p_hca->up_port_groups_num); 3276 CL_ASSERT(p_hca->up_port_groups[i]->remote_node_type == 3277 IB_NODE_TYPE_SWITCH); 3278 3279 p_sw = p_hca->up_port_groups[i]->remote_hca_or_sw.p_sw; 3280 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, 3281 "Selected leaf switch GUID 0x%" PRIx64 ", rank %u\n", 3282 __osm_ftree_sw_get_guid_ho(p_sw), p_sw->rank); 3283 p_ftree->leaf_switch_rank = p_sw->rank; 3284 } 3285 3286 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO, 3287 "FatTree leaf switch rank is %u\n", p_ftree->leaf_switch_rank); 3288 OSM_LOG_EXIT(&p_ftree->p_osm->log); 3289} /* __osm_ftree_fabric_set_leaf_rank() */ 3290 3291/*************************************************** 3292 ***************************************************/ 3293 3294static int __osm_ftree_fabric_populate_ports(IN ftree_fabric_t * p_ftree) 3295{ 3296 ftree_hca_t *p_hca; 3297 ftree_hca_t *p_next_hca; 3298 ftree_sw_t *p_sw; 3299 ftree_sw_t *p_next_sw; 3300 int res = 0; 3301 3302 OSM_LOG_ENTER(&p_ftree->p_osm->log); 3303 3304 p_next_hca = (ftree_hca_t *) cl_qmap_head(&p_ftree->hca_tbl); 3305 while (p_next_hca != (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl)) { 3306 p_hca = p_next_hca; 3307 p_next_hca = (ftree_hca_t *) cl_qmap_next(&p_hca->map_item); 3308 if (__osm_ftree_fabric_construct_hca_ports(p_ftree, p_hca) != 0) { 3309 res = -1; 3310 goto Exit; 3311 } 3312 } 3313 3314 p_next_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl); 3315 while (p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) { 3316 p_sw = p_next_sw; 3317 p_next_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item); 3318 if (__osm_ftree_fabric_construct_sw_ports(p_ftree, p_sw) != 0) { 3319 res = -1; 3320 goto Exit; 3321 } 3322 } 3323Exit: 3324 OSM_LOG_EXIT(&p_ftree->p_osm->log); 3325 return res; 3326} /* __osm_ftree_fabric_populate_ports() */ 3327 3328/*************************************************** 3329 ***************************************************/ 3330static int add_guid_item_to_list(void *cxt, uint64_t guid, char *p) 3331{ 3332 cl_qlist_t *list = cxt; 3333 struct guid_list_item *item; 3334 3335 item = malloc(sizeof(*item)); 3336 if (!item) 3337 return -1; 3338 3339 item->guid = guid; 3340 cl_qlist_insert_tail(list, &item->list); 3341 3342 return 0; 3343} 3344 3345static int add_guid_item_to_map(void *cxt, uint64_t guid, char *p) 3346{ 3347 cl_qmap_t *map = cxt; 3348 name_map_item_t *item; 3349 3350 item = malloc(sizeof(*item)); 3351 if (!item) 3352 return -1; 3353 3354 item->guid = guid; 3355 cl_qmap_insert(map, guid, &item->item); 3356 3357 return 0; 3358} 3359 3360static int __osm_ftree_fabric_read_guid_files(IN ftree_fabric_t * p_ftree) 3361{ 3362 int status = 0; 3363 3364 OSM_LOG_ENTER(&p_ftree->p_osm->log); 3365 3366 if (__osm_ftree_fabric_roots_provided(p_ftree)) { 3367 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, 3368 "Fetching root nodes from file %s\n", 3369 p_ftree->p_osm->subn.opt.root_guid_file); 3370 3371 if (parse_node_map(p_ftree->p_osm->subn.opt.root_guid_file, 3372 add_guid_item_to_list, 3373 &p_ftree->root_guid_list)) { 3374 status = -1; 3375 goto Exit; 3376 } 3377 3378 if (!cl_qlist_count(&p_ftree->root_guid_list)) { 3379 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, "ERR AB22: " 3380 "Root guids file has no valid guids\n"); 3381 status = -1; 3382 goto Exit; 3383 } 3384 } 3385 3386 if (__osm_ftree_fabric_cns_provided(p_ftree)) { 3387 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, 3388 "Fetching compute nodes from file %s\n", 3389 p_ftree->p_osm->subn.opt.cn_guid_file); 3390 3391 if (parse_node_map(p_ftree->p_osm->subn.opt.cn_guid_file, 3392 add_guid_item_to_map, 3393 &p_ftree->cn_guid_tbl)) { 3394 status = -1; 3395 goto Exit; 3396 } 3397 3398 if (!cl_qmap_count(&p_ftree->cn_guid_tbl)) { 3399 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, "ERR AB23: " 3400 "Compute node guids file has no valid guids\n"); 3401 status = -1; 3402 goto Exit; 3403 } 3404 } 3405 3406Exit: 3407 OSM_LOG_EXIT(&p_ftree->p_osm->log); 3408 return status; 3409} /*__osm_ftree_fabric_read_guid_files() */ 3410 3411/*************************************************** 3412 ***************************************************/ 3413 3414static int __osm_ftree_construct_fabric(IN void *context) 3415{ 3416 ftree_fabric_t *p_ftree = context; 3417 int status = 0; 3418 3419 OSM_LOG_ENTER(&p_ftree->p_osm->log); 3420 3421 __osm_ftree_fabric_clear(p_ftree); 3422 3423 if (p_ftree->p_osm->subn.opt.lmc > 0) { 3424 osm_log(&p_ftree->p_osm->log, OSM_LOG_SYS, 3425 "LMC > 0 is not supported by fat-tree routing.\n" 3426 "Falling back to default routing\n"); 3427 status = -1; 3428 goto Exit; 3429 } 3430 3431 if (cl_qmap_count(&p_ftree->p_osm->subn.sw_guid_tbl) < 2) { 3432 osm_log(&p_ftree->p_osm->log, OSM_LOG_SYS, 3433 "Fabric has %u switches - topology is not fat-tree.\n" 3434 "Falling back to default routing\n", 3435 cl_qmap_count(&p_ftree->p_osm->subn.sw_guid_tbl)); 3436 status = -1; 3437 goto Exit; 3438 } 3439 3440 if ((cl_qmap_count(&p_ftree->p_osm->subn.node_guid_tbl) - 3441 cl_qmap_count(&p_ftree->p_osm->subn.sw_guid_tbl)) < 2) { 3442 osm_log(&p_ftree->p_osm->log, OSM_LOG_SYS, 3443 "Fabric has %u nodes (%u switches) - topology is not fat-tree.\n" 3444 "Falling back to default routing\n", 3445 cl_qmap_count(&p_ftree->p_osm->subn.node_guid_tbl), 3446 cl_qmap_count(&p_ftree->p_osm->subn.sw_guid_tbl)); 3447 status = -1; 3448 goto Exit; 3449 } 3450 3451 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, "\n" 3452 " |----------------------------------------|\n" 3453 " |- Starting FatTree fabric construction -|\n" 3454 " |----------------------------------------|\n\n"); 3455 3456 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, 3457 "Populating FatTree Switch and CA tables\n"); 3458 if (__osm_ftree_fabric_populate_nodes(p_ftree) != 0) { 3459 osm_log(&p_ftree->p_osm->log, OSM_LOG_SYS, 3460 "Fabric topology is not fat-tree - " 3461 "falling back to default routing\n"); 3462 status = -1; 3463 goto Exit; 3464 } 3465 3466 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, 3467 "Reading guid files provided by user\n"); 3468 if (__osm_ftree_fabric_read_guid_files(p_ftree) != 0) { 3469 osm_log(&p_ftree->p_osm->log, OSM_LOG_SYS, 3470 "Failed reading guid files - " 3471 "falling back to default routing\n"); 3472 status = -1; 3473 goto Exit; 3474 } 3475 3476 if (cl_qmap_count(&p_ftree->hca_tbl) < 2) { 3477 osm_log(&p_ftree->p_osm->log, OSM_LOG_SYS, 3478 "Fabric has %u CAa - topology is not fat-tree.\n" 3479 "Falling back to default routing\n", 3480 cl_qmap_count(&p_ftree->hca_tbl)); 3481 status = -1; 3482 goto Exit; 3483 } 3484 3485 /* Rank all the switches in the fabric. 3486 After that we will know only fabric max switch rank. 3487 We will be able to check leaf switches rank and the 3488 whole tree rank after filling ports and marking CNs. */ 3489 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, "Ranking FatTree\n"); 3490 if (__osm_ftree_fabric_rank(p_ftree) != 0) { 3491 osm_log(&p_ftree->p_osm->log, OSM_LOG_SYS, 3492 "Failed ranking the tree\n"); 3493 status = -1; 3494 goto Exit; 3495 } 3496 3497 /* For each hca and switch, construct array of ports. 3498 This is done after the whole FatTree data structure is ready, 3499 because we want the ports to have pointers to ftree_{sw,hca}_t 3500 objects, and we need the switches to be already ranked because 3501 that's how the port direction is determined. */ 3502 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, 3503 "Populating CA & switch ports\n"); 3504 if (__osm_ftree_fabric_populate_ports(p_ftree) != 0) { 3505 osm_log(&p_ftree->p_osm->log, OSM_LOG_SYS, 3506 "Fabric topology is not a fat-tree\n"); 3507 status = -1; 3508 goto Exit; 3509 } else if (p_ftree->cn_num == 0) { 3510 osm_log(&p_ftree->p_osm->log, OSM_LOG_SYS, 3511 "Fabric has no valid compute nodes\n"); 3512 status = -1; 3513 goto Exit; 3514 } 3515 3516 /* Now that the CA ports have been created and CNs were marked, 3517 we can complete the fabric ranking - set leaf switches rank. */ 3518 __osm_ftree_fabric_set_leaf_rank(p_ftree); 3519 3520 if (__osm_ftree_fabric_get_rank(p_ftree) > FAT_TREE_MAX_RANK || 3521 __osm_ftree_fabric_get_rank(p_ftree) < FAT_TREE_MIN_RANK) { 3522 osm_log(&p_ftree->p_osm->log, OSM_LOG_SYS, 3523 "Fabric rank is %u (should be between %u and %u)\n", 3524 __osm_ftree_fabric_get_rank(p_ftree), FAT_TREE_MIN_RANK, 3525 FAT_TREE_MAX_RANK); 3526 status = -1; 3527 goto Exit; 3528 } 3529 3530 /* Mark all the switches in the fabric with rank equal to 3531 p_ftree->leaf_switch_rank and that are also connected to CNs. 3532 As a by-product, this function also runs basic topology 3533 validation - it checks that all the CNs are at the same rank. */ 3534 if (__osm_ftree_fabric_mark_leaf_switches(p_ftree)) { 3535 osm_log(&p_ftree->p_osm->log, OSM_LOG_SYS, 3536 "Fabric topology is not a fat-tree\n"); 3537 status = -1; 3538 goto Exit; 3539 } 3540 3541 /* Assign index to all the switches in the fabric. 3542 This function also sorts leaf switch array by the switch index, 3543 sorts all the port arrays of the indexed switches by remote 3544 switch index, and creates switch-by-tuple table (sw_by_tuple_tbl) */ 3545 __osm_ftree_fabric_make_indexing(p_ftree); 3546 3547 /* Create leaf switch array sorted by index. 3548 This array contains switches with rank equal to p_ftree->leaf_switch_rank 3549 and that are also connected to CNs (REAL leafs), and it may contain 3550 switches at the same leaf rank w/o CNs, if this is the order of indexing. 3551 In any case, the first and the last switches in the array are REAL leafs. */ 3552 if (__osm_ftree_fabric_create_leaf_switch_array(p_ftree)) { 3553 osm_log(&p_ftree->p_osm->log, OSM_LOG_SYS, 3554 "Fabric topology is not a fat-tree\n"); 3555 status = -1; 3556 goto Exit; 3557 } 3558 3559 /* calculate and set ftree.max_cn_per_leaf field */ 3560 __osm_ftree_fabric_set_max_cn_per_leaf(p_ftree); 3561 3562 /* print general info about fabric topology */ 3563 __osm_ftree_fabric_dump_general_info(p_ftree); 3564 3565 /* dump full tree topology */ 3566 if (osm_log_is_active(&p_ftree->p_osm->log, OSM_LOG_DEBUG)) 3567 __osm_ftree_fabric_dump(p_ftree); 3568 3569 /* the fabric is required to be PURE fat-tree only if the root 3570 guid file hasn't been provided by user */ 3571 if (!__osm_ftree_fabric_roots_provided(p_ftree) && 3572 !__osm_ftree_fabric_validate_topology(p_ftree)) { 3573 osm_log(&p_ftree->p_osm->log, OSM_LOG_SYS, 3574 "Fabric topology is not a fat-tree\n"); 3575 status = -1; 3576 goto Exit; 3577 } 3578 3579 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, 3580 "Max LID in switch LFTs: %u\n", 3581 p_ftree->lft_max_lid_ho); 3582 3583Exit: 3584 if (status != 0) { 3585 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, 3586 "Clearing FatTree Fabric data structures\n"); 3587 __osm_ftree_fabric_clear(p_ftree); 3588 } else 3589 p_ftree->fabric_built = TRUE; 3590 3591 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, "\n" 3592 " |--------------------------------------------------|\n" 3593 " |- Done constructing FatTree fabric (status = %d) -|\n" 3594 " |--------------------------------------------------|\n\n", 3595 status); 3596 3597 OSM_LOG_EXIT(&p_ftree->p_osm->log); 3598 return status; 3599} /* __osm_ftree_construct_fabric() */ 3600 3601/*************************************************** 3602 ***************************************************/ 3603 3604static int __osm_ftree_do_routing(IN void *context) 3605{ 3606 ftree_fabric_t *p_ftree = context; 3607 int status = 0; 3608 3609 OSM_LOG_ENTER(&p_ftree->p_osm->log); 3610 3611 if (!p_ftree->fabric_built) { 3612 status = -1; 3613 goto Exit; 3614 } 3615 3616 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, 3617 "Starting FatTree routing\n"); 3618 3619 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, 3620 "Filling switch forwarding tables for Compute Nodes\n"); 3621 __osm_ftree_fabric_route_to_cns(p_ftree); 3622 3623 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, 3624 "Filling switch forwarding tables for non-CN targets\n"); 3625 __osm_ftree_fabric_route_to_non_cns(p_ftree); 3626 3627 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, 3628 "Filling switch forwarding tables for switch-to-switch paths\n"); 3629 __osm_ftree_fabric_route_to_switches(p_ftree); 3630 3631 /* for each switch, set its fwd table */ 3632 cl_qmap_apply_func(&p_ftree->sw_tbl, __osm_ftree_set_sw_fwd_table, 3633 (void *)p_ftree); 3634 3635 /* write out hca ordering file */ 3636 __osm_ftree_fabric_dump_hca_ordering(p_ftree); 3637 3638 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, 3639 "FatTree routing is done\n"); 3640 3641Exit: 3642 OSM_LOG_EXIT(&p_ftree->p_osm->log); 3643 return status; 3644} 3645 3646/*************************************************** 3647 ***************************************************/ 3648 3649static void __osm_ftree_delete(IN void *context) 3650{ 3651 if (!context) 3652 return; 3653 __osm_ftree_fabric_destroy((ftree_fabric_t *) context); 3654} 3655 3656/*************************************************** 3657 ***************************************************/ 3658 3659int osm_ucast_ftree_setup(struct osm_routing_engine *r, osm_opensm_t * p_osm) 3660{ 3661 ftree_fabric_t *p_ftree = __osm_ftree_fabric_create(); 3662 if (!p_ftree) 3663 return -1; 3664 3665 p_ftree->p_osm = p_osm; 3666 3667 r->context = (void *)p_ftree; 3668 r->build_lid_matrices = __osm_ftree_construct_fabric; 3669 r->ucast_build_fwd_tables = __osm_ftree_do_routing; 3670 r->delete = __osm_ftree_delete; 3671 3672 return 0; 3673} 3674