1/* 2 * Copyright (c) 2004-2008 Voltaire, Inc. All rights reserved. 3 * Copyright (c) 2002-2006 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 1996-2003 Intel Corporation. All rights reserved. 5 * Copyright (c) 2008 Xsigo Systems Inc. All rights reserved. 6 * 7 * This software is available to you under a choice of one of two 8 * licenses. You may choose to be licensed under the terms of the GNU 9 * General Public License (GPL) Version 2, available from the file 10 * COPYING in the main directory of this source tree, or the 11 * OpenIB.org BSD license below: 12 * 13 * Redistribution and use in source and binary forms, with or 14 * without modification, are permitted provided that the following 15 * conditions are met: 16 * 17 * - Redistributions of source code must retain the above 18 * copyright notice, this list of conditions and the following 19 * disclaimer. 20 * 21 * - Redistributions in binary form must reproduce the above 22 * copyright notice, this list of conditions and the following 23 * disclaimer in the documentation and/or other materials 24 * provided with the distribution. 25 * 26 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 27 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 28 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 29 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 30 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 31 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 32 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 33 * SOFTWARE. 34 * 35 */ 36 37/* 38 * Abstract: 39 * Implementation of osm_mcast_mgr_t. 40 * This file implements the Multicast Manager object. 41 */ 42 43#if HAVE_CONFIG_H 44# include <config.h> 45#endif /* HAVE_CONFIG_H */ 46 47#include <stdlib.h> 48#include <string.h> 49#include <iba/ib_types.h> 50#include <complib/cl_debug.h> 51#include <opensm/osm_opensm.h> 52#include <opensm/osm_sm.h> 53#include <opensm/osm_multicast.h> 54#include <opensm/osm_node.h> 55#include <opensm/osm_switch.h> 56#include <opensm/osm_helper.h> 57#include <opensm/osm_msgdef.h> 58 59/********************************************************************** 60 **********************************************************************/ 61typedef struct osm_mcast_work_obj { 62 cl_list_item_t list_item; 63 osm_port_t *p_port; 64} osm_mcast_work_obj_t; 65 66/********************************************************************** 67 **********************************************************************/ 68static osm_mcast_work_obj_t *__osm_mcast_work_obj_new(IN const osm_port_t * 69 const p_port) 70{ 71 /* 72 TO DO - get these objects from a lockpool. 73 */ 74 osm_mcast_work_obj_t *p_obj; 75 76 /* 77 clean allocated memory to avoid assertion when trying to insert to 78 qlist. 79 see cl_qlist_insert_tail(): CL_ASSERT(p_list_item->p_list != p_list) 80 */ 81 p_obj = malloc(sizeof(*p_obj)); 82 if (p_obj) { 83 memset(p_obj, 0, sizeof(*p_obj)); 84 p_obj->p_port = (osm_port_t *) p_port; 85 } 86 87 return (p_obj); 88} 89 90/********************************************************************** 91 **********************************************************************/ 92static void __osm_mcast_work_obj_delete(IN osm_mcast_work_obj_t * p_wobj) 93{ 94 free(p_wobj); 95} 96 97/********************************************************************** 98 Recursively remove nodes from the tree 99 *********************************************************************/ 100static void __osm_mcast_mgr_purge_tree_node(IN osm_mtree_node_t * p_mtn) 101{ 102 uint8_t i; 103 104 for (i = 0; i < p_mtn->max_children; i++) { 105 if (p_mtn->child_array[i] && 106 (p_mtn->child_array[i] != OSM_MTREE_LEAF)) 107 __osm_mcast_mgr_purge_tree_node(p_mtn->child_array[i]); 108 109 p_mtn->child_array[i] = NULL; 110 111 } 112 113 free(p_mtn); 114} 115 116/********************************************************************** 117 **********************************************************************/ 118static void 119__osm_mcast_mgr_purge_tree(osm_sm_t * sm, IN osm_mgrp_t * const p_mgrp) 120{ 121 OSM_LOG_ENTER(sm->p_log); 122 123 if (p_mgrp->p_root) 124 __osm_mcast_mgr_purge_tree_node(p_mgrp->p_root); 125 126 p_mgrp->p_root = NULL; 127 128 OSM_LOG_EXIT(sm->p_log); 129} 130 131/********************************************************************** 132 **********************************************************************/ 133static float 134osm_mcast_mgr_compute_avg_hops(osm_sm_t * sm, 135 const osm_mgrp_t * const p_mgrp, 136 const osm_switch_t * const p_sw) 137{ 138 float avg_hops = 0; 139 uint32_t hops = 0; 140 uint32_t num_ports = 0; 141 const osm_port_t *p_port; 142 const osm_mcm_port_t *p_mcm_port; 143 const cl_qmap_t *p_mcm_tbl; 144 145 OSM_LOG_ENTER(sm->p_log); 146 147 p_mcm_tbl = &p_mgrp->mcm_port_tbl; 148 149 /* 150 For each member of the multicast group, compute the 151 number of hops to its base LID. 152 */ 153 for (p_mcm_port = (osm_mcm_port_t *) cl_qmap_head(p_mcm_tbl); 154 p_mcm_port != (osm_mcm_port_t *) cl_qmap_end(p_mcm_tbl); 155 p_mcm_port = 156 (osm_mcm_port_t *) cl_qmap_next(&p_mcm_port->map_item)) { 157 /* 158 Acquire the port object for this port guid, then create 159 the new worker object to build the list. 160 */ 161 p_port = osm_get_port_by_guid(sm->p_subn, 162 ib_gid_get_guid(&p_mcm_port-> 163 port_gid)); 164 165 if (!p_port) { 166 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A18: " 167 "No port object for port 0x%016" PRIx64 "\n", 168 cl_ntoh64(ib_gid_get_guid 169 (&p_mcm_port->port_gid))); 170 continue; 171 } 172 173 hops += osm_switch_get_port_least_hops(p_sw, p_port); 174 num_ports++; 175 } 176 177 /* 178 We should be here if there aren't any ports in the group. 179 */ 180 CL_ASSERT(num_ports); 181 182 if (num_ports != 0) 183 avg_hops = (float)(hops / num_ports); 184 185 OSM_LOG_EXIT(sm->p_log); 186 return (avg_hops); 187} 188 189/********************************************************************** 190 Calculate the maximal "min hops" from the given switch to any 191 of the group HCAs 192 **********************************************************************/ 193static float 194osm_mcast_mgr_compute_max_hops(osm_sm_t * sm, 195 const osm_mgrp_t * const p_mgrp, 196 const osm_switch_t * const p_sw) 197{ 198 uint32_t max_hops = 0; 199 uint32_t hops = 0; 200 const osm_port_t *p_port; 201 const osm_mcm_port_t *p_mcm_port; 202 const cl_qmap_t *p_mcm_tbl; 203 204 OSM_LOG_ENTER(sm->p_log); 205 206 p_mcm_tbl = &p_mgrp->mcm_port_tbl; 207 208 /* 209 For each member of the multicast group, compute the 210 number of hops to its base LID. 211 */ 212 for (p_mcm_port = (osm_mcm_port_t *) cl_qmap_head(p_mcm_tbl); 213 p_mcm_port != (osm_mcm_port_t *) cl_qmap_end(p_mcm_tbl); 214 p_mcm_port = 215 (osm_mcm_port_t *) cl_qmap_next(&p_mcm_port->map_item)) { 216 /* 217 Acquire the port object for this port guid, then create 218 the new worker object to build the list. 219 */ 220 p_port = osm_get_port_by_guid(sm->p_subn, 221 ib_gid_get_guid(&p_mcm_port-> 222 port_gid)); 223 224 if (!p_port) { 225 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A1A: " 226 "No port object for port 0x%016" PRIx64 "\n", 227 cl_ntoh64(ib_gid_get_guid 228 (&p_mcm_port->port_gid))); 229 continue; 230 } 231 232 hops = osm_switch_get_port_least_hops(p_sw, p_port); 233 if (hops > max_hops) 234 max_hops = hops; 235 } 236 237 if (max_hops == 0) { 238 /* 239 We should be here if there aren't any ports in the group. 240 */ 241 max_hops = 10001; /* see later - we use it to realize no hops */ 242 } 243 244 OSM_LOG_EXIT(sm->p_log); 245 return (float)(max_hops); 246} 247 248/********************************************************************** 249 This function attempts to locate the optimal switch for the 250 center of the spanning tree. The current algorithm chooses 251 a switch with the lowest average hop count to the members 252 of the multicast group. 253**********************************************************************/ 254static osm_switch_t *__osm_mcast_mgr_find_optimal_switch(osm_sm_t * sm, 255 const osm_mgrp_t * 256 const p_mgrp) 257{ 258 cl_qmap_t *p_sw_tbl; 259 const osm_switch_t *p_sw; 260 const osm_switch_t *p_best_sw = NULL; 261 float hops = 0; 262 float best_hops = 10000; /* any big # will do */ 263#ifdef OSM_VENDOR_INTF_ANAFA 264 boolean_t use_avg_hops = TRUE; /* anafa2 - bug hca on switch *//* use max hops for root */ 265#else 266 boolean_t use_avg_hops = FALSE; /* use max hops for root */ 267#endif 268 269 OSM_LOG_ENTER(sm->p_log); 270 271 p_sw_tbl = &sm->p_subn->sw_guid_tbl; 272 273 CL_ASSERT(!osm_mgrp_is_empty(p_mgrp)); 274 275 for (p_sw = (osm_switch_t *) cl_qmap_head(p_sw_tbl); 276 p_sw != (osm_switch_t *) cl_qmap_end(p_sw_tbl); 277 p_sw = (osm_switch_t *) cl_qmap_next(&p_sw->map_item)) { 278 if (!osm_switch_supports_mcast(p_sw)) 279 continue; 280 281 if (use_avg_hops) 282 hops = osm_mcast_mgr_compute_avg_hops(sm, p_mgrp, p_sw); 283 else 284 hops = osm_mcast_mgr_compute_max_hops(sm, p_mgrp, p_sw); 285 286 OSM_LOG(sm->p_log, OSM_LOG_DEBUG, 287 "Switch 0x%016" PRIx64 ", hops = %f\n", 288 cl_ntoh64(osm_node_get_node_guid(p_sw->p_node)), hops); 289 290 if (hops < best_hops) { 291 p_best_sw = p_sw; 292 best_hops = hops; 293 } 294 } 295 296 if (p_best_sw) 297 OSM_LOG(sm->p_log, OSM_LOG_VERBOSE, 298 "Best switch is 0x%" PRIx64 ", hops = %f\n", 299 cl_ntoh64(osm_node_get_node_guid(p_best_sw->p_node)), 300 best_hops); 301 else 302 OSM_LOG(sm->p_log, OSM_LOG_VERBOSE, 303 "No multicast capable switches detected\n"); 304 305 OSM_LOG_EXIT(sm->p_log); 306 return ((osm_switch_t *) p_best_sw); 307} 308 309/********************************************************************** 310 This function returns the existing or optimal root swtich for the tree. 311**********************************************************************/ 312static osm_switch_t *__osm_mcast_mgr_find_root_switch(osm_sm_t * sm, 313 const osm_mgrp_t * 314 const p_mgrp) 315{ 316 const osm_switch_t *p_sw = NULL; 317 318 OSM_LOG_ENTER(sm->p_log); 319 320 /* 321 We always look for the best multicast tree root switch. 322 Otherwise since we always start with a a single join 323 the root will be always on the first switch attached to it. 324 - Very bad ... 325 */ 326 p_sw = __osm_mcast_mgr_find_optimal_switch(sm, p_mgrp); 327 328 OSM_LOG_EXIT(sm->p_log); 329 return ((osm_switch_t *) p_sw); 330} 331 332/********************************************************************** 333 **********************************************************************/ 334static osm_signal_t 335__osm_mcast_mgr_set_tbl(osm_sm_t * sm, IN osm_switch_t * const p_sw) 336{ 337 osm_node_t *p_node; 338 osm_dr_path_t *p_path; 339 osm_madw_context_t mad_context; 340 ib_api_status_t status; 341 uint32_t block_id_ho = 0; 342 int16_t block_num = 0; 343 uint32_t position = 0; 344 uint32_t max_position; 345 osm_mcast_tbl_t *p_tbl; 346 ib_net16_t block[IB_MCAST_BLOCK_SIZE]; 347 osm_signal_t signal = OSM_SIGNAL_DONE; 348 349 CL_ASSERT(sm); 350 351 OSM_LOG_ENTER(sm->p_log); 352 353 CL_ASSERT(p_sw); 354 355 p_node = p_sw->p_node; 356 357 CL_ASSERT(p_node); 358 359 p_path = osm_physp_get_dr_path_ptr(osm_node_get_physp_ptr(p_node, 0)); 360 361 /* 362 Send multicast forwarding table blocks to the switch 363 as long as the switch indicates it has blocks needing 364 configuration. 365 */ 366 367 mad_context.mft_context.node_guid = osm_node_get_node_guid(p_node); 368 mad_context.mft_context.set_method = TRUE; 369 370 p_tbl = osm_switch_get_mcast_tbl_ptr(p_sw); 371 max_position = p_tbl->max_position; 372 373 while (osm_mcast_tbl_get_block(p_tbl, block_num, 374 (uint8_t) position, block)) { 375 OSM_LOG(sm->p_log, OSM_LOG_DEBUG, 376 "Writing MFT block 0x%X\n", block_id_ho); 377 378 block_id_ho = block_num + (position << 28); 379 380 status = osm_req_set(sm, p_path, (void *)block, sizeof(block), 381 IB_MAD_ATTR_MCAST_FWD_TBL, 382 cl_hton32(block_id_ho), 383 CL_DISP_MSGID_NONE, &mad_context); 384 385 if (status != IB_SUCCESS) { 386 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A02: " 387 "Sending multicast fwd. tbl. block failed (%s)\n", 388 ib_get_err_str(status)); 389 } 390 391 signal = OSM_SIGNAL_DONE_PENDING; 392 393 if (++position > max_position) { 394 position = 0; 395 block_num++; 396 } 397 } 398 399 OSM_LOG_EXIT(sm->p_log); 400 return (signal); 401} 402 403/********************************************************************** 404 This is part of the recursive function to compute the paths in the 405 spanning tree that eminate from this switch. On input, the p_list 406 contains the group members that must be routed from this switch. 407**********************************************************************/ 408static void 409__osm_mcast_mgr_subdivide(osm_sm_t * sm, 410 osm_mgrp_t * const p_mgrp, 411 osm_switch_t * const p_sw, 412 cl_qlist_t * const p_list, 413 cl_qlist_t * const list_array, 414 uint8_t const array_size) 415{ 416 uint8_t port_num; 417 uint16_t mlid_ho; 418 boolean_t ignore_existing; 419 osm_mcast_work_obj_t *p_wobj; 420 421 OSM_LOG_ENTER(sm->p_log); 422 423 mlid_ho = cl_ntoh16(osm_mgrp_get_mlid(p_mgrp)); 424 425 /* 426 For Multicast Groups, we want not to count on previous 427 configurations - since we can easily generate a storm 428 by loops. 429 */ 430 ignore_existing = TRUE; 431 432 /* 433 Subdivide the set of ports into non-overlapping subsets 434 that will be routed to other switches. 435 */ 436 while ((p_wobj = 437 (osm_mcast_work_obj_t *) cl_qlist_remove_head(p_list)) != 438 (osm_mcast_work_obj_t *) cl_qlist_end(p_list)) { 439 port_num = 440 osm_switch_recommend_mcast_path(p_sw, p_wobj->p_port, 441 mlid_ho, ignore_existing); 442 443 if (port_num == OSM_NO_PATH) { 444 /* 445 This typically occurs if the switch does not support 446 multicast and the multicast tree must branch at this 447 switch. 448 */ 449 uint64_t node_guid_ho = 450 cl_ntoh64(osm_node_get_node_guid(p_sw->p_node)); 451 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A03: " 452 "Error routing MLID 0x%X through switch 0x%" 453 PRIx64 "\n" 454 "\t\t\t\tNo multicast paths from this switch for port " 455 "with LID %u\n", mlid_ho, node_guid_ho, 456 cl_ntoh16(osm_port_get_base_lid 457 (p_wobj->p_port))); 458 459 __osm_mcast_work_obj_delete(p_wobj); 460 continue; 461 } 462 463 if (port_num > array_size) { 464 uint64_t node_guid_ho = 465 cl_ntoh64(osm_node_get_node_guid(p_sw->p_node)); 466 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A04: " 467 "Error routing MLID 0x%X through switch 0x%" 468 PRIx64 "\n" 469 "\t\t\t\tNo multicast paths from this switch to port " 470 "with LID %u\n", mlid_ho, node_guid_ho, 471 cl_ntoh16(osm_port_get_base_lid 472 (p_wobj->p_port))); 473 474 __osm_mcast_work_obj_delete(p_wobj); 475 476 /* This is means OpenSM has a bug. */ 477 CL_ASSERT(FALSE); 478 continue; 479 } 480 481 cl_qlist_insert_tail(&list_array[port_num], &p_wobj->list_item); 482 } 483 484 OSM_LOG_EXIT(sm->p_log); 485} 486 487/********************************************************************** 488 **********************************************************************/ 489static void __osm_mcast_mgr_purge_list(osm_sm_t * sm, cl_qlist_t * const p_list) 490{ 491 osm_mcast_work_obj_t *p_wobj; 492 493 OSM_LOG_ENTER(sm->p_log); 494 495 while ((p_wobj = (osm_mcast_work_obj_t *) cl_qlist_remove_head(p_list)) 496 != (osm_mcast_work_obj_t *) cl_qlist_end(p_list)) { 497 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A06: " 498 "Unable to route for port 0x%" PRIx64 "\n", 499 osm_port_get_guid(p_wobj->p_port)); 500 __osm_mcast_work_obj_delete(p_wobj); 501 } 502 503 OSM_LOG_EXIT(sm->p_log); 504} 505 506/********************************************************************** 507 This is the recursive function to compute the paths in the spanning 508 tree that emanate from this switch. On input, the p_list contains 509 the group members that must be routed from this switch. 510 511 The function returns the newly created mtree node element. 512**********************************************************************/ 513static osm_mtree_node_t *__osm_mcast_mgr_branch(osm_sm_t * sm, 514 osm_mgrp_t * const p_mgrp, 515 osm_switch_t * const p_sw, 516 cl_qlist_t * const p_list, 517 uint8_t depth, 518 uint8_t const upstream_port, 519 uint8_t * const p_max_depth) 520{ 521 uint8_t max_children; 522 osm_mtree_node_t *p_mtn = NULL; 523 cl_qlist_t *list_array = NULL; 524 uint8_t i; 525 ib_net64_t node_guid; 526 uint64_t node_guid_ho; 527 osm_mcast_work_obj_t *p_wobj; 528 cl_qlist_t *p_port_list; 529 size_t count; 530 uint16_t mlid_ho; 531 osm_mcast_tbl_t *p_tbl; 532 533 OSM_LOG_ENTER(sm->p_log); 534 535 CL_ASSERT(p_sw); 536 CL_ASSERT(p_list); 537 CL_ASSERT(p_max_depth); 538 539 node_guid = osm_node_get_node_guid(p_sw->p_node); 540 node_guid_ho = cl_ntoh64(node_guid); 541 mlid_ho = cl_ntoh16(osm_mgrp_get_mlid(p_mgrp)); 542 543 OSM_LOG(sm->p_log, OSM_LOG_VERBOSE, 544 "Routing MLID 0x%X through switch 0x%" PRIx64 545 ", %u nodes at depth %u\n", 546 mlid_ho, node_guid_ho, cl_qlist_count(p_list), depth); 547 548 CL_ASSERT(cl_qlist_count(p_list) > 0); 549 550 depth++; 551 552 if (depth >= 64) { 553 OSM_LOG(sm->p_log, OSM_LOG_ERROR, 554 "Maximal hops number is reached for MLID 0x%x." 555 " Break processing.", mlid_ho); 556 __osm_mcast_mgr_purge_list(sm, p_list); 557 goto Exit; 558 } 559 560 if (depth > *p_max_depth) { 561 CL_ASSERT(depth == *p_max_depth + 1); 562 *p_max_depth = depth; 563 } 564 565 if (osm_switch_supports_mcast(p_sw) == FALSE) { 566 /* 567 This switch doesn't do multicast. Clean-up. 568 */ 569 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A14: " 570 "Switch 0x%" PRIx64 " does not support multicast\n", 571 node_guid_ho); 572 573 /* 574 Deallocate all the work objects on this branch of the tree. 575 */ 576 __osm_mcast_mgr_purge_list(sm, p_list); 577 goto Exit; 578 } 579 580 p_mtn = osm_mtree_node_new(p_sw); 581 if (p_mtn == NULL) { 582 /* 583 We are unable to continue routing down this 584 leg of the tree. Clean-up. 585 */ 586 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A15: " 587 "Insufficient memory to build multicast tree\n"); 588 589 /* 590 Deallocate all the work objects on this branch of the tree. 591 */ 592 __osm_mcast_mgr_purge_list(sm, p_list); 593 goto Exit; 594 } 595 596 max_children = osm_mtree_node_get_max_children(p_mtn); 597 598 CL_ASSERT(max_children > 1); 599 600 /* 601 Prepare an empty list for each port in the switch. 602 TO DO - this list array could probably be moved 603 inside the switch element to save on malloc thrashing. 604 */ 605 list_array = malloc(sizeof(cl_qlist_t) * max_children); 606 if (list_array == NULL) { 607 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A16: " 608 "Unable to allocate list array\n"); 609 __osm_mcast_mgr_purge_list(sm, p_list); 610 goto Exit; 611 } 612 613 memset(list_array, 0, sizeof(cl_qlist_t) * max_children); 614 615 for (i = 0; i < max_children; i++) 616 cl_qlist_init(&list_array[i]); 617 618 __osm_mcast_mgr_subdivide(sm, p_mgrp, p_sw, p_list, list_array, 619 max_children); 620 621 p_tbl = osm_switch_get_mcast_tbl_ptr(p_sw); 622 623 /* 624 Add the upstream port to the forwarding table unless 625 we're at the root of the spanning tree. 626 */ 627 if (depth > 1) { 628 OSM_LOG(sm->p_log, OSM_LOG_DEBUG, 629 "Adding upstream port %u\n", upstream_port); 630 631 CL_ASSERT(upstream_port); 632 osm_mcast_tbl_set(p_tbl, mlid_ho, upstream_port); 633 } 634 635 /* 636 For each port that was allocated some routes, 637 recurse into this function to continue building the tree 638 if the node on the other end of that port is another switch. 639 Otherwise, the node is an endpoint, and we've found a leaf 640 of the tree. Mark leaves with our special pointer value. 641 */ 642 643 for (i = 0; i < max_children; i++) { 644 const osm_physp_t *p_physp; 645 const osm_physp_t *p_remote_physp; 646 osm_node_t *p_node; 647 const osm_node_t *p_remote_node; 648 649 p_port_list = &list_array[i]; 650 651 count = cl_qlist_count(p_port_list); 652 653 /* 654 There should be no children routed through the upstream port! 655 */ 656 CL_ASSERT((upstream_port == 0) || (i != upstream_port) || 657 ((i == upstream_port) && (count == 0))); 658 659 if (count == 0) 660 continue; /* No routes down this port. */ 661 662 OSM_LOG(sm->p_log, OSM_LOG_DEBUG, 663 "Routing %zu destinations via switch port %u\n", 664 count, i); 665 666 /* 667 This port routes frames for this mcast group. Therefore, 668 set the appropriate bit in the multicast forwarding 669 table for this switch. 670 */ 671 osm_mcast_tbl_set(p_tbl, mlid_ho, i); 672 if (i == 0) { 673 /* This means we are adding the switch to the MC group. 674 We do not need to continue looking at the remote port, just 675 needed to add the port to the table */ 676 CL_ASSERT(count == 1); 677 678 p_wobj = (osm_mcast_work_obj_t *) 679 cl_qlist_remove_head(p_port_list); 680 __osm_mcast_work_obj_delete(p_wobj); 681 continue; 682 } 683 684 p_node = p_sw->p_node; 685 p_remote_node = osm_node_get_remote_node(p_node, i, NULL); 686 if (!p_remote_node) 687 continue; 688 689 if (osm_node_get_type(p_remote_node) == IB_NODE_TYPE_SWITCH) { 690 /* 691 Acquire a pointer to the remote switch then recurse. 692 */ 693 CL_ASSERT(p_remote_node->sw); 694 695 p_physp = osm_node_get_physp_ptr(p_node, i); 696 CL_ASSERT(p_physp); 697 698 p_remote_physp = osm_physp_get_remote(p_physp); 699 CL_ASSERT(p_remote_physp); 700 701 p_mtn->child_array[i] = 702 __osm_mcast_mgr_branch(sm, p_mgrp, 703 p_remote_node->sw, 704 p_port_list, depth, 705 osm_physp_get_port_num 706 (p_remote_physp), 707 p_max_depth); 708 } else { 709 /* 710 The neighbor node is not a switch, so this 711 must be a leaf. 712 */ 713 CL_ASSERT(count == 1); 714 715 p_mtn->child_array[i] = OSM_MTREE_LEAF; 716 p_wobj = (osm_mcast_work_obj_t *) 717 cl_qlist_remove_head(p_port_list); 718 719 CL_ASSERT(cl_is_qlist_empty(p_port_list)); 720 721 OSM_LOG(sm->p_log, OSM_LOG_DEBUG, 722 "Found leaf for port 0x%016" PRIx64 723 " on switch port %u\n", 724 cl_ntoh64(osm_port_get_guid(p_wobj->p_port)), 725 i); 726 727 __osm_mcast_work_obj_delete(p_wobj); 728 } 729 } 730 731 free(list_array); 732Exit: 733 OSM_LOG_EXIT(sm->p_log); 734 return (p_mtn); 735} 736 737/********************************************************************** 738 **********************************************************************/ 739static ib_api_status_t 740__osm_mcast_mgr_build_spanning_tree(osm_sm_t * sm, osm_mgrp_t * const p_mgrp) 741{ 742 const cl_qmap_t *p_mcm_tbl; 743 const osm_port_t *p_port; 744 const osm_mcm_port_t *p_mcm_port; 745 uint32_t num_ports; 746 cl_qlist_t port_list; 747 osm_switch_t *p_sw; 748 osm_mcast_work_obj_t *p_wobj; 749 ib_api_status_t status = IB_SUCCESS; 750 uint8_t max_depth = 0; 751 uint32_t count; 752 753 OSM_LOG_ENTER(sm->p_log); 754 755 cl_qlist_init(&port_list); 756 757 /* 758 TO DO - for now, just blow away the old tree. 759 In the future we'll need to construct the tree based 760 on multicast forwarding table information if the user wants to 761 preserve existing multicast routes. 762 */ 763 __osm_mcast_mgr_purge_tree(sm, p_mgrp); 764 765 p_mcm_tbl = &p_mgrp->mcm_port_tbl; 766 num_ports = cl_qmap_count(p_mcm_tbl); 767 if (num_ports == 0) { 768 OSM_LOG(sm->p_log, OSM_LOG_VERBOSE, 769 "MLID 0x%X has no members - nothing to do\n", 770 cl_ntoh16(osm_mgrp_get_mlid(p_mgrp))); 771 goto Exit; 772 } 773 774 /* 775 This function builds the single spanning tree recursively. 776 At each stage, the ports to be reached are divided into 777 non-overlapping subsets of member ports that can be reached through 778 a given switch port. Construction then moves down each 779 branch, and the process starts again with each branch computing 780 for its own subset of the member ports. 781 782 The maximum recursion depth is at worst the maximum hop count in the 783 subnet, which is spec limited to 64. 784 */ 785 786 /* 787 Locate the switch around which to create the spanning 788 tree for this multicast group. 789 */ 790 p_sw = __osm_mcast_mgr_find_root_switch(sm, p_mgrp); 791 if (p_sw == NULL) { 792 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A08: " 793 "Unable to locate a suitable switch for group 0x%X\n", 794 cl_ntoh16(osm_mgrp_get_mlid(p_mgrp))); 795 status = IB_ERROR; 796 goto Exit; 797 } 798 799 /* 800 Build the first "subset" containing all member ports. 801 */ 802 for (p_mcm_port = (osm_mcm_port_t *) cl_qmap_head(p_mcm_tbl); 803 p_mcm_port != (osm_mcm_port_t *) cl_qmap_end(p_mcm_tbl); 804 p_mcm_port = 805 (osm_mcm_port_t *) cl_qmap_next(&p_mcm_port->map_item)) { 806 /* 807 Acquire the port object for this port guid, then create 808 the new worker object to build the list. 809 */ 810 p_port = osm_get_port_by_guid(sm->p_subn, 811 ib_gid_get_guid(&p_mcm_port-> 812 port_gid)); 813 if (!p_port) { 814 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A09: " 815 "No port object for port 0x%016" PRIx64 "\n", 816 cl_ntoh64(ib_gid_get_guid 817 (&p_mcm_port->port_gid))); 818 continue; 819 } 820 821 p_wobj = __osm_mcast_work_obj_new(p_port); 822 if (p_wobj == NULL) { 823 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A10: " 824 "Insufficient memory to route port 0x%016" 825 PRIx64 "\n", 826 cl_ntoh64(osm_port_get_guid(p_port))); 827 continue; 828 } 829 830 cl_qlist_insert_tail(&port_list, &p_wobj->list_item); 831 } 832 833 count = cl_qlist_count(&port_list); 834 p_mgrp->p_root = __osm_mcast_mgr_branch(sm, p_mgrp, p_sw, 835 &port_list, 0, 0, &max_depth); 836 837 OSM_LOG(sm->p_log, OSM_LOG_VERBOSE, 838 "Configured MLID 0x%X for %u ports, max tree depth = %u\n", 839 cl_ntoh16(osm_mgrp_get_mlid(p_mgrp)), count, max_depth); 840 841Exit: 842 OSM_LOG_EXIT(sm->p_log); 843 return (status); 844} 845 846#if 0 847/* unused */ 848/********************************************************************** 849 **********************************************************************/ 850void 851osm_mcast_mgr_set_table(osm_sm_t * sm, 852 IN const osm_mgrp_t * const p_mgrp, 853 IN const osm_mtree_node_t * const p_mtn) 854{ 855 uint8_t i; 856 uint8_t max_children; 857 osm_mtree_node_t *p_child_mtn; 858 uint16_t mlid_ho; 859 osm_mcast_tbl_t *p_tbl; 860 osm_switch_t *p_sw; 861 862 OSM_LOG_ENTER(sm->p_log); 863 864 mlid_ho = cl_ntoh16(osm_mgrp_get_mlid(p_mgrp)); 865 p_sw = osm_mtree_node_get_switch_ptr(p_mtn); 866 867 CL_ASSERT(p_sw); 868 869 OSM_LOG(sm->p_log, OSM_LOG_VERBOSE, 870 "Configuring MLID 0x%X on switch 0x%" PRIx64 "\n", 871 mlid_ho, osm_node_get_node_guid(p_sw->p_node)); 872 873 /* 874 For every child of this tree node, set the corresponding 875 bit in the switch's mcast table. 876 */ 877 p_tbl = osm_switch_get_mcast_tbl_ptr(p_sw); 878 max_children = osm_mtree_node_get_max_children(p_mtn); 879 880 CL_ASSERT(max_children <= osm_switch_get_num_ports(p_sw)); 881 882 osm_mcast_tbl_clear_mlid(p_tbl, mlid_ho); 883 884 for (i = 0; i < max_children; i++) { 885 p_child_mtn = osm_mtree_node_get_child(p_mtn, i); 886 if (p_child_mtn == NULL) 887 continue; 888 889 osm_mcast_tbl_set(p_tbl, mlid_ho, i); 890 } 891 892 OSM_LOG_EXIT(sm->p_log); 893} 894#endif 895 896/********************************************************************** 897 **********************************************************************/ 898static void __osm_mcast_mgr_clear(osm_sm_t * sm, IN osm_mgrp_t * const p_mgrp) 899{ 900 osm_switch_t *p_sw; 901 cl_qmap_t *p_sw_tbl; 902 osm_mcast_tbl_t *p_mcast_tbl; 903 904 OSM_LOG_ENTER(sm->p_log); 905 906 /* 907 Walk the switches and clear the routing entries for 908 this MLID. 909 */ 910 p_sw_tbl = &sm->p_subn->sw_guid_tbl; 911 p_sw = (osm_switch_t *) cl_qmap_head(p_sw_tbl); 912 while (p_sw != (osm_switch_t *) cl_qmap_end(p_sw_tbl)) { 913 p_mcast_tbl = osm_switch_get_mcast_tbl_ptr(p_sw); 914 osm_mcast_tbl_clear_mlid(p_mcast_tbl, cl_ntoh16(p_mgrp->mlid)); 915 p_sw = (osm_switch_t *) cl_qmap_next(&p_sw->map_item); 916 } 917 918 OSM_LOG_EXIT(sm->p_log); 919} 920 921#if 0 922/* TO DO - make this real -- at least update spanning tree */ 923/********************************************************************** 924 Lock must be held on entry. 925**********************************************************************/ 926ib_api_status_t 927osm_mcast_mgr_process_single(osm_sm_t * sm, 928 IN ib_net16_t const mlid, 929 IN ib_net64_t const port_guid, 930 IN uint8_t const join_state) 931{ 932 uint8_t port_num; 933 uint16_t mlid_ho; 934 ib_net64_t sw_guid; 935 osm_port_t *p_port; 936 osm_physp_t *p_physp; 937 osm_physp_t *p_remote_physp; 938 osm_node_t *p_remote_node; 939 osm_mcast_tbl_t *p_mcast_tbl; 940 ib_api_status_t status = IB_SUCCESS; 941 942 OSM_LOG_ENTER(sm->p_log); 943 944 CL_ASSERT(mlid); 945 CL_ASSERT(port_guid); 946 947 mlid_ho = cl_ntoh16(mlid); 948 949 OSM_LOG(sm->p_log, OSM_LOG_DEBUG, 950 "Attempting to add port 0x%" PRIx64 " to MLID 0x%X, " 951 "\n\t\t\t\tjoin state = 0x%X\n", 952 cl_ntoh64(port_guid), mlid_ho, join_state); 953 954 /* 955 Acquire the Port object. 956 */ 957 p_port = osm_get_port_by_guid(sm->p_subn, port_guid); 958 if (!p_port) { 959 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A01: " 960 "Unable to acquire port object for 0x%" PRIx64 "\n", 961 cl_ntoh64(port_guid)); 962 status = IB_ERROR; 963 goto Exit; 964 } 965 966 p_physp = p_port->p_physp; 967 if (p_physp == NULL) { 968 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A05: " 969 "Unable to acquire phsyical port object for 0x%" PRIx64 970 "\n", cl_ntoh64(port_guid)); 971 status = IB_ERROR; 972 goto Exit; 973 } 974 975 p_remote_physp = osm_physp_get_remote(p_physp); 976 if (p_remote_physp == NULL) { 977 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A11: " 978 "Unable to acquire remote phsyical port object " 979 "for 0x%" PRIx64 "\n", cl_ntoh64(port_guid)); 980 status = IB_ERROR; 981 goto Exit; 982 } 983 984 p_remote_node = osm_physp_get_node_ptr(p_remote_physp); 985 986 CL_ASSERT(p_remote_node); 987 988 sw_guid = osm_node_get_node_guid(p_remote_node); 989 990 if (osm_node_get_type(p_remote_node) != IB_NODE_TYPE_SWITCH) { 991 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A22: " 992 "Remote node not a switch node 0x%" PRIx64 "\n", 993 cl_ntoh64(sw_guid)); 994 status = IB_ERROR; 995 goto Exit; 996 } 997 998 if (!p_remote_node->sw) { 999 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A12: " 1000 "No switch object 0x%" PRIx64 "\n", cl_ntoh64(sw_guid)); 1001 status = IB_ERROR; 1002 goto Exit; 1003 } 1004 1005 if (osm_switch_is_in_mcast_tree(p_remote_node->sw, mlid_ho)) { 1006 /* 1007 We're in luck. The switch attached to this port 1008 is already in the multicast group, so we can just 1009 add the specified port as a new leaf of the tree. 1010 */ 1011 if (join_state & (IB_JOIN_STATE_FULL | IB_JOIN_STATE_NON)) { 1012 /* 1013 This node wants to receive multicast frames. 1014 Get the switch port number to which the new member port 1015 is attached, then configure this single mcast table. 1016 */ 1017 port_num = osm_physp_get_port_num(p_remote_physp); 1018 CL_ASSERT(port_num); 1019 1020 p_mcast_tbl = 1021 osm_switch_get_mcast_tbl_ptr(p_remote_node->sw); 1022 osm_mcast_tbl_set(p_mcast_tbl, mlid_ho, port_num); 1023 } else { 1024 if (join_state & IB_JOIN_STATE_SEND_ONLY) 1025 OSM_LOG(sm->p_log, OSM_LOG_DEBUG, 1026 "Success. Nothing to do for send" 1027 "only member\n"); 1028 else { 1029 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A13: " 1030 "Unknown join state 0x%X\n", 1031 join_state); 1032 status = IB_ERROR; 1033 goto Exit; 1034 } 1035 } 1036 } else 1037 OSM_LOG(sm->p_log, OSM_LOG_DEBUG, "Unable to add port\n"); 1038 1039Exit: 1040 OSM_LOG_EXIT(sm->p_log); 1041 return (status); 1042} 1043#endif 1044 1045/********************************************************************** 1046 lock must already be held on entry 1047**********************************************************************/ 1048static ib_api_status_t 1049osm_mcast_mgr_process_tree(osm_sm_t * sm, 1050 IN osm_mgrp_t * const p_mgrp, 1051 IN osm_mcast_req_type_t req_type, 1052 ib_net64_t port_guid) 1053{ 1054 ib_api_status_t status = IB_SUCCESS; 1055 ib_net16_t mlid; 1056 1057 OSM_LOG_ENTER(sm->p_log); 1058 1059 mlid = osm_mgrp_get_mlid(p_mgrp); 1060 1061 OSM_LOG(sm->p_log, OSM_LOG_DEBUG, 1062 "Processing multicast group 0x%X\n", cl_ntoh16(mlid)); 1063 1064 /* 1065 If there are no switches in the subnet, then we have nothing to do. 1066 */ 1067 if (cl_qmap_count(&sm->p_subn->sw_guid_tbl) == 0) { 1068 OSM_LOG(sm->p_log, OSM_LOG_DEBUG, 1069 "No switches in subnet. Nothing to do\n"); 1070 goto Exit; 1071 } 1072 1073 /* 1074 Clear the multicast tables to start clean, then build 1075 the spanning tree which sets the mcast table bits for each 1076 port in the group. 1077 */ 1078 __osm_mcast_mgr_clear(sm, p_mgrp); 1079 1080 if (!p_mgrp->full_members) 1081 goto Exit; 1082 1083 status = __osm_mcast_mgr_build_spanning_tree(sm, p_mgrp); 1084 if (status != IB_SUCCESS) { 1085 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A17: " 1086 "Unable to create spanning tree (%s)\n", 1087 ib_get_err_str(status)); 1088 goto Exit; 1089 } 1090 1091Exit: 1092 OSM_LOG_EXIT(sm->p_log); 1093 return (status); 1094} 1095 1096/********************************************************************** 1097 Process the entire group. 1098 NOTE : The lock should be held externally! 1099 **********************************************************************/ 1100static ib_api_status_t 1101mcast_mgr_process_mgrp(osm_sm_t * sm, 1102 IN osm_mgrp_t * const p_mgrp, 1103 IN osm_mcast_req_type_t req_type, 1104 IN ib_net64_t port_guid) 1105{ 1106 ib_api_status_t status; 1107 1108 OSM_LOG_ENTER(sm->p_log); 1109 1110 status = osm_mcast_mgr_process_tree(sm, p_mgrp, req_type, port_guid); 1111 if (status != IB_SUCCESS) { 1112 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A19: " 1113 "Unable to create spanning tree (%s)\n", 1114 ib_get_err_str(status)); 1115 goto Exit; 1116 } 1117 p_mgrp->last_tree_id = p_mgrp->last_change_id; 1118 1119 /* remove MCGRP if it is marked for deletion */ 1120 if (p_mgrp->to_be_deleted) { 1121 OSM_LOG(sm->p_log, OSM_LOG_DEBUG, 1122 "Destroying mgrp with lid:0x%x\n", 1123 cl_ntoh16(p_mgrp->mlid)); 1124 sm->p_subn->mgroups[cl_ntoh16(p_mgrp->mlid) - IB_LID_MCAST_START_HO] = NULL; 1125 osm_mgrp_delete(p_mgrp); 1126 } 1127 1128Exit: 1129 OSM_LOG_EXIT(sm->p_log); 1130 return status; 1131} 1132 1133/********************************************************************** 1134 **********************************************************************/ 1135osm_signal_t osm_mcast_mgr_process(osm_sm_t * sm) 1136{ 1137 osm_signal_t signal; 1138 osm_switch_t *p_sw; 1139 cl_qmap_t *p_sw_tbl; 1140 cl_qlist_t *p_list = &sm->mgrp_list; 1141 osm_mgrp_t *p_mgrp; 1142 boolean_t pending_transactions = FALSE; 1143 int i; 1144 1145 OSM_LOG_ENTER(sm->p_log); 1146 1147 p_sw_tbl = &sm->p_subn->sw_guid_tbl; 1148 /* 1149 While holding the lock, iterate over all the established 1150 multicast groups, servicing each in turn. 1151 1152 Then, download the multicast tables to the switches. 1153 */ 1154 CL_PLOCK_EXCL_ACQUIRE(sm->p_lock); 1155 1156 for (i = 0; i <= sm->p_subn->max_mcast_lid_ho - IB_LID_MCAST_START_HO; 1157 i++) { 1158 /* 1159 We reached here due to some change that caused a heavy sweep 1160 of the subnet. Not due to a specific multicast request. 1161 So the request type is subnet_change and the port guid is 0. 1162 */ 1163 p_mgrp = sm->p_subn->mgroups[i]; 1164 if (p_mgrp) 1165 mcast_mgr_process_mgrp(sm, p_mgrp, 1166 OSM_MCAST_REQ_TYPE_SUBNET_CHANGE, 1167 0); 1168 } 1169 1170 /* 1171 Walk the switches and download the tables for each. 1172 */ 1173 p_sw = (osm_switch_t *) cl_qmap_head(p_sw_tbl); 1174 while (p_sw != (osm_switch_t *) cl_qmap_end(p_sw_tbl)) { 1175 signal = __osm_mcast_mgr_set_tbl(sm, p_sw); 1176 if (signal == OSM_SIGNAL_DONE_PENDING) 1177 pending_transactions = TRUE; 1178 p_sw = (osm_switch_t *) cl_qmap_next(&p_sw->map_item); 1179 } 1180 1181 while (!cl_is_qlist_empty(p_list)) { 1182 cl_list_item_t *p = cl_qlist_remove_head(p_list); 1183 free(p); 1184 } 1185 1186 CL_PLOCK_RELEASE(sm->p_lock); 1187 1188 OSM_LOG_EXIT(sm->p_log); 1189 1190 if (pending_transactions == TRUE) 1191 return (OSM_SIGNAL_DONE_PENDING); 1192 else 1193 return (OSM_SIGNAL_DONE); 1194} 1195 1196/********************************************************************** 1197 This is the function that is invoked during idle time to handle the 1198 process request for mcast groups where join/leave/delete was required. 1199 **********************************************************************/ 1200osm_signal_t osm_mcast_mgr_process_mgroups(osm_sm_t * sm) 1201{ 1202 cl_qlist_t *p_list = &sm->mgrp_list; 1203 osm_switch_t *p_sw; 1204 cl_qmap_t *p_sw_tbl; 1205 osm_mgrp_t *p_mgrp; 1206 ib_net16_t mlid; 1207 osm_signal_t ret, signal = OSM_SIGNAL_DONE; 1208 osm_mcast_mgr_ctxt_t *ctx; 1209 osm_mcast_req_type_t req_type; 1210 ib_net64_t port_guid; 1211 1212 OSM_LOG_ENTER(sm->p_log); 1213 1214 /* we need a lock to make sure the p_mgrp is not change other ways */ 1215 CL_PLOCK_EXCL_ACQUIRE(sm->p_lock); 1216 1217 while (!cl_is_qlist_empty(p_list)) { 1218 ctx = (osm_mcast_mgr_ctxt_t *) cl_qlist_remove_head(p_list); 1219 req_type = ctx->req_type; 1220 port_guid = ctx->port_guid; 1221 1222 /* nice copy no warning on size diff */ 1223 memcpy(&mlid, &ctx->mlid, sizeof(mlid)); 1224 1225 /* we can destroy the context now */ 1226 free(ctx); 1227 1228 /* since we delayed the execution we prefer to pass the 1229 mlid as the mgrp identifier and then find it or abort */ 1230 p_mgrp = osm_get_mgrp_by_mlid(sm->p_subn, mlid); 1231 if (!p_mgrp) 1232 continue; 1233 1234 /* if there was no change from the last time 1235 * we processed the group we can skip doing anything 1236 */ 1237 if (p_mgrp->last_change_id == p_mgrp->last_tree_id) { 1238 OSM_LOG(sm->p_log, OSM_LOG_DEBUG, 1239 "Skip processing mgrp with lid:0x%X change id:%u\n", 1240 cl_ntoh16(mlid), p_mgrp->last_change_id); 1241 continue; 1242 } 1243 1244 OSM_LOG(sm->p_log, OSM_LOG_DEBUG, 1245 "Processing mgrp with lid:0x%X change id:%u\n", 1246 cl_ntoh16(mlid), p_mgrp->last_change_id); 1247 mcast_mgr_process_mgrp(sm, p_mgrp, req_type, port_guid); 1248 } 1249 1250 /* 1251 Walk the switches and download the tables for each. 1252 */ 1253 p_sw_tbl = &sm->p_subn->sw_guid_tbl; 1254 p_sw = (osm_switch_t *) cl_qmap_head(p_sw_tbl); 1255 while (p_sw != (osm_switch_t *) cl_qmap_end(p_sw_tbl)) { 1256 ret = __osm_mcast_mgr_set_tbl(sm, p_sw); 1257 if (ret == OSM_SIGNAL_DONE_PENDING) 1258 signal = ret; 1259 p_sw = (osm_switch_t *) cl_qmap_next(&p_sw->map_item); 1260 } 1261 1262 osm_dump_mcast_routes(sm->p_subn->p_osm); 1263 1264 CL_PLOCK_RELEASE(sm->p_lock); 1265 OSM_LOG_EXIT(sm->p_log); 1266 return signal; 1267} 1268