1321936Shselasky/* 2321936Shselasky * Copyright (c) 2004-2009 Voltaire, Inc. All rights reserved. 3321936Shselasky * Copyright (c) 2002-2015 Mellanox Technologies LTD. All rights reserved. 4321936Shselasky * Copyright (c) 1996-2003 Intel Corporation. All rights reserved. 5321936Shselasky * Copyright (c) 2009 HNR Consulting. All rights reserved. 6321936Shselasky * 7321936Shselasky * This software is available to you under a choice of one of two 8321936Shselasky * licenses. You may choose to be licensed under the terms of the GNU 9321936Shselasky * General Public License (GPL) Version 2, available from the file 10321936Shselasky * COPYING in the main directory of this source tree, or the 11321936Shselasky * OpenIB.org BSD license below: 12321936Shselasky * 13321936Shselasky * Redistribution and use in source and binary forms, with or 14321936Shselasky * without modification, are permitted provided that the following 15321936Shselasky * conditions are met: 16321936Shselasky * 17321936Shselasky * - Redistributions of source code must retain the above 18321936Shselasky * copyright notice, this list of conditions and the following 19321936Shselasky * disclaimer. 20321936Shselasky * 21321936Shselasky * - Redistributions in binary form must reproduce the above 22321936Shselasky * copyright notice, this list of conditions and the following 23321936Shselasky * disclaimer in the documentation and/or other materials 24321936Shselasky * provided with the distribution. 25321936Shselasky * 26321936Shselasky * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 27321936Shselasky * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 28321936Shselasky * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 29321936Shselasky * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 30321936Shselasky * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 31321936Shselasky * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 32321936Shselasky * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 33321936Shselasky * SOFTWARE. 34321936Shselasky * 35321936Shselasky */ 36321936Shselasky 37321936Shselasky/* 38321936Shselasky * Abstract: 39321936Shselasky * Implementation of osm_switch_t. 40321936Shselasky * This object represents an Infiniband switch. 41321936Shselasky * This object is part of the opensm family of objects. 42321936Shselasky */ 43321936Shselasky 44321936Shselasky#if HAVE_CONFIG_H 45321936Shselasky# include <config.h> 46321936Shselasky#endif /* HAVE_CONFIG_H */ 47321936Shselasky 48321936Shselasky#include <stdlib.h> 49321936Shselasky#include <string.h> 50321936Shselasky#include <complib/cl_math.h> 51321936Shselasky#include <iba/ib_types.h> 52321936Shselasky#include <opensm/osm_file_ids.h> 53321936Shselasky#define FILE_ID OSM_FILE_SWITCH_C 54321936Shselasky#include <opensm/osm_switch.h> 55321936Shselasky 56321936Shselaskystruct switch_port_path { 57321936Shselasky uint8_t port_num; 58321936Shselasky uint32_t path_count; 59321936Shselasky int found_sys_guid; 60321936Shselasky int found_node_guid; 61321936Shselasky uint32_t forwarded_to; 62321936Shselasky}; 63321936Shselasky 64321936Shselaskycl_status_t osm_switch_set_hops(IN osm_switch_t * p_sw, IN uint16_t lid_ho, 65321936Shselasky IN uint8_t port_num, IN uint8_t num_hops) 66321936Shselasky{ 67321936Shselasky if (!lid_ho || lid_ho > p_sw->max_lid_ho) 68321936Shselasky return -1; 69321936Shselasky if (port_num >= p_sw->num_ports) 70321936Shselasky return -1; 71321936Shselasky if (!p_sw->hops[lid_ho]) { 72321936Shselasky p_sw->hops[lid_ho] = malloc(p_sw->num_ports); 73321936Shselasky if (!p_sw->hops[lid_ho]) 74321936Shselasky return -1; 75321936Shselasky memset(p_sw->hops[lid_ho], OSM_NO_PATH, p_sw->num_ports); 76321936Shselasky } 77321936Shselasky 78321936Shselasky p_sw->hops[lid_ho][port_num] = num_hops; 79321936Shselasky if (p_sw->hops[lid_ho][0] > num_hops) 80321936Shselasky p_sw->hops[lid_ho][0] = num_hops; 81321936Shselasky 82321936Shselasky return 0; 83321936Shselasky} 84321936Shselasky 85321936Shselaskyvoid osm_switch_delete(IN OUT osm_switch_t ** pp_sw) 86321936Shselasky{ 87321936Shselasky osm_switch_t *p_sw = *pp_sw; 88321936Shselasky unsigned i; 89321936Shselasky 90321936Shselasky osm_mcast_tbl_destroy(&p_sw->mcast_tbl); 91321936Shselasky if (p_sw->p_prof) 92321936Shselasky free(p_sw->p_prof); 93321936Shselasky if (p_sw->search_ordering_ports) 94321936Shselasky free(p_sw->search_ordering_ports); 95321936Shselasky if (p_sw->lft) 96321936Shselasky free(p_sw->lft); 97321936Shselasky if (p_sw->new_lft) 98321936Shselasky free(p_sw->new_lft); 99321936Shselasky if (p_sw->hops) { 100321936Shselasky for (i = 0; i < p_sw->num_hops; i++) 101321936Shselasky if (p_sw->hops[i]) 102321936Shselasky free(p_sw->hops[i]); 103321936Shselasky free(p_sw->hops); 104321936Shselasky } 105321936Shselasky free(*pp_sw); 106321936Shselasky *pp_sw = NULL; 107321936Shselasky} 108321936Shselasky 109321936Shselaskyosm_switch_t *osm_switch_new(IN osm_node_t * p_node, 110321936Shselasky IN const osm_madw_t * p_madw) 111321936Shselasky{ 112321936Shselasky osm_switch_t *p_sw; 113321936Shselasky ib_switch_info_t *p_si; 114321936Shselasky ib_smp_t *p_smp; 115321936Shselasky uint8_t num_ports; 116321936Shselasky uint32_t port_num; 117321936Shselasky 118321936Shselasky CL_ASSERT(p_madw); 119321936Shselasky CL_ASSERT(p_node); 120321936Shselasky 121321936Shselasky p_smp = osm_madw_get_smp_ptr(p_madw); 122321936Shselasky p_si = ib_smp_get_payload_ptr(p_smp); 123321936Shselasky num_ports = osm_node_get_num_physp(p_node); 124321936Shselasky 125321936Shselasky CL_ASSERT(p_smp->attr_id == IB_MAD_ATTR_SWITCH_INFO); 126321936Shselasky 127321936Shselasky if (!p_si->lin_cap) /* The switch doesn't support LFT */ 128321936Shselasky return NULL; 129321936Shselasky 130321936Shselasky p_sw = malloc(sizeof(*p_sw)); 131321936Shselasky if (!p_sw) 132321936Shselasky return NULL; 133321936Shselasky 134321936Shselasky memset(p_sw, 0, sizeof(*p_sw)); 135321936Shselasky 136321936Shselasky p_sw->p_node = p_node; 137321936Shselasky p_sw->switch_info = *p_si; 138321936Shselasky p_sw->num_ports = num_ports; 139321936Shselasky p_sw->need_update = 2; 140321936Shselasky 141321936Shselasky p_sw->p_prof = malloc(sizeof(*p_sw->p_prof) * num_ports); 142321936Shselasky if (!p_sw->p_prof) 143321936Shselasky goto err; 144321936Shselasky 145321936Shselasky memset(p_sw->p_prof, 0, sizeof(*p_sw->p_prof) * num_ports); 146321936Shselasky 147321936Shselasky osm_mcast_tbl_init(&p_sw->mcast_tbl, osm_node_get_num_physp(p_node), 148321936Shselasky cl_ntoh16(p_si->mcast_cap)); 149321936Shselasky 150321936Shselasky for (port_num = 0; port_num < num_ports; port_num++) 151321936Shselasky osm_port_prof_construct(&p_sw->p_prof[port_num]); 152321936Shselasky 153321936Shselasky return p_sw; 154321936Shselasky 155321936Shselaskyerr: 156321936Shselasky osm_switch_delete(&p_sw); 157321936Shselasky return NULL; 158321936Shselasky} 159321936Shselasky 160321936Shselaskyboolean_t osm_switch_get_lft_block(IN const osm_switch_t * p_sw, 161321936Shselasky IN uint16_t block_id, OUT uint8_t * p_block) 162321936Shselasky{ 163321936Shselasky uint16_t base_lid_ho = block_id * IB_SMP_DATA_SIZE; 164321936Shselasky 165321936Shselasky CL_ASSERT(p_sw); 166321936Shselasky CL_ASSERT(p_block); 167321936Shselasky 168321936Shselasky if (base_lid_ho > p_sw->max_lid_ho) 169321936Shselasky return FALSE; 170321936Shselasky 171321936Shselasky CL_ASSERT(base_lid_ho + IB_SMP_DATA_SIZE - 1 <= IB_LID_UCAST_END_HO); 172321936Shselasky memcpy(p_block, &(p_sw->new_lft[base_lid_ho]), IB_SMP_DATA_SIZE); 173321936Shselasky return TRUE; 174321936Shselasky} 175321936Shselasky 176321936Shselaskystatic struct osm_remote_node * 177321936Shselaskyswitch_find_guid_common(IN const osm_switch_t * p_sw, 178321936Shselasky IN struct osm_remote_guids_count *r, 179321936Shselasky IN uint8_t port_num, IN int find_sys_guid, 180321936Shselasky IN int find_node_guid) 181321936Shselasky{ 182321936Shselasky struct osm_remote_node *p_remote_guid = NULL; 183321936Shselasky osm_physp_t *p_physp; 184321936Shselasky osm_physp_t *p_rem_physp; 185321936Shselasky osm_node_t *p_rem_node; 186321936Shselasky uint64_t sys_guid; 187321936Shselasky uint64_t node_guid; 188321936Shselasky unsigned int i; 189321936Shselasky 190321936Shselasky CL_ASSERT(p_sw); 191321936Shselasky 192321936Shselasky if (!r) 193321936Shselasky goto out; 194321936Shselasky 195321936Shselasky p_physp = osm_node_get_physp_ptr(p_sw->p_node, port_num); 196321936Shselasky if (!p_physp) 197321936Shselasky goto out; 198321936Shselasky 199321936Shselasky p_rem_physp = osm_physp_get_remote(p_physp); 200321936Shselasky p_rem_node = osm_physp_get_node_ptr(p_rem_physp); 201321936Shselasky sys_guid = p_rem_node->node_info.sys_guid; 202321936Shselasky node_guid = p_rem_node->node_info.node_guid; 203321936Shselasky 204321936Shselasky for (i = 0; i < r->count; i++) { 205321936Shselasky if ((!find_sys_guid 206321936Shselasky || r->guids[i].node->node_info.sys_guid == sys_guid) 207321936Shselasky && (!find_node_guid 208321936Shselasky || r->guids[i].node->node_info.node_guid == node_guid)) { 209321936Shselasky p_remote_guid = &r->guids[i]; 210321936Shselasky break; 211321936Shselasky } 212321936Shselasky } 213321936Shselasky 214321936Shselaskyout: 215321936Shselasky return p_remote_guid; 216321936Shselasky} 217321936Shselasky 218321936Shselaskystatic struct osm_remote_node * 219321936Shselaskyswitch_find_sys_guid_count(IN const osm_switch_t * p_sw, 220321936Shselasky IN struct osm_remote_guids_count *r, 221321936Shselasky IN uint8_t port_num) 222321936Shselasky{ 223321936Shselasky return switch_find_guid_common(p_sw, r, port_num, 1, 0); 224321936Shselasky} 225321936Shselasky 226321936Shselaskystatic struct osm_remote_node * 227321936Shselaskyswitch_find_node_guid_count(IN const osm_switch_t * p_sw, 228321936Shselasky IN struct osm_remote_guids_count *r, 229321936Shselasky IN uint8_t port_num) 230321936Shselasky{ 231321936Shselasky return switch_find_guid_common(p_sw, r, port_num, 0, 1); 232321936Shselasky} 233321936Shselasky 234321936Shselaskyuint8_t osm_switch_recommend_path(IN const osm_switch_t * p_sw, 235321936Shselasky IN osm_port_t * p_port, IN uint16_t lid_ho, 236321936Shselasky IN unsigned start_from, 237321936Shselasky IN boolean_t ignore_existing, 238321936Shselasky IN boolean_t routing_for_lmc, 239321936Shselasky IN boolean_t dor, 240321936Shselasky IN boolean_t port_shifting, 241321936Shselasky IN uint32_t scatter_ports, 242321936Shselasky IN osm_lft_type_enum lft_enum) 243321936Shselasky{ 244321936Shselasky /* 245321936Shselasky We support an enhanced LMC aware routing mode: 246321936Shselasky In the case of LMC > 0, we can track the remote side 247321936Shselasky system and node for all of the lids of the target 248321936Shselasky and try and avoid routing again through the same 249321936Shselasky system / node. 250321936Shselasky 251321936Shselasky Assume if routing_for_lmc is true that this procedure was 252321936Shselasky provided the tracking array and counter via p_port->priv, 253321936Shselasky and we can conduct this algorithm. 254321936Shselasky */ 255321936Shselasky uint16_t base_lid; 256321936Shselasky uint8_t hops; 257321936Shselasky uint8_t least_hops; 258321936Shselasky uint8_t port_num; 259321936Shselasky uint8_t num_ports; 260321936Shselasky uint32_t least_paths = 0xFFFFFFFF; 261321936Shselasky unsigned i; 262321936Shselasky /* 263321936Shselasky The following will track the least paths if the 264321936Shselasky route should go through a new system/node 265321936Shselasky */ 266321936Shselasky uint32_t least_paths_other_sys = 0xFFFFFFFF; 267321936Shselasky uint32_t least_paths_other_nodes = 0xFFFFFFFF; 268321936Shselasky uint32_t least_forwarded_to = 0xFFFFFFFF; 269321936Shselasky uint32_t check_count; 270321936Shselasky uint8_t best_port = 0; 271321936Shselasky /* 272321936Shselasky These vars track the best port if it connects to 273321936Shselasky not used system/node. 274321936Shselasky */ 275321936Shselasky uint8_t best_port_other_sys = 0; 276321936Shselasky uint8_t best_port_other_node = 0; 277321936Shselasky boolean_t port_found = FALSE; 278321936Shselasky osm_physp_t *p_physp; 279321936Shselasky osm_physp_t *p_rem_physp; 280321936Shselasky osm_node_t *p_rem_node; 281321936Shselasky osm_node_t *p_rem_node_first = NULL; 282321936Shselasky struct osm_remote_node *p_remote_guid = NULL; 283321936Shselasky struct osm_remote_node null_remote_node = {NULL, 0, 0}; 284321936Shselasky struct switch_port_path port_paths[IB_NODE_NUM_PORTS_MAX]; 285321936Shselasky unsigned int port_paths_total_paths = 0; 286321936Shselasky unsigned int port_paths_count = 0; 287321936Shselasky uint8_t scatter_possible_ports[IB_NODE_NUM_PORTS_MAX]; 288321936Shselasky unsigned int scatter_possible_ports_count = 0; 289321936Shselasky int found_sys_guid = 0; 290321936Shselasky int found_node_guid = 0; 291321936Shselasky 292321936Shselasky CL_ASSERT(lid_ho > 0); 293321936Shselasky 294321936Shselasky if (p_port->p_node->sw) { 295321936Shselasky if (p_port->p_node->sw == p_sw) 296321936Shselasky return 0; 297321936Shselasky base_lid = osm_port_get_base_lid(p_port); 298321936Shselasky } else { 299321936Shselasky p_physp = p_port->p_physp; 300321936Shselasky if (!p_physp || !p_physp->p_remote_physp || 301321936Shselasky !p_physp->p_remote_physp->p_node->sw) 302321936Shselasky return OSM_NO_PATH; 303321936Shselasky 304321936Shselasky if (p_physp->p_remote_physp->p_node->sw == p_sw) 305321936Shselasky return p_physp->p_remote_physp->port_num; 306321936Shselasky base_lid = 307321936Shselasky osm_node_get_base_lid(p_physp->p_remote_physp->p_node, 0); 308321936Shselasky } 309321936Shselasky base_lid = cl_ntoh16(base_lid); 310321936Shselasky 311321936Shselasky num_ports = p_sw->num_ports; 312321936Shselasky 313321936Shselasky least_hops = osm_switch_get_least_hops(p_sw, base_lid); 314321936Shselasky if (least_hops == OSM_NO_PATH) 315321936Shselasky return OSM_NO_PATH; 316321936Shselasky 317321936Shselasky /* 318321936Shselasky First, inquire with the forwarding table for an existing 319321936Shselasky route. If one is found, honor it unless: 320321936Shselasky 1. the ignore existing flag is set. 321321936Shselasky 2. the physical port is not a valid one or not healthy 322321936Shselasky 3. the physical port has a remote port (the link is up) 323321936Shselasky 4. the port has min-hops to the target (avoid loops) 324321936Shselasky */ 325321936Shselasky if (!ignore_existing) { 326321936Shselasky port_num = osm_switch_get_port_by_lid(p_sw, lid_ho, lft_enum); 327321936Shselasky 328321936Shselasky if (port_num != OSM_NO_PATH) { 329321936Shselasky CL_ASSERT(port_num < num_ports); 330321936Shselasky 331321936Shselasky p_physp = 332321936Shselasky osm_node_get_physp_ptr(p_sw->p_node, port_num); 333321936Shselasky /* 334321936Shselasky Don't be too trusting of the current forwarding table! 335321936Shselasky Verify that the port number is legal and that the 336321936Shselasky LID is reachable through this port. 337321936Shselasky */ 338321936Shselasky if (p_physp && osm_physp_is_healthy(p_physp) && 339321936Shselasky osm_physp_get_remote(p_physp)) { 340321936Shselasky hops = 341321936Shselasky osm_switch_get_hop_count(p_sw, base_lid, 342321936Shselasky port_num); 343321936Shselasky /* 344321936Shselasky If we aren't using pre-defined user routes 345321936Shselasky function, then we need to make sure that the 346321936Shselasky current path is the minimum one. In case of 347321936Shselasky having such a user function - this check will 348321936Shselasky not be done, and the old routing will be used. 349321936Shselasky Note: This means that it is the user's job to 350321936Shselasky clean all data in the forwarding tables that 351321936Shselasky he wants to be overridden by the minimum 352321936Shselasky hop function. 353321936Shselasky */ 354321936Shselasky if (hops == least_hops) 355321936Shselasky return port_num; 356321936Shselasky } 357321936Shselasky } 358321936Shselasky } 359321936Shselasky 360321936Shselasky /* 361321936Shselasky This algorithm selects a port based on a static load balanced 362321936Shselasky selection across equal hop-count ports. 363321936Shselasky There is lots of room for improved sophistication here, 364321936Shselasky possibly guided by user configuration info. 365321936Shselasky */ 366321936Shselasky 367321936Shselasky /* 368321936Shselasky OpenSM routing is "local" - not considering a full lid to lid 369321936Shselasky path. As such we can not guarantee a path will not loop if we 370321936Shselasky do not always follow least hops. 371321936Shselasky So we must abort if not least hops. 372321936Shselasky */ 373321936Shselasky 374321936Shselasky /* port number starts with one and num_ports is 1 + num phys ports */ 375321936Shselasky for (i = start_from; i < start_from + num_ports; i++) { 376321936Shselasky port_num = osm_switch_get_dimn_port(p_sw, i % num_ports); 377321936Shselasky if (!port_num || 378321936Shselasky osm_switch_get_hop_count(p_sw, base_lid, port_num) != 379321936Shselasky least_hops) 380321936Shselasky continue; 381321936Shselasky 382321936Shselasky /* let us make sure it is not down or unhealthy */ 383321936Shselasky p_physp = osm_node_get_physp_ptr(p_sw->p_node, port_num); 384321936Shselasky if (!p_physp || !osm_physp_is_healthy(p_physp) || 385321936Shselasky /* 386321936Shselasky we require all - non sma ports to be linked 387321936Shselasky to be routed through 388321936Shselasky */ 389321936Shselasky !osm_physp_get_remote(p_physp)) 390321936Shselasky continue; 391321936Shselasky 392321936Shselasky /* 393321936Shselasky We located a least-hop port, possibly one of many. 394321936Shselasky For this port, check the running total count of 395321936Shselasky the number of paths through this port. Select 396321936Shselasky the port routing the least number of paths. 397321936Shselasky */ 398321936Shselasky check_count = 399321936Shselasky osm_port_prof_path_count_get(&p_sw->p_prof[port_num]); 400321936Shselasky 401321936Shselasky 402321936Shselasky if (dor) { 403321936Shselasky /* Get the Remote Node */ 404321936Shselasky p_rem_physp = osm_physp_get_remote(p_physp); 405321936Shselasky p_rem_node = osm_physp_get_node_ptr(p_rem_physp); 406321936Shselasky /* use the first dimension, but spread traffic 407321936Shselasky * out among the group of ports representing 408321936Shselasky * that dimension */ 409321936Shselasky if (!p_rem_node_first) 410321936Shselasky p_rem_node_first = p_rem_node; 411321936Shselasky else if (p_rem_node != p_rem_node_first) 412321936Shselasky continue; 413321936Shselasky if (routing_for_lmc) { 414321936Shselasky struct osm_remote_guids_count *r = p_port->priv; 415321936Shselasky uint8_t rem_port = osm_physp_get_port_num(p_rem_physp); 416321936Shselasky unsigned int j; 417321936Shselasky 418321936Shselasky for (j = 0; j < r->count; j++) { 419321936Shselasky p_remote_guid = &r->guids[j]; 420321936Shselasky if ((p_remote_guid->node == p_rem_node) 421321936Shselasky && (p_remote_guid->port == rem_port)) 422321936Shselasky break; 423321936Shselasky } 424321936Shselasky if (j == r->count) 425321936Shselasky p_remote_guid = &null_remote_node; 426321936Shselasky } 427321936Shselasky /* 428321936Shselasky Advanced LMC routing requires tracking of the 429321936Shselasky best port by the node connected to the other side of 430321936Shselasky it. 431321936Shselasky */ 432321936Shselasky } else if (routing_for_lmc) { 433321936Shselasky /* Is the sys guid already used ? */ 434321936Shselasky p_remote_guid = switch_find_sys_guid_count(p_sw, 435321936Shselasky p_port->priv, 436321936Shselasky port_num); 437321936Shselasky 438321936Shselasky /* If not update the least hops for this case */ 439321936Shselasky if (!p_remote_guid) { 440321936Shselasky if (check_count < least_paths_other_sys) { 441321936Shselasky least_paths_other_sys = check_count; 442321936Shselasky best_port_other_sys = port_num; 443321936Shselasky least_forwarded_to = 0; 444321936Shselasky } 445321936Shselasky found_sys_guid = 0; 446321936Shselasky } else { /* same sys found - try node */ 447321936Shselasky 448321936Shselasky 449321936Shselasky /* Else is the node guid already used ? */ 450321936Shselasky p_remote_guid = switch_find_node_guid_count(p_sw, 451321936Shselasky p_port->priv, 452321936Shselasky port_num); 453321936Shselasky 454321936Shselasky /* If not update the least hops for this case */ 455321936Shselasky if (!p_remote_guid 456321936Shselasky && check_count < least_paths_other_nodes) { 457321936Shselasky least_paths_other_nodes = check_count; 458321936Shselasky best_port_other_node = port_num; 459321936Shselasky least_forwarded_to = 0; 460321936Shselasky } 461321936Shselasky /* else prior sys and node guid already used */ 462321936Shselasky 463321936Shselasky if (!p_remote_guid) 464321936Shselasky found_node_guid = 0; 465321936Shselasky else 466321936Shselasky found_node_guid = 1; 467321936Shselasky found_sys_guid = 1; 468321936Shselasky } /* same sys found */ 469321936Shselasky } 470321936Shselasky 471321936Shselasky port_paths[port_paths_count].port_num = port_num; 472321936Shselasky port_paths[port_paths_count].path_count = check_count; 473321936Shselasky if (routing_for_lmc) { 474321936Shselasky port_paths[port_paths_count].found_sys_guid = found_sys_guid; 475321936Shselasky port_paths[port_paths_count].found_node_guid = found_node_guid; 476321936Shselasky } 477321936Shselasky if (routing_for_lmc && p_remote_guid) 478321936Shselasky port_paths[port_paths_count].forwarded_to = p_remote_guid->forwarded_to; 479321936Shselasky else 480321936Shselasky port_paths[port_paths_count].forwarded_to = 0; 481321936Shselasky port_paths_total_paths += check_count; 482321936Shselasky port_paths_count++; 483321936Shselasky 484321936Shselasky /* routing for LMC mode */ 485321936Shselasky /* 486321936Shselasky the count is min but also lower then the max subscribed 487321936Shselasky */ 488321936Shselasky if (check_count < least_paths) { 489321936Shselasky port_found = TRUE; 490321936Shselasky best_port = port_num; 491321936Shselasky least_paths = check_count; 492321936Shselasky scatter_possible_ports_count = 0; 493321936Shselasky scatter_possible_ports[scatter_possible_ports_count++] = port_num; 494321936Shselasky if (routing_for_lmc 495321936Shselasky && p_remote_guid 496321936Shselasky && p_remote_guid->forwarded_to < least_forwarded_to) 497321936Shselasky least_forwarded_to = p_remote_guid->forwarded_to; 498321936Shselasky } else if (scatter_ports 499321936Shselasky && check_count == least_paths) { 500321936Shselasky scatter_possible_ports[scatter_possible_ports_count++] = port_num; 501321936Shselasky } else if (routing_for_lmc 502321936Shselasky && p_remote_guid 503321936Shselasky && check_count == least_paths 504321936Shselasky && p_remote_guid->forwarded_to < least_forwarded_to) { 505321936Shselasky least_forwarded_to = p_remote_guid->forwarded_to; 506321936Shselasky best_port = port_num; 507321936Shselasky } 508321936Shselasky } 509321936Shselasky 510321936Shselasky if (port_found == FALSE) 511321936Shselasky return OSM_NO_PATH; 512321936Shselasky 513321936Shselasky if (port_shifting && port_paths_count) { 514321936Shselasky /* In the port_paths[] array, we now have all the ports that we 515321936Shselasky * can route out of. Using some shifting math below, possibly 516321936Shselasky * select a different one so that lids won't align in LFTs 517321936Shselasky * 518321936Shselasky * If lmc > 0, we need to loop through these ports to find the 519321936Shselasky * least_forwarded_to port, best_port_other_sys, and 520321936Shselasky * best_port_other_node just like before but through the different 521321936Shselasky * ordering. 522321936Shselasky */ 523321936Shselasky 524321936Shselasky least_paths = 0xFFFFFFFF; 525321936Shselasky least_paths_other_sys = 0xFFFFFFFF; 526321936Shselasky least_paths_other_nodes = 0xFFFFFFFF; 527321936Shselasky least_forwarded_to = 0xFFFFFFFF; 528321936Shselasky best_port = 0; 529321936Shselasky best_port_other_sys = 0; 530321936Shselasky best_port_other_node = 0; 531321936Shselasky 532321936Shselasky for (i = 0; i < port_paths_count; i++) { 533321936Shselasky unsigned int idx; 534321936Shselasky 535321936Shselasky idx = (port_paths_total_paths/port_paths_count + i) % port_paths_count; 536321936Shselasky 537321936Shselasky if (routing_for_lmc) { 538321936Shselasky if (!port_paths[idx].found_sys_guid 539321936Shselasky && port_paths[idx].path_count < least_paths_other_sys) { 540321936Shselasky least_paths_other_sys = port_paths[idx].path_count; 541321936Shselasky best_port_other_sys = port_paths[idx].port_num; 542321936Shselasky least_forwarded_to = 0; 543321936Shselasky } 544321936Shselasky else if (!port_paths[idx].found_node_guid 545321936Shselasky && port_paths[idx].path_count < least_paths_other_nodes) { 546321936Shselasky least_paths_other_nodes = port_paths[idx].path_count; 547321936Shselasky best_port_other_node = port_paths[idx].port_num; 548321936Shselasky least_forwarded_to = 0; 549321936Shselasky } 550321936Shselasky } 551321936Shselasky 552321936Shselasky if (port_paths[idx].path_count < least_paths) { 553321936Shselasky best_port = port_paths[idx].port_num; 554321936Shselasky least_paths = port_paths[idx].path_count; 555321936Shselasky if (routing_for_lmc 556321936Shselasky && (port_paths[idx].found_sys_guid 557321936Shselasky || port_paths[idx].found_node_guid) 558321936Shselasky && port_paths[idx].forwarded_to < least_forwarded_to) 559321936Shselasky least_forwarded_to = port_paths[idx].forwarded_to; 560321936Shselasky } 561321936Shselasky else if (routing_for_lmc 562321936Shselasky && (port_paths[idx].found_sys_guid 563321936Shselasky || port_paths[idx].found_node_guid) 564321936Shselasky && port_paths[idx].path_count == least_paths 565321936Shselasky && port_paths[idx].forwarded_to < least_forwarded_to) { 566321936Shselasky least_forwarded_to = port_paths[idx].forwarded_to; 567321936Shselasky best_port = port_paths[idx].port_num; 568321936Shselasky } 569321936Shselasky 570321936Shselasky } 571321936Shselasky } 572321936Shselasky 573321936Shselasky /* 574321936Shselasky if we are in enhanced routing mode and the best port is not 575321936Shselasky the local port 0 576321936Shselasky */ 577321936Shselasky if (routing_for_lmc && best_port && !scatter_ports) { 578321936Shselasky /* Select the least hop port of the non used sys first */ 579321936Shselasky if (best_port_other_sys) 580321936Shselasky best_port = best_port_other_sys; 581321936Shselasky else if (best_port_other_node) 582321936Shselasky best_port = best_port_other_node; 583321936Shselasky } else if (scatter_ports) { 584321936Shselasky /* 585321936Shselasky * There is some danger that this random could "rebalance" the routes 586321936Shselasky * every time, to combat this there is a global srandom that 587321936Shselasky * occurs at the start of every sweep. 588321936Shselasky */ 589321936Shselasky unsigned int idx = random() % scatter_possible_ports_count; 590321936Shselasky best_port = scatter_possible_ports[idx]; 591321936Shselasky } 592321936Shselasky return best_port; 593321936Shselasky} 594321936Shselasky 595321936Shselaskyvoid osm_switch_clear_hops(IN osm_switch_t * p_sw) 596321936Shselasky{ 597321936Shselasky unsigned i; 598321936Shselasky 599321936Shselasky for (i = 0; i < p_sw->num_hops; i++) 600321936Shselasky if (p_sw->hops[i]) 601321936Shselasky memset(p_sw->hops[i], OSM_NO_PATH, p_sw->num_ports); 602321936Shselasky} 603321936Shselasky 604321936Shselaskystatic int alloc_lft(IN osm_switch_t * p_sw, uint16_t lids) 605321936Shselasky{ 606321936Shselasky uint16_t lft_size; 607321936Shselasky 608321936Shselasky /* Ensure LFT is in units of LFT block size */ 609321936Shselasky lft_size = (lids / IB_SMP_DATA_SIZE + 1) * IB_SMP_DATA_SIZE; 610321936Shselasky if (lft_size > p_sw->lft_size) { 611321936Shselasky uint8_t *new_lft = realloc(p_sw->lft, lft_size); 612321936Shselasky if (!new_lft) 613321936Shselasky return -1; 614321936Shselasky memset(new_lft + p_sw->lft_size, OSM_NO_PATH, 615321936Shselasky lft_size - p_sw->lft_size); 616321936Shselasky p_sw->lft = new_lft; 617321936Shselasky p_sw->lft_size = lft_size; 618321936Shselasky } 619321936Shselasky 620321936Shselasky return 0; 621321936Shselasky} 622321936Shselasky 623321936Shselaskyint osm_switch_prepare_path_rebuild(IN osm_switch_t * p_sw, IN uint16_t max_lids) 624321936Shselasky{ 625321936Shselasky uint8_t **hops; 626321936Shselasky uint8_t *new_lft; 627321936Shselasky unsigned i; 628321936Shselasky 629321936Shselasky if (alloc_lft(p_sw, max_lids)) 630321936Shselasky return -1; 631321936Shselasky 632321936Shselasky for (i = 0; i < p_sw->num_ports; i++) 633321936Shselasky osm_port_prof_construct(&p_sw->p_prof[i]); 634321936Shselasky 635321936Shselasky osm_switch_clear_hops(p_sw); 636321936Shselasky 637321936Shselasky if (!(new_lft = realloc(p_sw->new_lft, p_sw->lft_size))) 638321936Shselasky return -1; 639321936Shselasky 640321936Shselasky p_sw->new_lft = new_lft; 641321936Shselasky 642321936Shselasky memset(p_sw->new_lft, OSM_NO_PATH, p_sw->lft_size); 643321936Shselasky 644321936Shselasky if (!p_sw->hops) { 645321936Shselasky hops = malloc((max_lids + 1) * sizeof(hops[0])); 646321936Shselasky if (!hops) 647321936Shselasky return -1; 648321936Shselasky memset(hops, 0, (max_lids + 1) * sizeof(hops[0])); 649321936Shselasky p_sw->hops = hops; 650321936Shselasky p_sw->num_hops = max_lids + 1; 651321936Shselasky } else if (max_lids + 1 > p_sw->num_hops) { 652321936Shselasky hops = realloc(p_sw->hops, (max_lids + 1) * sizeof(hops[0])); 653321936Shselasky if (!hops) 654321936Shselasky return -1; 655321936Shselasky memset(hops + p_sw->num_hops, 0, 656321936Shselasky (max_lids + 1 - p_sw->num_hops) * sizeof(hops[0])); 657321936Shselasky p_sw->hops = hops; 658321936Shselasky p_sw->num_hops = max_lids + 1; 659321936Shselasky } 660321936Shselasky p_sw->max_lid_ho = max_lids; 661321936Shselasky 662321936Shselasky return 0; 663321936Shselasky} 664321936Shselasky 665321936Shselaskyuint8_t osm_switch_get_port_least_hops(IN const osm_switch_t * p_sw, 666321936Shselasky IN const osm_port_t * p_port) 667321936Shselasky{ 668321936Shselasky uint16_t lid; 669321936Shselasky 670321936Shselasky if (p_port->p_node->sw) { 671321936Shselasky if (p_port->p_node->sw == p_sw) 672321936Shselasky return 0; 673321936Shselasky lid = osm_node_get_base_lid(p_port->p_node, 0); 674321936Shselasky return osm_switch_get_least_hops(p_sw, cl_ntoh16(lid)); 675321936Shselasky } else { 676321936Shselasky osm_physp_t *p = p_port->p_physp; 677321936Shselasky uint8_t hops; 678321936Shselasky 679321936Shselasky if (!p || !p->p_remote_physp || !p->p_remote_physp->p_node->sw) 680321936Shselasky return OSM_NO_PATH; 681321936Shselasky if (p->p_remote_physp->p_node->sw == p_sw) 682321936Shselasky return 1; 683321936Shselasky lid = osm_node_get_base_lid(p->p_remote_physp->p_node, 0); 684321936Shselasky hops = osm_switch_get_least_hops(p_sw, cl_ntoh16(lid)); 685321936Shselasky return hops != OSM_NO_PATH ? hops + 1 : OSM_NO_PATH; 686321936Shselasky } 687321936Shselasky} 688321936Shselasky 689321936Shselaskyuint8_t osm_switch_recommend_mcast_path(IN osm_switch_t * p_sw, 690321936Shselasky IN osm_port_t * p_port, 691321936Shselasky IN uint16_t mlid_ho, 692321936Shselasky IN boolean_t ignore_existing) 693321936Shselasky{ 694321936Shselasky uint16_t base_lid; 695321936Shselasky uint8_t hops; 696321936Shselasky uint8_t port_num; 697321936Shselasky uint8_t num_ports; 698321936Shselasky uint8_t least_hops; 699321936Shselasky 700321936Shselasky CL_ASSERT(mlid_ho >= IB_LID_MCAST_START_HO); 701321936Shselasky 702321936Shselasky if (p_port->p_node->sw) { 703321936Shselasky if (p_port->p_node->sw == p_sw) 704321936Shselasky return 0; 705321936Shselasky base_lid = osm_port_get_base_lid(p_port); 706321936Shselasky } else { 707321936Shselasky osm_physp_t *p_physp = p_port->p_physp; 708321936Shselasky if (!p_physp || !p_physp->p_remote_physp || 709321936Shselasky !p_physp->p_remote_physp->p_node->sw) 710321936Shselasky return OSM_NO_PATH; 711321936Shselasky if (p_physp->p_remote_physp->p_node->sw == p_sw) 712321936Shselasky return p_physp->p_remote_physp->port_num; 713321936Shselasky base_lid = 714321936Shselasky osm_node_get_base_lid(p_physp->p_remote_physp->p_node, 0); 715321936Shselasky } 716321936Shselasky base_lid = cl_ntoh16(base_lid); 717321936Shselasky num_ports = p_sw->num_ports; 718321936Shselasky 719321936Shselasky /* 720321936Shselasky If the user wants us to ignore existing multicast routes, 721321936Shselasky then simply return the shortest hop count path to the 722321936Shselasky target port. 723321936Shselasky 724321936Shselasky Otherwise, return the first port that has a path to the target, 725321936Shselasky picking from the ports that are already in the multicast group. 726321936Shselasky */ 727321936Shselasky if (!ignore_existing) { 728321936Shselasky for (port_num = 1; port_num < num_ports; port_num++) { 729321936Shselasky if (!osm_mcast_tbl_is_port 730321936Shselasky (&p_sw->mcast_tbl, mlid_ho, port_num)) 731321936Shselasky continue; 732321936Shselasky /* 733321936Shselasky Don't be too trusting of the current forwarding table! 734321936Shselasky Verify that the LID is reachable through this port. 735321936Shselasky */ 736321936Shselasky hops = 737321936Shselasky osm_switch_get_hop_count(p_sw, base_lid, port_num); 738321936Shselasky if (hops != OSM_NO_PATH) 739321936Shselasky return port_num; 740321936Shselasky } 741321936Shselasky } 742321936Shselasky 743321936Shselasky /* 744321936Shselasky Either no existing mcast paths reach this port or we are 745321936Shselasky ignoring existing paths. 746321936Shselasky 747321936Shselasky Determine the best multicast path to the target. Note that this 748321936Shselasky algorithm is slightly different from the one used for unicast route 749321936Shselasky recommendation. In this case (multicast), we must NOT 750321936Shselasky perform any sort of load balancing. We MUST take the FIRST 751321936Shselasky port found that has <= the lowest hop count path. This prevents 752321936Shselasky more than one multicast path to the same remote switch which 753321936Shselasky prevents a multicast loop. Multicast loops are bad since the same 754321936Shselasky multicast packet will go around and around, inevitably creating 755321936Shselasky a black hole that will destroy the Earth in a firey conflagration. 756321936Shselasky */ 757321936Shselasky least_hops = osm_switch_get_least_hops(p_sw, base_lid); 758321936Shselasky if (least_hops == OSM_NO_PATH) 759321936Shselasky return OSM_NO_PATH; 760321936Shselasky for (port_num = 1; port_num < num_ports; port_num++) 761321936Shselasky if (osm_switch_get_hop_count(p_sw, base_lid, port_num) == 762321936Shselasky least_hops) 763321936Shselasky break; 764321936Shselasky 765321936Shselasky CL_ASSERT(port_num < num_ports); 766321936Shselasky return port_num; 767321936Shselasky} 768