1/* 2 * Copyright (c) 2004-2008 Voltaire, Inc. All rights reserved. 3 * Copyright (c) 2002-2005 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 1996-2003 Intel Corporation. All rights reserved. 5 * 6 * This software is available to you under a choice of one of two 7 * licenses. You may choose to be licensed under the terms of the GNU 8 * General Public License (GPL) Version 2, available from the file 9 * COPYING in the main directory of this source tree, or the 10 * OpenIB.org BSD license below: 11 * 12 * Redistribution and use in source and binary forms, with or 13 * without modification, are permitted provided that the following 14 * conditions are met: 15 * 16 * - Redistributions of source code must retain the above 17 * copyright notice, this list of conditions and the following 18 * disclaimer. 19 * 20 * - Redistributions in binary form must reproduce the above 21 * copyright notice, this list of conditions and the following 22 * disclaimer in the documentation and/or other materials 23 * provided with the distribution. 24 * 25 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 26 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 27 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 28 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 29 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 30 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 31 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 32 * SOFTWARE. 33 * 34 */ 35 36/* 37 * Abstract: 38 * Implementation of osm_sm_state_mgr_t. 39 * This file implements the SM State Manager object. 40 */ 41 42#if HAVE_CONFIG_H 43# include <config.h> 44#endif /* HAVE_CONFIG_H */ 45 46#include <string.h> 47#include <time.h> 48#include <iba/ib_types.h> 49#include <complib/cl_passivelock.h> 50#include <complib/cl_debug.h> 51#include <opensm/osm_sm.h> 52#include <opensm/osm_madw.h> 53#include <opensm/osm_switch.h> 54#include <opensm/osm_log.h> 55#include <opensm/osm_subnet.h> 56#include <opensm/osm_helper.h> 57#include <opensm/osm_msgdef.h> 58#include <opensm/osm_node.h> 59#include <opensm/osm_port.h> 60#include <vendor/osm_vendor_api.h> 61#include <opensm/osm_helper.h> 62#include <opensm/osm_opensm.h> 63 64/********************************************************************** 65 **********************************************************************/ 66void osm_report_sm_state(osm_sm_t * sm) 67{ 68 char buf[64]; 69 const char *state_str = osm_get_sm_mgr_state_str(sm->p_subn->sm_state); 70 71 osm_log(sm->p_log, OSM_LOG_SYS, "Entering %s state\n", state_str); 72 snprintf(buf, sizeof(buf), "ENTERING SM %s STATE", state_str); 73 OSM_LOG_MSG_BOX(sm->p_log, OSM_LOG_VERBOSE, buf); 74} 75 76/********************************************************************** 77 **********************************************************************/ 78static void __osm_sm_state_mgr_send_master_sm_info_req(osm_sm_t * sm) 79{ 80 osm_madw_context_t context; 81 const osm_port_t *p_port; 82 ib_api_status_t status; 83 84 OSM_LOG_ENTER(sm->p_log); 85 86 memset(&context, 0, sizeof(context)); 87 if (sm->p_subn->sm_state == IB_SMINFO_STATE_STANDBY) { 88 /* 89 * We are in STANDBY state - this means we need to poll on the master 90 * SM (according to master_guid) 91 * Send a query of SubnGet(SMInfo) to the subn master_sm_base_lid object. 92 */ 93 p_port = osm_get_port_by_guid(sm->p_subn, sm->master_sm_guid); 94 } else { 95 /* 96 * We are not in STANDBY - this means we are in MASTER state - so we need 97 * to poll on the SM that is saved in p_polling_sm under sm. 98 * Send a query of SubnGet(SMInfo) to that SM. 99 */ 100 p_port = sm->p_polling_sm->p_port; 101 } 102 if (p_port == NULL) { 103 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 3203: " 104 "No port object for GUID 0x%016" PRIx64 "\n", 105 cl_ntoh64(sm->master_sm_guid)); 106 goto Exit; 107 } 108 109 context.smi_context.port_guid = p_port->guid; 110 context.smi_context.set_method = FALSE; 111 112 status = osm_req_get(sm, osm_physp_get_dr_path_ptr(p_port->p_physp), 113 IB_MAD_ATTR_SM_INFO, 0, CL_DISP_MSGID_NONE, 114 &context); 115 116 if (status != IB_SUCCESS) 117 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 3204: " 118 "Failure requesting SMInfo (%s)\n", 119 ib_get_err_str(status)); 120 121Exit: 122 OSM_LOG_EXIT(sm->p_log); 123} 124 125/********************************************************************** 126 **********************************************************************/ 127static void __osm_sm_state_mgr_start_polling(osm_sm_t * sm) 128{ 129 uint32_t timeout = sm->p_subn->opt.sminfo_polling_timeout; 130 cl_status_t cl_status; 131 132 OSM_LOG_ENTER(sm->p_log); 133 134 /* 135 * Init the retry_number back to zero - need to restart counting 136 */ 137 sm->retry_number = 0; 138 139 /* 140 * Send a SubnGet(SMInfo) query to the current (or new) master found. 141 */ 142 __osm_sm_state_mgr_send_master_sm_info_req(sm); 143 144 /* 145 * Start a timer that will wake up every sminfo_polling_timeout milliseconds. 146 * The callback of the timer will send a SubnGet(SMInfo) to the Master SM 147 * and restart the timer 148 */ 149 cl_status = cl_timer_start(&sm->polling_timer, timeout); 150 if (cl_status != CL_SUCCESS) 151 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 3210: " 152 "Failed to start timer\n"); 153 154 OSM_LOG_EXIT(sm->p_log); 155} 156 157/********************************************************************** 158 **********************************************************************/ 159void osm_sm_state_mgr_polling_callback(IN void *context) 160{ 161 osm_sm_t *sm = context; 162 uint32_t timeout = sm->p_subn->opt.sminfo_polling_timeout; 163 cl_status_t cl_status; 164 165 OSM_LOG_ENTER(sm->p_log); 166 167 /* 168 * We can be here in one of two cases: 169 * 1. We are a STANDBY sm polling on the master SM. 170 * 2. We are a MASTER sm, waiting for a handover from a remote master sm. 171 * If we are not in one of these cases - don't need to restart the poller. 172 */ 173 if (!((sm->p_subn->sm_state == IB_SMINFO_STATE_MASTER && 174 sm->p_polling_sm != NULL) || 175 (sm->p_subn->sm_state == IB_SMINFO_STATE_STANDBY))) 176 goto Exit; 177 178 /* 179 * If we are a STANDBY sm and the osm_exit_flag is set, then let's 180 * signal the subnet_up. This is relevant for the case of running only 181 * once. In that case - the program is stuck until this signal is 182 * received. In other cases - it is not relevant whether or not the 183 * signal is on - since we are currently in exit flow 184 */ 185 if (sm->p_subn->sm_state == IB_SMINFO_STATE_STANDBY && osm_exit_flag) { 186 OSM_LOG(sm->p_log, OSM_LOG_VERBOSE, 187 "Signalling subnet_up_event\n"); 188 cl_event_signal(&sm->subnet_up_event); 189 goto Exit; 190 } 191 192 /* 193 * Incr the retry number. 194 * If it reached the max_retry_number in the subnet opt - call 195 * osm_sm_state_mgr_process with signal OSM_SM_SIGNAL_POLLING_TIMEOUT 196 */ 197 sm->retry_number++; 198 OSM_LOG(sm->p_log, OSM_LOG_VERBOSE, 199 "Retry number:%d\n", sm->retry_number); 200 201 if (sm->retry_number >= sm->p_subn->opt.polling_retry_number) { 202 OSM_LOG(sm->p_log, OSM_LOG_DEBUG, 203 "Reached polling_retry_number value in retry_number. " 204 "Go to DISCOVERY state\n"); 205 osm_sm_state_mgr_process(sm, OSM_SM_SIGNAL_POLLING_TIMEOUT); 206 goto Exit; 207 } 208 209 /* Send a SubnGet(SMInfo) request to the remote sm (depends on our state) */ 210 __osm_sm_state_mgr_send_master_sm_info_req(sm); 211 212 /* restart the timer */ 213 cl_status = cl_timer_start(&sm->polling_timer, timeout); 214 if (cl_status != CL_SUCCESS) 215 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 3211: " 216 "Failed to restart timer\n"); 217 218Exit: 219 OSM_LOG_EXIT(sm->p_log); 220 return; 221} 222 223/********************************************************************** 224 **********************************************************************/ 225static void __osm_sm_state_mgr_signal_error(osm_sm_t * sm, 226 IN const osm_sm_signal_t signal) 227{ 228 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 3207: " 229 "Invalid signal %s in state %s\n", 230 osm_get_sm_mgr_signal_str(signal), 231 osm_get_sm_mgr_state_str(sm->p_subn->sm_state)); 232} 233 234/********************************************************************** 235 **********************************************************************/ 236void osm_sm_state_mgr_signal_master_is_alive(osm_sm_t * sm) 237{ 238 OSM_LOG_ENTER(sm->p_log); 239 sm->retry_number = 0; 240 OSM_LOG_EXIT(sm->p_log); 241} 242 243/********************************************************************** 244 **********************************************************************/ 245ib_api_status_t osm_sm_state_mgr_process(osm_sm_t * sm, 246 IN osm_sm_signal_t signal) 247{ 248 ib_api_status_t status = IB_SUCCESS; 249 250 CL_ASSERT(sm); 251 252 OSM_LOG_ENTER(sm->p_log); 253 254 /* 255 * The state lock prevents many race conditions from screwing 256 * up the state transition process. 257 */ 258 cl_spinlock_acquire(&sm->state_lock); 259 260 OSM_LOG(sm->p_log, OSM_LOG_DEBUG, 261 "Received signal %s in state %s\n", 262 osm_get_sm_mgr_signal_str(signal), 263 osm_get_sm_mgr_state_str(sm->p_subn->sm_state)); 264 265 switch (sm->p_subn->sm_state) { 266 case IB_SMINFO_STATE_DISCOVERING: 267 switch (signal) { 268 case OSM_SM_SIGNAL_DISCOVERY_COMPLETED: 269 /* 270 * Update the state of the SM to MASTER 271 */ 272 /* Turn on the first_time_master_sweep flag */ 273 sm->p_subn->first_time_master_sweep = TRUE; 274 sm->p_subn->sm_state = IB_SMINFO_STATE_MASTER; 275 osm_report_sm_state(sm); 276 /* 277 * Make sure to set the subnet master_sm_base_lid 278 * to the sm_base_lid value 279 */ 280 sm->p_subn->master_sm_base_lid = 281 sm->p_subn->sm_base_lid; 282 break; 283 case OSM_SM_SIGNAL_MASTER_OR_HIGHER_SM_DETECTED: 284 /* 285 * Finished all discovery actions - move to STANDBY 286 * start the polling 287 */ 288 sm->p_subn->sm_state = IB_SMINFO_STATE_STANDBY; 289 osm_report_sm_state(sm); 290 /* 291 * Since another SM is doing the LFT config - we should not 292 * ignore the results of it 293 */ 294 sm->p_subn->ignore_existing_lfts = FALSE; 295 296 __osm_sm_state_mgr_start_polling(sm); 297 break; 298 case OSM_SM_SIGNAL_HANDOVER: 299 /* 300 * Do nothing. We will discover it later on. If we already discovered 301 * this SM, and got the HANDOVER - this means the remote SM is of 302 * lower priority. In this case we will stop polling it (since it is 303 * a lower priority SM in STANDBY state). 304 */ 305 break; 306 default: 307 __osm_sm_state_mgr_signal_error(sm, signal); 308 status = IB_INVALID_PARAMETER; 309 break; 310 } 311 break; 312 313 case IB_SMINFO_STATE_STANDBY: 314 switch (signal) { 315 case OSM_SM_SIGNAL_POLLING_TIMEOUT: 316 case OSM_SM_SIGNAL_DISCOVER: 317 /* 318 * case 1: Polling timeout occured - this means that the Master SM 319 * is no longer alive. 320 * case 2: Got a signal to move to DISCOVERING 321 * Move to DISCOVERING state and start sweeping 322 */ 323 sm->p_subn->sm_state = IB_SMINFO_STATE_DISCOVERING; 324 osm_report_sm_state(sm); 325 sm->p_subn->coming_out_of_standby = TRUE; 326 osm_sm_signal(sm, OSM_SIGNAL_SWEEP); 327 break; 328 case OSM_SM_SIGNAL_DISABLE: 329 /* 330 * Update the state to NOT_ACTIVE 331 */ 332 sm->p_subn->sm_state = IB_SMINFO_STATE_NOTACTIVE; 333 osm_report_sm_state(sm); 334 osm_vendor_set_sm(sm->mad_ctrl.h_bind, FALSE); 335 break; 336 case OSM_SM_SIGNAL_HANDOVER: 337 /* 338 * Update the state to MASTER, and start sweeping 339 * OPTIONAL: send ACKNOWLEDGE 340 */ 341 /* Turn on the first_time_master_sweep flag */ 342 sm->p_subn->first_time_master_sweep = TRUE; 343 /* Turn on the force_heavy_sweep - we want a 344 * heavy sweep to occur on the first sweep of this SM. */ 345 sm->p_subn->force_heavy_sweep = TRUE; 346 347 sm->p_subn->sm_state = IB_SMINFO_STATE_MASTER; 348 osm_report_sm_state(sm); 349 /* 350 * Make sure to set the subnet master_sm_base_lid 351 * to the sm_base_lid value 352 */ 353 sm->p_subn->master_sm_base_lid = 354 sm->p_subn->sm_base_lid; 355 sm->p_subn->coming_out_of_standby = TRUE; 356 osm_sm_signal(sm, OSM_SIGNAL_SWEEP); 357 break; 358 case OSM_SM_SIGNAL_ACKNOWLEDGE: 359 /* 360 * Do nothing - already moved to STANDBY 361 */ 362 break; 363 default: 364 __osm_sm_state_mgr_signal_error(sm, signal); 365 status = IB_INVALID_PARAMETER; 366 break; 367 } 368 break; 369 370 case IB_SMINFO_STATE_NOTACTIVE: 371 switch (signal) { 372 case OSM_SM_SIGNAL_STANDBY: 373 /* 374 * Update the state to STANDBY 375 * start the polling 376 */ 377 sm->p_subn->sm_state = IB_SMINFO_STATE_STANDBY; 378 osm_report_sm_state(sm); 379 __osm_sm_state_mgr_start_polling(sm); 380 break; 381 default: 382 __osm_sm_state_mgr_signal_error(sm, signal); 383 status = IB_INVALID_PARAMETER; 384 break; 385 } 386 break; 387 388 case IB_SMINFO_STATE_MASTER: 389 switch (signal) { 390 case OSM_SM_SIGNAL_POLLING_TIMEOUT: 391 /* 392 * we received a polling timeout - this means that we waited for 393 * a remote master sm to send us a handover, but didn't get it, and 394 * didn't get a response from that remote sm. 395 * We want to force a heavy sweep - hopefully this occurred because 396 * the remote sm died, and we'll find this out and configure the 397 * subnet after a heavy sweep. 398 * We also want to clear the p_polling_sm object - since we are 399 * done polling on that remote sm - we are sweeping again. 400 */ 401 case OSM_SM_SIGNAL_HANDOVER: 402 /* 403 * If we received a handover in a master state - then we want to 404 * force a heavy sweep. This means that either we are in a sweep 405 * currently - in this case - no change, or we are in idle state - 406 * since we recognized a master SM before - so we want to make a 407 * heavy sweep and reconfigure the new subnet. 408 * We also want to clear the p_polling_sm object - since we are 409 * done polling on that remote sm - we got a handover from it. 410 */ 411 OSM_LOG(sm->p_log, OSM_LOG_VERBOSE, 412 "Forcing heavy sweep. " 413 "Received OSM_SM_SIGNAL_HANDOVER or OSM_SM_SIGNAL_POLLING_TIMEOUT\n"); 414 sm->p_polling_sm = NULL; 415 sm->p_subn->force_heavy_sweep = TRUE; 416 osm_sm_signal(sm, OSM_SIGNAL_SWEEP); 417 break; 418 case OSM_SM_SIGNAL_HANDOVER_SENT: 419 /* 420 * Just sent a HANDOVER signal - move to STANDBY 421 * start the polling 422 */ 423 sm->p_subn->sm_state = IB_SMINFO_STATE_STANDBY; 424 osm_report_sm_state(sm); 425 __osm_sm_state_mgr_start_polling(sm); 426 break; 427 case OSM_SM_SIGNAL_WAIT_FOR_HANDOVER: 428 /* 429 * We found a remote master SM, and we are waiting for it 430 * to handover the mastership to us. Need to start polling 431 * on that SM, to make sure it is alive, if it isn't - then 432 * we should move back to discovering, since something must 433 * have happened to it. 434 */ 435 __osm_sm_state_mgr_start_polling(sm); 436 break; 437 case OSM_SM_SIGNAL_DISCOVER: 438 sm->p_subn->sm_state = IB_SMINFO_STATE_DISCOVERING; 439 osm_report_sm_state(sm); 440 break; 441 default: 442 __osm_sm_state_mgr_signal_error(sm, signal); 443 status = IB_INVALID_PARAMETER; 444 break; 445 } 446 break; 447 448 default: 449 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 3208: " 450 "Invalid state %s\n", 451 osm_get_sm_mgr_state_str(sm->p_subn->sm_state)); 452 453 } 454 455 cl_spinlock_release(&sm->state_lock); 456 457 OSM_LOG_EXIT(sm->p_log); 458 return (status); 459} 460 461/********************************************************************** 462 **********************************************************************/ 463ib_api_status_t osm_sm_state_mgr_check_legality(osm_sm_t * sm, 464 IN osm_sm_signal_t signal) 465{ 466 ib_api_status_t status = IB_SUCCESS; 467 468 CL_ASSERT(sm); 469 470 OSM_LOG_ENTER(sm->p_log); 471 472 /* 473 * The state lock prevents many race conditions from screwing 474 * up the state transition process. 475 */ 476 cl_spinlock_acquire(&sm->state_lock); 477 478 OSM_LOG(sm->p_log, OSM_LOG_DEBUG, 479 "Received signal %s in state %s\n", 480 osm_get_sm_mgr_signal_str(signal), 481 osm_get_sm_mgr_state_str(sm->p_subn->sm_state)); 482 483 switch (sm->p_subn->sm_state) { 484 case IB_SMINFO_STATE_DISCOVERING: 485 switch (signal) { 486 case OSM_SM_SIGNAL_DISCOVERY_COMPLETED: 487 case OSM_SM_SIGNAL_MASTER_OR_HIGHER_SM_DETECTED: 488 case OSM_SM_SIGNAL_HANDOVER: 489 status = IB_SUCCESS; 490 break; 491 default: 492 __osm_sm_state_mgr_signal_error(sm, signal); 493 status = IB_INVALID_PARAMETER; 494 break; 495 } 496 break; 497 498 case IB_SMINFO_STATE_STANDBY: 499 switch (signal) { 500 case OSM_SM_SIGNAL_POLLING_TIMEOUT: 501 case OSM_SM_SIGNAL_DISCOVER: 502 case OSM_SM_SIGNAL_DISABLE: 503 case OSM_SM_SIGNAL_HANDOVER: 504 case OSM_SM_SIGNAL_ACKNOWLEDGE: 505 status = IB_SUCCESS; 506 break; 507 default: 508 __osm_sm_state_mgr_signal_error(sm, signal); 509 status = IB_INVALID_PARAMETER; 510 break; 511 } 512 break; 513 514 case IB_SMINFO_STATE_NOTACTIVE: 515 switch (signal) { 516 case OSM_SM_SIGNAL_STANDBY: 517 status = IB_SUCCESS; 518 break; 519 default: 520 __osm_sm_state_mgr_signal_error(sm, signal); 521 status = IB_INVALID_PARAMETER; 522 break; 523 } 524 break; 525 526 case IB_SMINFO_STATE_MASTER: 527 switch (signal) { 528 case OSM_SM_SIGNAL_HANDOVER: 529 case OSM_SM_SIGNAL_HANDOVER_SENT: 530 status = IB_SUCCESS; 531 break; 532 default: 533 __osm_sm_state_mgr_signal_error(sm, signal); 534 status = IB_INVALID_PARAMETER; 535 break; 536 } 537 break; 538 539 default: 540 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 3209: " 541 "Invalid state %s\n", 542 osm_get_sm_mgr_state_str(sm->p_subn->sm_state)); 543 status = IB_INVALID_PARAMETER; 544 545 } 546 547 cl_spinlock_release(&sm->state_lock); 548 549 OSM_LOG_EXIT(sm->p_log); 550 return (status); 551} 552