1/*
2 * Copyright (c) 2004-2008 Voltaire, Inc. All rights reserved.
3 * Copyright (c) 2002-2005 Mellanox Technologies LTD. All rights reserved.
4 * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
5 *
6 * This software is available to you under a choice of one of two
7 * licenses.  You may choose to be licensed under the terms of the GNU
8 * General Public License (GPL) Version 2, available from the file
9 * COPYING in the main directory of this source tree, or the
10 * OpenIB.org BSD license below:
11 *
12 *     Redistribution and use in source and binary forms, with or
13 *     without modification, are permitted provided that the following
14 *     conditions are met:
15 *
16 *      - Redistributions of source code must retain the above
17 *        copyright notice, this list of conditions and the following
18 *        disclaimer.
19 *
20 *      - Redistributions in binary form must reproduce the above
21 *        copyright notice, this list of conditions and the following
22 *        disclaimer in the documentation and/or other materials
23 *        provided with the distribution.
24 *
25 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
26 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
27 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
28 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
29 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
30 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
31 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
32 * SOFTWARE.
33 *
34 */
35
36/*
37 * Abstract:
38 *    Implementation of osm_sm_state_mgr_t.
39 * This file implements the SM State Manager object.
40 */
41
42#if HAVE_CONFIG_H
43#  include <config.h>
44#endif				/* HAVE_CONFIG_H */
45
46#include <string.h>
47#include <time.h>
48#include <iba/ib_types.h>
49#include <complib/cl_passivelock.h>
50#include <complib/cl_debug.h>
51#include <opensm/osm_sm.h>
52#include <opensm/osm_madw.h>
53#include <opensm/osm_switch.h>
54#include <opensm/osm_log.h>
55#include <opensm/osm_subnet.h>
56#include <opensm/osm_helper.h>
57#include <opensm/osm_msgdef.h>
58#include <opensm/osm_node.h>
59#include <opensm/osm_port.h>
60#include <vendor/osm_vendor_api.h>
61#include <opensm/osm_helper.h>
62#include <opensm/osm_opensm.h>
63
64/**********************************************************************
65 **********************************************************************/
66void osm_report_sm_state(osm_sm_t * sm)
67{
68	char buf[64];
69	const char *state_str = osm_get_sm_mgr_state_str(sm->p_subn->sm_state);
70
71	osm_log(sm->p_log, OSM_LOG_SYS, "Entering %s state\n", state_str);
72	snprintf(buf, sizeof(buf), "ENTERING SM %s STATE", state_str);
73	OSM_LOG_MSG_BOX(sm->p_log, OSM_LOG_VERBOSE, buf);
74}
75
76/**********************************************************************
77 **********************************************************************/
78static void __osm_sm_state_mgr_send_master_sm_info_req(osm_sm_t * sm)
79{
80	osm_madw_context_t context;
81	const osm_port_t *p_port;
82	ib_api_status_t status;
83
84	OSM_LOG_ENTER(sm->p_log);
85
86	memset(&context, 0, sizeof(context));
87	if (sm->p_subn->sm_state == IB_SMINFO_STATE_STANDBY) {
88		/*
89		 * We are in STANDBY state - this means we need to poll on the master
90		 * SM (according to master_guid)
91		 * Send a query of SubnGet(SMInfo) to the subn master_sm_base_lid object.
92		 */
93		p_port = osm_get_port_by_guid(sm->p_subn, sm->master_sm_guid);
94	} else {
95		/*
96		 * We are not in STANDBY - this means we are in MASTER state - so we need
97		 * to poll on the SM that is saved in p_polling_sm under sm.
98		 * Send a query of SubnGet(SMInfo) to that SM.
99		 */
100		p_port = sm->p_polling_sm->p_port;
101	}
102	if (p_port == NULL) {
103		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 3203: "
104			"No port object for GUID 0x%016" PRIx64 "\n",
105			cl_ntoh64(sm->master_sm_guid));
106		goto Exit;
107	}
108
109	context.smi_context.port_guid = p_port->guid;
110	context.smi_context.set_method = FALSE;
111
112	status = osm_req_get(sm, osm_physp_get_dr_path_ptr(p_port->p_physp),
113			     IB_MAD_ATTR_SM_INFO, 0, CL_DISP_MSGID_NONE,
114			     &context);
115
116	if (status != IB_SUCCESS)
117		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 3204: "
118			"Failure requesting SMInfo (%s)\n",
119			ib_get_err_str(status));
120
121Exit:
122	OSM_LOG_EXIT(sm->p_log);
123}
124
125/**********************************************************************
126 **********************************************************************/
127static void __osm_sm_state_mgr_start_polling(osm_sm_t * sm)
128{
129	uint32_t timeout = sm->p_subn->opt.sminfo_polling_timeout;
130	cl_status_t cl_status;
131
132	OSM_LOG_ENTER(sm->p_log);
133
134	/*
135	 * Init the retry_number back to zero - need to restart counting
136	 */
137	sm->retry_number = 0;
138
139	/*
140	 * Send a SubnGet(SMInfo) query to the current (or new) master found.
141	 */
142	__osm_sm_state_mgr_send_master_sm_info_req(sm);
143
144	/*
145	 * Start a timer that will wake up every sminfo_polling_timeout milliseconds.
146	 * The callback of the timer will send a SubnGet(SMInfo) to the Master SM
147	 * and restart the timer
148	 */
149	cl_status = cl_timer_start(&sm->polling_timer, timeout);
150	if (cl_status != CL_SUCCESS)
151		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 3210: "
152			"Failed to start timer\n");
153
154	OSM_LOG_EXIT(sm->p_log);
155}
156
157/**********************************************************************
158 **********************************************************************/
159void osm_sm_state_mgr_polling_callback(IN void *context)
160{
161	osm_sm_t *sm = context;
162	uint32_t timeout = sm->p_subn->opt.sminfo_polling_timeout;
163	cl_status_t cl_status;
164
165	OSM_LOG_ENTER(sm->p_log);
166
167	/*
168	 * We can be here in one of two cases:
169	 * 1. We are a STANDBY sm polling on the master SM.
170	 * 2. We are a MASTER sm, waiting for a handover from a remote master sm.
171	 * If we are not in one of these cases - don't need to restart the poller.
172	 */
173	if (!((sm->p_subn->sm_state == IB_SMINFO_STATE_MASTER &&
174	       sm->p_polling_sm != NULL) ||
175	      (sm->p_subn->sm_state == IB_SMINFO_STATE_STANDBY)))
176		goto Exit;
177
178	/*
179	 * If we are a STANDBY sm and the osm_exit_flag is set, then let's
180	 * signal the subnet_up. This is relevant for the case of running only
181	 * once. In that case - the program is stuck until this signal is
182	 * received. In other cases - it is not relevant whether or not the
183	 * signal is on - since we are currently in exit flow
184	 */
185	if (sm->p_subn->sm_state == IB_SMINFO_STATE_STANDBY && osm_exit_flag) {
186		OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
187			"Signalling subnet_up_event\n");
188		cl_event_signal(&sm->subnet_up_event);
189		goto Exit;
190	}
191
192	/*
193	 * Incr the retry number.
194	 * If it reached the max_retry_number in the subnet opt - call
195	 * osm_sm_state_mgr_process with signal OSM_SM_SIGNAL_POLLING_TIMEOUT
196	 */
197	sm->retry_number++;
198	OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
199		"Retry number:%d\n", sm->retry_number);
200
201	if (sm->retry_number >= sm->p_subn->opt.polling_retry_number) {
202		OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
203			"Reached polling_retry_number value in retry_number. "
204			"Go to DISCOVERY state\n");
205		osm_sm_state_mgr_process(sm, OSM_SM_SIGNAL_POLLING_TIMEOUT);
206		goto Exit;
207	}
208
209	/* Send a SubnGet(SMInfo) request to the remote sm (depends on our state) */
210	__osm_sm_state_mgr_send_master_sm_info_req(sm);
211
212	/* restart the timer */
213	cl_status = cl_timer_start(&sm->polling_timer, timeout);
214	if (cl_status != CL_SUCCESS)
215		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 3211: "
216			"Failed to restart timer\n");
217
218Exit:
219	OSM_LOG_EXIT(sm->p_log);
220	return;
221}
222
223/**********************************************************************
224 **********************************************************************/
225static void __osm_sm_state_mgr_signal_error(osm_sm_t * sm,
226					    IN const osm_sm_signal_t signal)
227{
228	OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 3207: "
229		"Invalid signal %s in state %s\n",
230		osm_get_sm_mgr_signal_str(signal),
231		osm_get_sm_mgr_state_str(sm->p_subn->sm_state));
232}
233
234/**********************************************************************
235 **********************************************************************/
236void osm_sm_state_mgr_signal_master_is_alive(osm_sm_t * sm)
237{
238	OSM_LOG_ENTER(sm->p_log);
239	sm->retry_number = 0;
240	OSM_LOG_EXIT(sm->p_log);
241}
242
243/**********************************************************************
244 **********************************************************************/
245ib_api_status_t osm_sm_state_mgr_process(osm_sm_t * sm,
246					 IN osm_sm_signal_t signal)
247{
248	ib_api_status_t status = IB_SUCCESS;
249
250	CL_ASSERT(sm);
251
252	OSM_LOG_ENTER(sm->p_log);
253
254	/*
255	 * The state lock prevents many race conditions from screwing
256	 * up the state transition process.
257	 */
258	cl_spinlock_acquire(&sm->state_lock);
259
260	OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
261		"Received signal %s in state %s\n",
262		osm_get_sm_mgr_signal_str(signal),
263		osm_get_sm_mgr_state_str(sm->p_subn->sm_state));
264
265	switch (sm->p_subn->sm_state) {
266	case IB_SMINFO_STATE_DISCOVERING:
267		switch (signal) {
268		case OSM_SM_SIGNAL_DISCOVERY_COMPLETED:
269			/*
270			 * Update the state of the SM to MASTER
271			 */
272			/* Turn on the first_time_master_sweep flag */
273			sm->p_subn->first_time_master_sweep = TRUE;
274			sm->p_subn->sm_state = IB_SMINFO_STATE_MASTER;
275			osm_report_sm_state(sm);
276			/*
277			 * Make sure to set the subnet master_sm_base_lid
278			 * to the sm_base_lid value
279			 */
280			sm->p_subn->master_sm_base_lid =
281			    sm->p_subn->sm_base_lid;
282			break;
283		case OSM_SM_SIGNAL_MASTER_OR_HIGHER_SM_DETECTED:
284			/*
285			 * Finished all discovery actions - move to STANDBY
286			 * start the polling
287			 */
288			sm->p_subn->sm_state = IB_SMINFO_STATE_STANDBY;
289			osm_report_sm_state(sm);
290			/*
291			 * Since another SM is doing the LFT config - we should not
292			 * ignore the results of it
293			 */
294			sm->p_subn->ignore_existing_lfts = FALSE;
295
296			__osm_sm_state_mgr_start_polling(sm);
297			break;
298		case OSM_SM_SIGNAL_HANDOVER:
299			/*
300			 * Do nothing. We will discover it later on. If we already discovered
301			 * this SM, and got the HANDOVER - this means the remote SM is of
302			 * lower priority. In this case we will stop polling it (since it is
303			 * a lower priority SM in STANDBY state).
304			 */
305			break;
306		default:
307			__osm_sm_state_mgr_signal_error(sm, signal);
308			status = IB_INVALID_PARAMETER;
309			break;
310		}
311		break;
312
313	case IB_SMINFO_STATE_STANDBY:
314		switch (signal) {
315		case OSM_SM_SIGNAL_POLLING_TIMEOUT:
316		case OSM_SM_SIGNAL_DISCOVER:
317			/*
318			 * case 1: Polling timeout occured - this means that the Master SM
319			 * is no longer alive.
320			 * case 2: Got a signal to move to DISCOVERING
321			 * Move to DISCOVERING state and start sweeping
322			 */
323			sm->p_subn->sm_state = IB_SMINFO_STATE_DISCOVERING;
324			osm_report_sm_state(sm);
325			sm->p_subn->coming_out_of_standby = TRUE;
326			osm_sm_signal(sm, OSM_SIGNAL_SWEEP);
327			break;
328		case OSM_SM_SIGNAL_DISABLE:
329			/*
330			 * Update the state to NOT_ACTIVE
331			 */
332			sm->p_subn->sm_state = IB_SMINFO_STATE_NOTACTIVE;
333			osm_report_sm_state(sm);
334			osm_vendor_set_sm(sm->mad_ctrl.h_bind, FALSE);
335			break;
336		case OSM_SM_SIGNAL_HANDOVER:
337			/*
338			 * Update the state to MASTER, and start sweeping
339			 * OPTIONAL: send ACKNOWLEDGE
340			 */
341			/* Turn on the first_time_master_sweep flag */
342			sm->p_subn->first_time_master_sweep = TRUE;
343			/* Turn on the force_heavy_sweep - we want a
344			 * heavy sweep to occur on the first sweep of this SM. */
345			sm->p_subn->force_heavy_sweep = TRUE;
346
347			sm->p_subn->sm_state = IB_SMINFO_STATE_MASTER;
348			osm_report_sm_state(sm);
349			/*
350			 * Make sure to set the subnet master_sm_base_lid
351			 * to the sm_base_lid value
352			 */
353			sm->p_subn->master_sm_base_lid =
354			    sm->p_subn->sm_base_lid;
355			sm->p_subn->coming_out_of_standby = TRUE;
356			osm_sm_signal(sm, OSM_SIGNAL_SWEEP);
357			break;
358		case OSM_SM_SIGNAL_ACKNOWLEDGE:
359			/*
360			 * Do nothing - already moved to STANDBY
361			 */
362			break;
363		default:
364			__osm_sm_state_mgr_signal_error(sm, signal);
365			status = IB_INVALID_PARAMETER;
366			break;
367		}
368		break;
369
370	case IB_SMINFO_STATE_NOTACTIVE:
371		switch (signal) {
372		case OSM_SM_SIGNAL_STANDBY:
373			/*
374			 * Update the state to STANDBY
375			 * start the polling
376			 */
377			sm->p_subn->sm_state = IB_SMINFO_STATE_STANDBY;
378			osm_report_sm_state(sm);
379			__osm_sm_state_mgr_start_polling(sm);
380			break;
381		default:
382			__osm_sm_state_mgr_signal_error(sm, signal);
383			status = IB_INVALID_PARAMETER;
384			break;
385		}
386		break;
387
388	case IB_SMINFO_STATE_MASTER:
389		switch (signal) {
390		case OSM_SM_SIGNAL_POLLING_TIMEOUT:
391			/*
392			 * we received a polling timeout - this means that we waited for
393			 * a remote master sm to send us a handover, but didn't get it, and
394			 * didn't get a response from that remote sm.
395			 * We want to force a heavy sweep - hopefully this occurred because
396			 * the remote sm died, and we'll find this out and configure the
397			 * subnet after a heavy sweep.
398			 * We also want to clear the p_polling_sm object - since we are
399			 * done polling on that remote sm - we are sweeping again.
400			 */
401		case OSM_SM_SIGNAL_HANDOVER:
402			/*
403			 * If we received a handover in a master state - then we want to
404			 * force a heavy sweep. This means that either we are in a sweep
405			 * currently - in this case - no change, or we are in idle state -
406			 * since we recognized a master SM before - so we want to make a
407			 * heavy sweep and reconfigure the new subnet.
408			 * We also want to clear the p_polling_sm object - since we are
409			 * done polling on that remote sm - we got a handover from it.
410			 */
411			OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
412				"Forcing heavy sweep. "
413				"Received OSM_SM_SIGNAL_HANDOVER or OSM_SM_SIGNAL_POLLING_TIMEOUT\n");
414			sm->p_polling_sm = NULL;
415			sm->p_subn->force_heavy_sweep = TRUE;
416			osm_sm_signal(sm, OSM_SIGNAL_SWEEP);
417			break;
418		case OSM_SM_SIGNAL_HANDOVER_SENT:
419			/*
420			 * Just sent a HANDOVER signal - move to STANDBY
421			 * start the polling
422			 */
423			sm->p_subn->sm_state = IB_SMINFO_STATE_STANDBY;
424			osm_report_sm_state(sm);
425			__osm_sm_state_mgr_start_polling(sm);
426			break;
427		case OSM_SM_SIGNAL_WAIT_FOR_HANDOVER:
428			/*
429			 * We found a remote master SM, and we are waiting for it
430			 * to handover the mastership to us. Need to start polling
431			 * on that SM, to make sure it is alive, if it isn't - then
432			 * we should move back to discovering, since something must
433			 * have happened to it.
434			 */
435			__osm_sm_state_mgr_start_polling(sm);
436			break;
437		case OSM_SM_SIGNAL_DISCOVER:
438			sm->p_subn->sm_state = IB_SMINFO_STATE_DISCOVERING;
439			osm_report_sm_state(sm);
440			break;
441		default:
442			__osm_sm_state_mgr_signal_error(sm, signal);
443			status = IB_INVALID_PARAMETER;
444			break;
445		}
446		break;
447
448	default:
449		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 3208: "
450			"Invalid state %s\n",
451			osm_get_sm_mgr_state_str(sm->p_subn->sm_state));
452
453	}
454
455	cl_spinlock_release(&sm->state_lock);
456
457	OSM_LOG_EXIT(sm->p_log);
458	return (status);
459}
460
461/**********************************************************************
462 **********************************************************************/
463ib_api_status_t osm_sm_state_mgr_check_legality(osm_sm_t * sm,
464						IN osm_sm_signal_t signal)
465{
466	ib_api_status_t status = IB_SUCCESS;
467
468	CL_ASSERT(sm);
469
470	OSM_LOG_ENTER(sm->p_log);
471
472	/*
473	 * The state lock prevents many race conditions from screwing
474	 * up the state transition process.
475	 */
476	cl_spinlock_acquire(&sm->state_lock);
477
478	OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
479		"Received signal %s in state %s\n",
480		osm_get_sm_mgr_signal_str(signal),
481		osm_get_sm_mgr_state_str(sm->p_subn->sm_state));
482
483	switch (sm->p_subn->sm_state) {
484	case IB_SMINFO_STATE_DISCOVERING:
485		switch (signal) {
486		case OSM_SM_SIGNAL_DISCOVERY_COMPLETED:
487		case OSM_SM_SIGNAL_MASTER_OR_HIGHER_SM_DETECTED:
488		case OSM_SM_SIGNAL_HANDOVER:
489			status = IB_SUCCESS;
490			break;
491		default:
492			__osm_sm_state_mgr_signal_error(sm, signal);
493			status = IB_INVALID_PARAMETER;
494			break;
495		}
496		break;
497
498	case IB_SMINFO_STATE_STANDBY:
499		switch (signal) {
500		case OSM_SM_SIGNAL_POLLING_TIMEOUT:
501		case OSM_SM_SIGNAL_DISCOVER:
502		case OSM_SM_SIGNAL_DISABLE:
503		case OSM_SM_SIGNAL_HANDOVER:
504		case OSM_SM_SIGNAL_ACKNOWLEDGE:
505			status = IB_SUCCESS;
506			break;
507		default:
508			__osm_sm_state_mgr_signal_error(sm, signal);
509			status = IB_INVALID_PARAMETER;
510			break;
511		}
512		break;
513
514	case IB_SMINFO_STATE_NOTACTIVE:
515		switch (signal) {
516		case OSM_SM_SIGNAL_STANDBY:
517			status = IB_SUCCESS;
518			break;
519		default:
520			__osm_sm_state_mgr_signal_error(sm, signal);
521			status = IB_INVALID_PARAMETER;
522			break;
523		}
524		break;
525
526	case IB_SMINFO_STATE_MASTER:
527		switch (signal) {
528		case OSM_SM_SIGNAL_HANDOVER:
529		case OSM_SM_SIGNAL_HANDOVER_SENT:
530			status = IB_SUCCESS;
531			break;
532		default:
533			__osm_sm_state_mgr_signal_error(sm, signal);
534			status = IB_INVALID_PARAMETER;
535			break;
536		}
537		break;
538
539	default:
540		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 3209: "
541			"Invalid state %s\n",
542			osm_get_sm_mgr_state_str(sm->p_subn->sm_state));
543		status = IB_INVALID_PARAMETER;
544
545	}
546
547	cl_spinlock_release(&sm->state_lock);
548
549	OSM_LOG_EXIT(sm->p_log);
550	return (status);
551}
552