disk_monitor.c revision 9120:fe1f7d8cd967
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27/*
28 * Disk Monitor
29 */
30#include <sys/types.h>
31#include <sys/stat.h>
32#include <fcntl.h>
33#include <time.h>
34#include <stdio.h>
35#include <stdlib.h>
36#include <strings.h>
37#include <stdarg.h>
38#include <errno.h>
39#include <signal.h>
40#include <unistd.h>
41#include <pthread.h>
42#include <libnvpair.h>
43#include <fm/fmd_api.h>
44#include <fm/fmd_fmri.h>
45#include <sys/fm/protocol.h>
46#include <sys/fm/io/disk.h>
47#include <fm/libtopo.h>
48
49#include "disk_monitor.h"
50#include "hotplug_mgr.h"
51#include "schg_mgr.h"
52#include "topo_gather.h"
53#include "dm_platform.h"
54
55#define	THIS_FMD_MODULE_NAME "disk-monitor"
56
57static enum disk_init_state {
58	INIT_STATE_NONE = 0,
59	STATE_CHANGE_MGR_INITTED = 2,
60	HOTPLUG_MGR_INITTED = 4
61} g_init_state = INIT_STATE_NONE;
62
63typedef enum {
64	LT_SUSPECT,
65	LT_REPAIRED
66} fm_list_type_t;
67
68/*
69 * Global verbosity flag -- controls chattiness of debug messages and
70 * warnings.  Its value is determined by the fmd property "log-level"
71 * settable in the DE's .conf file.
72 */
73log_class_t			g_verbose = 0;
74cfgdata_t			*config_data = NULL;
75fmd_hdl_t			*g_fm_hdl = NULL;
76
77static const fmd_prop_t		fmd_props[];
78
79static void
80diskmon_teardown_all(void)
81{
82	cleanup_hotplug_manager();
83	cleanup_state_change_manager(config_data);
84	config_fini();
85}
86
87static int
88count_disks(diskmon_t *disklistp)
89{
90	int i = 0;
91
92	while (disklistp != NULL) {
93		i++;
94		disklistp = disklistp->next;
95	}
96
97	return (i);
98}
99
100static int
101diskmon_init(void)
102{
103	/*
104	 * Block the generation of state change events (generated by the
105	 * hotplug manager thread) here; they will be unblocked after the
106	 * state change manager thread is ready to accept state changes
107	 * (shortly after it starts).
108	 */
109	block_state_change_events();
110
111	if (dm_platform_init() != 0)
112		goto cleanup;
113
114	if (init_hotplug_manager() != 0)
115		goto cleanup;
116	else
117		g_init_state |= HOTPLUG_MGR_INITTED;
118
119	if (init_state_change_manager(config_data) != 0)
120		goto cleanup;
121	else
122		g_init_state |= STATE_CHANGE_MGR_INITTED;
123
124	return (E_SUCCESS);
125
126cleanup:
127
128	unblock_state_change_events();
129
130	/*
131	 * The cleanup order here does matter, due to dependencies between the
132	 * managers.
133	 */
134	if (g_init_state & HOTPLUG_MGR_INITTED)
135		cleanup_hotplug_manager();
136	if (g_init_state & STATE_CHANGE_MGR_INITTED)
137		cleanup_state_change_manager(config_data);
138	dm_platform_fini();
139
140	return (E_ERROR);
141}
142
143static void
144dm_fault_execute_actions(fmd_hdl_t *hdl, diskmon_t *diskp, nvlist_t *nvl)
145{
146	const char		*action_prop = NULL;
147	const char		*action_string;
148
149	/*
150	 * The predictive failure action is the activation of the fault
151	 * indicator.
152	 */
153	if (fmd_nvl_class_match(hdl, nvl,
154	    DISK_ERROR_CLASS "." FM_FAULT_DISK_OVERTEMP))
155		action_prop = DISK_PROP_OTEMPACTION;
156
157	if (fmd_nvl_class_match(hdl, nvl,
158	    DISK_ERROR_CLASS "." FM_FAULT_DISK_TESTFAIL))
159		action_prop = DISK_PROP_STFAILACTION;
160
161	dm_fault_indicator_set(diskp, INDICATOR_ON);
162
163	if (action_prop != NULL &&
164	    (action_string = dm_prop_lookup(diskp->props, action_prop))
165	    != NULL) {
166
167		if (dm_platform_indicator_execute(action_string) != 0) {
168			log_warn("Fault action `%s' did not successfully "
169			    "complete.\n", action_string);
170		}
171	}
172}
173
174static void
175diskmon_agent_repair(fmd_hdl_t *hdl, nvlist_t *nvl, int repair)
176{
177	char		*uuid = NULL;
178	nvlist_t	**nva;
179	uint_t		nvc;
180	diskmon_t	*diskp;
181	nvlist_t	*fmri;
182	nvlist_t	*fltnvl;
183	int		err = 0;
184
185	err |= nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid);
186	err |= nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST,
187	    &nva, &nvc);
188	if (err != 0)
189		return;
190
191	while (nvc-- != 0) {
192
193		fltnvl = *nva++;
194
195		if (nvlist_lookup_nvlist(fltnvl, FM_FAULT_RESOURCE, &fmri)
196		    != 0)
197			continue;
198
199		if ((diskp = dm_fmri_to_diskmon(hdl, fmri)) == NULL)
200			continue;
201
202		log_msg(MM_MAIN, "Disk %s repaired!\n",
203		    diskp->location);
204
205		dm_fault_indicator_set(diskp, INDICATOR_OFF);
206
207		dm_state_change(diskp, HPS_REPAIRED);
208	}
209
210	if (repair)
211		fmd_case_uuresolved(hdl, uuid);
212
213}
214
215static void
216diskmon_agent_suspect(fmd_hdl_t *hdl, nvlist_t *nvl)
217{
218	char		*uuid = NULL;
219	nvlist_t	**nva;
220	uint_t		nvc;
221	diskmon_t	*diskp;
222	nvlist_t	*fmri;
223	nvlist_t	*fltnvl;
224	int		err = 0;
225
226	err |= nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid);
227	err |= nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST,
228	    &nva, &nvc);
229	if (err != 0)
230		return;
231
232	while (nvc-- != 0 && !fmd_case_uuclosed(hdl, uuid)) {
233
234		fltnvl = *nva++;
235
236		if (nvlist_lookup_nvlist(fltnvl, FM_FAULT_RESOURCE, &fmri) != 0)
237			continue;
238
239		if ((diskp = dm_fmri_to_diskmon(hdl, fmri)) == NULL)
240			continue;
241
242		/* Execute the actions associated with this fault */
243		dm_fault_execute_actions(hdl, diskp,  fltnvl);
244
245		/*
246		 * Send a state change event to the state change manager
247		 */
248		dm_state_change(diskp, HPS_FAULTED);
249	}
250
251	if (!fmd_case_uuclosed(hdl, uuid)) {
252		/* Case is closed */
253		fmd_case_uuclose(hdl, uuid);
254	}
255}
256
257/*ARGSUSED*/
258static void
259diskmon_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
260{
261	diskmon_t	*diskp;
262	nvlist_t	*fmri;
263
264	if (g_verbose & MM_MAIN)
265		nvlist_print(stderr, nvl);
266
267	/*
268	 * Act on the fault suspect list or repaired list (embedded agent
269	 * action).
270	 */
271	if (fmd_nvl_class_match(hdl, nvl, FM_LIST_REPAIRED_CLASS)) {
272
273		diskmon_agent_repair(hdl, nvl, 1);
274		return;
275
276	} else if (fmd_nvl_class_match(hdl, nvl, FM_LIST_UPDATED_CLASS)) {
277
278		diskmon_agent_repair(hdl, nvl, 0);
279		return;
280
281	} else if (fmd_nvl_class_match(hdl, nvl, FM_LIST_SUSPECT_CLASS)) {
282
283		diskmon_agent_suspect(hdl, nvl);
284		return;
285	} else if (fmd_nvl_class_match(hdl, nvl, FM_LIST_RESOLVED_CLASS)) {
286		return;
287	}
288
289	/*
290	 * If we get any replayed faults, set the diskmon's faulted
291	 * flag for the appropriate fault, then change the diskmon's state
292	 * to faulted.
293	 */
294	if (fmd_nvl_class_match(hdl, nvl, DISK_ERROR_CLASS ".*")) {
295
296		if (nvlist_lookup_nvlist(nvl, FM_FAULT_RESOURCE,
297		    &fmri) != 0)
298			return;
299
300		if ((diskp = dm_fmri_to_diskmon(hdl, fmri)) == NULL)
301			return;
302
303		/* Execute the actions associated with this fault */
304		dm_fault_execute_actions(hdl, diskp, nvl);
305
306		/*
307		 * If the fault wasn't generated by this module, send a
308		 * state change event to the state change manager
309		 */
310		dm_state_change(diskp, HPS_FAULTED);
311		return;
312	}
313}
314
315static const fmd_hdl_ops_t fmd_ops = {
316	diskmon_recv,	/* fmdo_recv */
317	NULL,		/* fmdo_timeout */
318	NULL,		/* fmdo_close */
319	NULL,		/* fmdo_stats */
320	NULL,		/* fmdo_gc */
321};
322
323static const fmd_prop_t fmd_props[] = {
324	{ GLOBAL_PROP_LOG_LEVEL, FMD_TYPE_UINT32, "0" },
325	{ NULL, 0, NULL }
326};
327
328static const fmd_hdl_info_t fmd_info = {
329	"Disk Monitor",
330	DISK_MONITOR_MODULE_VERSION,
331	&fmd_ops,
332	fmd_props
333};
334
335void
336_fmd_init(fmd_hdl_t *hdl)
337{
338	fmd_case_t	*cp;
339	int		disk_count;
340
341	g_fm_hdl = hdl;
342
343	if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) {
344		return;
345	}
346
347	if (config_init()) {
348		log_err("Could not initialize configuration!\n");
349		fmd_hdl_unregister(hdl);
350		return;
351	}
352
353	if (config_get(hdl, fmd_props)) {
354		config_fini();
355		log_err("Could not retrieve configuration from libtopo!\n");
356		fmd_hdl_unregister(hdl);
357		return;
358	}
359
360	/*
361	 * If there are no disks to monitor, bail out
362	 */
363	if ((disk_count = count_disks(config_data->disk_list)) == 0) {
364		config_fini();
365		fmd_hdl_unregister(hdl);
366		return;
367	}
368
369	if (diskmon_init() == E_ERROR) {
370		config_fini();
371		fmd_hdl_unregister(hdl);
372		return;
373	}
374
375	log_msg(MM_MAIN, "Monitoring %d disks.\n", disk_count);
376
377	/*
378	 * Iterate over all active cases.
379	 * Since we automatically solve all cases, these cases must have
380	 * had the fault added, but the DE must have been interrupted
381	 * before they were solved.
382	 */
383	for (cp = fmd_case_next(hdl, NULL);
384	    cp != NULL; cp = fmd_case_next(hdl, cp)) {
385
386		if (!fmd_case_solved(hdl, cp))
387			fmd_case_solve(hdl, cp);
388	}
389}
390
391/*ARGSUSED*/
392void
393_fmd_fini(fmd_hdl_t *hdl)
394{
395	diskmon_teardown_all();
396	g_fm_hdl = NULL;
397}
398