disk_monitor.c revision 9120:fe1f7d8cd967
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22/* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27/* 28 * Disk Monitor 29 */ 30#include <sys/types.h> 31#include <sys/stat.h> 32#include <fcntl.h> 33#include <time.h> 34#include <stdio.h> 35#include <stdlib.h> 36#include <strings.h> 37#include <stdarg.h> 38#include <errno.h> 39#include <signal.h> 40#include <unistd.h> 41#include <pthread.h> 42#include <libnvpair.h> 43#include <fm/fmd_api.h> 44#include <fm/fmd_fmri.h> 45#include <sys/fm/protocol.h> 46#include <sys/fm/io/disk.h> 47#include <fm/libtopo.h> 48 49#include "disk_monitor.h" 50#include "hotplug_mgr.h" 51#include "schg_mgr.h" 52#include "topo_gather.h" 53#include "dm_platform.h" 54 55#define THIS_FMD_MODULE_NAME "disk-monitor" 56 57static enum disk_init_state { 58 INIT_STATE_NONE = 0, 59 STATE_CHANGE_MGR_INITTED = 2, 60 HOTPLUG_MGR_INITTED = 4 61} g_init_state = INIT_STATE_NONE; 62 63typedef enum { 64 LT_SUSPECT, 65 LT_REPAIRED 66} fm_list_type_t; 67 68/* 69 * Global verbosity flag -- controls chattiness of debug messages and 70 * warnings. Its value is determined by the fmd property "log-level" 71 * settable in the DE's .conf file. 72 */ 73log_class_t g_verbose = 0; 74cfgdata_t *config_data = NULL; 75fmd_hdl_t *g_fm_hdl = NULL; 76 77static const fmd_prop_t fmd_props[]; 78 79static void 80diskmon_teardown_all(void) 81{ 82 cleanup_hotplug_manager(); 83 cleanup_state_change_manager(config_data); 84 config_fini(); 85} 86 87static int 88count_disks(diskmon_t *disklistp) 89{ 90 int i = 0; 91 92 while (disklistp != NULL) { 93 i++; 94 disklistp = disklistp->next; 95 } 96 97 return (i); 98} 99 100static int 101diskmon_init(void) 102{ 103 /* 104 * Block the generation of state change events (generated by the 105 * hotplug manager thread) here; they will be unblocked after the 106 * state change manager thread is ready to accept state changes 107 * (shortly after it starts). 108 */ 109 block_state_change_events(); 110 111 if (dm_platform_init() != 0) 112 goto cleanup; 113 114 if (init_hotplug_manager() != 0) 115 goto cleanup; 116 else 117 g_init_state |= HOTPLUG_MGR_INITTED; 118 119 if (init_state_change_manager(config_data) != 0) 120 goto cleanup; 121 else 122 g_init_state |= STATE_CHANGE_MGR_INITTED; 123 124 return (E_SUCCESS); 125 126cleanup: 127 128 unblock_state_change_events(); 129 130 /* 131 * The cleanup order here does matter, due to dependencies between the 132 * managers. 133 */ 134 if (g_init_state & HOTPLUG_MGR_INITTED) 135 cleanup_hotplug_manager(); 136 if (g_init_state & STATE_CHANGE_MGR_INITTED) 137 cleanup_state_change_manager(config_data); 138 dm_platform_fini(); 139 140 return (E_ERROR); 141} 142 143static void 144dm_fault_execute_actions(fmd_hdl_t *hdl, diskmon_t *diskp, nvlist_t *nvl) 145{ 146 const char *action_prop = NULL; 147 const char *action_string; 148 149 /* 150 * The predictive failure action is the activation of the fault 151 * indicator. 152 */ 153 if (fmd_nvl_class_match(hdl, nvl, 154 DISK_ERROR_CLASS "." FM_FAULT_DISK_OVERTEMP)) 155 action_prop = DISK_PROP_OTEMPACTION; 156 157 if (fmd_nvl_class_match(hdl, nvl, 158 DISK_ERROR_CLASS "." FM_FAULT_DISK_TESTFAIL)) 159 action_prop = DISK_PROP_STFAILACTION; 160 161 dm_fault_indicator_set(diskp, INDICATOR_ON); 162 163 if (action_prop != NULL && 164 (action_string = dm_prop_lookup(diskp->props, action_prop)) 165 != NULL) { 166 167 if (dm_platform_indicator_execute(action_string) != 0) { 168 log_warn("Fault action `%s' did not successfully " 169 "complete.\n", action_string); 170 } 171 } 172} 173 174static void 175diskmon_agent_repair(fmd_hdl_t *hdl, nvlist_t *nvl, int repair) 176{ 177 char *uuid = NULL; 178 nvlist_t **nva; 179 uint_t nvc; 180 diskmon_t *diskp; 181 nvlist_t *fmri; 182 nvlist_t *fltnvl; 183 int err = 0; 184 185 err |= nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid); 186 err |= nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST, 187 &nva, &nvc); 188 if (err != 0) 189 return; 190 191 while (nvc-- != 0) { 192 193 fltnvl = *nva++; 194 195 if (nvlist_lookup_nvlist(fltnvl, FM_FAULT_RESOURCE, &fmri) 196 != 0) 197 continue; 198 199 if ((diskp = dm_fmri_to_diskmon(hdl, fmri)) == NULL) 200 continue; 201 202 log_msg(MM_MAIN, "Disk %s repaired!\n", 203 diskp->location); 204 205 dm_fault_indicator_set(diskp, INDICATOR_OFF); 206 207 dm_state_change(diskp, HPS_REPAIRED); 208 } 209 210 if (repair) 211 fmd_case_uuresolved(hdl, uuid); 212 213} 214 215static void 216diskmon_agent_suspect(fmd_hdl_t *hdl, nvlist_t *nvl) 217{ 218 char *uuid = NULL; 219 nvlist_t **nva; 220 uint_t nvc; 221 diskmon_t *diskp; 222 nvlist_t *fmri; 223 nvlist_t *fltnvl; 224 int err = 0; 225 226 err |= nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid); 227 err |= nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST, 228 &nva, &nvc); 229 if (err != 0) 230 return; 231 232 while (nvc-- != 0 && !fmd_case_uuclosed(hdl, uuid)) { 233 234 fltnvl = *nva++; 235 236 if (nvlist_lookup_nvlist(fltnvl, FM_FAULT_RESOURCE, &fmri) != 0) 237 continue; 238 239 if ((diskp = dm_fmri_to_diskmon(hdl, fmri)) == NULL) 240 continue; 241 242 /* Execute the actions associated with this fault */ 243 dm_fault_execute_actions(hdl, diskp, fltnvl); 244 245 /* 246 * Send a state change event to the state change manager 247 */ 248 dm_state_change(diskp, HPS_FAULTED); 249 } 250 251 if (!fmd_case_uuclosed(hdl, uuid)) { 252 /* Case is closed */ 253 fmd_case_uuclose(hdl, uuid); 254 } 255} 256 257/*ARGSUSED*/ 258static void 259diskmon_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) 260{ 261 diskmon_t *diskp; 262 nvlist_t *fmri; 263 264 if (g_verbose & MM_MAIN) 265 nvlist_print(stderr, nvl); 266 267 /* 268 * Act on the fault suspect list or repaired list (embedded agent 269 * action). 270 */ 271 if (fmd_nvl_class_match(hdl, nvl, FM_LIST_REPAIRED_CLASS)) { 272 273 diskmon_agent_repair(hdl, nvl, 1); 274 return; 275 276 } else if (fmd_nvl_class_match(hdl, nvl, FM_LIST_UPDATED_CLASS)) { 277 278 diskmon_agent_repair(hdl, nvl, 0); 279 return; 280 281 } else if (fmd_nvl_class_match(hdl, nvl, FM_LIST_SUSPECT_CLASS)) { 282 283 diskmon_agent_suspect(hdl, nvl); 284 return; 285 } else if (fmd_nvl_class_match(hdl, nvl, FM_LIST_RESOLVED_CLASS)) { 286 return; 287 } 288 289 /* 290 * If we get any replayed faults, set the diskmon's faulted 291 * flag for the appropriate fault, then change the diskmon's state 292 * to faulted. 293 */ 294 if (fmd_nvl_class_match(hdl, nvl, DISK_ERROR_CLASS ".*")) { 295 296 if (nvlist_lookup_nvlist(nvl, FM_FAULT_RESOURCE, 297 &fmri) != 0) 298 return; 299 300 if ((diskp = dm_fmri_to_diskmon(hdl, fmri)) == NULL) 301 return; 302 303 /* Execute the actions associated with this fault */ 304 dm_fault_execute_actions(hdl, diskp, nvl); 305 306 /* 307 * If the fault wasn't generated by this module, send a 308 * state change event to the state change manager 309 */ 310 dm_state_change(diskp, HPS_FAULTED); 311 return; 312 } 313} 314 315static const fmd_hdl_ops_t fmd_ops = { 316 diskmon_recv, /* fmdo_recv */ 317 NULL, /* fmdo_timeout */ 318 NULL, /* fmdo_close */ 319 NULL, /* fmdo_stats */ 320 NULL, /* fmdo_gc */ 321}; 322 323static const fmd_prop_t fmd_props[] = { 324 { GLOBAL_PROP_LOG_LEVEL, FMD_TYPE_UINT32, "0" }, 325 { NULL, 0, NULL } 326}; 327 328static const fmd_hdl_info_t fmd_info = { 329 "Disk Monitor", 330 DISK_MONITOR_MODULE_VERSION, 331 &fmd_ops, 332 fmd_props 333}; 334 335void 336_fmd_init(fmd_hdl_t *hdl) 337{ 338 fmd_case_t *cp; 339 int disk_count; 340 341 g_fm_hdl = hdl; 342 343 if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) { 344 return; 345 } 346 347 if (config_init()) { 348 log_err("Could not initialize configuration!\n"); 349 fmd_hdl_unregister(hdl); 350 return; 351 } 352 353 if (config_get(hdl, fmd_props)) { 354 config_fini(); 355 log_err("Could not retrieve configuration from libtopo!\n"); 356 fmd_hdl_unregister(hdl); 357 return; 358 } 359 360 /* 361 * If there are no disks to monitor, bail out 362 */ 363 if ((disk_count = count_disks(config_data->disk_list)) == 0) { 364 config_fini(); 365 fmd_hdl_unregister(hdl); 366 return; 367 } 368 369 if (diskmon_init() == E_ERROR) { 370 config_fini(); 371 fmd_hdl_unregister(hdl); 372 return; 373 } 374 375 log_msg(MM_MAIN, "Monitoring %d disks.\n", disk_count); 376 377 /* 378 * Iterate over all active cases. 379 * Since we automatically solve all cases, these cases must have 380 * had the fault added, but the DE must have been interrupted 381 * before they were solved. 382 */ 383 for (cp = fmd_case_next(hdl, NULL); 384 cp != NULL; cp = fmd_case_next(hdl, cp)) { 385 386 if (!fmd_case_solved(hdl, cp)) 387 fmd_case_solve(hdl, cp); 388 } 389} 390 391/*ARGSUSED*/ 392void 393_fmd_fini(fmd_hdl_t *hdl) 394{ 395 diskmon_teardown_all(); 396 g_fm_hdl = NULL; 397} 398