1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26/*
27 * FMD Case Subsystem
28 *
29 * Diagnosis engines are expected to group telemetry events related to the
30 * diagnosis of a particular problem on the system into a set of cases.  The
31 * diagnosis engine may have any number of cases open at a given point in time.
32 * Some cases may eventually be *solved* by associating a suspect list of one
33 * or more problems with the case, at which point fmd publishes a list.suspect
34 * event for the case and it becomes visible to administrators and agents.
35 *
36 * Every case is named using a UUID, and is globally visible in the case hash.
37 * Cases are reference-counted, except for the reference from the case hash
38 * itself.  Consumers of case references include modules, which store active
39 * cases on the mod_cases list, ASRUs in the resource cache, and the RPC code.
40 *
41 * Cases obey the following state machine.  In states UNSOLVED, SOLVED, and
42 * CLOSE_WAIT, a case's module refers to the owning module (a diagnosis engine
43 * or transport) and the case is referenced by the mod_cases list.  Once the
44 * case reaches the CLOSED or REPAIRED states, a case's module changes to refer
45 * to the root module (fmd.d_rmod) and is deleted from the owner's mod_cases.
46 *
47 *			+------------+
48 *	     +----------|  UNSOLVED  |
49 *	     |		+------------+
50 *	     |		      1 |
51 *	     |			|
52 *	     |		+-------v----+
53 *	   2 |		|    SOLVED  |
54 *	     |		+------------+
55 *	     |		    3 |  5 |
56 *	     +------------+   |    |
57 *			  |   |    |
58 *			+-v---v----v-+
59 *			| CLOSE_WAIT |
60 *			+------------+
61 *			  |   |    |
62 *	      +-----------+   |    +------------+
63 *	      |		    4 |			|
64 *	      v		+-----v------+		|
65 *	   discard      |   CLOSED   |	      6	|
66 *			+------------+		|
67 *			      |			|
68 *			      |	   +------------+
69 *			    7 |	   |
70 *			+-----v----v-+
71 *			|  REPAIRED  |
72 *			+------------+
73 *			      |
74 *			    8 |
75 *			+-----v------+
76 *			|  RESOLVED  |
77 *			+------------+
78 *			      |
79 *			      v
80 *			   discard
81 *
82 * The state machine changes are triggered by calls to fmd_case_transition()
83 * from various locations inside of fmd, as described below:
84 *
85 * [1] Called by: fmd_case_solve()
86 *       Actions: FMD_CF_SOLVED flag is set in ci_flags
87 *                conviction policy is applied to suspect list
88 *                suspects convicted are marked faulty (F) in R$
89 *                list.suspect event logged and dispatched
90 *
91 * [2] Called by: fmd_case_close(), fmd_case_uuclose()
92 *       Actions: diagnosis engine fmdo_close() entry point scheduled
93 *                case discarded upon exit from CLOSE_WAIT
94 *
95 * [3] Called by: fmd_case_close(), fmd_case_uuclose(), fmd_xprt_event_uuclose()
96 *       Actions: FMD_CF_ISOLATED flag is set in ci_flags
97 *                suspects convicted (F) are marked unusable (U) in R$
98 *                diagnosis engine fmdo_close() entry point scheduled
99 *                case transitions to CLOSED [4] upon exit from CLOSE_WAIT
100 *
101 * [4] Called by: fmd_case_delete() (after fmdo_close() entry point returns)
102 *       Actions: list.isolated event dispatched
103 *                case deleted from module's list of open cases
104 *
105 * [5] Called by: fmd_case_repair(), fmd_case_update()
106 *       Actions: FMD_CF_REPAIR flag is set in ci_flags
107 *                diagnosis engine fmdo_close() entry point scheduled
108 *                case transitions to REPAIRED [6] upon exit from CLOSE_WAIT
109 *
110 * [6] Called by: fmd_case_delete() (after fmdo_close() entry point returns)
111 *       Actions: suspects convicted are marked non faulty (!F) in R$
112 *                list.repaired or list.updated event dispatched
113 *
114 * [7] Called by: fmd_case_repair(), fmd_case_update()
115 *       Actions: FMD_CF_REPAIR flag is set in ci_flags
116 *                suspects convicted are marked non faulty (!F) in R$
117 *                list.repaired or list.updated event dispatched
118 *
119 * [8] Called by: fmd_case_uuresolve()
120 *       Actions: list.resolved event dispatched
121 *		  case is discarded
122 */
123
124#include <sys/fm/protocol.h>
125#include <uuid/uuid.h>
126#include <alloca.h>
127
128#include <fmd_alloc.h>
129#include <fmd_module.h>
130#include <fmd_error.h>
131#include <fmd_conf.h>
132#include <fmd_case.h>
133#include <fmd_string.h>
134#include <fmd_subr.h>
135#include <fmd_protocol.h>
136#include <fmd_event.h>
137#include <fmd_eventq.h>
138#include <fmd_dispq.h>
139#include <fmd_buf.h>
140#include <fmd_log.h>
141#include <fmd_asru.h>
142#include <fmd_fmri.h>
143#include <fmd_xprt.h>
144
145#include <fmd.h>
146
147static const char *const _fmd_case_snames[] = {
148	"UNSOLVED",	/* FMD_CASE_UNSOLVED */
149	"SOLVED",	/* FMD_CASE_SOLVED */
150	"CLOSE_WAIT",	/* FMD_CASE_CLOSE_WAIT */
151	"CLOSED",	/* FMD_CASE_CLOSED */
152	"REPAIRED",	/* FMD_CASE_REPAIRED */
153	"RESOLVED"	/* FMD_CASE_RESOLVED */
154};
155
156static fmd_case_impl_t *fmd_case_tryhold(fmd_case_impl_t *);
157
158fmd_case_hash_t *
159fmd_case_hash_create(void)
160{
161	fmd_case_hash_t *chp = fmd_alloc(sizeof (fmd_case_hash_t), FMD_SLEEP);
162
163	(void) pthread_rwlock_init(&chp->ch_lock, NULL);
164	chp->ch_hashlen = fmd.d_str_buckets;
165	chp->ch_hash = fmd_zalloc(sizeof (void *) * chp->ch_hashlen, FMD_SLEEP);
166	chp->ch_code_hash = fmd_zalloc(sizeof (void *) * chp->ch_hashlen,
167	    FMD_SLEEP);
168	chp->ch_count = 0;
169
170	return (chp);
171}
172
173/*
174 * Destroy the case hash.  Unlike most of our hash tables, no active references
175 * are kept by the case hash itself; all references come from other subsystems.
176 * The hash must be destroyed after all modules are unloaded; if anything was
177 * present in the hash it would be by definition a reference count leak.
178 */
179void
180fmd_case_hash_destroy(fmd_case_hash_t *chp)
181{
182	fmd_free(chp->ch_hash, sizeof (void *) * chp->ch_hashlen);
183	fmd_free(chp->ch_code_hash, sizeof (void *) * chp->ch_hashlen);
184	fmd_free(chp, sizeof (fmd_case_hash_t));
185}
186
187/*
188 * Take a snapshot of the case hash by placing an additional hold on each
189 * member in an auxiliary array, and then call 'func' for each case.
190 */
191void
192fmd_case_hash_apply(fmd_case_hash_t *chp,
193    void (*func)(fmd_case_t *, void *), void *arg)
194{
195	fmd_case_impl_t *cp, **cps, **cpp;
196	uint_t cpc, i;
197
198	(void) pthread_rwlock_rdlock(&chp->ch_lock);
199
200	cps = cpp = fmd_alloc(chp->ch_count * sizeof (fmd_case_t *), FMD_SLEEP);
201	cpc = chp->ch_count;
202
203	for (i = 0; i < chp->ch_hashlen; i++) {
204		for (cp = chp->ch_hash[i]; cp != NULL; cp = cp->ci_next)
205			*cpp++ = fmd_case_tryhold(cp);
206	}
207
208	ASSERT(cpp == cps + cpc);
209	(void) pthread_rwlock_unlock(&chp->ch_lock);
210
211	for (i = 0; i < cpc; i++) {
212		if (cps[i] != NULL) {
213			func((fmd_case_t *)cps[i], arg);
214			fmd_case_rele((fmd_case_t *)cps[i]);
215		}
216	}
217
218	fmd_free(cps, cpc * sizeof (fmd_case_t *));
219}
220
221static void
222fmd_case_code_hash_insert(fmd_case_hash_t *chp, fmd_case_impl_t *cip)
223{
224	uint_t h = fmd_strhash(cip->ci_code) % chp->ch_hashlen;
225
226	cip->ci_code_next = chp->ch_code_hash[h];
227	chp->ch_code_hash[h] = cip;
228}
229
230static void
231fmd_case_code_hash_delete(fmd_case_hash_t *chp, fmd_case_impl_t *cip)
232{
233	fmd_case_impl_t **pp, *cp;
234
235	if (cip->ci_code) {
236		uint_t h = fmd_strhash(cip->ci_code) % chp->ch_hashlen;
237
238		pp = &chp->ch_code_hash[h];
239		for (cp = *pp; cp != NULL; cp = cp->ci_code_next) {
240			if (cp != cip)
241				pp = &cp->ci_code_next;
242			else
243				break;
244		}
245		if (cp != NULL) {
246			*pp = cp->ci_code_next;
247			cp->ci_code_next = NULL;
248		}
249	}
250}
251
252/*
253 * Look up the diagcode for this case and cache it in ci_code.  If no suspects
254 * were defined for this case or if the lookup fails, the event dictionary or
255 * module code is broken, and we set the event code to a precomputed default.
256 */
257static const char *
258fmd_case_mkcode(fmd_case_t *cp)
259{
260	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
261	fmd_case_susp_t *cis;
262	fmd_case_hash_t *chp = fmd.d_cases;
263
264	char **keys, **keyp;
265	const char *s;
266
267	ASSERT(MUTEX_HELD(&cip->ci_lock));
268	ASSERT(cip->ci_state >= FMD_CASE_SOLVED);
269
270	/*
271	 * delete any existing entry from code hash if it is on it
272	 */
273	fmd_case_code_hash_delete(chp, cip);
274
275	fmd_free(cip->ci_code, cip->ci_codelen);
276	cip->ci_codelen = cip->ci_mod->mod_codelen;
277	cip->ci_code = fmd_zalloc(cip->ci_codelen, FMD_SLEEP);
278	keys = keyp = alloca(sizeof (char *) * (cip->ci_nsuspects + 1));
279
280	for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) {
281		if (nvlist_lookup_string(cis->cis_nvl, FM_CLASS, keyp) == 0)
282			keyp++;
283	}
284
285	*keyp = NULL; /* mark end of keys[] array for libdiagcode */
286
287	if (cip->ci_nsuspects == 0 || fmd_module_dc_key2code(
288	    cip->ci_mod, keys, cip->ci_code, cip->ci_codelen) != 0) {
289		(void) fmd_conf_getprop(fmd.d_conf, "nodiagcode", &s);
290		fmd_free(cip->ci_code, cip->ci_codelen);
291		cip->ci_codelen = strlen(s) + 1;
292		cip->ci_code = fmd_zalloc(cip->ci_codelen, FMD_SLEEP);
293		(void) strcpy(cip->ci_code, s);
294	}
295
296	/*
297	 * add into hash of solved cases
298	 */
299	fmd_case_code_hash_insert(chp, cip);
300
301	return (cip->ci_code);
302}
303
304typedef struct {
305	int	*fcl_countp;
306	int	fcl_maxcount;
307	uint8_t *fcl_ba;
308	nvlist_t **fcl_nva;
309	int	*fcl_msgp;
310} fmd_case_lst_t;
311
312static void
313fmd_case_set_lst(fmd_asru_link_t *alp, void *arg)
314{
315	fmd_case_lst_t *entryp = (fmd_case_lst_t *)arg;
316	boolean_t b;
317	int state;
318
319	if (*entryp->fcl_countp >= entryp->fcl_maxcount)
320		return;
321	if (nvlist_lookup_boolean_value(alp->al_event, FM_SUSPECT_MESSAGE,
322	    &b) == 0 && b == B_FALSE)
323		*entryp->fcl_msgp = B_FALSE;
324	entryp->fcl_ba[*entryp->fcl_countp] = 0;
325	state = fmd_asru_al_getstate(alp);
326	if (state & FMD_ASRU_DEGRADED)
327		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_DEGRADED;
328	if (state & FMD_ASRU_UNUSABLE)
329		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_UNUSABLE;
330	if (state & FMD_ASRU_FAULTY)
331		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_FAULTY;
332	if (!(state & FMD_ASRU_PRESENT))
333		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_NOT_PRESENT;
334	if (alp->al_reason == FMD_ASRU_REPAIRED)
335		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_REPAIRED;
336	else if (alp->al_reason == FMD_ASRU_REPLACED)
337		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_REPLACED;
338	else if (alp->al_reason == FMD_ASRU_ACQUITTED)
339		entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_ACQUITTED;
340	entryp->fcl_nva[*entryp->fcl_countp] = alp->al_event;
341	(*entryp->fcl_countp)++;
342}
343
344static void
345fmd_case_faulty(fmd_asru_link_t *alp, void *arg)
346{
347	int *faultyp = (int *)arg;
348
349	*faultyp |= (alp->al_flags & FMD_ASRU_FAULTY);
350}
351
352static void
353fmd_case_usable(fmd_asru_link_t *alp, void *arg)
354{
355	int *usablep = (int *)arg;
356
357	*usablep |= !(fmd_asru_al_getstate(alp) & FMD_ASRU_UNUSABLE);
358}
359
360static void
361fmd_case_not_faulty(fmd_asru_link_t *alp, void *arg)
362{
363	int *not_faultyp = (int *)arg;
364
365	*not_faultyp |= !(alp->al_flags & FMD_ASRU_FAULTY);
366}
367
368/*
369 * Have we got any suspects with an asru that are still unusable and present?
370 */
371static void
372fmd_case_unusable_and_present(fmd_asru_link_t *alp, void *arg)
373{
374	int *rvalp = (int *)arg;
375	int state;
376	nvlist_t *asru;
377
378	/*
379	 * if this a proxy case and this suspect doesn't have an local asru
380	 * then state is unknown so we must assume it may still be unusable.
381	 */
382	if ((alp->al_flags & FMD_ASRU_PROXY) &&
383	    !(alp->al_flags & FMD_ASRU_PROXY_WITH_ASRU)) {
384		*rvalp |= B_TRUE;
385		return;
386	}
387
388	state = fmd_asru_al_getstate(alp);
389	if (nvlist_lookup_nvlist(alp->al_event, FM_FAULT_ASRU, &asru) != 0)
390		return;
391	*rvalp |= ((state & FMD_ASRU_UNUSABLE) && (state & FMD_ASRU_PRESENT));
392}
393
394nvlist_t *
395fmd_case_mkevent(fmd_case_t *cp, const char *class)
396{
397	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
398	nvlist_t **nva, *nvl;
399	uint8_t *ba;
400	int msg = B_TRUE;
401	const char *code;
402	fmd_case_lst_t fcl;
403	int count = 0;
404
405	(void) pthread_mutex_lock(&cip->ci_lock);
406	ASSERT(cip->ci_state >= FMD_CASE_SOLVED);
407
408	nva = alloca(sizeof (nvlist_t *) * cip->ci_nsuspects);
409	ba = alloca(sizeof (uint8_t) * cip->ci_nsuspects);
410
411	/*
412	 * For each suspect associated with the case, store its fault event
413	 * nvlist in 'nva'.  We also look to see if any of the suspect faults
414	 * have asked not to be messaged.  If any of them have made such a
415	 * request, propagate that attribute to the composite list.* event.
416	 * Finally, store each suspect's faulty status into the bitmap 'ba'.
417	 */
418	fcl.fcl_countp = &count;
419	fcl.fcl_maxcount = cip->ci_nsuspects;
420	fcl.fcl_msgp = &msg;
421	fcl.fcl_ba = ba;
422	fcl.fcl_nva = nva;
423	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_set_lst, &fcl);
424
425	if (cip->ci_code == NULL)
426		(void) fmd_case_mkcode(cp);
427	/*
428	 * For repair and updated event, we lookup diagcode from dict using key
429	 * "list.repaired" or "list.updated" or "list.resolved".
430	 */
431	if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0)
432		(void) fmd_conf_getprop(fmd.d_conf, "repaircode", &code);
433	else if (strcmp(class, FM_LIST_RESOLVED_CLASS) == 0)
434		(void) fmd_conf_getprop(fmd.d_conf, "resolvecode", &code);
435	else if (strcmp(class, FM_LIST_UPDATED_CLASS) == 0)
436		(void) fmd_conf_getprop(fmd.d_conf, "updatecode", &code);
437	else
438		code = cip->ci_code;
439
440	if (msg == B_FALSE)
441		cip->ci_flags |= FMD_CF_INVISIBLE;
442
443	/*
444	 * Use the ci_diag_de if one has been saved (eg for an injected fault).
445	 * Otherwise use the authority for the current module.
446	 */
447	nvl = fmd_protocol_list(class, cip->ci_diag_de == NULL ?
448	    cip->ci_mod->mod_fmri : cip->ci_diag_de, cip->ci_uuid, code, count,
449	    nva, ba, msg, &cip->ci_tv, cip->ci_injected);
450
451	(void) pthread_mutex_unlock(&cip->ci_lock);
452	return (nvl);
453}
454
455static int fmd_case_match_on_faulty_overlap = 1;
456static int fmd_case_match_on_acquit_overlap = 1;
457static int fmd_case_auto_acquit_isolated = 1;
458static int fmd_case_auto_acquit_non_acquitted = 1;
459static int fmd_case_too_recent = 10; /* time in seconds */
460
461static boolean_t
462fmd_case_compare_elem(nvlist_t *nvl, nvlist_t *xnvl, const char *elem)
463{
464	nvlist_t *new_rsrc;
465	nvlist_t *rsrc;
466	char *new_name = NULL;
467	char *name = NULL;
468	ssize_t new_namelen;
469	ssize_t namelen;
470	int fmri_present = 1;
471	int new_fmri_present = 1;
472	int match = B_FALSE;
473	fmd_topo_t *ftp = fmd_topo_hold();
474
475	if (nvlist_lookup_nvlist(xnvl, elem, &rsrc) != 0)
476		fmri_present = 0;
477	else {
478		if ((namelen = fmd_fmri_nvl2str(rsrc, NULL, 0)) == -1)
479			goto done;
480		name = fmd_alloc(namelen + 1, FMD_SLEEP);
481		if (fmd_fmri_nvl2str(rsrc, name, namelen + 1) == -1)
482			goto done;
483	}
484	if (nvlist_lookup_nvlist(nvl, elem, &new_rsrc) != 0)
485		new_fmri_present = 0;
486	else {
487		if ((new_namelen = fmd_fmri_nvl2str(new_rsrc, NULL, 0)) == -1)
488			goto done;
489		new_name = fmd_alloc(new_namelen + 1, FMD_SLEEP);
490		if (fmd_fmri_nvl2str(new_rsrc, new_name, new_namelen + 1) == -1)
491			goto done;
492	}
493	match = (fmri_present == new_fmri_present &&
494	    (fmri_present == 0 ||
495	    topo_fmri_strcmp(ftp->ft_hdl, name, new_name)));
496done:
497	if (name != NULL)
498		fmd_free(name, namelen + 1);
499	if (new_name != NULL)
500		fmd_free(new_name, new_namelen + 1);
501	fmd_topo_rele(ftp);
502	return (match);
503}
504
505static int
506fmd_case_match_suspect(nvlist_t *nvl1, nvlist_t *nvl2)
507{
508	char *class, *new_class;
509
510	if (!fmd_case_compare_elem(nvl1, nvl2, FM_FAULT_ASRU))
511		return (0);
512	if (!fmd_case_compare_elem(nvl1, nvl2, FM_FAULT_RESOURCE))
513		return (0);
514	if (!fmd_case_compare_elem(nvl1, nvl2, FM_FAULT_FRU))
515		return (0);
516	(void) nvlist_lookup_string(nvl2, FM_CLASS, &class);
517	(void) nvlist_lookup_string(nvl1, FM_CLASS, &new_class);
518	return (strcmp(class, new_class) == 0);
519}
520
521typedef struct {
522	int	*fcms_countp;
523	int	fcms_maxcount;
524	fmd_case_impl_t *fcms_cip;
525	uint8_t *fcms_new_susp_state;
526	uint8_t *fcms_old_susp_state;
527	uint8_t *fcms_old_match_state;
528} fcms_t;
529#define	SUSPECT_STATE_FAULTY				0x1
530#define	SUSPECT_STATE_ISOLATED				0x2
531#define	SUSPECT_STATE_REMOVED				0x4
532#define	SUSPECT_STATE_ACQUITED				0x8
533#define	SUSPECT_STATE_REPAIRED				0x10
534#define	SUSPECT_STATE_REPLACED				0x20
535#define	SUSPECT_STATE_NO_MATCH				0x1
536
537/*
538 * This is called for each suspect in the old case. Compare it against each
539 * suspect in the new case, setting fcms_old_susp_state and fcms_new_susp_state
540 * as appropriate. fcms_new_susp_state will left as 0 if the suspect is not
541 * found in the old case.
542 */
543static void
544fmd_case_match_suspects(fmd_asru_link_t *alp, void *arg)
545{
546	fcms_t *fcmsp = (fcms_t *)arg;
547	fmd_case_impl_t *cip = fcmsp->fcms_cip;
548	fmd_case_susp_t *cis;
549	int i = 0;
550	int state = fmd_asru_al_getstate(alp);
551
552	if (*fcmsp->fcms_countp >= fcmsp->fcms_maxcount)
553		return;
554
555	if (!(state & FMD_ASRU_PRESENT) || (!(state & FMD_ASRU_FAULTY) &&
556	    alp->al_reason == FMD_ASRU_REMOVED))
557		fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] =
558		    SUSPECT_STATE_REMOVED;
559	else if ((state & FMD_ASRU_UNUSABLE) && (state & FMD_ASRU_FAULTY))
560		fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] =
561		    SUSPECT_STATE_ISOLATED;
562	else if (state & FMD_ASRU_FAULTY)
563		fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] =
564		    SUSPECT_STATE_FAULTY;
565	else if (alp->al_reason == FMD_ASRU_REPLACED)
566		fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] =
567		    SUSPECT_STATE_REPLACED;
568	else if (alp->al_reason == FMD_ASRU_ACQUITTED)
569		fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] =
570		    SUSPECT_STATE_ACQUITED;
571	else
572		fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] =
573		    SUSPECT_STATE_REPAIRED;
574
575	for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next, i++)
576		if (fmd_case_match_suspect(cis->cis_nvl, alp->al_event) == 1)
577			break;
578	if (cis != NULL)
579		fcmsp->fcms_new_susp_state[i] =
580		    fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp];
581	else
582		fcmsp->fcms_old_match_state[*fcmsp->fcms_countp] |=
583		    SUSPECT_STATE_NO_MATCH;
584	(*fcmsp->fcms_countp)++;
585}
586
587typedef struct {
588	int	*fca_do_update;
589	fmd_case_impl_t *fca_cip;
590} fca_t;
591
592/*
593 * Re-fault all acquitted suspects that are still present in the new list.
594 */
595static void
596fmd_case_fault_acquitted_matching(fmd_asru_link_t *alp, void *arg)
597{
598	fca_t *fcap = (fca_t *)arg;
599	fmd_case_impl_t *cip = fcap->fca_cip;
600	fmd_case_susp_t *cis;
601	int state = fmd_asru_al_getstate(alp);
602
603	if (!(state & FMD_ASRU_FAULTY) &&
604	    alp->al_reason == FMD_ASRU_ACQUITTED) {
605		for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next)
606			if (fmd_case_match_suspect(cis->cis_nvl,
607			    alp->al_event) == 1)
608				break;
609		if (cis != NULL) {
610			(void) fmd_asru_setflags(alp, FMD_ASRU_FAULTY);
611			*fcap->fca_do_update = 1;
612		}
613	}
614}
615
616/*
617 * Re-fault all suspects that are still present in the new list.
618 */
619static void
620fmd_case_fault_all_matching(fmd_asru_link_t *alp, void *arg)
621{
622	fca_t *fcap = (fca_t *)arg;
623	fmd_case_impl_t *cip = fcap->fca_cip;
624	fmd_case_susp_t *cis;
625	int state = fmd_asru_al_getstate(alp);
626
627	if (!(state & FMD_ASRU_FAULTY)) {
628		for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next)
629			if (fmd_case_match_suspect(cis->cis_nvl,
630			    alp->al_event) == 1)
631				break;
632		if (cis != NULL) {
633			(void) fmd_asru_setflags(alp, FMD_ASRU_FAULTY);
634			*fcap->fca_do_update = 1;
635		}
636	}
637}
638
639/*
640 * Acquit all suspects that are no longer present in the new list.
641 */
642static void
643fmd_case_acquit_no_match(fmd_asru_link_t *alp, void *arg)
644{
645	fca_t *fcap = (fca_t *)arg;
646	fmd_case_impl_t *cip = fcap->fca_cip;
647	fmd_case_susp_t *cis;
648	int state = fmd_asru_al_getstate(alp);
649
650	if (state & FMD_ASRU_FAULTY) {
651		for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next)
652			if (fmd_case_match_suspect(cis->cis_nvl,
653			    alp->al_event) == 1)
654				break;
655		if (cis == NULL) {
656			(void) fmd_asru_clrflags(alp, FMD_ASRU_FAULTY,
657			    FMD_ASRU_ACQUITTED);
658			*fcap->fca_do_update = 1;
659		}
660	}
661}
662
663/*
664 * Acquit all isolated suspects.
665 */
666static void
667fmd_case_acquit_isolated(fmd_asru_link_t *alp, void *arg)
668{
669	int *do_update = (int *)arg;
670	int state = fmd_asru_al_getstate(alp);
671
672	if ((state & FMD_ASRU_PRESENT) && (state & FMD_ASRU_UNUSABLE) &&
673	    (state & FMD_ASRU_FAULTY)) {
674		(void) fmd_asru_clrflags(alp, FMD_ASRU_FAULTY,
675		    FMD_ASRU_ACQUITTED);
676		*do_update = 1;
677	}
678}
679
680/*
681 * Acquit suspect which matches specified nvlist
682 */
683static void
684fmd_case_acquit_suspect(fmd_asru_link_t *alp, void *arg)
685{
686	nvlist_t *nvl = (nvlist_t *)arg;
687	int state = fmd_asru_al_getstate(alp);
688
689	if ((state & FMD_ASRU_FAULTY) &&
690	    fmd_case_match_suspect(nvl, alp->al_event) == 1)
691		(void) fmd_asru_clrflags(alp, FMD_ASRU_FAULTY,
692		    FMD_ASRU_ACQUITTED);
693}
694
695typedef struct {
696	fmd_case_impl_t *fccd_cip;
697	uint8_t *fccd_new_susp_state;
698	uint8_t *fccd_new_match_state;
699	int *fccd_discard_new;
700	int *fccd_adjust_new;
701} fccd_t;
702
703/*
704 * see if a matching suspect list already exists in the cache
705 */
706static void
707fmd_case_check_for_dups(fmd_case_t *old_cp, void *arg)
708{
709	fccd_t *fccdp = (fccd_t *)arg;
710	fmd_case_impl_t *new_cip = fccdp->fccd_cip;
711	fmd_case_impl_t *old_cip = (fmd_case_impl_t *)old_cp;
712	int i, count = 0, do_update = 0, got_isolated_overlap = 0;
713	int got_faulty_overlap = 0;
714	int got_acquit_overlap = 0;
715	boolean_t too_recent;
716	uint64_t most_recent = 0;
717	fcms_t fcms;
718	fca_t fca;
719	uint8_t *new_susp_state;
720	uint8_t *old_susp_state;
721	uint8_t *old_match_state;
722
723	new_susp_state = alloca(new_cip->ci_nsuspects * sizeof (uint8_t));
724	for (i = 0; i < new_cip->ci_nsuspects; i++)
725		new_susp_state[i] = 0;
726	old_susp_state = alloca(old_cip->ci_nsuspects * sizeof (uint8_t));
727	for (i = 0; i < old_cip->ci_nsuspects; i++)
728		old_susp_state[i] = 0;
729	old_match_state = alloca(old_cip->ci_nsuspects * sizeof (uint8_t));
730	for (i = 0; i < old_cip->ci_nsuspects; i++)
731		old_match_state[i] = 0;
732
733	/*
734	 * Compare with each suspect in the existing case.
735	 */
736	fcms.fcms_countp = &count;
737	fcms.fcms_maxcount = old_cip->ci_nsuspects;
738	fcms.fcms_cip = new_cip;
739	fcms.fcms_new_susp_state = new_susp_state;
740	fcms.fcms_old_susp_state = old_susp_state;
741	fcms.fcms_old_match_state = old_match_state;
742	fmd_asru_hash_apply_by_case(fmd.d_asrus, (fmd_case_t *)old_cip,
743	    fmd_case_match_suspects, &fcms);
744
745	/*
746	 * If we have some faulty, non-isolated suspects that overlap, then most
747	 * likely it is the suspects that overlap in the suspect lists that are
748	 * to blame. So we can consider this to be a match.
749	 */
750	for (i = 0; i < new_cip->ci_nsuspects; i++)
751		if (new_susp_state[i] == SUSPECT_STATE_FAULTY)
752			got_faulty_overlap = 1;
753	if (got_faulty_overlap && fmd_case_match_on_faulty_overlap)
754		goto got_match;
755
756	/*
757	 * If we have no faulty, non-isolated suspects in the old case, but we
758	 * do have some acquitted suspects that overlap, then most likely it is
759	 * the acquitted suspects that overlap in the suspect lists that are
760	 * to blame. So we can consider this to be a match.
761	 */
762	for (i = 0; i < new_cip->ci_nsuspects; i++)
763		if (new_susp_state[i] == SUSPECT_STATE_ACQUITED)
764			got_acquit_overlap = 1;
765	for (i = 0; i < old_cip->ci_nsuspects; i++)
766		if (old_susp_state[i] == SUSPECT_STATE_FAULTY)
767			got_acquit_overlap = 0;
768	if (got_acquit_overlap && fmd_case_match_on_acquit_overlap)
769		goto got_match;
770
771	/*
772	 * Check that all suspects in the new list are present in the old list.
773	 * Return if we find one that isn't.
774	 */
775	for (i = 0; i < new_cip->ci_nsuspects; i++)
776		if (new_susp_state[i] == 0)
777			return;
778
779	/*
780	 * Check that all suspects in the old list are present in the new list
781	 * *or* they are isolated or removed/replaced (which would explain why
782	 * they are not present in the new list). Return if we find one that is
783	 * faulty and unisolated or repaired or acquitted, and that is not
784	 * present in the new case.
785	 */
786	for (i = 0; i < old_cip->ci_nsuspects; i++)
787		if (old_match_state[i] == SUSPECT_STATE_NO_MATCH &&
788		    (old_susp_state[i] == SUSPECT_STATE_FAULTY ||
789		    old_susp_state[i] == SUSPECT_STATE_ACQUITED ||
790		    old_susp_state[i] == SUSPECT_STATE_REPAIRED))
791			return;
792
793got_match:
794	/*
795	 * If the old case is already in repaired/resolved state, we can't
796	 * do anything more with it, so keep the new case, but acquit some
797	 * of the suspects if appropriate.
798	 */
799	if (old_cip->ci_state >= FMD_CASE_REPAIRED) {
800		if (fmd_case_auto_acquit_non_acquitted) {
801			*fccdp->fccd_adjust_new = 1;
802			for (i = 0; i < new_cip->ci_nsuspects; i++) {
803				fccdp->fccd_new_susp_state[i] |=
804				    new_susp_state[i];
805				if (new_susp_state[i] == 0)
806					fccdp->fccd_new_susp_state[i] =
807					    SUSPECT_STATE_NO_MATCH;
808			}
809		}
810		return;
811	}
812
813	/*
814	 * Otherwise discard the new case and keep the old, again updating the
815	 * state of the suspects as appropriate
816	 */
817	*fccdp->fccd_discard_new = 1;
818	fca.fca_cip = new_cip;
819	fca.fca_do_update = &do_update;
820
821	/*
822	 * See if new case occurred within fmd_case_too_recent seconds of the
823	 * most recent modification to the old case and if so don't do
824	 * auto-acquit. This avoids problems if a flood of ereports come in and
825	 * they don't all get diagnosed before the first case causes some of
826	 * the devices to be isolated making it appear that an isolated device
827	 * was in the suspect list.
828	 */
829	fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp,
830	    fmd_asru_most_recent, &most_recent);
831	too_recent = (new_cip->ci_tv.tv_sec - most_recent <
832	    fmd_case_too_recent);
833
834	if (got_faulty_overlap) {
835		/*
836		 * Acquit any suspects not present in the new list, plus
837		 * any that are are present but are isolated.
838		 */
839		fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp,
840		    fmd_case_acquit_no_match, &fca);
841		if (fmd_case_auto_acquit_isolated && !too_recent)
842			fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp,
843			    fmd_case_acquit_isolated, &do_update);
844	} else if (got_acquit_overlap) {
845		/*
846		 * Re-fault the acquitted matching suspects and acquit all
847		 * isolated suspects.
848		 */
849		if (fmd_case_auto_acquit_isolated && !too_recent) {
850			fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp,
851			    fmd_case_fault_acquitted_matching, &fca);
852			fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp,
853			    fmd_case_acquit_isolated, &do_update);
854		}
855	} else if (fmd_case_auto_acquit_isolated) {
856		/*
857		 * To get here, there must be no faulty or acquitted suspects,
858		 * but there must be at least one isolated suspect. Just acquit
859		 * non-matching isolated suspects. If there are no matching
860		 * isolated suspects, then re-fault all matching suspects.
861		 */
862		for (i = 0; i < new_cip->ci_nsuspects; i++)
863			if (new_susp_state[i] == SUSPECT_STATE_ISOLATED)
864				got_isolated_overlap = 1;
865		if (!got_isolated_overlap)
866			fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp,
867			    fmd_case_fault_all_matching, &fca);
868		fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp,
869		    fmd_case_acquit_no_match, &fca);
870	}
871
872	/*
873	 * If we've updated anything in the old case, call fmd_case_update()
874	 */
875	if (do_update)
876		fmd_case_update(old_cp);
877}
878
879/*
880 * Convict suspects in a case by applying a conviction policy and updating the
881 * resource cache prior to emitting the list.suspect event for the given case.
882 * At present, our policy is very simple: convict every suspect in the case.
883 * In the future, this policy can be extended and made configurable to permit:
884 *
885 * - convicting the suspect with the highest FIT rate
886 * - convicting the suspect with the cheapest FRU
887 * - convicting the suspect with the FRU that is in a depot's inventory
888 * - convicting the suspect with the longest lifetime
889 *
890 * and so forth.  A word to the wise: this problem is significantly harder that
891 * it seems at first glance.  Future work should heed the following advice:
892 *
893 * Hacking the policy into C code here is a very bad idea.  The policy needs to
894 * be decided upon very carefully and fundamentally encodes knowledge of what
895 * suspect list combinations can be emitted by what diagnosis engines.  As such
896 * fmd's code is the wrong location, because that would require fmd itself to
897 * be updated for every diagnosis engine change, defeating the entire design.
898 * The FMA Event Registry knows the suspect list combinations: policy inputs
899 * can be derived from it and used to produce per-module policy configuration.
900 *
901 * If the policy needs to be dynamic and not statically fixed at either fmd
902 * startup or module load time, any implementation of dynamic policy retrieval
903 * must employ some kind of caching mechanism or be part of a built-in module.
904 * The fmd_case_convict() function is called with locks held inside of fmd and
905 * is not a place where unbounded blocking on some inter-process or inter-
906 * system communication to another service (e.g. another daemon) can occur.
907 */
908static int
909fmd_case_convict(fmd_case_t *cp)
910{
911	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
912	fmd_asru_hash_t *ahp = fmd.d_asrus;
913	int discard_new = 0, i;
914	fmd_case_susp_t *cis;
915	fmd_asru_link_t *alp;
916	uint8_t *new_susp_state;
917	uint8_t *new_match_state;
918	int adjust_new = 0;
919	fccd_t fccd;
920	fmd_case_impl_t *ncp, **cps, **cpp;
921	uint_t cpc;
922	fmd_case_hash_t *chp;
923
924	/*
925	 * First we must see if any matching cases already exist.
926	 */
927	new_susp_state = alloca(cip->ci_nsuspects * sizeof (uint8_t));
928	for (i = 0; i < cip->ci_nsuspects; i++)
929		new_susp_state[i] = 0;
930	new_match_state = alloca(cip->ci_nsuspects * sizeof (uint8_t));
931	for (i = 0; i < cip->ci_nsuspects; i++)
932		new_match_state[i] = 0;
933	fccd.fccd_cip = cip;
934	fccd.fccd_adjust_new = &adjust_new;
935	fccd.fccd_new_susp_state = new_susp_state;
936	fccd.fccd_new_match_state = new_match_state;
937	fccd.fccd_discard_new = &discard_new;
938
939	/*
940	 * Hold all cases
941	 */
942	chp = fmd.d_cases;
943	(void) pthread_rwlock_rdlock(&chp->ch_lock);
944	cps = cpp = fmd_alloc(chp->ch_count * sizeof (fmd_case_t *), FMD_SLEEP);
945	cpc = chp->ch_count;
946	for (i = 0; i < chp->ch_hashlen; i++)
947		for (ncp = chp->ch_hash[i]; ncp != NULL; ncp = ncp->ci_next)
948			*cpp++ = fmd_case_tryhold(ncp);
949	ASSERT(cpp == cps + cpc);
950	(void) pthread_rwlock_unlock(&chp->ch_lock);
951
952	/*
953	 * Run fmd_case_check_for_dups() on all cases except the current one.
954	 */
955	for (i = 0; i < cpc; i++) {
956		if (cps[i] != NULL) {
957			if (cps[i] != (fmd_case_impl_t *)cp)
958				fmd_case_check_for_dups((fmd_case_t *)cps[i],
959				    &fccd);
960			fmd_case_rele((fmd_case_t *)cps[i]);
961		}
962	}
963	fmd_free(cps, cpc * sizeof (fmd_case_t *));
964
965	(void) pthread_mutex_lock(&cip->ci_lock);
966	if (cip->ci_code == NULL)
967		(void) fmd_case_mkcode(cp);
968	else if (cip->ci_precanned)
969		fmd_case_code_hash_insert(fmd.d_cases, cip);
970
971	if (discard_new) {
972		/*
973		 * We've found an existing case that is a match and it is not
974		 * already in repaired or resolved state. So we can close this
975		 * one as a duplicate.
976		 */
977		(void) pthread_mutex_unlock(&cip->ci_lock);
978		return (1);
979	}
980
981	/*
982	 * Allocate new cache entries
983	 */
984	for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) {
985		if ((alp = fmd_asru_hash_create_entry(ahp,
986		    cp, cis->cis_nvl)) == NULL) {
987			fmd_error(EFMD_CASE_EVENT, "cannot convict suspect in "
988			    "%s: %s\n", cip->ci_uuid, fmd_strerror(errno));
989			continue;
990		}
991		alp->al_flags |= FMD_ASRU_PRESENT;
992		alp->al_asru->asru_flags |= FMD_ASRU_PRESENT;
993		(void) fmd_asru_clrflags(alp, FMD_ASRU_UNUSABLE, 0);
994		(void) fmd_asru_setflags(alp, FMD_ASRU_FAULTY);
995	}
996
997	if (adjust_new) {
998		int some_suspect = 0, some_not_suspect = 0;
999
1000		/*
1001		 * There is one or more matching case but they are already in
1002		 * repaired or resolved state. So we need to keep the new
1003		 * case, but we can adjust it. Repaired/removed/replaced
1004		 * suspects are unlikely to be to blame (unless there are
1005		 * actually two separate faults). So if we have a combination of
1006		 * repaired/replaced/removed suspects and acquitted suspects in
1007		 * the old lists, then we should acquit in the new list those
1008		 * that were repaired/replaced/removed in the old.
1009		 */
1010		for (i = 0; i < cip->ci_nsuspects; i++) {
1011			if ((new_susp_state[i] & SUSPECT_STATE_REPLACED) ||
1012			    (new_susp_state[i] & SUSPECT_STATE_REPAIRED) ||
1013			    (new_susp_state[i] & SUSPECT_STATE_REMOVED) ||
1014			    (new_match_state[i] & SUSPECT_STATE_NO_MATCH))
1015				some_not_suspect = 1;
1016			else
1017				some_suspect = 1;
1018		}
1019		if (some_suspect && some_not_suspect) {
1020			for (cis = cip->ci_suspects, i = 0; cis != NULL;
1021			    cis = cis->cis_next, i++)
1022				if ((new_susp_state[i] &
1023				    SUSPECT_STATE_REPLACED) ||
1024				    (new_susp_state[i] &
1025				    SUSPECT_STATE_REPAIRED) ||
1026				    (new_susp_state[i] &
1027				    SUSPECT_STATE_REMOVED) ||
1028				    (new_match_state[i] &
1029				    SUSPECT_STATE_NO_MATCH))
1030					fmd_asru_hash_apply_by_case(fmd.d_asrus,
1031					    cp, fmd_case_acquit_suspect,
1032					    cis->cis_nvl);
1033		}
1034	}
1035
1036	(void) pthread_mutex_unlock(&cip->ci_lock);
1037	return (0);
1038}
1039
1040void
1041fmd_case_publish(fmd_case_t *cp, uint_t state)
1042{
1043	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1044	fmd_event_t *e;
1045	nvlist_t *nvl;
1046	char *class;
1047
1048	if (state == FMD_CASE_CURRENT)
1049		state = cip->ci_state; /* use current state */
1050
1051	switch (state) {
1052	case FMD_CASE_SOLVED:
1053		(void) pthread_mutex_lock(&cip->ci_lock);
1054
1055		/*
1056		 * If we already have a code, then case is already solved.
1057		 */
1058		if (cip->ci_precanned == 0 && cip->ci_xprt == NULL &&
1059		    cip->ci_code != NULL) {
1060			(void) pthread_mutex_unlock(&cip->ci_lock);
1061			break;
1062		}
1063
1064		if (cip->ci_tv_valid == 0) {
1065			fmd_time_gettimeofday(&cip->ci_tv);
1066			cip->ci_tv_valid = 1;
1067		}
1068		(void) pthread_mutex_unlock(&cip->ci_lock);
1069
1070		if (fmd_case_convict(cp) == 1) { /* dupclose */
1071			cip->ci_flags &= ~FMD_CF_SOLVED;
1072			fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, 0);
1073			break;
1074		}
1075		if (cip->ci_xprt != NULL) {
1076			/*
1077			 * For proxy, save some information about the transport
1078			 * in the resource cache.
1079			 */
1080			int count = 0;
1081			fmd_asru_set_on_proxy_t fasp;
1082			fmd_xprt_impl_t *xip = (fmd_xprt_impl_t *)cip->ci_xprt;
1083
1084			fasp.fasp_countp = &count;
1085			fasp.fasp_maxcount = cip->ci_nsuspects;
1086			fasp.fasp_proxy_asru = cip->ci_proxy_asru;
1087			fasp.fasp_proxy_external = xip->xi_flags &
1088			    FMD_XPRT_EXTERNAL;
1089			fasp.fasp_proxy_rdonly = ((xip->xi_flags &
1090			    FMD_XPRT_RDWR) == FMD_XPRT_RDONLY);
1091			fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
1092			    fmd_asru_set_on_proxy, &fasp);
1093		}
1094		nvl = fmd_case_mkevent(cp, FM_LIST_SUSPECT_CLASS);
1095		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
1096
1097		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
1098		(void) pthread_rwlock_rdlock(&fmd.d_log_lock);
1099		fmd_log_append(fmd.d_fltlog, e, cp);
1100		(void) pthread_rwlock_unlock(&fmd.d_log_lock);
1101		fmd_dispq_dispatch(fmd.d_disp, e, class);
1102
1103		(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
1104		cip->ci_mod->mod_stats->ms_casesolved.fmds_value.ui64++;
1105		(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
1106
1107		break;
1108
1109	case FMD_CASE_CLOSE_WAIT:
1110		fmd_case_hold(cp);
1111		e = fmd_event_create(FMD_EVT_CLOSE, FMD_HRT_NOW, NULL, cp);
1112		fmd_eventq_insert_at_head(cip->ci_mod->mod_queue, e);
1113
1114		(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
1115		cip->ci_mod->mod_stats->ms_caseclosed.fmds_value.ui64++;
1116		(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
1117
1118		break;
1119
1120	case FMD_CASE_CLOSED:
1121		nvl = fmd_case_mkevent(cp, FM_LIST_ISOLATED_CLASS);
1122		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
1123		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
1124		fmd_dispq_dispatch(fmd.d_disp, e, class);
1125		break;
1126
1127	case FMD_CASE_REPAIRED:
1128		nvl = fmd_case_mkevent(cp, FM_LIST_REPAIRED_CLASS);
1129		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
1130		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
1131		(void) pthread_rwlock_rdlock(&fmd.d_log_lock);
1132		fmd_log_append(fmd.d_fltlog, e, cp);
1133		(void) pthread_rwlock_unlock(&fmd.d_log_lock);
1134		fmd_dispq_dispatch(fmd.d_disp, e, class);
1135		break;
1136
1137	case FMD_CASE_RESOLVED:
1138		nvl = fmd_case_mkevent(cp, FM_LIST_RESOLVED_CLASS);
1139		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
1140		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
1141		(void) pthread_rwlock_rdlock(&fmd.d_log_lock);
1142		fmd_log_append(fmd.d_fltlog, e, cp);
1143		(void) pthread_rwlock_unlock(&fmd.d_log_lock);
1144		fmd_dispq_dispatch(fmd.d_disp, e, class);
1145		break;
1146	}
1147}
1148
1149fmd_case_t *
1150fmd_case_hash_lookup(fmd_case_hash_t *chp, const char *uuid)
1151{
1152	fmd_case_impl_t *cip;
1153	uint_t h;
1154
1155	(void) pthread_rwlock_rdlock(&chp->ch_lock);
1156	h = fmd_strhash(uuid) % chp->ch_hashlen;
1157
1158	for (cip = chp->ch_hash[h]; cip != NULL; cip = cip->ci_next) {
1159		if (strcmp(cip->ci_uuid, uuid) == 0)
1160			break;
1161	}
1162
1163	/*
1164	 * If deleting bit is set, treat the case as if it doesn't exist.
1165	 */
1166	if (cip != NULL)
1167		cip = fmd_case_tryhold(cip);
1168
1169	if (cip == NULL)
1170		(void) fmd_set_errno(EFMD_CASE_INVAL);
1171
1172	(void) pthread_rwlock_unlock(&chp->ch_lock);
1173	return ((fmd_case_t *)cip);
1174}
1175
1176static fmd_case_impl_t *
1177fmd_case_hash_insert(fmd_case_hash_t *chp, fmd_case_impl_t *cip)
1178{
1179	fmd_case_impl_t *eip;
1180	uint_t h;
1181
1182	(void) pthread_rwlock_wrlock(&chp->ch_lock);
1183	h = fmd_strhash(cip->ci_uuid) % chp->ch_hashlen;
1184
1185	for (eip = chp->ch_hash[h]; eip != NULL; eip = eip->ci_next) {
1186		if (strcmp(cip->ci_uuid, eip->ci_uuid) == 0 &&
1187		    fmd_case_tryhold(eip) != NULL) {
1188			(void) pthread_rwlock_unlock(&chp->ch_lock);
1189			return (eip); /* uuid already present */
1190		}
1191	}
1192
1193	cip->ci_next = chp->ch_hash[h];
1194	chp->ch_hash[h] = cip;
1195
1196	chp->ch_count++;
1197	ASSERT(chp->ch_count != 0);
1198
1199	(void) pthread_rwlock_unlock(&chp->ch_lock);
1200	return (cip);
1201}
1202
1203static void
1204fmd_case_hash_delete(fmd_case_hash_t *chp, fmd_case_impl_t *cip)
1205{
1206	fmd_case_impl_t *cp, **pp;
1207	uint_t h;
1208
1209	ASSERT(MUTEX_HELD(&cip->ci_lock));
1210
1211	cip->ci_flags |= FMD_CF_DELETING;
1212	(void) pthread_mutex_unlock(&cip->ci_lock);
1213
1214	(void) pthread_rwlock_wrlock(&chp->ch_lock);
1215
1216	h = fmd_strhash(cip->ci_uuid) % chp->ch_hashlen;
1217	pp = &chp->ch_hash[h];
1218
1219	for (cp = *pp; cp != NULL; cp = cp->ci_next) {
1220		if (cp != cip)
1221			pp = &cp->ci_next;
1222		else
1223			break;
1224	}
1225
1226	if (cp == NULL) {
1227		fmd_panic("case %p (%s) not found on hash chain %u\n",
1228		    (void *)cip, cip->ci_uuid, h);
1229	}
1230
1231	*pp = cp->ci_next;
1232	cp->ci_next = NULL;
1233
1234	/*
1235	 * delete from code hash if it is on it
1236	 */
1237	fmd_case_code_hash_delete(chp, cip);
1238
1239	ASSERT(chp->ch_count != 0);
1240	chp->ch_count--;
1241
1242	(void) pthread_rwlock_unlock(&chp->ch_lock);
1243
1244	(void) pthread_mutex_lock(&cip->ci_lock);
1245	ASSERT(cip->ci_flags & FMD_CF_DELETING);
1246}
1247
1248fmd_case_t *
1249fmd_case_create(fmd_module_t *mp, const char *uuidstr, void *data)
1250{
1251	fmd_case_impl_t *cip = fmd_zalloc(sizeof (fmd_case_impl_t), FMD_SLEEP);
1252	fmd_case_impl_t *eip = NULL;
1253	uuid_t uuid;
1254
1255	(void) pthread_mutex_init(&cip->ci_lock, NULL);
1256	fmd_buf_hash_create(&cip->ci_bufs);
1257
1258	fmd_module_hold(mp);
1259	cip->ci_mod = mp;
1260	cip->ci_refs = 1;
1261	cip->ci_state = FMD_CASE_UNSOLVED;
1262	cip->ci_flags = FMD_CF_DIRTY;
1263	cip->ci_data = data;
1264
1265	/*
1266	 * Calling libuuid: get a clue.  The library interfaces cleverly do not
1267	 * define any constant for the length of an unparse string, and do not
1268	 * permit the caller to specify a buffer length for safety.  The spec
1269	 * says it will be 36 bytes, but we make it tunable just in case.
1270	 */
1271	(void) fmd_conf_getprop(fmd.d_conf, "uuidlen", &cip->ci_uuidlen);
1272	cip->ci_uuid = fmd_zalloc(cip->ci_uuidlen + 1, FMD_SLEEP);
1273
1274	if (uuidstr == NULL) {
1275		/*
1276		 * We expect this loop to execute only once, but code it
1277		 * defensively against the possibility of libuuid bugs.
1278		 * Keep generating uuids and attempting to do a hash insert
1279		 * until we get a unique one.
1280		 */
1281		do {
1282			if (eip != NULL)
1283				fmd_case_rele((fmd_case_t *)eip);
1284			uuid_generate(uuid);
1285			uuid_unparse(uuid, cip->ci_uuid);
1286		} while ((eip = fmd_case_hash_insert(fmd.d_cases, cip)) != cip);
1287	} else {
1288		/*
1289		 * If a uuid was specified we must succeed with that uuid,
1290		 * or return NULL indicating a case with that uuid already
1291		 * exists.
1292		 */
1293		(void) strncpy(cip->ci_uuid, uuidstr, cip->ci_uuidlen + 1);
1294		if (fmd_case_hash_insert(fmd.d_cases, cip) != cip) {
1295			fmd_free(cip->ci_uuid, cip->ci_uuidlen + 1);
1296			(void) fmd_buf_hash_destroy(&cip->ci_bufs);
1297			fmd_module_rele(mp);
1298			pthread_mutex_destroy(&cip->ci_lock);
1299			fmd_free(cip, sizeof (*cip));
1300			return (NULL);
1301		}
1302	}
1303
1304	ASSERT(fmd_module_locked(mp));
1305	fmd_list_append(&mp->mod_cases, cip);
1306	fmd_module_setcdirty(mp);
1307
1308	(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
1309	cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64++;
1310	(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
1311
1312	return ((fmd_case_t *)cip);
1313}
1314
1315static void
1316fmd_case_destroy_suspects(fmd_case_impl_t *cip)
1317{
1318	fmd_case_susp_t *cis, *ncis;
1319
1320	ASSERT(MUTEX_HELD(&cip->ci_lock));
1321
1322	if (cip->ci_proxy_asru)
1323		fmd_free(cip->ci_proxy_asru, sizeof (uint8_t) *
1324		    cip->ci_nsuspects);
1325	if (cip->ci_diag_de)
1326		nvlist_free(cip->ci_diag_de);
1327	if (cip->ci_diag_asru)
1328		fmd_free(cip->ci_diag_asru, sizeof (uint8_t) *
1329		    cip->ci_nsuspects);
1330
1331	for (cis = cip->ci_suspects; cis != NULL; cis = ncis) {
1332		ncis = cis->cis_next;
1333		nvlist_free(cis->cis_nvl);
1334		fmd_free(cis, sizeof (fmd_case_susp_t));
1335	}
1336
1337	cip->ci_suspects = NULL;
1338	cip->ci_nsuspects = 0;
1339}
1340
1341fmd_case_t *
1342fmd_case_recreate(fmd_module_t *mp, fmd_xprt_t *xp,
1343    uint_t state, const char *uuid, const char *code)
1344{
1345	fmd_case_impl_t *cip = fmd_zalloc(sizeof (fmd_case_impl_t), FMD_SLEEP);
1346	fmd_case_impl_t *eip;
1347
1348	(void) pthread_mutex_init(&cip->ci_lock, NULL);
1349	fmd_buf_hash_create(&cip->ci_bufs);
1350
1351	fmd_module_hold(mp);
1352	cip->ci_mod = mp;
1353	cip->ci_xprt = xp;
1354	cip->ci_refs = 1;
1355	cip->ci_state = state;
1356	cip->ci_uuid = fmd_strdup(uuid, FMD_SLEEP);
1357	cip->ci_uuidlen = strlen(cip->ci_uuid);
1358	cip->ci_code = fmd_strdup(code, FMD_SLEEP);
1359	cip->ci_codelen = cip->ci_code ? strlen(cip->ci_code) + 1 : 0;
1360
1361	if (state > FMD_CASE_CLOSE_WAIT)
1362		cip->ci_flags |= FMD_CF_SOLVED;
1363
1364	/*
1365	 * Insert the case into the global case hash.  If the specified UUID is
1366	 * already present, check to see if it is an orphan: if so, reclaim it;
1367	 * otherwise if it is owned by a different module then return NULL.
1368	 */
1369	if ((eip = fmd_case_hash_insert(fmd.d_cases, cip)) != cip) {
1370		(void) pthread_mutex_lock(&cip->ci_lock);
1371		cip->ci_refs--; /* decrement to zero */
1372		fmd_case_destroy((fmd_case_t *)cip, B_FALSE);
1373
1374		cip = eip; /* switch 'cip' to the existing case */
1375		(void) pthread_mutex_lock(&cip->ci_lock);
1376
1377		/*
1378		 * If the ASRU cache is trying to recreate an orphan, then just
1379		 * return the existing case that we found without changing it.
1380		 */
1381		if (mp == fmd.d_rmod) {
1382			/*
1383			 * In case the case has already been created from
1384			 * a checkpoint file we need to set up code now.
1385			 */
1386			if (cip->ci_state < FMD_CASE_CLOSED) {
1387				if (code != NULL && cip->ci_code == NULL) {
1388					cip->ci_code = fmd_strdup(code,
1389					    FMD_SLEEP);
1390					cip->ci_codelen = cip->ci_code ?
1391					    strlen(cip->ci_code) + 1 : 0;
1392					fmd_case_code_hash_insert(fmd.d_cases,
1393					    cip);
1394				}
1395			}
1396
1397			/*
1398			 * When recreating an orphan case, state passed in may
1399			 * be CLOSED (faulty) or REPAIRED/RESOLVED (!faulty). If
1400			 * any suspects are still CLOSED (faulty) then the
1401			 * overall state needs to be CLOSED.
1402			 */
1403			if ((cip->ci_state == FMD_CASE_REPAIRED ||
1404			    cip->ci_state == FMD_CASE_RESOLVED) &&
1405			    state == FMD_CASE_CLOSED)
1406				cip->ci_state = FMD_CASE_CLOSED;
1407			(void) pthread_mutex_unlock(&cip->ci_lock);
1408			fmd_case_rele((fmd_case_t *)cip);
1409			return ((fmd_case_t *)cip);
1410		}
1411
1412		/*
1413		 * If the existing case isn't an orphan or is being proxied,
1414		 * then we have a UUID conflict: return failure to the caller.
1415		 */
1416		if (cip->ci_mod != fmd.d_rmod || xp != NULL) {
1417			(void) pthread_mutex_unlock(&cip->ci_lock);
1418			fmd_case_rele((fmd_case_t *)cip);
1419			return (NULL);
1420		}
1421
1422		/*
1423		 * If the new module is reclaiming an orphaned case, remove
1424		 * the case from the root module, switch ci_mod, and then fall
1425		 * through to adding the case to the new owner module 'mp'.
1426		 */
1427		fmd_module_lock(cip->ci_mod);
1428		fmd_list_delete(&cip->ci_mod->mod_cases, cip);
1429		fmd_module_unlock(cip->ci_mod);
1430
1431		fmd_module_rele(cip->ci_mod);
1432		cip->ci_mod = mp;
1433		fmd_module_hold(mp);
1434
1435		/*
1436		 * It's possible that fmd crashed or was restarted during a
1437		 * previous solve operation between the asru cache being created
1438		 * and the ckpt file being updated to SOLVED. Thus when the DE
1439		 * recreates the case here from the checkpoint file, the state
1440		 * will be UNSOLVED and yet we are having to reclaim because
1441		 * the case was in the asru cache. If this happens, revert the
1442		 * case back to the UNSOLVED state and let the DE solve it again
1443		 */
1444		if (state == FMD_CASE_UNSOLVED) {
1445			fmd_asru_hash_delete_case(fmd.d_asrus,
1446			    (fmd_case_t *)cip);
1447			fmd_case_destroy_suspects(cip);
1448			fmd_case_code_hash_delete(fmd.d_cases, cip);
1449			fmd_free(cip->ci_code, cip->ci_codelen);
1450			cip->ci_code = NULL;
1451			cip->ci_codelen = 0;
1452			cip->ci_tv_valid = 0;
1453		}
1454
1455		cip->ci_state = state;
1456
1457		(void) pthread_mutex_unlock(&cip->ci_lock);
1458		fmd_case_rele((fmd_case_t *)cip);
1459	} else {
1460		/*
1461		 * add into hash of solved cases
1462		 */
1463		if (cip->ci_code)
1464			fmd_case_code_hash_insert(fmd.d_cases, cip);
1465	}
1466
1467	ASSERT(fmd_module_locked(mp));
1468	fmd_list_append(&mp->mod_cases, cip);
1469
1470	(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
1471	cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64++;
1472	(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
1473
1474	return ((fmd_case_t *)cip);
1475}
1476
1477void
1478fmd_case_destroy(fmd_case_t *cp, int visible)
1479{
1480	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1481	fmd_case_item_t *cit, *ncit;
1482
1483	ASSERT(MUTEX_HELD(&cip->ci_lock));
1484	ASSERT(cip->ci_refs == 0);
1485
1486	if (visible) {
1487		TRACE((FMD_DBG_CASE, "deleting case %s", cip->ci_uuid));
1488		fmd_case_hash_delete(fmd.d_cases, cip);
1489	}
1490
1491	for (cit = cip->ci_items; cit != NULL; cit = ncit) {
1492		ncit = cit->cit_next;
1493		fmd_event_rele(cit->cit_event);
1494		fmd_free(cit, sizeof (fmd_case_item_t));
1495	}
1496
1497	fmd_case_destroy_suspects(cip);
1498
1499	if (cip->ci_principal != NULL)
1500		fmd_event_rele(cip->ci_principal);
1501
1502	fmd_free(cip->ci_uuid, cip->ci_uuidlen + 1);
1503	fmd_free(cip->ci_code, cip->ci_codelen);
1504	(void) fmd_buf_hash_destroy(&cip->ci_bufs);
1505
1506	fmd_module_rele(cip->ci_mod);
1507	fmd_free(cip, sizeof (fmd_case_impl_t));
1508}
1509
1510void
1511fmd_case_hold(fmd_case_t *cp)
1512{
1513	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1514
1515	(void) pthread_mutex_lock(&cip->ci_lock);
1516	fmd_case_hold_locked(cp);
1517	(void) pthread_mutex_unlock(&cip->ci_lock);
1518}
1519
1520void
1521fmd_case_hold_locked(fmd_case_t *cp)
1522{
1523	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1524
1525	ASSERT(MUTEX_HELD(&cip->ci_lock));
1526	if (cip->ci_flags & FMD_CF_DELETING)
1527		fmd_panic("attempt to hold a deleting case %p (%s)\n",
1528		    (void *)cip, cip->ci_uuid);
1529	cip->ci_refs++;
1530	ASSERT(cip->ci_refs != 0);
1531}
1532
1533static fmd_case_impl_t *
1534fmd_case_tryhold(fmd_case_impl_t *cip)
1535{
1536	/*
1537	 * If the case's "deleting" bit is unset, hold and return case,
1538	 * otherwise, return NULL.
1539	 */
1540	(void) pthread_mutex_lock(&cip->ci_lock);
1541	if (cip->ci_flags & FMD_CF_DELETING) {
1542		(void) pthread_mutex_unlock(&cip->ci_lock);
1543		cip = NULL;
1544	} else {
1545		fmd_case_hold_locked((fmd_case_t *)cip);
1546		(void) pthread_mutex_unlock(&cip->ci_lock);
1547	}
1548	return (cip);
1549}
1550
1551void
1552fmd_case_rele(fmd_case_t *cp)
1553{
1554	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1555
1556	(void) pthread_mutex_lock(&cip->ci_lock);
1557	ASSERT(cip->ci_refs != 0);
1558
1559	if (--cip->ci_refs == 0)
1560		fmd_case_destroy((fmd_case_t *)cip, B_TRUE);
1561	else
1562		(void) pthread_mutex_unlock(&cip->ci_lock);
1563}
1564
1565void
1566fmd_case_rele_locked(fmd_case_t *cp)
1567{
1568	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1569
1570	ASSERT(MUTEX_HELD(&cip->ci_lock));
1571	--cip->ci_refs;
1572	ASSERT(cip->ci_refs != 0);
1573}
1574
1575int
1576fmd_case_insert_principal(fmd_case_t *cp, fmd_event_t *ep)
1577{
1578	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1579	fmd_case_item_t *cit;
1580	fmd_event_t *oep;
1581	uint_t state;
1582	int new;
1583
1584	fmd_event_hold(ep);
1585	(void) pthread_mutex_lock(&cip->ci_lock);
1586
1587	if (cip->ci_flags & FMD_CF_SOLVED)
1588		state = FMD_EVS_DIAGNOSED;
1589	else
1590		state = FMD_EVS_ACCEPTED;
1591
1592	oep = cip->ci_principal;
1593	cip->ci_principal = ep;
1594
1595	for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) {
1596		if (cit->cit_event == ep)
1597			break;
1598	}
1599
1600	cip->ci_flags |= FMD_CF_DIRTY;
1601	new = cit == NULL && ep != oep;
1602
1603	(void) pthread_mutex_unlock(&cip->ci_lock);
1604
1605	fmd_module_setcdirty(cip->ci_mod);
1606	fmd_event_transition(ep, state);
1607
1608	if (oep != NULL)
1609		fmd_event_rele(oep);
1610
1611	return (new);
1612}
1613
1614int
1615fmd_case_insert_event(fmd_case_t *cp, fmd_event_t *ep)
1616{
1617	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1618	fmd_case_item_t *cit;
1619	uint_t state;
1620	int new;
1621	boolean_t injected;
1622
1623	(void) pthread_mutex_lock(&cip->ci_lock);
1624
1625	if (cip->ci_flags & FMD_CF_SOLVED)
1626		state = FMD_EVS_DIAGNOSED;
1627	else
1628		state = FMD_EVS_ACCEPTED;
1629
1630	for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) {
1631		if (cit->cit_event == ep)
1632			break;
1633	}
1634
1635	new = cit == NULL && ep != cip->ci_principal;
1636
1637	/*
1638	 * If the event is already in the case or the case is already solved,
1639	 * there is no reason to save it: just transition it appropriately.
1640	 */
1641	if (cit != NULL || (cip->ci_flags & FMD_CF_SOLVED)) {
1642		(void) pthread_mutex_unlock(&cip->ci_lock);
1643		fmd_event_transition(ep, state);
1644		return (new);
1645	}
1646
1647	cit = fmd_alloc(sizeof (fmd_case_item_t), FMD_SLEEP);
1648	fmd_event_hold(ep);
1649
1650	if (nvlist_lookup_boolean_value(((fmd_event_impl_t *)ep)->ev_nvl,
1651	    "__injected", &injected) == 0 && injected)
1652		fmd_case_set_injected(cp);
1653
1654	cit->cit_next = cip->ci_items;
1655	cit->cit_event = ep;
1656
1657	cip->ci_items = cit;
1658	cip->ci_nitems++;
1659
1660	cip->ci_flags |= FMD_CF_DIRTY;
1661	(void) pthread_mutex_unlock(&cip->ci_lock);
1662
1663	fmd_module_setcdirty(cip->ci_mod);
1664	fmd_event_transition(ep, state);
1665
1666	return (new);
1667}
1668
1669void
1670fmd_case_insert_suspect(fmd_case_t *cp, nvlist_t *nvl)
1671{
1672	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1673	fmd_case_susp_t *cis = fmd_alloc(sizeof (fmd_case_susp_t), FMD_SLEEP);
1674
1675	(void) pthread_mutex_lock(&cip->ci_lock);
1676	ASSERT(cip->ci_state < FMD_CASE_CLOSE_WAIT);
1677	cip->ci_flags |= FMD_CF_DIRTY;
1678
1679	cis->cis_next = cip->ci_suspects;
1680	cis->cis_nvl = nvl;
1681
1682	cip->ci_suspects = cis;
1683	cip->ci_nsuspects++;
1684
1685	(void) pthread_mutex_unlock(&cip->ci_lock);
1686	if (cip->ci_xprt == NULL)
1687		fmd_module_setcdirty(cip->ci_mod);
1688}
1689
1690void
1691fmd_case_recreate_suspect(fmd_case_t *cp, nvlist_t *nvl)
1692{
1693	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1694	fmd_case_susp_t *cis = fmd_alloc(sizeof (fmd_case_susp_t), FMD_SLEEP);
1695	boolean_t b;
1696
1697	(void) pthread_mutex_lock(&cip->ci_lock);
1698
1699	cis->cis_next = cip->ci_suspects;
1700	cis->cis_nvl = nvl;
1701
1702	if (nvlist_lookup_boolean_value(nvl,
1703	    FM_SUSPECT_MESSAGE, &b) == 0 && b == B_FALSE)
1704		cip->ci_flags |= FMD_CF_INVISIBLE;
1705
1706	cip->ci_suspects = cis;
1707	cip->ci_nsuspects++;
1708
1709	(void) pthread_mutex_unlock(&cip->ci_lock);
1710}
1711
1712void
1713fmd_case_reset_suspects(fmd_case_t *cp)
1714{
1715	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1716
1717	(void) pthread_mutex_lock(&cip->ci_lock);
1718	ASSERT(cip->ci_state < FMD_CASE_SOLVED);
1719
1720	fmd_case_destroy_suspects(cip);
1721	cip->ci_flags |= FMD_CF_DIRTY;
1722
1723	(void) pthread_mutex_unlock(&cip->ci_lock);
1724	fmd_module_setcdirty(cip->ci_mod);
1725}
1726
1727/*ARGSUSED*/
1728static void
1729fmd_case_unusable(fmd_asru_link_t *alp, void *arg)
1730{
1731	(void) fmd_asru_setflags(alp, FMD_ASRU_UNUSABLE);
1732}
1733
1734/*
1735 * Grab ci_lock and update the case state and set the dirty bit.  Then perform
1736 * whatever actions and emit whatever events are appropriate for the state.
1737 * Refer to the topmost block comment explaining the state machine for details.
1738 */
1739void
1740fmd_case_transition(fmd_case_t *cp, uint_t state, uint_t flags)
1741{
1742	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1743	fmd_case_item_t *cit;
1744	fmd_event_t *e;
1745	int resolved = 0;
1746	int any_unusable_and_present = 0;
1747
1748	ASSERT(state <= FMD_CASE_RESOLVED);
1749	(void) pthread_mutex_lock(&cip->ci_lock);
1750
1751	if (!(cip->ci_flags & FMD_CF_SOLVED) && !(flags & FMD_CF_SOLVED))
1752		flags &= ~(FMD_CF_ISOLATED | FMD_CF_REPAIRED | FMD_CF_RESOLVED);
1753
1754	cip->ci_flags |= flags;
1755
1756	if (cip->ci_state >= state) {
1757		(void) pthread_mutex_unlock(&cip->ci_lock);
1758		return; /* already in specified state */
1759	}
1760
1761	TRACE((FMD_DBG_CASE, "case %s %s->%s", cip->ci_uuid,
1762	    _fmd_case_snames[cip->ci_state], _fmd_case_snames[state]));
1763
1764	cip->ci_state = state;
1765	cip->ci_flags |= FMD_CF_DIRTY;
1766
1767	if (cip->ci_xprt == NULL && cip->ci_mod != fmd.d_rmod)
1768		fmd_module_setcdirty(cip->ci_mod);
1769
1770	switch (state) {
1771	case FMD_CASE_SOLVED:
1772		for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next)
1773			fmd_event_transition(cit->cit_event, FMD_EVS_DIAGNOSED);
1774
1775		if (cip->ci_principal != NULL) {
1776			fmd_event_transition(cip->ci_principal,
1777			    FMD_EVS_DIAGNOSED);
1778		}
1779		break;
1780
1781	case FMD_CASE_CLOSE_WAIT:
1782		/*
1783		 * If the case was never solved, do not change ASRUs.
1784		 * If the case was never fmd_case_closed, do not change ASRUs.
1785		 * If the case was repaired, do not change ASRUs.
1786		 */
1787		if ((cip->ci_flags & (FMD_CF_SOLVED | FMD_CF_ISOLATED |
1788		    FMD_CF_REPAIRED)) == (FMD_CF_SOLVED | FMD_CF_ISOLATED))
1789			fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
1790			    fmd_case_unusable, NULL);
1791
1792		/*
1793		 * If an orphaned case transitions to CLOSE_WAIT, the owning
1794		 * module is no longer loaded: continue on to CASE_CLOSED or
1795		 * CASE_REPAIRED as appropriate.
1796		 */
1797		if (fmd_case_orphaned(cp)) {
1798			if (cip->ci_flags & FMD_CF_REPAIRED) {
1799				state = cip->ci_state = FMD_CASE_REPAIRED;
1800				TRACE((FMD_DBG_CASE, "case %s %s->%s",
1801				    cip->ci_uuid,
1802				    _fmd_case_snames[FMD_CASE_CLOSE_WAIT],
1803				    _fmd_case_snames[FMD_CASE_REPAIRED]));
1804				goto do_repair;
1805			} else {
1806				state = cip->ci_state = FMD_CASE_CLOSED;
1807				TRACE((FMD_DBG_CASE, "case %s %s->%s",
1808				    cip->ci_uuid,
1809				    _fmd_case_snames[FMD_CASE_CLOSE_WAIT],
1810				    _fmd_case_snames[FMD_CASE_CLOSED]));
1811			}
1812		}
1813		break;
1814
1815	case FMD_CASE_REPAIRED:
1816do_repair:
1817		ASSERT(cip->ci_xprt != NULL || fmd_case_orphaned(cp));
1818
1819		/*
1820		 * If we've been requested to transition straight on to the
1821		 * RESOLVED state (which can happen with fault proxying where a
1822		 * list.resolved or a uuresolved is received from the other
1823		 * side), or if all suspects are already either usable or not
1824		 * present then transition straight to RESOLVED state,
1825		 * publishing both the list.repaired and list.resolved. For a
1826		 * proxy, if we discover here that all suspects are already
1827		 * either usable or not present, notify the diag side instead
1828		 * using fmd_xprt_uuresolved().
1829		 */
1830		if (flags & FMD_CF_RESOLVED) {
1831			if (cip->ci_xprt != NULL)
1832				fmd_list_delete(&cip->ci_mod->mod_cases, cip);
1833		} else {
1834			fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
1835			    fmd_case_unusable_and_present,
1836			    &any_unusable_and_present);
1837			if (any_unusable_and_present)
1838				break;
1839			if (cip->ci_xprt != NULL) {
1840				fmd_xprt_uuresolved(cip->ci_xprt, cip->ci_uuid);
1841				break;
1842			}
1843		}
1844
1845		cip->ci_state = FMD_CASE_RESOLVED;
1846		(void) pthread_mutex_unlock(&cip->ci_lock);
1847		fmd_case_publish(cp, state);
1848		TRACE((FMD_DBG_CASE, "case %s %s->%s", cip->ci_uuid,
1849		    _fmd_case_snames[FMD_CASE_REPAIRED],
1850		    _fmd_case_snames[FMD_CASE_RESOLVED]));
1851		state = FMD_CASE_RESOLVED;
1852		resolved = 1;
1853		(void) pthread_mutex_lock(&cip->ci_lock);
1854		break;
1855
1856	case FMD_CASE_RESOLVED:
1857		/*
1858		 * For a proxy, no need to check that all suspects are already
1859		 * either usable or not present - this request has come from
1860		 * the diagnosing side which makes the final decision on this.
1861		 */
1862		if (cip->ci_xprt != NULL) {
1863			fmd_list_delete(&cip->ci_mod->mod_cases, cip);
1864			resolved = 1;
1865			break;
1866		}
1867
1868		ASSERT(fmd_case_orphaned(cp));
1869
1870		/*
1871		 * If all suspects are already either usable or not present then
1872		 * carry on, publish list.resolved and discard the case.
1873		 */
1874		fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
1875		    fmd_case_unusable_and_present, &any_unusable_and_present);
1876		if (any_unusable_and_present) {
1877			(void) pthread_mutex_unlock(&cip->ci_lock);
1878			return;
1879		}
1880
1881		resolved = 1;
1882		break;
1883	}
1884
1885	(void) pthread_mutex_unlock(&cip->ci_lock);
1886
1887	/*
1888	 * If the module has initialized, then publish the appropriate event
1889	 * for the new case state.  If not, we are being called from the
1890	 * checkpoint code during module load, in which case the module's
1891	 * _fmd_init() routine hasn't finished yet, and our event dictionaries
1892	 * may not be open yet, which will prevent us from computing the event
1893	 * code.  Defer the call to fmd_case_publish() by enqueuing a PUBLISH
1894	 * event in our queue: this won't be processed until _fmd_init is done.
1895	 */
1896	if (cip->ci_mod->mod_flags & FMD_MOD_INIT)
1897		fmd_case_publish(cp, state);
1898	else {
1899		fmd_case_hold(cp);
1900		e = fmd_event_create(FMD_EVT_PUBLISH, FMD_HRT_NOW, NULL, cp);
1901		fmd_eventq_insert_at_head(cip->ci_mod->mod_queue, e);
1902	}
1903
1904	if (resolved) {
1905		if (cip->ci_xprt != NULL) {
1906			/*
1907			 * If we transitioned to RESOLVED, adjust the reference
1908			 * count to reflect our removal from
1909			 * fmd.d_rmod->mod_cases above.  If the caller has not
1910			 * placed an additional hold on the case, it will now
1911			 * be freed.
1912			 */
1913			(void) pthread_mutex_lock(&cip->ci_lock);
1914			fmd_asru_hash_delete_case(fmd.d_asrus, cp);
1915			(void) pthread_mutex_unlock(&cip->ci_lock);
1916			fmd_case_rele(cp);
1917		} else {
1918			fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
1919			    fmd_asru_log_resolved, NULL);
1920			(void) pthread_mutex_lock(&cip->ci_lock);
1921			/* mark as "ready to be discarded */
1922			cip->ci_flags |= FMD_CF_RES_CMPL;
1923			(void) pthread_mutex_unlock(&cip->ci_lock);
1924		}
1925	}
1926}
1927
1928/*
1929 * Discard any case if it is in RESOLVED state (and if check_if_aged argument
1930 * is set if all suspects have passed the rsrc.aged time).
1931 */
1932void
1933fmd_case_discard_resolved(fmd_case_t *cp, void *arg)
1934{
1935	int check_if_aged = *(int *)arg;
1936	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1937
1938	/*
1939	 * First check if case has completed transition to resolved.
1940	 */
1941	(void) pthread_mutex_lock(&cip->ci_lock);
1942	if (!(cip->ci_flags & FMD_CF_RES_CMPL)) {
1943		(void) pthread_mutex_unlock(&cip->ci_lock);
1944		return;
1945	}
1946
1947	/*
1948	 * Now if check_is_aged is set, see if all suspects have aged.
1949	 */
1950	if (check_if_aged) {
1951		int aged = 1;
1952
1953		fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
1954		    fmd_asru_check_if_aged, &aged);
1955		if (!aged) {
1956			(void) pthread_mutex_unlock(&cip->ci_lock);
1957			return;
1958		}
1959	}
1960
1961	/*
1962	 * Finally discard the case, clearing FMD_CF_RES_CMPL so we don't
1963	 * do it twice.
1964	 */
1965	fmd_module_lock(cip->ci_mod);
1966	fmd_list_delete(&cip->ci_mod->mod_cases, cip);
1967	fmd_module_unlock(cip->ci_mod);
1968	fmd_asru_hash_delete_case(fmd.d_asrus, cp);
1969	cip->ci_flags &= ~FMD_CF_RES_CMPL;
1970	(void) pthread_mutex_unlock(&cip->ci_lock);
1971	fmd_case_rele(cp);
1972}
1973
1974/*
1975 * Transition the specified case to *at least* the specified state by first
1976 * re-validating the suspect list using the resource cache.  This function is
1977 * employed by the checkpoint code when restoring a saved, solved case to see
1978 * if the state of the case has effectively changed while fmd was not running
1979 * or the module was not loaded.
1980 */
1981void
1982fmd_case_transition_update(fmd_case_t *cp, uint_t state, uint_t flags)
1983{
1984	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1985
1986	int usable = 0;		/* are any suspects usable? */
1987
1988	ASSERT(state >= FMD_CASE_SOLVED);
1989	(void) pthread_mutex_lock(&cip->ci_lock);
1990
1991	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_usable, &usable);
1992
1993	(void) pthread_mutex_unlock(&cip->ci_lock);
1994
1995	if (!usable) {
1996		state = MAX(state, FMD_CASE_CLOSE_WAIT);
1997		flags |= FMD_CF_ISOLATED;
1998	}
1999
2000	fmd_case_transition(cp, state, flags);
2001}
2002
2003void
2004fmd_case_setdirty(fmd_case_t *cp)
2005{
2006	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2007
2008	(void) pthread_mutex_lock(&cip->ci_lock);
2009	cip->ci_flags |= FMD_CF_DIRTY;
2010	(void) pthread_mutex_unlock(&cip->ci_lock);
2011
2012	fmd_module_setcdirty(cip->ci_mod);
2013}
2014
2015void
2016fmd_case_clrdirty(fmd_case_t *cp)
2017{
2018	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2019
2020	(void) pthread_mutex_lock(&cip->ci_lock);
2021	cip->ci_flags &= ~FMD_CF_DIRTY;
2022	(void) pthread_mutex_unlock(&cip->ci_lock);
2023}
2024
2025void
2026fmd_case_commit(fmd_case_t *cp)
2027{
2028	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2029	fmd_case_item_t *cit;
2030
2031	(void) pthread_mutex_lock(&cip->ci_lock);
2032
2033	if (cip->ci_flags & FMD_CF_DIRTY) {
2034		for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next)
2035			fmd_event_commit(cit->cit_event);
2036
2037		if (cip->ci_principal != NULL)
2038			fmd_event_commit(cip->ci_principal);
2039
2040		fmd_buf_hash_commit(&cip->ci_bufs);
2041		cip->ci_flags &= ~FMD_CF_DIRTY;
2042	}
2043
2044	(void) pthread_mutex_unlock(&cip->ci_lock);
2045}
2046
2047/*
2048 * On proxy side, send back repair/acquit/etc request to diagnosing side
2049 */
2050void
2051fmd_case_xprt_updated(fmd_case_t *cp)
2052{
2053	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2054	nvlist_t **nva;
2055	uint8_t *ba;
2056	int msg = B_TRUE;
2057	int count = 0;
2058	fmd_case_lst_t fcl;
2059
2060	ASSERT(cip->ci_xprt != NULL);
2061	(void) pthread_mutex_lock(&cip->ci_lock);
2062	ba = alloca(sizeof (uint8_t) * cip->ci_nsuspects);
2063	nva = alloca(sizeof (nvlist_t *) * cip->ci_nsuspects);
2064	fcl.fcl_countp = &count;
2065	fcl.fcl_maxcount = cip->ci_nsuspects;
2066	fcl.fcl_msgp = &msg;
2067	fcl.fcl_ba = ba;
2068	fcl.fcl_nva = nva;
2069	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_set_lst, &fcl);
2070	(void) pthread_mutex_unlock(&cip->ci_lock);
2071	fmd_xprt_updated(cip->ci_xprt, cip->ci_uuid, ba, cip->ci_proxy_asru,
2072	    count);
2073}
2074
2075/*
2076 * fmd_case_update_status() can be called on either the proxy side when a
2077 * list.suspect is received, or on the diagnosing side when an update request
2078 * is received from the proxy. It updates the status in the resource cache.
2079 */
2080void
2081fmd_case_update_status(fmd_case_t *cp, uint8_t *statusp, uint8_t *proxy_asrup,
2082    uint8_t *diag_asrup)
2083{
2084	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2085	int count = 0;
2086	fmd_asru_update_status_t faus;
2087
2088	/*
2089	 * update status of resource cache entries
2090	 */
2091	faus.faus_countp = &count;
2092	faus.faus_maxcount = cip->ci_nsuspects;
2093	faus.faus_ba = statusp;
2094	faus.faus_proxy_asru = proxy_asrup;
2095	faus.faus_diag_asru = diag_asrup;
2096	faus.faus_is_proxy = (cip->ci_xprt != NULL);
2097	(void) pthread_mutex_lock(&cip->ci_lock);
2098	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_update_status,
2099	    &faus);
2100	(void) pthread_mutex_unlock(&cip->ci_lock);
2101}
2102
2103/*
2104 * Called on either the proxy side or the diag side when a repair has taken
2105 * place on the other side but this side may know the asru "contains"
2106 * relationships.
2107 */
2108void
2109fmd_case_update_containees(fmd_case_t *cp)
2110{
2111	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2112
2113	(void) pthread_mutex_lock(&cip->ci_lock);
2114	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
2115	    fmd_asru_update_containees, NULL);
2116	(void) pthread_mutex_unlock(&cip->ci_lock);
2117}
2118
2119/*
2120 * fmd_case_close_status() is called on diagnosing side when proxy side
2121 * has had a uuclose. It updates the status in the resource cache.
2122 */
2123void
2124fmd_case_close_status(fmd_case_t *cp)
2125{
2126	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2127	int count = 0;
2128	fmd_asru_close_status_t facs;
2129
2130	/*
2131	 * update status of resource cache entries
2132	 */
2133	facs.facs_countp = &count;
2134	facs.facs_maxcount = cip->ci_nsuspects;
2135	(void) pthread_mutex_lock(&cip->ci_lock);
2136	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_close_status,
2137	    &facs);
2138	(void) pthread_mutex_unlock(&cip->ci_lock);
2139}
2140
2141/*
2142 * Indicate that the case may need to change state because one or more of the
2143 * ASRUs named as a suspect has changed state.  We examine all the suspects
2144 * and if none are still faulty, we initiate a case close transition.
2145 */
2146void
2147fmd_case_update(fmd_case_t *cp)
2148{
2149	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2150	uint_t cstate;
2151	int faulty = 0;
2152
2153	(void) pthread_mutex_lock(&cip->ci_lock);
2154	cstate = cip->ci_state;
2155
2156	if (cip->ci_state < FMD_CASE_SOLVED) {
2157		(void) pthread_mutex_unlock(&cip->ci_lock);
2158		return; /* update is not appropriate */
2159	}
2160
2161	if (cip->ci_flags & FMD_CF_REPAIRED) {
2162		(void) pthread_mutex_unlock(&cip->ci_lock);
2163		return; /* already repaired */
2164	}
2165
2166	TRACE((FMD_DBG_CASE, "case update %s", cip->ci_uuid));
2167	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_faulty, &faulty);
2168	(void) pthread_mutex_unlock(&cip->ci_lock);
2169
2170	if (faulty) {
2171		nvlist_t *nvl;
2172		fmd_event_t *e;
2173		char *class;
2174
2175		TRACE((FMD_DBG_CASE, "sending list.updated %s", cip->ci_uuid));
2176		nvl = fmd_case_mkevent(cp, FM_LIST_UPDATED_CLASS);
2177		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
2178		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
2179		(void) pthread_rwlock_rdlock(&fmd.d_log_lock);
2180		fmd_log_append(fmd.d_fltlog, e, cp);
2181		(void) pthread_rwlock_unlock(&fmd.d_log_lock);
2182		fmd_dispq_dispatch(fmd.d_disp, e, class);
2183		return; /* one or more suspects are still marked faulty */
2184	}
2185
2186	if (cstate == FMD_CASE_CLOSED)
2187		fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED);
2188	else
2189		fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED);
2190}
2191
2192/*
2193 * Delete a closed case from the module's case list once the fmdo_close() entry
2194 * point has run to completion.  If the case is owned by a transport module,
2195 * tell the transport to proxy a case close on the other end of the transport.
2196 * Transition to the appropriate next state based on ci_flags.  This
2197 * function represents the end of CLOSE_WAIT and transitions the case to either
2198 * CLOSED or REPAIRED or discards it entirely because it was never solved;
2199 * refer to the topmost block comment explaining the state machine for details.
2200 */
2201void
2202fmd_case_delete(fmd_case_t *cp)
2203{
2204	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2205	fmd_modstat_t *msp;
2206	size_t buftotal;
2207
2208	TRACE((FMD_DBG_CASE, "case delete %s", cip->ci_uuid));
2209	ASSERT(fmd_module_locked(cip->ci_mod));
2210	fmd_list_delete(&cip->ci_mod->mod_cases, cip);
2211	buftotal = fmd_buf_hash_destroy(&cip->ci_bufs);
2212
2213	(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
2214	msp = cip->ci_mod->mod_stats;
2215
2216	ASSERT(msp->ms_caseopen.fmds_value.ui64 != 0);
2217	msp->ms_caseopen.fmds_value.ui64--;
2218
2219	ASSERT(msp->ms_buftotal.fmds_value.ui64 >= buftotal);
2220	msp->ms_buftotal.fmds_value.ui64 -= buftotal;
2221
2222	(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
2223
2224	if (cip->ci_xprt == NULL)
2225		fmd_module_setcdirty(cip->ci_mod);
2226
2227	fmd_module_rele(cip->ci_mod);
2228	cip->ci_mod = fmd.d_rmod;
2229	fmd_module_hold(cip->ci_mod);
2230
2231	/*
2232	 * If the case has been solved, then retain it
2233	 * on the root module's case list at least until we're transitioned.
2234	 * Otherwise free the case with our final fmd_case_rele() below.
2235	 */
2236	if (cip->ci_flags & FMD_CF_SOLVED) {
2237		fmd_module_lock(cip->ci_mod);
2238		fmd_list_append(&cip->ci_mod->mod_cases, cip);
2239		fmd_module_unlock(cip->ci_mod);
2240		fmd_case_hold(cp);
2241	}
2242
2243	/*
2244	 * Transition onwards to REPAIRED or CLOSED as originally requested.
2245	 * Note that for proxy case if we're transitioning to CLOSED it means
2246	 * the case was isolated locally, so call fmd_xprt_uuclose() to notify
2247	 * the diagnosing side. No need to notify the diagnosing side if we are
2248	 * transitioning to REPAIRED as we only do this when requested to do
2249	 * so by the diagnosing side anyway.
2250	 */
2251	if (cip->ci_flags & FMD_CF_REPAIRED)
2252		fmd_case_transition(cp, FMD_CASE_REPAIRED, 0);
2253	else if (cip->ci_flags & FMD_CF_ISOLATED) {
2254		fmd_case_transition(cp, FMD_CASE_CLOSED, 0);
2255		if (cip->ci_xprt != NULL)
2256			fmd_xprt_uuclose(cip->ci_xprt, cip->ci_uuid);
2257	}
2258
2259	fmd_case_rele(cp);
2260}
2261
2262void
2263fmd_case_discard(fmd_case_t *cp, boolean_t delete_from_asru_cache)
2264{
2265	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2266
2267	(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
2268	cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64--;
2269	(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
2270
2271	ASSERT(fmd_module_locked(cip->ci_mod));
2272	fmd_list_delete(&cip->ci_mod->mod_cases, cip);
2273	if (delete_from_asru_cache) {
2274		(void) pthread_mutex_lock(&cip->ci_lock);
2275		fmd_asru_hash_delete_case(fmd.d_asrus, cp);
2276		(void) pthread_mutex_unlock(&cip->ci_lock);
2277	}
2278	fmd_case_rele(cp);
2279}
2280
2281/*
2282 * Indicate that the problem corresponding to a case has been repaired by
2283 * clearing the faulty bit on each ASRU named as a suspect.  If the case hasn't
2284 * already been closed, this function initiates the transition to CLOSE_WAIT.
2285 * The caller must have the case held from fmd_case_hash_lookup(), so we can
2286 * grab and drop ci_lock without the case being able to be freed in between.
2287 */
2288int
2289fmd_case_repair(fmd_case_t *cp)
2290{
2291	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2292	uint_t cstate;
2293	fmd_asru_rep_arg_t fara;
2294
2295	(void) pthread_mutex_lock(&cip->ci_lock);
2296	cstate = cip->ci_state;
2297
2298	if (cstate < FMD_CASE_SOLVED) {
2299		(void) pthread_mutex_unlock(&cip->ci_lock);
2300		return (fmd_set_errno(EFMD_CASE_STATE));
2301	}
2302
2303	if (cip->ci_flags & FMD_CF_REPAIRED) {
2304		(void) pthread_mutex_unlock(&cip->ci_lock);
2305		return (0); /* already repaired */
2306	}
2307
2308	TRACE((FMD_DBG_CASE, "case repair %s", cip->ci_uuid));
2309	fara.fara_reason = FMD_ASRU_REPAIRED;
2310	fara.fara_bywhat = FARA_BY_CASE;
2311	fara.fara_rval = NULL;
2312	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_repaired, &fara);
2313	(void) pthread_mutex_unlock(&cip->ci_lock);
2314
2315	/*
2316	 * if this is a proxied case, send the repair across the transport.
2317	 * The remote side will then do the repair and send a list.repaired back
2318	 * again such that we can finally repair the case on this side.
2319	 */
2320	if (cip->ci_xprt != NULL) {
2321		fmd_case_xprt_updated(cp);
2322		return (0);
2323	}
2324
2325	if (cstate == FMD_CASE_CLOSED)
2326		fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED);
2327	else
2328		fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED);
2329
2330	return (0);
2331}
2332
2333int
2334fmd_case_acquit(fmd_case_t *cp)
2335{
2336	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2337	uint_t cstate;
2338	fmd_asru_rep_arg_t fara;
2339
2340	(void) pthread_mutex_lock(&cip->ci_lock);
2341	cstate = cip->ci_state;
2342
2343	if (cstate < FMD_CASE_SOLVED) {
2344		(void) pthread_mutex_unlock(&cip->ci_lock);
2345		return (fmd_set_errno(EFMD_CASE_STATE));
2346	}
2347
2348	if (cip->ci_flags & FMD_CF_REPAIRED) {
2349		(void) pthread_mutex_unlock(&cip->ci_lock);
2350		return (0); /* already repaired */
2351	}
2352
2353	TRACE((FMD_DBG_CASE, "case acquit %s", cip->ci_uuid));
2354	fara.fara_reason = FMD_ASRU_ACQUITTED;
2355	fara.fara_bywhat = FARA_BY_CASE;
2356	fara.fara_rval = NULL;
2357	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_repaired, &fara);
2358	(void) pthread_mutex_unlock(&cip->ci_lock);
2359
2360	/*
2361	 * if this is a proxied case, send the repair across the transport.
2362	 * The remote side will then do the repair and send a list.repaired back
2363	 * again such that we can finally repair the case on this side.
2364	 */
2365	if (cip->ci_xprt != NULL) {
2366		fmd_case_xprt_updated(cp);
2367		return (0);
2368	}
2369
2370	if (cstate == FMD_CASE_CLOSED)
2371		fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED);
2372	else
2373		fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED);
2374
2375	return (0);
2376}
2377
2378int
2379fmd_case_contains(fmd_case_t *cp, fmd_event_t *ep)
2380{
2381	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2382	fmd_case_item_t *cit;
2383	uint_t state;
2384	int rv = 0;
2385
2386	(void) pthread_mutex_lock(&cip->ci_lock);
2387
2388	if (cip->ci_state >= FMD_CASE_SOLVED)
2389		state = FMD_EVS_DIAGNOSED;
2390	else
2391		state = FMD_EVS_ACCEPTED;
2392
2393	for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) {
2394		if ((rv = fmd_event_equal(ep, cit->cit_event)) != 0)
2395			break;
2396	}
2397
2398	if (rv == 0 && cip->ci_principal != NULL)
2399		rv = fmd_event_equal(ep, cip->ci_principal);
2400
2401	(void) pthread_mutex_unlock(&cip->ci_lock);
2402
2403	if (rv != 0)
2404		fmd_event_transition(ep, state);
2405
2406	return (rv);
2407}
2408
2409int
2410fmd_case_orphaned(fmd_case_t *cp)
2411{
2412	return (((fmd_case_impl_t *)cp)->ci_mod == fmd.d_rmod);
2413}
2414
2415void
2416fmd_case_settime(fmd_case_t *cp, time_t tv_sec, suseconds_t tv_usec)
2417{
2418	((fmd_case_impl_t *)cp)->ci_tv.tv_sec = tv_sec;
2419	((fmd_case_impl_t *)cp)->ci_tv.tv_usec = tv_usec;
2420	((fmd_case_impl_t *)cp)->ci_tv_valid = 1;
2421}
2422
2423void
2424fmd_case_set_injected(fmd_case_t *cp)
2425{
2426	((fmd_case_impl_t *)cp)->ci_injected = 1;
2427}
2428
2429void
2430fmd_case_set_de_fmri(fmd_case_t *cp, nvlist_t *nvl)
2431{
2432	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2433
2434	if (cip->ci_diag_de)
2435		nvlist_free(cip->ci_diag_de);
2436	cip->ci_diag_de = nvl;
2437}
2438
2439void
2440fmd_case_setcode(fmd_case_t *cp, char *code)
2441{
2442	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2443
2444	cip->ci_code = fmd_strdup(code, FMD_SLEEP);
2445	cip->ci_codelen = cip->ci_code ? strlen(cip->ci_code) + 1 : 0;
2446}
2447
2448/*ARGSUSED*/
2449static void
2450fmd_case_repair_replay_case(fmd_case_t *cp, void *arg)
2451{
2452	int not_faulty = 0;
2453	int faulty = 0;
2454	nvlist_t *nvl;
2455	fmd_event_t *e;
2456	char *class;
2457	int any_unusable_and_present = 0;
2458	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2459
2460	if (cip->ci_state < FMD_CASE_SOLVED || cip->ci_xprt != NULL)
2461		return;
2462
2463	if (cip->ci_state == FMD_CASE_RESOLVED) {
2464		cip->ci_flags |= FMD_CF_RES_CMPL;
2465		return;
2466	}
2467
2468	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_faulty, &faulty);
2469	fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_not_faulty,
2470	    &not_faulty);
2471
2472	if (cip->ci_state >= FMD_CASE_REPAIRED && !faulty) {
2473		/*
2474		 * If none of the suspects is faulty, replay the list.repaired.
2475		 * If all suspects are already either usable or not present then
2476		 * also transition straight to RESOLVED state.
2477		 */
2478		fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
2479		    fmd_case_unusable_and_present, &any_unusable_and_present);
2480		if (!any_unusable_and_present) {
2481			cip->ci_state = FMD_CASE_RESOLVED;
2482
2483			TRACE((FMD_DBG_CASE, "replay sending list.repaired %s",
2484			    cip->ci_uuid));
2485			nvl = fmd_case_mkevent(cp, FM_LIST_REPAIRED_CLASS);
2486			(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
2487			e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl,
2488			    class);
2489			fmd_dispq_dispatch(fmd.d_disp, e, class);
2490
2491			TRACE((FMD_DBG_CASE, "replay sending list.resolved %s",
2492			    cip->ci_uuid));
2493			fmd_case_publish(cp, FMD_CASE_RESOLVED);
2494			fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
2495			    fmd_asru_log_resolved, NULL);
2496			cip->ci_flags |= FMD_CF_RES_CMPL;
2497		} else {
2498			TRACE((FMD_DBG_CASE, "replay sending list.repaired %s",
2499			    cip->ci_uuid));
2500			nvl = fmd_case_mkevent(cp, FM_LIST_REPAIRED_CLASS);
2501			(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
2502			e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl,
2503			    class);
2504			fmd_dispq_dispatch(fmd.d_disp, e, class);
2505		}
2506	} else if (faulty && not_faulty) {
2507		/*
2508		 * if some but not all of the suspects are not faulty, replay
2509		 * the list.updated.
2510		 */
2511		TRACE((FMD_DBG_CASE, "replay sending list.updated %s",
2512		    cip->ci_uuid));
2513		nvl = fmd_case_mkevent(cp, FM_LIST_UPDATED_CLASS);
2514		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
2515		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
2516		fmd_dispq_dispatch(fmd.d_disp, e, class);
2517	}
2518}
2519
2520void
2521fmd_case_repair_replay()
2522{
2523	fmd_case_hash_apply(fmd.d_cases, fmd_case_repair_replay_case, NULL);
2524}
2525