1/*-
2 * Copyright (c) 2011, 2012, 2013, 2014, 2016 Spectra Logic Corporation
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions, and the following disclaimer,
10 *    without modification.
11 * 2. Redistributions in binary form must reproduce at minimum a disclaimer
12 *    substantially similar to the "NO WARRANTY" disclaimer below
13 *    ("Disclaimer") and any redistribution must be conditioned upon
14 *    including a substantially similar Disclaimer requirement for further
15 *    binary redistribution.
16 *
17 * NO WARRANTY
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
26 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
27 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 * POSSIBILITY OF SUCH DAMAGES.
29 *
30 * Authors: Justin T. Gibbs     (Spectra Logic Corporation)
31 */
32
33/**
34 * \file case_file.cc
35 *
36 * We keep case files for any leaf vdev that is not in the optimal state.
37 * However, we only serialize to disk those events that need to be preserved
38 * across reboots.  For now, this is just a log of soft errors which we
39 * accumulate in order to mark a device as degraded.
40 */
41#include <sys/cdefs.h>
42#include <sys/byteorder.h>
43#include <sys/time.h>
44
45#include <sys/fs/zfs.h>
46
47#include <dirent.h>
48#include <fcntl.h>
49#include <iomanip>
50#include <fstream>
51#include <functional>
52#include <sstream>
53#include <syslog.h>
54#include <unistd.h>
55
56#include <libzfs.h>
57
58#include <list>
59#include <map>
60#include <string>
61
62#include <devdctl/guid.h>
63#include <devdctl/event.h>
64#include <devdctl/event_factory.h>
65#include <devdctl/exception.h>
66#include <devdctl/consumer.h>
67
68#include "callout.h"
69#include "vdev_iterator.h"
70#include "zfsd_event.h"
71#include "case_file.h"
72#include "vdev.h"
73#include "zfsd.h"
74#include "zfsd_exception.h"
75#include "zpool_list.h"
76
77__FBSDID("$FreeBSD$");
78
79/*============================ Namespace Control =============================*/
80using std::hex;
81using std::ifstream;
82using std::stringstream;
83using std::setfill;
84using std::setw;
85
86using DevdCtl::Event;
87using DevdCtl::EventFactory;
88using DevdCtl::EventList;
89using DevdCtl::Guid;
90using DevdCtl::ParseException;
91
92/*--------------------------------- CaseFile ---------------------------------*/
93//- CaseFile Static Data -------------------------------------------------------
94
95CaseFileList  CaseFile::s_activeCases;
96const string  CaseFile::s_caseFilePath = "/var/db/zfsd/cases";
97const timeval CaseFile::s_removeGracePeriod = { 60 /*sec*/, 0 /*usec*/};
98
99//- CaseFile Static Public Methods ---------------------------------------------
100CaseFile *
101CaseFile::Find(Guid poolGUID, Guid vdevGUID)
102{
103	for (CaseFileList::iterator curCase = s_activeCases.begin();
104	     curCase != s_activeCases.end(); curCase++) {
105
106		if (((*curCase)->PoolGUID() != poolGUID
107		  && Guid::InvalidGuid() != poolGUID)
108		 || (*curCase)->VdevGUID() != vdevGUID)
109			continue;
110
111		/*
112		 * We only carry one active case per-vdev.
113		 */
114		return (*curCase);
115	}
116	return (NULL);
117}
118
119CaseFile *
120CaseFile::Find(const string &physPath)
121{
122	CaseFile *result = NULL;
123
124	for (CaseFileList::iterator curCase = s_activeCases.begin();
125	     curCase != s_activeCases.end(); curCase++) {
126
127		if ((*curCase)->PhysicalPath() != physPath)
128			continue;
129
130		if (result != NULL) {
131			syslog(LOG_WARNING, "Multiple casefiles found for "
132			    "physical path %s.  "
133			    "This is most likely a bug in zfsd",
134			    physPath.c_str());
135		}
136		result = *curCase;
137	}
138	return (result);
139}
140
141
142void
143CaseFile::ReEvaluateByGuid(Guid poolGUID, const ZfsEvent &event)
144{
145	CaseFileList::iterator casefile;
146	for (casefile = s_activeCases.begin(); casefile != s_activeCases.end();){
147		CaseFileList::iterator next = casefile;
148		next++;
149		if (poolGUID == (*casefile)->PoolGUID())
150			(*casefile)->ReEvaluate(event);
151		casefile = next;
152	}
153}
154
155CaseFile &
156CaseFile::Create(Vdev &vdev)
157{
158	CaseFile *activeCase;
159
160	activeCase = Find(vdev.PoolGUID(), vdev.GUID());
161	if (activeCase == NULL)
162		activeCase = new CaseFile(vdev);
163
164	return (*activeCase);
165}
166
167void
168CaseFile::DeSerialize()
169{
170	struct dirent **caseFiles;
171
172	int numCaseFiles(scandir(s_caseFilePath.c_str(), &caseFiles,
173			 DeSerializeSelector, /*compar*/NULL));
174
175	if (numCaseFiles == -1)
176		return;
177	if (numCaseFiles == 0) {
178		free(caseFiles);
179		return;
180	}
181
182	for (int i = 0; i < numCaseFiles; i++) {
183
184		DeSerializeFile(caseFiles[i]->d_name);
185		free(caseFiles[i]);
186	}
187	free(caseFiles);
188}
189
190bool
191CaseFile::Empty()
192{
193	return (s_activeCases.empty());
194}
195
196void
197CaseFile::LogAll()
198{
199	for (CaseFileList::iterator curCase = s_activeCases.begin();
200	     curCase != s_activeCases.end(); curCase++)
201		(*curCase)->Log();
202}
203
204void
205CaseFile::PurgeAll()
206{
207	/*
208	 * Serialize casefiles before deleting them so that they can be reread
209	 * and revalidated during BuildCaseFiles.
210	 * CaseFiles remove themselves from this list on destruction.
211	 */
212	while (s_activeCases.size() != 0) {
213		CaseFile *casefile = s_activeCases.front();
214		casefile->Serialize();
215		delete casefile;
216	}
217
218}
219
220//- CaseFile Public Methods ----------------------------------------------------
221bool
222CaseFile::RefreshVdevState()
223{
224	ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID);
225	zpool_handle_t *casePool(zpl.empty() ? NULL : zpl.front());
226	if (casePool == NULL)
227		return (false);
228
229	Vdev vd(casePool, CaseVdev(casePool));
230	if (vd.DoesNotExist())
231		return (false);
232
233	m_vdevState    = vd.State();
234	m_vdevPhysPath = vd.PhysicalPath();
235	return (true);
236}
237
238bool
239CaseFile::ReEvaluate(const string &devPath, const string &physPath, Vdev *vdev)
240{
241	ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID);
242	zpool_handle_t *pool(zpl.empty() ? NULL : zpl.front());
243
244	if (pool == NULL || !RefreshVdevState()) {
245		/*
246		 * The pool or vdev for this case file is no longer
247		 * part of the configuration.  This can happen
248		 * if we process a device arrival notification
249		 * before seeing the ZFS configuration change
250		 * event.
251		 */
252		syslog(LOG_INFO,
253		       "CaseFile::ReEvaluate(%s,%s) Pool/Vdev unconfigured.  "
254		       "Closing\n",
255		       PoolGUIDString().c_str(),
256		       VdevGUIDString().c_str());
257		Close();
258
259		/*
260		 * Since this event was not used to close this
261		 * case, do not report it as consumed.
262		 */
263		return (/*consumed*/false);
264	}
265
266	if (VdevState() > VDEV_STATE_CANT_OPEN) {
267		/*
268		 * For now, newly discovered devices only help for
269		 * devices that are missing.  In the future, we might
270		 * use a newly inserted spare to replace a degraded
271		 * or faulted device.
272		 */
273		syslog(LOG_INFO, "CaseFile::ReEvaluate(%s,%s): Pool/Vdev ignored",
274		    PoolGUIDString().c_str(), VdevGUIDString().c_str());
275		return (/*consumed*/false);
276	}
277
278	if (vdev != NULL
279	 && ( vdev->PoolGUID() == m_poolGUID
280	   || vdev->PoolGUID() == Guid::InvalidGuid())
281	 && vdev->GUID() == m_vdevGUID) {
282
283		zpool_vdev_online(pool, vdev->GUIDString().c_str(),
284				  ZFS_ONLINE_CHECKREMOVE | ZFS_ONLINE_UNSPARE,
285				  &m_vdevState);
286		syslog(LOG_INFO, "Onlined vdev(%s/%s:%s).  State now %s.\n",
287		       zpool_get_name(pool), vdev->GUIDString().c_str(),
288		       devPath.c_str(),
289		       zpool_state_to_name(VdevState(), VDEV_AUX_NONE));
290
291		/*
292		 * Check the vdev state post the online action to see
293		 * if we can retire this case.
294		 */
295		CloseIfSolved();
296
297		return (/*consumed*/true);
298	}
299
300	/*
301	 * If the auto-replace policy is enabled, and we have physical
302	 * path information, try a physical path replacement.
303	 */
304	if (zpool_get_prop_int(pool, ZPOOL_PROP_AUTOREPLACE, NULL) == 0) {
305		syslog(LOG_INFO,
306		       "CaseFile(%s:%s:%s): AutoReplace not set.  "
307		       "Ignoring device insertion.\n",
308		       PoolGUIDString().c_str(),
309		       VdevGUIDString().c_str(),
310		       zpool_state_to_name(VdevState(), VDEV_AUX_NONE));
311		return (/*consumed*/false);
312	}
313
314	if (PhysicalPath().empty()) {
315		syslog(LOG_INFO,
316		       "CaseFile(%s:%s:%s): No physical path information.  "
317		       "Ignoring device insertion.\n",
318		       PoolGUIDString().c_str(),
319		       VdevGUIDString().c_str(),
320		       zpool_state_to_name(VdevState(), VDEV_AUX_NONE));
321		return (/*consumed*/false);
322	}
323
324	if (physPath != PhysicalPath()) {
325		syslog(LOG_INFO,
326		       "CaseFile(%s:%s:%s): Physical path mismatch.  "
327		       "Ignoring device insertion.\n",
328		       PoolGUIDString().c_str(),
329		       VdevGUIDString().c_str(),
330		       zpool_state_to_name(VdevState(), VDEV_AUX_NONE));
331		return (/*consumed*/false);
332	}
333
334	/* Write a label on the newly inserted disk. */
335	if (zpool_label_disk(g_zfsHandle, pool, devPath.c_str()) != 0) {
336		syslog(LOG_ERR,
337		       "Replace vdev(%s/%s) by physical path (label): %s: %s\n",
338		       zpool_get_name(pool), VdevGUIDString().c_str(),
339		       libzfs_error_action(g_zfsHandle),
340		       libzfs_error_description(g_zfsHandle));
341		return (/*consumed*/false);
342	}
343
344	syslog(LOG_INFO, "CaseFile::ReEvaluate(%s/%s): Replacing with %s",
345	    PoolGUIDString().c_str(), VdevGUIDString().c_str(),
346	    devPath.c_str());
347	return (Replace(VDEV_TYPE_DISK, devPath.c_str(), /*isspare*/false));
348}
349
350bool
351CaseFile::ReEvaluate(const ZfsEvent &event)
352{
353	bool consumed(false);
354
355	if (event.Value("type") == "misc.fs.zfs.vdev_remove") {
356		/*
357		 * The Vdev we represent has been removed from the
358		 * configuration.  This case is no longer of value.
359		 */
360		Close();
361
362		return (/*consumed*/true);
363	} else if (event.Value("type") == "misc.fs.zfs.pool_destroy") {
364		/* This Pool has been destroyed.  Discard the case */
365		Close();
366
367		return (/*consumed*/true);
368	} else if (event.Value("type") == "misc.fs.zfs.config_sync") {
369		RefreshVdevState();
370		if (VdevState() < VDEV_STATE_HEALTHY)
371			consumed = ActivateSpare();
372	}
373
374
375	if (event.Value("class") == "resource.fs.zfs.removed") {
376		bool spare_activated;
377
378		if (!RefreshVdevState()) {
379			/*
380			 * The pool or vdev for this case file is no longer
381			 * part of the configuration.  This can happen
382			 * if we process a device arrival notification
383			 * before seeing the ZFS configuration change
384			 * event.
385			 */
386			syslog(LOG_INFO,
387			       "CaseFile::ReEvaluate(%s,%s) Pool/Vdev "
388			       "unconfigured.  Closing\n",
389			       PoolGUIDString().c_str(),
390			       VdevGUIDString().c_str());
391			/*
392			 * Close the case now so we won't waste cycles in the
393			 * system rescan
394			 */
395			Close();
396
397			/*
398			 * Since this event was not used to close this
399			 * case, do not report it as consumed.
400			 */
401			return (/*consumed*/false);
402		}
403
404		/*
405		 * Discard any tentative I/O error events for
406		 * this case.  They were most likely caused by the
407		 * hot-unplug of this device.
408		 */
409		PurgeTentativeEvents();
410
411		/* Try to activate spares if they are available */
412		spare_activated = ActivateSpare();
413
414		/*
415		 * Rescan the drives in the system to see if a recent
416		 * drive arrival can be used to solve this case.
417		 */
418		ZfsDaemon::RequestSystemRescan();
419
420		/*
421		 * Consume the event if we successfully activated a spare.
422		 * Otherwise, leave it in the unconsumed events list so that the
423		 * future addition of a spare to this pool might be able to
424		 * close the case
425		 */
426		consumed = spare_activated;
427	} else if (event.Value("class") == "resource.fs.zfs.statechange") {
428		RefreshVdevState();
429		/*
430		 * If this vdev is DEGRADED, FAULTED, or UNAVAIL, try to
431		 * activate a hotspare.  Otherwise, ignore the event
432		 */
433		if (VdevState() == VDEV_STATE_FAULTED ||
434		    VdevState() == VDEV_STATE_DEGRADED ||
435		    VdevState() == VDEV_STATE_CANT_OPEN)
436			(void) ActivateSpare();
437		consumed = true;
438	}
439	else if (event.Value("class") == "ereport.fs.zfs.io" ||
440	         event.Value("class") == "ereport.fs.zfs.checksum") {
441
442		m_tentativeEvents.push_front(event.DeepCopy());
443		RegisterCallout(event);
444		consumed = true;
445	}
446
447	bool closed(CloseIfSolved());
448
449	return (consumed || closed);
450}
451
452/* Find a Vdev containing the vdev with the given GUID */
453static nvlist_t*
454find_parent(nvlist_t *pool_config, nvlist_t *config, DevdCtl::Guid child_guid)
455{
456	nvlist_t **vdevChildren;
457	int        error;
458	unsigned   ch, numChildren;
459
460	error = nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_CHILDREN,
461					   &vdevChildren, &numChildren);
462
463	if (error != 0 || numChildren == 0)
464		return (NULL);
465
466	for (ch = 0; ch < numChildren; ch++) {
467		nvlist *result;
468		Vdev vdev(pool_config, vdevChildren[ch]);
469
470		if (vdev.GUID() == child_guid)
471			return (config);
472
473		result = find_parent(pool_config, vdevChildren[ch], child_guid);
474		if (result != NULL)
475			return (result);
476	}
477
478	return (NULL);
479}
480
481bool
482CaseFile::ActivateSpare() {
483	nvlist_t	*config, *nvroot, *parent_config;
484	nvlist_t       **spares;
485	char		*devPath, *vdev_type;
486	const char	*poolname;
487	u_int		 nspares, i;
488	int		 error;
489
490	ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID);
491	zpool_handle_t	*zhp(zpl.empty() ? NULL : zpl.front());
492	if (zhp == NULL) {
493		syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not find pool "
494		       "for pool_guid %" PRIu64".", (uint64_t)m_poolGUID);
495		return (false);
496	}
497	poolname = zpool_get_name(zhp);
498	config = zpool_get_config(zhp, NULL);
499	if (config == NULL) {
500		syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not find pool "
501		       "config for pool %s", poolname);
502		return (false);
503	}
504	error = nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot);
505	if (error != 0){
506		syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not find vdev "
507		       "tree for pool %s", poolname);
508		return (false);
509	}
510
511	parent_config = find_parent(config, nvroot, m_vdevGUID);
512	if (parent_config != NULL) {
513		char *parent_type;
514
515		/*
516		 * Don't activate spares for members of a "replacing" vdev.
517		 * They're already dealt with.  Sparing them will just drag out
518		 * the resilver process.
519		 */
520		error = nvlist_lookup_string(parent_config,
521		    ZPOOL_CONFIG_TYPE, &parent_type);
522		if (error == 0 && strcmp(parent_type, VDEV_TYPE_REPLACING) == 0)
523			return (false);
524	}
525
526	nspares = 0;
527	nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares,
528				   &nspares);
529	if (nspares == 0) {
530		/* The pool has no spares configured */
531		syslog(LOG_INFO, "CaseFile::ActivateSpare: "
532		       "No spares available for pool %s", poolname);
533		return (false);
534	}
535	for (i = 0; i < nspares; i++) {
536		uint64_t    *nvlist_array;
537		vdev_stat_t *vs;
538		uint_t	     nstats;
539
540		if (nvlist_lookup_uint64_array(spares[i],
541		    ZPOOL_CONFIG_VDEV_STATS, &nvlist_array, &nstats) != 0) {
542			syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not "
543			       "find vdev stats for pool %s, spare %d",
544			       poolname, i);
545			return (false);
546		}
547		vs = reinterpret_cast<vdev_stat_t *>(nvlist_array);
548
549		if ((vs->vs_aux != VDEV_AUX_SPARED)
550		 && (vs->vs_state == VDEV_STATE_HEALTHY)) {
551			/* We found a usable spare */
552			break;
553		}
554	}
555
556	if (i == nspares) {
557		/* No available spares were found */
558		return (false);
559	}
560
561	error = nvlist_lookup_string(spares[i], ZPOOL_CONFIG_PATH, &devPath);
562	if (error != 0) {
563		syslog(LOG_ERR, "CaseFile::ActivateSpare: Cannot determine "
564		       "the path of pool %s, spare %d. Error %d",
565		       poolname, i, error);
566		return (false);
567	}
568
569	error = nvlist_lookup_string(spares[i], ZPOOL_CONFIG_TYPE, &vdev_type);
570	if (error != 0) {
571		syslog(LOG_ERR, "CaseFile::ActivateSpare: Cannot determine "
572		       "the vdev type of pool %s, spare %d. Error %d",
573		       poolname, i, error);
574		return (false);
575	}
576
577	return (Replace(vdev_type, devPath, /*isspare*/true));
578}
579
580void
581CaseFile::RegisterCallout(const Event &event)
582{
583	timeval now, countdown, elapsed, timestamp, zero, remaining;
584
585	gettimeofday(&now, 0);
586	timestamp = event.GetTimestamp();
587	timersub(&now, &timestamp, &elapsed);
588	timersub(&s_removeGracePeriod, &elapsed, &countdown);
589	/*
590	 * If countdown is <= zero, Reset the timer to the
591	 * smallest positive time value instead
592	 */
593	timerclear(&zero);
594	if (timercmp(&countdown, &zero, <=)) {
595		timerclear(&countdown);
596		countdown.tv_usec = 1;
597	}
598
599	remaining = m_tentativeTimer.TimeRemaining();
600
601	if (!m_tentativeTimer.IsPending()
602	 || timercmp(&countdown, &remaining, <))
603		m_tentativeTimer.Reset(countdown, OnGracePeriodEnded, this);
604}
605
606
607bool
608CaseFile::CloseIfSolved()
609{
610	if (m_events.empty()
611	 && m_tentativeEvents.empty()) {
612
613		/*
614		 * We currently do not track or take actions on
615		 * devices in the degraded or faulted state.
616		 * Once we have support for spare pools, we'll
617		 * retain these cases so that any spares added in
618		 * the future can be applied to them.
619		 */
620		switch (VdevState()) {
621		case VDEV_STATE_HEALTHY:
622			/* No need to keep cases for healthy vdevs */
623			Close();
624			return (true);
625		case VDEV_STATE_REMOVED:
626		case VDEV_STATE_CANT_OPEN:
627			/*
628			 * Keep open.  We may solve it with a newly inserted
629			 * device.
630			 */
631		case VDEV_STATE_FAULTED:
632		case VDEV_STATE_DEGRADED:
633			/*
634			 * Keep open.  We may solve it with the future
635			 * addition of a spare to the pool
636			 */
637		case VDEV_STATE_UNKNOWN:
638		case VDEV_STATE_CLOSED:
639		case VDEV_STATE_OFFLINE:
640			/*
641			 * Keep open?  This may not be the correct behavior,
642			 * but it's what we've always done
643			 */
644			;
645		}
646
647		/*
648		 * Re-serialize the case in order to remove any
649		 * previous event data.
650		 */
651		Serialize();
652	}
653
654	return (false);
655}
656
657void
658CaseFile::Log()
659{
660	syslog(LOG_INFO, "CaseFile(%s,%s,%s)\n", PoolGUIDString().c_str(),
661	       VdevGUIDString().c_str(), PhysicalPath().c_str());
662	syslog(LOG_INFO, "\tVdev State = %s\n",
663	       zpool_state_to_name(VdevState(), VDEV_AUX_NONE));
664	if (m_tentativeEvents.size() != 0) {
665		syslog(LOG_INFO, "\t=== Tentative Events ===\n");
666		for (EventList::iterator event(m_tentativeEvents.begin());
667		     event != m_tentativeEvents.end(); event++)
668			(*event)->Log(LOG_INFO);
669	}
670	if (m_events.size() != 0) {
671		syslog(LOG_INFO, "\t=== Events ===\n");
672		for (EventList::iterator event(m_events.begin());
673		     event != m_events.end(); event++)
674			(*event)->Log(LOG_INFO);
675	}
676}
677
678//- CaseFile Static Protected Methods ------------------------------------------
679void
680CaseFile::OnGracePeriodEnded(void *arg)
681{
682	CaseFile &casefile(*static_cast<CaseFile *>(arg));
683
684	casefile.OnGracePeriodEnded();
685}
686
687int
688CaseFile::DeSerializeSelector(const struct dirent *dirEntry)
689{
690	uint64_t poolGUID;
691	uint64_t vdevGUID;
692
693	if (dirEntry->d_type == DT_REG
694	 && sscanf(dirEntry->d_name, "pool_%" PRIu64 "_vdev_%" PRIu64 ".case",
695		   &poolGUID, &vdevGUID) == 2)
696		return (1);
697	return (0);
698}
699
700void
701CaseFile::DeSerializeFile(const char *fileName)
702{
703	string	  fullName(s_caseFilePath + '/' + fileName);
704	CaseFile *existingCaseFile(NULL);
705	CaseFile *caseFile(NULL);
706
707	try {
708		uint64_t poolGUID;
709		uint64_t vdevGUID;
710		nvlist_t *vdevConf;
711
712		if (sscanf(fileName, "pool_%" PRIu64 "_vdev_%" PRIu64 ".case",
713		       &poolGUID, &vdevGUID) != 2) {
714			throw ZfsdException("CaseFile::DeSerialize: "
715			    "Unintelligible CaseFile filename %s.\n", fileName);
716		}
717		existingCaseFile = Find(Guid(poolGUID), Guid(vdevGUID));
718		if (existingCaseFile != NULL) {
719			/*
720			 * If the vdev is already degraded or faulted,
721			 * there's no point in keeping the state around
722			 * that we use to put a drive into the degraded
723			 * state.  However, if the vdev is simply missing,
724			 * preserve the case data in the hopes that it will
725			 * return.
726			 */
727			caseFile = existingCaseFile;
728			vdev_state curState(caseFile->VdevState());
729			if (curState > VDEV_STATE_CANT_OPEN
730			 && curState < VDEV_STATE_HEALTHY) {
731				unlink(fileName);
732				return;
733			}
734		} else {
735			ZpoolList zpl(ZpoolList::ZpoolByGUID, &poolGUID);
736			if (zpl.empty()
737			 || (vdevConf = VdevIterator(zpl.front())
738						    .Find(vdevGUID)) == NULL) {
739				/*
740				 * Either the pool no longer exists
741				 * or this vdev is no longer a member of
742				 * the pool.
743				 */
744				unlink(fullName.c_str());
745				return;
746			}
747
748			/*
749			 * Any vdev we find that does not have a case file
750			 * must be in the healthy state and thus worthy of
751			 * continued SERD data tracking.
752			 */
753			caseFile = new CaseFile(Vdev(zpl.front(), vdevConf));
754		}
755
756		ifstream caseStream(fullName.c_str());
757		if (!caseStream)
758			throw ZfsdException("CaseFile::DeSerialize: Unable to "
759					    "read %s.\n", fileName);
760
761		caseFile->DeSerialize(caseStream);
762	} catch (const ParseException &exp) {
763
764		exp.Log();
765		if (caseFile != existingCaseFile)
766			delete caseFile;
767
768		/*
769		 * Since we can't parse the file, unlink it so we don't
770		 * trip over it again.
771		 */
772		unlink(fileName);
773	} catch (const ZfsdException &zfsException) {
774
775		zfsException.Log();
776		if (caseFile != existingCaseFile)
777			delete caseFile;
778	}
779}
780
781//- CaseFile Protected Methods -------------------------------------------------
782CaseFile::CaseFile(const Vdev &vdev)
783 : m_poolGUID(vdev.PoolGUID()),
784   m_vdevGUID(vdev.GUID()),
785   m_vdevState(vdev.State()),
786   m_vdevPhysPath(vdev.PhysicalPath())
787{
788	stringstream guidString;
789
790	guidString << m_vdevGUID;
791	m_vdevGUIDString = guidString.str();
792	guidString.str("");
793	guidString << m_poolGUID;
794	m_poolGUIDString = guidString.str();
795
796	s_activeCases.push_back(this);
797
798	syslog(LOG_INFO, "Creating new CaseFile:\n");
799	Log();
800}
801
802CaseFile::~CaseFile()
803{
804	PurgeEvents();
805	PurgeTentativeEvents();
806	m_tentativeTimer.Stop();
807	s_activeCases.remove(this);
808}
809
810void
811CaseFile::PurgeEvents()
812{
813	for (EventList::iterator event(m_events.begin());
814	     event != m_events.end(); event++)
815		delete *event;
816
817	m_events.clear();
818}
819
820void
821CaseFile::PurgeTentativeEvents()
822{
823	for (EventList::iterator event(m_tentativeEvents.begin());
824	     event != m_tentativeEvents.end(); event++)
825		delete *event;
826
827	m_tentativeEvents.clear();
828}
829
830void
831CaseFile::SerializeEvList(const EventList events, int fd,
832		const char* prefix) const
833{
834	if (events.empty())
835		return;
836	for (EventList::const_iterator curEvent = events.begin();
837	     curEvent != events.end(); curEvent++) {
838		const string &eventString((*curEvent)->GetEventString());
839
840		// TODO: replace many write(2) calls with a single writev(2)
841		if (prefix)
842			write(fd, prefix, strlen(prefix));
843		write(fd, eventString.c_str(), eventString.length());
844	}
845}
846
847void
848CaseFile::Serialize()
849{
850	stringstream saveFile;
851
852	saveFile << setfill('0')
853		 << s_caseFilePath << "/"
854		 << "pool_" << PoolGUIDString()
855		 << "_vdev_" << VdevGUIDString()
856		 << ".case";
857
858	if (m_events.empty() && m_tentativeEvents.empty()) {
859		unlink(saveFile.str().c_str());
860		return;
861	}
862
863	int fd(open(saveFile.str().c_str(), O_CREAT|O_TRUNC|O_WRONLY, 0644));
864	if (fd == -1) {
865		syslog(LOG_ERR, "CaseFile::Serialize: Unable to open %s.\n",
866		       saveFile.str().c_str());
867		return;
868	}
869	SerializeEvList(m_events, fd);
870	SerializeEvList(m_tentativeEvents, fd, "tentative ");
871	close(fd);
872}
873
874/*
875 * XXX: This method assumes that events may not contain embedded newlines.  If
876 * ever events can contain embedded newlines, then CaseFile must switch
877 * serialization formats
878 */
879void
880CaseFile::DeSerialize(ifstream &caseStream)
881{
882	string	      evString;
883	const EventFactory &factory(ZfsDaemon::Get().GetFactory());
884
885	caseStream >> std::noskipws >> std::ws;
886	while (caseStream.good()) {
887		/*
888		 * Outline:
889		 * read the beginning of a line and check it for
890		 * "tentative".  If found, discard "tentative".
891		 * Create a new event
892		 * continue
893		 */
894		EventList* destEvents;
895		const string tentFlag("tentative ");
896		string line;
897		std::stringbuf lineBuf;
898
899		caseStream.get(lineBuf);
900		caseStream.ignore();  /*discard the newline character*/
901		line = lineBuf.str();
902		if (line.compare(0, tentFlag.size(), tentFlag) == 0) {
903			/* Discard "tentative" */
904			line.erase(0, tentFlag.size());
905			destEvents = &m_tentativeEvents;
906		} else {
907			destEvents = &m_events;
908		}
909		Event *event(Event::CreateEvent(factory, line));
910		if (event != NULL) {
911			destEvents->push_back(event);
912			RegisterCallout(*event);
913		}
914	}
915}
916
917void
918CaseFile::Close()
919{
920	/*
921	 * This case is no longer relevant.  Clean up our
922	 * serialization file, and delete the case.
923	 */
924	syslog(LOG_INFO, "CaseFile(%s,%s) closed - State %s\n",
925	       PoolGUIDString().c_str(), VdevGUIDString().c_str(),
926	       zpool_state_to_name(VdevState(), VDEV_AUX_NONE));
927
928	/*
929	 * Serialization of a Case with no event data, clears the
930	 * Serialization data for that event.
931	 */
932	PurgeEvents();
933	Serialize();
934
935	delete this;
936}
937
938void
939CaseFile::OnGracePeriodEnded()
940{
941	bool should_fault, should_degrade;
942	ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID);
943	zpool_handle_t *zhp(zpl.empty() ? NULL : zpl.front());
944
945	m_events.splice(m_events.begin(), m_tentativeEvents);
946	should_fault = ShouldFault();
947	should_degrade = ShouldDegrade();
948
949	if (should_fault || should_degrade) {
950		if (zhp == NULL
951		 || (VdevIterator(zhp).Find(m_vdevGUID)) == NULL) {
952			/*
953			 * Either the pool no longer exists
954			 * or this vdev is no longer a member of
955			 * the pool.
956			 */
957			Close();
958			return;
959		}
960
961	}
962
963	/* A fault condition has priority over a degrade condition */
964	if (ShouldFault()) {
965		/* Fault the vdev and close the case. */
966		if (zpool_vdev_fault(zhp, (uint64_t)m_vdevGUID,
967				       VDEV_AUX_ERR_EXCEEDED) == 0) {
968			syslog(LOG_INFO, "Faulting vdev(%s/%s)",
969			       PoolGUIDString().c_str(),
970			       VdevGUIDString().c_str());
971			Close();
972			return;
973		}
974		else {
975			syslog(LOG_ERR, "Fault vdev(%s/%s): %s: %s\n",
976			       PoolGUIDString().c_str(),
977			       VdevGUIDString().c_str(),
978			       libzfs_error_action(g_zfsHandle),
979			       libzfs_error_description(g_zfsHandle));
980		}
981	}
982	else if (ShouldDegrade()) {
983		/* Degrade the vdev and close the case. */
984		if (zpool_vdev_degrade(zhp, (uint64_t)m_vdevGUID,
985				       VDEV_AUX_ERR_EXCEEDED) == 0) {
986			syslog(LOG_INFO, "Degrading vdev(%s/%s)",
987			       PoolGUIDString().c_str(),
988			       VdevGUIDString().c_str());
989			Close();
990			return;
991		}
992		else {
993			syslog(LOG_ERR, "Degrade vdev(%s/%s): %s: %s\n",
994			       PoolGUIDString().c_str(),
995			       VdevGUIDString().c_str(),
996			       libzfs_error_action(g_zfsHandle),
997			       libzfs_error_description(g_zfsHandle));
998		}
999	}
1000	Serialize();
1001}
1002
1003Vdev
1004CaseFile::BeingReplacedBy(zpool_handle_t *zhp) {
1005	Vdev vd(zhp, CaseVdev(zhp));
1006	std::list<Vdev> children;
1007	std::list<Vdev>::iterator children_it;
1008
1009	Vdev parent(vd.Parent());
1010	Vdev replacing(NonexistentVdev);
1011
1012	/*
1013	 * To determine whether we are being replaced by another spare that
1014	 * is still working, then make sure that it is currently spared and
1015	 * that the spare is either resilvering or healthy.  If any of these
1016	 * conditions fail, then we are not being replaced by a spare.
1017	 *
1018	 * If the spare is healthy, then the case file should be closed very
1019	 * soon after this check.
1020	 */
1021	if (parent.DoesNotExist()
1022	 || parent.Name(zhp, /*verbose*/false) != "spare")
1023		return (NonexistentVdev);
1024
1025	children = parent.Children();
1026	children_it = children.begin();
1027	for (;children_it != children.end(); children_it++) {
1028		Vdev child = *children_it;
1029
1030		/* Skip our vdev. */
1031		if (child.GUID() == VdevGUID())
1032			continue;
1033		/*
1034		 * Accept the first child that doesn't match our GUID, or
1035		 * any resilvering/healthy device if one exists.
1036		 */
1037		if (replacing.DoesNotExist() || child.IsResilvering()
1038		 || child.State() == VDEV_STATE_HEALTHY)
1039			replacing = child;
1040	}
1041
1042	return (replacing);
1043}
1044
1045bool
1046CaseFile::Replace(const char* vdev_type, const char* path, bool isspare) {
1047	nvlist_t *nvroot, *newvd;
1048	const char *poolname;
1049	string oldstr(VdevGUIDString());
1050	bool retval = true;
1051
1052	/* Figure out what pool we're working on */
1053	ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID);
1054	zpool_handle_t *zhp(zpl.empty() ? NULL : zpl.front());
1055	if (zhp == NULL) {
1056		syslog(LOG_ERR, "CaseFile::Replace: could not find pool for "
1057		       "pool_guid %" PRIu64 ".", (uint64_t)m_poolGUID);
1058		return (false);
1059	}
1060	poolname = zpool_get_name(zhp);
1061	Vdev vd(zhp, CaseVdev(zhp));
1062	Vdev replaced(BeingReplacedBy(zhp));
1063
1064	if (isspare && !vd.IsSpare() && !replaced.DoesNotExist()) {
1065		/* If we are already being replaced by a working spare, pass. */
1066		if (replaced.IsResilvering()
1067		 || replaced.State() == VDEV_STATE_HEALTHY) {
1068			syslog(LOG_INFO, "CaseFile::Replace(%s->%s): already "
1069			    "replaced", VdevGUIDString().c_str(), path);
1070			return (/*consumed*/false);
1071		}
1072		/*
1073		 * If we have already been replaced by a spare, but that spare
1074		 * is broken, we must spare the spare, not the original device.
1075		 */
1076		oldstr = replaced.GUIDString();
1077		syslog(LOG_INFO, "CaseFile::Replace(%s->%s): sparing "
1078		    "broken spare %s instead", VdevGUIDString().c_str(),
1079		    path, oldstr.c_str());
1080	}
1081
1082	/*
1083	 * Build a root vdev/leaf vdev configuration suitable for
1084	 * zpool_vdev_attach. Only enough data for the kernel to find
1085	 * the device (i.e. type and disk device node path) are needed.
1086	 */
1087	nvroot = NULL;
1088	newvd = NULL;
1089
1090	if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0
1091	 || nvlist_alloc(&newvd, NV_UNIQUE_NAME, 0) != 0) {
1092		syslog(LOG_ERR, "Replace vdev(%s/%s): Unable to allocate "
1093		    "configuration data.", poolname, oldstr.c_str());
1094		if (nvroot != NULL)
1095			nvlist_free(nvroot);
1096		return (false);
1097	}
1098	if (nvlist_add_string(newvd, ZPOOL_CONFIG_TYPE, vdev_type) != 0
1099	 || nvlist_add_string(newvd, ZPOOL_CONFIG_PATH, path) != 0
1100	 || nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) != 0
1101	 || nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
1102				    &newvd, 1) != 0) {
1103		syslog(LOG_ERR, "Replace vdev(%s/%s): Unable to initialize "
1104		    "configuration data.", poolname, oldstr.c_str());
1105		nvlist_free(newvd);
1106		nvlist_free(nvroot);
1107		return (true);
1108	}
1109
1110	/* Data was copied when added to the root vdev. */
1111	nvlist_free(newvd);
1112
1113	retval = (zpool_vdev_attach(zhp, oldstr.c_str(), path, nvroot,
1114       /*replace*/B_TRUE, /*rebuild*/ B_FALSE) == 0);
1115	if (retval)
1116		syslog(LOG_INFO, "Replacing vdev(%s/%s) with %s\n",
1117		    poolname, oldstr.c_str(), path);
1118	else
1119		syslog(LOG_ERR, "Replace vdev(%s/%s): %s: %s\n",
1120		    poolname, oldstr.c_str(), libzfs_error_action(g_zfsHandle),
1121		    libzfs_error_description(g_zfsHandle));
1122	nvlist_free(nvroot);
1123
1124	return (retval);
1125}
1126
1127/* Does the argument event refer to a checksum error? */
1128static bool
1129IsChecksumEvent(const Event* const event)
1130{
1131	return ("ereport.fs.zfs.checksum" == event->Value("type"));
1132}
1133
1134/* Does the argument event refer to an IO error? */
1135static bool
1136IsIOEvent(const Event* const event)
1137{
1138	return ("ereport.fs.zfs.io" == event->Value("type"));
1139}
1140
1141bool
1142CaseFile::ShouldDegrade() const
1143{
1144	return (std::count_if(m_events.begin(), m_events.end(),
1145			      IsChecksumEvent) > ZFS_DEGRADE_IO_COUNT);
1146}
1147
1148bool
1149CaseFile::ShouldFault() const
1150{
1151	return (std::count_if(m_events.begin(), m_events.end(),
1152			      IsIOEvent) > ZFS_DEGRADE_IO_COUNT);
1153}
1154
1155nvlist_t *
1156CaseFile::CaseVdev(zpool_handle_t *zhp) const
1157{
1158	return (VdevIterator(zhp).Find(VdevGUID()));
1159}
1160