1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License Version 1.0 (CDDL-1.0).
6 * You can obtain a copy of the license from the top-level file
7 * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
8 * You may not use this file except in compliance with the license.
9 *
10 * CDDL HEADER END
11 */
12
13/*
14 * Copyright (c) 2016, Intel Corporation.
15 * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>
16 * Copyright (c) 2021 Hewlett Packard Enterprise Development LP
17 */
18
19#include <libnvpair.h>
20#include <libzfs.h>
21#include <stddef.h>
22#include <stdlib.h>
23#include <string.h>
24#include <sys/list.h>
25#include <sys/time.h>
26#include <sys/sysevent/eventdefs.h>
27#include <sys/sysevent/dev.h>
28#include <sys/fm/protocol.h>
29#include <sys/fm/fs/zfs.h>
30#include <pthread.h>
31#include <unistd.h>
32
33#include "zfs_agents.h"
34#include "fmd_api.h"
35#include "../zed_log.h"
36
37/*
38 * agent dispatch code
39 */
40
41static pthread_mutex_t	agent_lock = PTHREAD_MUTEX_INITIALIZER;
42static pthread_cond_t	agent_cond = PTHREAD_COND_INITIALIZER;
43static list_t		agent_events;	/* list of pending events */
44static int		agent_exiting;
45
46typedef struct agent_event {
47	char		ae_class[64];
48	char		ae_subclass[32];
49	nvlist_t	*ae_nvl;
50	list_node_t	ae_node;
51} agent_event_t;
52
53pthread_t g_agents_tid;
54
55libzfs_handle_t *g_zfs_hdl;
56
57/* guid search data */
58typedef enum device_type {
59	DEVICE_TYPE_L2ARC,	/* l2arc device */
60	DEVICE_TYPE_SPARE,	/* spare device */
61	DEVICE_TYPE_PRIMARY	/* any primary pool storage device */
62} device_type_t;
63
64typedef struct guid_search {
65	uint64_t	gs_pool_guid;
66	uint64_t	gs_vdev_guid;
67	char		*gs_devid;
68	device_type_t	gs_vdev_type;
69	uint64_t	gs_vdev_expandtime;	/* vdev expansion time */
70} guid_search_t;
71
72/*
73 * Walks the vdev tree recursively looking for a matching devid.
74 * Returns B_TRUE as soon as a matching device is found, B_FALSE otherwise.
75 */
76static boolean_t
77zfs_agent_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *arg)
78{
79	guid_search_t *gsp = arg;
80	char *path = NULL;
81	uint_t c, children;
82	nvlist_t **child;
83
84	/*
85	 * First iterate over any children.
86	 */
87	if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN,
88	    &child, &children) == 0) {
89		for (c = 0; c < children; c++) {
90			if (zfs_agent_iter_vdev(zhp, child[c], gsp)) {
91				gsp->gs_vdev_type = DEVICE_TYPE_PRIMARY;
92				return (B_TRUE);
93			}
94		}
95	}
96	/*
97	 * Iterate over any spares and cache devices
98	 */
99	if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_SPARES,
100	    &child, &children) == 0) {
101		for (c = 0; c < children; c++) {
102			if (zfs_agent_iter_vdev(zhp, child[c], gsp)) {
103				gsp->gs_vdev_type = DEVICE_TYPE_L2ARC;
104				return (B_TRUE);
105			}
106		}
107	}
108	if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_L2CACHE,
109	    &child, &children) == 0) {
110		for (c = 0; c < children; c++) {
111			if (zfs_agent_iter_vdev(zhp, child[c], gsp)) {
112				gsp->gs_vdev_type = DEVICE_TYPE_SPARE;
113				return (B_TRUE);
114			}
115		}
116	}
117	/*
118	 * On a devid match, grab the vdev guid and expansion time, if any.
119	 */
120	if (gsp->gs_devid != NULL &&
121	    (nvlist_lookup_string(nvl, ZPOOL_CONFIG_DEVID, &path) == 0) &&
122	    (strcmp(gsp->gs_devid, path) == 0)) {
123		(void) nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_GUID,
124		    &gsp->gs_vdev_guid);
125		(void) nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_EXPANSION_TIME,
126		    &gsp->gs_vdev_expandtime);
127		return (B_TRUE);
128	}
129
130	return (B_FALSE);
131}
132
133static int
134zfs_agent_iter_pool(zpool_handle_t *zhp, void *arg)
135{
136	guid_search_t *gsp = arg;
137	nvlist_t *config, *nvl;
138
139	/*
140	 * For each vdev in this pool, look for a match by devid
141	 */
142	if ((config = zpool_get_config(zhp, NULL)) != NULL) {
143		if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
144		    &nvl) == 0) {
145			(void) zfs_agent_iter_vdev(zhp, nvl, gsp);
146		}
147	}
148	/*
149	 * if a match was found then grab the pool guid
150	 */
151	if (gsp->gs_vdev_guid) {
152		(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
153		    &gsp->gs_pool_guid);
154	}
155
156	zpool_close(zhp);
157	return (gsp->gs_vdev_guid != 0);
158}
159
160void
161zfs_agent_post_event(const char *class, const char *subclass, nvlist_t *nvl)
162{
163	agent_event_t *event;
164
165	if (subclass == NULL)
166		subclass = "";
167
168	event = malloc(sizeof (agent_event_t));
169	if (event == NULL || nvlist_dup(nvl, &event->ae_nvl, 0) != 0) {
170		if (event)
171			free(event);
172		return;
173	}
174
175	if (strcmp(class, "sysevent.fs.zfs.vdev_check") == 0) {
176		class = EC_ZFS;
177		subclass = ESC_ZFS_VDEV_CHECK;
178	}
179
180	/*
181	 * On Linux, we don't get the expected FM_RESOURCE_REMOVED ereport
182	 * from the vdev_disk layer after a hot unplug. Fortunately we do
183	 * get an EC_DEV_REMOVE from our disk monitor and it is a suitable
184	 * proxy so we remap it here for the benefit of the diagnosis engine.
185	 * Starting in OpenZFS 2.0, we do get FM_RESOURCE_REMOVED from the spa
186	 * layer. Processing multiple FM_RESOURCE_REMOVED events is not harmful.
187	 */
188	if ((strcmp(class, EC_DEV_REMOVE) == 0) &&
189	    (strcmp(subclass, ESC_DISK) == 0) &&
190	    (nvlist_exists(nvl, ZFS_EV_VDEV_GUID) ||
191	    nvlist_exists(nvl, DEV_IDENTIFIER))) {
192		nvlist_t *payload = event->ae_nvl;
193		struct timeval tv;
194		int64_t tod[2];
195		uint64_t pool_guid = 0, vdev_guid = 0;
196		guid_search_t search = { 0 };
197		device_type_t devtype = DEVICE_TYPE_PRIMARY;
198
199		class = "resource.fs.zfs.removed";
200		subclass = "";
201
202		(void) nvlist_add_string(payload, FM_CLASS, class);
203		(void) nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, &pool_guid);
204		(void) nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &vdev_guid);
205
206		(void) gettimeofday(&tv, NULL);
207		tod[0] = tv.tv_sec;
208		tod[1] = tv.tv_usec;
209		(void) nvlist_add_int64_array(payload, FM_EREPORT_TIME, tod, 2);
210
211		/*
212		 * For multipath, spare and l2arc devices ZFS_EV_VDEV_GUID or
213		 * ZFS_EV_POOL_GUID may be missing so find them.
214		 */
215		if (pool_guid == 0 || vdev_guid == 0) {
216			if ((nvlist_lookup_string(nvl, DEV_IDENTIFIER,
217			    &search.gs_devid) == 0) &&
218			    (zpool_iter(g_zfs_hdl, zfs_agent_iter_pool, &search)
219			    == 1)) {
220				if (pool_guid == 0)
221					pool_guid = search.gs_pool_guid;
222				if (vdev_guid == 0)
223					vdev_guid = search.gs_vdev_guid;
224				devtype = search.gs_vdev_type;
225			}
226		}
227
228		/*
229		 * We want to avoid reporting "remove" events coming from
230		 * libudev for VDEVs which were expanded recently (10s) and
231		 * avoid activating spares in response to partitions being
232		 * deleted and created in rapid succession.
233		 */
234		if (search.gs_vdev_expandtime != 0 &&
235		    search.gs_vdev_expandtime + 10 > tv.tv_sec) {
236			zed_log_msg(LOG_INFO, "agent post event: ignoring '%s' "
237			    "for recently expanded device '%s'", EC_DEV_REMOVE,
238			    search.gs_devid);
239			goto out;
240		}
241
242		(void) nvlist_add_uint64(payload,
243		    FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, pool_guid);
244		(void) nvlist_add_uint64(payload,
245		    FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vdev_guid);
246		switch (devtype) {
247		case DEVICE_TYPE_L2ARC:
248			(void) nvlist_add_string(payload,
249			    FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE,
250			    VDEV_TYPE_L2CACHE);
251			break;
252		case DEVICE_TYPE_SPARE:
253			(void) nvlist_add_string(payload,
254			    FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, VDEV_TYPE_SPARE);
255			break;
256		case DEVICE_TYPE_PRIMARY:
257			(void) nvlist_add_string(payload,
258			    FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, VDEV_TYPE_DISK);
259			break;
260		}
261
262		zed_log_msg(LOG_INFO, "agent post event: mapping '%s' to '%s'",
263		    EC_DEV_REMOVE, class);
264	}
265
266	(void) strlcpy(event->ae_class, class, sizeof (event->ae_class));
267	(void) strlcpy(event->ae_subclass, subclass,
268	    sizeof (event->ae_subclass));
269
270	(void) pthread_mutex_lock(&agent_lock);
271	list_insert_tail(&agent_events, event);
272	(void) pthread_mutex_unlock(&agent_lock);
273
274out:
275	(void) pthread_cond_signal(&agent_cond);
276}
277
278static void
279zfs_agent_dispatch(const char *class, const char *subclass, nvlist_t *nvl)
280{
281	/*
282	 * The diagnosis engine subscribes to the following events.
283	 * On illumos these subscriptions reside in:
284	 * 	/usr/lib/fm/fmd/plugins/zfs-diagnosis.conf
285	 */
286	if (strstr(class, "ereport.fs.zfs.") != NULL ||
287	    strstr(class, "resource.fs.zfs.") != NULL ||
288	    strcmp(class, "sysevent.fs.zfs.vdev_remove") == 0 ||
289	    strcmp(class, "sysevent.fs.zfs.vdev_remove_dev") == 0 ||
290	    strcmp(class, "sysevent.fs.zfs.pool_destroy") == 0) {
291		fmd_module_recv(fmd_module_hdl("zfs-diagnosis"), nvl, class);
292	}
293
294	/*
295	 * The retire agent subscribes to the following events.
296	 * On illumos these subscriptions reside in:
297	 * 	/usr/lib/fm/fmd/plugins/zfs-retire.conf
298	 *
299	 * NOTE: faults events come directly from our diagnosis engine
300	 * and will not pass through the zfs kernel module.
301	 */
302	if (strcmp(class, FM_LIST_SUSPECT_CLASS) == 0 ||
303	    strcmp(class, "resource.fs.zfs.removed") == 0 ||
304	    strcmp(class, "resource.fs.zfs.statechange") == 0 ||
305	    strcmp(class, "sysevent.fs.zfs.vdev_remove")  == 0) {
306		fmd_module_recv(fmd_module_hdl("zfs-retire"), nvl, class);
307	}
308
309	/*
310	 * The SLM module only consumes disk events and vdev check events
311	 *
312	 * NOTE: disk events come directly from disk monitor and will
313	 * not pass through the zfs kernel module.
314	 */
315	if (strstr(class, "EC_dev_") != NULL ||
316	    strcmp(class, EC_ZFS) == 0) {
317		(void) zfs_slm_event(class, subclass, nvl);
318	}
319}
320
321/*
322 * Events are consumed and dispatched from this thread
323 * An agent can also post an event so event list lock
324 * is not held when calling an agent.
325 * One event is consumed at a time.
326 */
327static void *
328zfs_agent_consumer_thread(void *arg)
329{
330	for (;;) {
331		agent_event_t *event;
332
333		(void) pthread_mutex_lock(&agent_lock);
334
335		/* wait for an event to show up */
336		while (!agent_exiting && list_is_empty(&agent_events))
337			(void) pthread_cond_wait(&agent_cond, &agent_lock);
338
339		if (agent_exiting) {
340			(void) pthread_mutex_unlock(&agent_lock);
341			zed_log_msg(LOG_INFO, "zfs_agent_consumer_thread: "
342			    "exiting");
343			return (NULL);
344		}
345
346		if ((event = (list_head(&agent_events))) != NULL) {
347			list_remove(&agent_events, event);
348
349			(void) pthread_mutex_unlock(&agent_lock);
350
351			/* dispatch to all event subscribers */
352			zfs_agent_dispatch(event->ae_class, event->ae_subclass,
353			    event->ae_nvl);
354
355			nvlist_free(event->ae_nvl);
356			free(event);
357			continue;
358		}
359
360		(void) pthread_mutex_unlock(&agent_lock);
361	}
362
363	return (NULL);
364}
365
366void
367zfs_agent_init(libzfs_handle_t *zfs_hdl)
368{
369	fmd_hdl_t *hdl;
370
371	g_zfs_hdl = zfs_hdl;
372
373	if (zfs_slm_init() != 0)
374		zed_log_die("Failed to initialize zfs slm");
375	zed_log_msg(LOG_INFO, "Add Agent: init");
376
377	hdl = fmd_module_hdl("zfs-diagnosis");
378	_zfs_diagnosis_init(hdl);
379	if (!fmd_module_initialized(hdl))
380		zed_log_die("Failed to initialize zfs diagnosis");
381
382	hdl = fmd_module_hdl("zfs-retire");
383	_zfs_retire_init(hdl);
384	if (!fmd_module_initialized(hdl))
385		zed_log_die("Failed to initialize zfs retire");
386
387	list_create(&agent_events, sizeof (agent_event_t),
388	    offsetof(struct agent_event, ae_node));
389
390	if (pthread_create(&g_agents_tid, NULL, zfs_agent_consumer_thread,
391	    NULL) != 0) {
392		list_destroy(&agent_events);
393		zed_log_die("Failed to initialize agents");
394	}
395	pthread_setname_np(g_agents_tid, "agents");
396}
397
398void
399zfs_agent_fini(void)
400{
401	fmd_hdl_t *hdl;
402	agent_event_t *event;
403
404	agent_exiting = 1;
405	(void) pthread_cond_signal(&agent_cond);
406
407	/* wait for zfs_enum_pools thread to complete */
408	(void) pthread_join(g_agents_tid, NULL);
409
410	/* drain any pending events */
411	while ((event = (list_head(&agent_events))) != NULL) {
412		list_remove(&agent_events, event);
413		nvlist_free(event->ae_nvl);
414		free(event);
415	}
416
417	list_destroy(&agent_events);
418
419	if ((hdl = fmd_module_hdl("zfs-retire")) != NULL) {
420		_zfs_retire_fini(hdl);
421		fmd_hdl_unregister(hdl);
422	}
423	if ((hdl = fmd_module_hdl("zfs-diagnosis")) != NULL) {
424		_zfs_diagnosis_fini(hdl);
425		fmd_hdl_unregister(hdl);
426	}
427
428	zed_log_msg(LOG_INFO, "Add Agent: fini");
429	zfs_slm_fini();
430
431	g_zfs_hdl = NULL;
432}
433