1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License Version 1.0 (CDDL-1.0).
6 * You can obtain a copy of the license from the top-level file
7 * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
8 * You may not use this file except in compliance with the license.
9 *
10 * CDDL HEADER END
11 */
12
13/*
14 * Copyright (c) 2016, 2017, Intel Corporation.
15 */
16
17#ifdef HAVE_LIBUDEV
18
19#include <errno.h>
20#include <fcntl.h>
21#include <libnvpair.h>
22#include <libudev.h>
23#include <libzfs.h>
24#include <libzutil.h>
25#include <pthread.h>
26#include <stdlib.h>
27#include <string.h>
28
29#include <sys/sysevent/eventdefs.h>
30#include <sys/sysevent/dev.h>
31
32#include "zed_log.h"
33#include "zed_disk_event.h"
34#include "agents/zfs_agents.h"
35
36/*
37 * Portions of ZED need to see disk events for disks belonging to ZFS pools.
38 * A libudev monitor is established to monitor block device actions and pass
39 * them on to internal ZED logic modules.  Initially, zfs_mod.c is the only
40 * consumer and is the Linux equivalent for the illumos syseventd ZFS SLM
41 * module responsible for handling disk events for ZFS.
42 */
43
44pthread_t g_mon_tid;
45struct udev *g_udev;
46struct udev_monitor *g_mon;
47
48
49#define	DEV_BYID_PATH	"/dev/disk/by-id/"
50
51/* 64MB is minimum usable disk for ZFS */
52#define	MINIMUM_SECTORS		131072ULL
53
54
55/*
56 * Post disk event to SLM module
57 *
58 * occurs in the context of monitor thread
59 */
60static void
61zed_udev_event(const char *class, const char *subclass, nvlist_t *nvl)
62{
63	const char *strval;
64	uint64_t numval;
65
66	zed_log_msg(LOG_INFO, "zed_disk_event:");
67	zed_log_msg(LOG_INFO, "\tclass: %s", class);
68	zed_log_msg(LOG_INFO, "\tsubclass: %s", subclass);
69	if (nvlist_lookup_string(nvl, DEV_NAME, &strval) == 0)
70		zed_log_msg(LOG_INFO, "\t%s: %s", DEV_NAME, strval);
71	if (nvlist_lookup_string(nvl, DEV_PATH, &strval) == 0)
72		zed_log_msg(LOG_INFO, "\t%s: %s", DEV_PATH, strval);
73	if (nvlist_lookup_string(nvl, DEV_IDENTIFIER, &strval) == 0)
74		zed_log_msg(LOG_INFO, "\t%s: %s", DEV_IDENTIFIER, strval);
75	if (nvlist_lookup_boolean(nvl, DEV_IS_PART) == B_TRUE)
76		zed_log_msg(LOG_INFO, "\t%s: B_TRUE", DEV_IS_PART);
77	if (nvlist_lookup_string(nvl, DEV_PHYS_PATH, &strval) == 0)
78		zed_log_msg(LOG_INFO, "\t%s: %s", DEV_PHYS_PATH, strval);
79	if (nvlist_lookup_uint64(nvl, DEV_SIZE, &numval) == 0)
80		zed_log_msg(LOG_INFO, "\t%s: %llu", DEV_SIZE, numval);
81	if (nvlist_lookup_uint64(nvl, DEV_PARENT_SIZE, &numval) == 0)
82		zed_log_msg(LOG_INFO, "\t%s: %llu", DEV_PARENT_SIZE, numval);
83	if (nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, &numval) == 0)
84		zed_log_msg(LOG_INFO, "\t%s: %llu", ZFS_EV_POOL_GUID, numval);
85	if (nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &numval) == 0)
86		zed_log_msg(LOG_INFO, "\t%s: %llu", ZFS_EV_VDEV_GUID, numval);
87
88	(void) zfs_agent_post_event(class, subclass, nvl);
89}
90
91/*
92 * dev_event_nvlist: place event schema into an nv pair list
93 *
94 * NAME			VALUE (example)
95 * --------------	--------------------------------------------------------
96 * DEV_NAME		/dev/sdl
97 * DEV_PATH		/devices/pci0000:00/0000:00:03.0/0000:04:00.0/host0/...
98 * DEV_IDENTIFIER	ata-Hitachi_HTS725050A9A362_100601PCG420VLJ37DMC
99 * DEV_PHYS_PATH	pci-0000:04:00.0-sas-0x4433221101000000-lun-0
100 * DEV_IS_PART		---
101 * DEV_SIZE		500107862016
102 * ZFS_EV_POOL_GUID	17523635698032189180
103 * ZFS_EV_VDEV_GUID	14663607734290803088
104 */
105static nvlist_t *
106dev_event_nvlist(struct udev_device *dev)
107{
108	nvlist_t *nvl;
109	char strval[128];
110	const char *value, *path;
111	uint64_t guid;
112
113	if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0)
114		return (NULL);
115
116	if (zfs_device_get_devid(dev, strval, sizeof (strval)) == 0)
117		(void) nvlist_add_string(nvl, DEV_IDENTIFIER, strval);
118	if (zfs_device_get_physical(dev, strval, sizeof (strval)) == 0)
119		(void) nvlist_add_string(nvl, DEV_PHYS_PATH, strval);
120	if ((path = udev_device_get_devnode(dev)) != NULL)
121		(void) nvlist_add_string(nvl, DEV_NAME, path);
122	if ((value = udev_device_get_devpath(dev)) != NULL)
123		(void) nvlist_add_string(nvl, DEV_PATH, value);
124	value = udev_device_get_devtype(dev);
125	if ((value != NULL && strcmp("partition", value) == 0) ||
126	    (udev_device_get_property_value(dev, "ID_PART_ENTRY_NUMBER")
127	    != NULL)) {
128		(void) nvlist_add_boolean(nvl, DEV_IS_PART);
129	}
130	if ((value = udev_device_get_sysattr_value(dev, "size")) != NULL) {
131		uint64_t numval = DEV_BSIZE;
132
133		numval *= strtoull(value, NULL, 10);
134		(void) nvlist_add_uint64(nvl, DEV_SIZE, numval);
135
136		/*
137		 * If the device has a parent, then get the parent block
138		 * device's size as well.  For example, /dev/sda1's parent
139		 * is /dev/sda.
140		 */
141		struct udev_device *parent_dev = udev_device_get_parent(dev);
142		if ((value = udev_device_get_sysattr_value(parent_dev, "size"))
143		    != NULL) {
144			uint64_t numval = DEV_BSIZE;
145
146			numval *= strtoull(value, NULL, 10);
147			(void) nvlist_add_uint64(nvl, DEV_PARENT_SIZE, numval);
148		}
149	}
150
151	/*
152	 * Grab the pool and vdev guids from blkid cache
153	 */
154	value = udev_device_get_property_value(dev, "ID_FS_UUID");
155	if (value != NULL && (guid = strtoull(value, NULL, 10)) != 0)
156		(void) nvlist_add_uint64(nvl, ZFS_EV_POOL_GUID, guid);
157
158	value = udev_device_get_property_value(dev, "ID_FS_UUID_SUB");
159	if (value != NULL && (guid = strtoull(value, NULL, 10)) != 0)
160		(void) nvlist_add_uint64(nvl, ZFS_EV_VDEV_GUID, guid);
161
162	/*
163	 * Either a vdev guid or a devid must be present for matching
164	 */
165	if (!nvlist_exists(nvl, DEV_IDENTIFIER) &&
166	    !nvlist_exists(nvl, ZFS_EV_VDEV_GUID)) {
167		nvlist_free(nvl);
168		return (NULL);
169	}
170
171	return (nvl);
172}
173
174/*
175 *  Listen for block device uevents
176 */
177static void *
178zed_udev_monitor(void *arg)
179{
180	struct udev_monitor *mon = arg;
181	const char *tmp;
182	char *tmp2;
183
184	zed_log_msg(LOG_INFO, "Waiting for new udev disk events...");
185
186	while (1) {
187		struct udev_device *dev;
188		const char *action, *type, *part, *sectors;
189		const char *bus, *uuid, *devpath;
190		const char *class, *subclass;
191		nvlist_t *nvl;
192		boolean_t is_zfs = B_FALSE;
193
194		/* allow a cancellation while blocked (recvmsg) */
195		pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL);
196
197		/* blocks at recvmsg until an event occurs */
198		if ((dev = udev_monitor_receive_device(mon)) == NULL) {
199			zed_log_msg(LOG_WARNING, "zed_udev_monitor: receive "
200			    "device error %d", errno);
201			continue;
202		}
203
204		/* allow all steps to complete before a cancellation */
205		pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL);
206
207		/*
208		 * Strongly typed device is the preferred filter
209		 */
210		type = udev_device_get_property_value(dev, "ID_FS_TYPE");
211		if (type != NULL && type[0] != '\0') {
212			if (strcmp(type, "zfs_member") == 0) {
213				is_zfs = B_TRUE;
214			} else {
215				/* not ours, so skip */
216				zed_log_msg(LOG_INFO, "zed_udev_monitor: skip "
217				    "%s (in use by %s)",
218				    udev_device_get_devnode(dev), type);
219				udev_device_unref(dev);
220				continue;
221			}
222		}
223
224		/*
225		 * if this is a disk and it is partitioned, then the
226		 * zfs label will reside in a DEVTYPE=partition and
227		 * we can skip passing this event
228		 *
229		 * Special case: Blank disks are sometimes reported with
230		 * an erroneous 'atari' partition, and should not be
231		 * excluded from being used as an autoreplace disk:
232		 *
233		 * https://github.com/openzfs/zfs/issues/13497
234		 */
235		type = udev_device_get_property_value(dev, "DEVTYPE");
236		part = udev_device_get_property_value(dev,
237		    "ID_PART_TABLE_TYPE");
238		if (type != NULL && type[0] != '\0' &&
239		    strcmp(type, "disk") == 0 &&
240		    part != NULL && part[0] != '\0') {
241			const char *devname =
242			    udev_device_get_property_value(dev, "DEVNAME");
243
244			if (strcmp(part, "atari") == 0) {
245				zed_log_msg(LOG_INFO,
246				    "%s: %s is reporting an atari partition, "
247				    "but we're going to assume it's a false "
248				    "positive and still use it (issue #13497)",
249				    __func__, devname);
250			} else {
251				zed_log_msg(LOG_INFO,
252				    "%s: skip %s since it has a %s partition "
253				    "already", __func__, devname, part);
254				/* skip and wait for partition event */
255				udev_device_unref(dev);
256				continue;
257			}
258		}
259
260		/*
261		 * ignore small partitions
262		 */
263		sectors = udev_device_get_property_value(dev,
264		    "ID_PART_ENTRY_SIZE");
265		if (sectors == NULL)
266			sectors = udev_device_get_sysattr_value(dev, "size");
267		if (sectors != NULL &&
268		    strtoull(sectors, NULL, 10) < MINIMUM_SECTORS) {
269			zed_log_msg(LOG_INFO,
270			    "%s: %s sectors %s < %llu (minimum)",
271			    __func__,
272			    udev_device_get_property_value(dev, "DEVNAME"),
273			    sectors, MINIMUM_SECTORS);
274			udev_device_unref(dev);
275			continue;
276		}
277
278		/*
279		 * If the blkid probe didn't find ZFS, then a persistent
280		 * device id string is required in the message schema
281		 * for matching with vdevs. Preflight here for expected
282		 * udev information.
283		 *
284		 * Special case:
285		 * NVMe devices don't have ID_BUS set (at least on RHEL 7-8),
286		 * but they are valid for autoreplace.  Add a special case for
287		 * them by searching for "/nvme/" in the udev DEVPATH:
288		 *
289		 * DEVPATH=/devices/pci0000:00/0000:00:1e.0/nvme/nvme2/nvme2n1
290		 */
291		bus = udev_device_get_property_value(dev, "ID_BUS");
292		uuid = udev_device_get_property_value(dev, "DM_UUID");
293		devpath = udev_device_get_devpath(dev);
294		if (!is_zfs && (bus == NULL && uuid == NULL &&
295		    strstr(devpath, "/nvme/") == NULL)) {
296			zed_log_msg(LOG_INFO, "zed_udev_monitor: %s no devid "
297			    "source", udev_device_get_devnode(dev));
298			udev_device_unref(dev);
299			continue;
300		}
301
302		action = udev_device_get_action(dev);
303		if (strcmp(action, "add") == 0) {
304			class = EC_DEV_ADD;
305			subclass = ESC_DISK;
306		} else if (strcmp(action, "remove") == 0) {
307			class = EC_DEV_REMOVE;
308			subclass = ESC_DISK;
309		} else if (strcmp(action, "change") == 0) {
310			class = EC_DEV_STATUS;
311			subclass = ESC_DEV_DLE;
312		} else {
313			zed_log_msg(LOG_WARNING, "zed_udev_monitor: %s unknown",
314			    action);
315			udev_device_unref(dev);
316			continue;
317		}
318
319		/*
320		 * Special case an EC_DEV_ADD for multipath devices
321		 *
322		 * When a multipath device is created, udev reports the
323		 * following:
324		 *
325		 * 1.	"add" event of the dm device for the multipath device
326		 *	(like /dev/dm-3).
327		 * 2.	"change" event to create the actual multipath device
328		 *	symlink (like /dev/mapper/mpatha).  The event also
329		 *	passes back the relevant DM vars we care about, like
330		 *	DM_UUID.
331		 * 3.	Another "change" event identical to #2 (that we ignore).
332		 *
333		 * To get the behavior we want, we treat the "change" event
334		 * in #2 as a "add" event; as if "/dev/mapper/mpatha" was
335		 * a new disk being added.
336		 */
337		if (strcmp(class, EC_DEV_STATUS) == 0 &&
338		    udev_device_get_property_value(dev, "DM_UUID") &&
339		    udev_device_get_property_value(dev, "MPATH_SBIN_PATH")) {
340			tmp = udev_device_get_devnode(dev);
341			tmp2 = zfs_get_underlying_path(tmp);
342			if (tmp && tmp2 && (strcmp(tmp, tmp2) != 0)) {
343				/*
344				 * We have a real underlying device, which
345				 * means that this multipath "change" event is
346				 * an "add" event.
347				 *
348				 * If the multipath device and the underlying
349				 * dev are the same name (i.e. /dev/dm-5), then
350				 * there is no real underlying disk for this
351				 * multipath device, and so this "change" event
352				 * really is a multipath removal.
353				 */
354				class = EC_DEV_ADD;
355				subclass = ESC_DISK;
356			} else {
357				tmp = udev_device_get_property_value(dev,
358				    "DM_NR_VALID_PATHS");
359				/* treat as a multipath remove */
360				if (tmp != NULL && strcmp(tmp, "0") == 0) {
361					class = EC_DEV_REMOVE;
362					subclass = ESC_DISK;
363				}
364			}
365			free(tmp2);
366		}
367
368		/*
369		 * Special case an EC_DEV_ADD for scsi_debug devices
370		 *
371		 * These devices require a udevadm trigger command after
372		 * creation in order to register the vdev_id scsidebug alias
373		 * rule (adds a persistent path (phys_path) used for fault
374		 * management automated tests in the ZFS test suite.
375		 *
376		 * After udevadm trigger command, event registers as a "change"
377		 * event but needs to instead be handled as another "add" event
378		 * to allow for disk labeling and partitioning to occur.
379		 */
380		if (strcmp(class, EC_DEV_STATUS) == 0 &&
381		    udev_device_get_property_value(dev, "ID_VDEV") &&
382		    udev_device_get_property_value(dev, "ID_MODEL")) {
383			const char *id_model, *id_model_sd = "scsi_debug";
384
385			id_model = udev_device_get_property_value(dev,
386			    "ID_MODEL");
387			if (strcmp(id_model, id_model_sd) == 0) {
388				class = EC_DEV_ADD;
389				subclass = ESC_DISK;
390			}
391		}
392
393		if ((nvl = dev_event_nvlist(dev)) != NULL) {
394			zed_udev_event(class, subclass, nvl);
395			nvlist_free(nvl);
396		}
397
398		udev_device_unref(dev);
399	}
400
401	return (NULL);
402}
403
404int
405zed_disk_event_init(void)
406{
407	int fd, fflags;
408
409	if ((g_udev = udev_new()) == NULL) {
410		zed_log_msg(LOG_WARNING, "udev_new failed (%d)", errno);
411		return (-1);
412	}
413
414	/* Set up a udev monitor for block devices */
415	g_mon = udev_monitor_new_from_netlink(g_udev, "udev");
416	udev_monitor_filter_add_match_subsystem_devtype(g_mon, "block", "disk");
417	udev_monitor_filter_add_match_subsystem_devtype(g_mon, "block",
418	    "partition");
419	udev_monitor_enable_receiving(g_mon);
420
421	/* Make sure monitoring socket is blocking */
422	fd = udev_monitor_get_fd(g_mon);
423	if ((fflags = fcntl(fd, F_GETFL)) & O_NONBLOCK)
424		(void) fcntl(fd, F_SETFL, fflags & ~O_NONBLOCK);
425
426	/* spawn a thread to monitor events */
427	if (pthread_create(&g_mon_tid, NULL, zed_udev_monitor, g_mon) != 0) {
428		udev_monitor_unref(g_mon);
429		udev_unref(g_udev);
430		zed_log_msg(LOG_WARNING, "pthread_create failed");
431		return (-1);
432	}
433
434	pthread_setname_np(g_mon_tid, "udev monitor");
435	zed_log_msg(LOG_INFO, "zed_disk_event_init");
436
437	return (0);
438}
439
440void
441zed_disk_event_fini(void)
442{
443	/* cancel monitor thread at recvmsg() */
444	(void) pthread_cancel(g_mon_tid);
445	(void) pthread_join(g_mon_tid, NULL);
446
447	/* cleanup udev resources */
448	udev_monitor_unref(g_mon);
449	udev_unref(g_udev);
450
451	zed_log_msg(LOG_INFO, "zed_disk_event_fini");
452}
453
454#else
455
456#include "zed_disk_event.h"
457
458int
459zed_disk_event_init(void)
460{
461	return (0);
462}
463
464void
465zed_disk_event_fini(void)
466{
467}
468
469#endif /* HAVE_LIBUDEV */
470