1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26#include <ctype.h>
27#include <dirent.h>
28#include <fcntl.h>
29#include <stdio.h>
30#include <stdlib.h>
31#include <string.h>
32#include <sys/efi_partition.h>
33
34#ifdef HAVE_LIBUDEV
35#include <libudev.h>
36#endif
37
38#include <libzutil.h>
39
40/*
41 * Append partition suffix to an otherwise fully qualified device path.
42 * This is used to generate the name the full path as its stored in
43 * ZPOOL_CONFIG_PATH for whole disk devices.  On success the new length
44 * of 'path' will be returned on error a negative value is returned.
45 */
46int
47zfs_append_partition(char *path, size_t max_len)
48{
49	int len = strlen(path);
50
51	if ((strncmp(path, UDISK_ROOT, strlen(UDISK_ROOT)) == 0) ||
52	    (strncmp(path, ZVOL_ROOT, strlen(ZVOL_ROOT)) == 0)) {
53		if (len + 6 >= max_len)
54			return (-1);
55
56		(void) strcat(path, "-part1");
57		len += 6;
58	} else {
59		if (len + 2 >= max_len)
60			return (-1);
61
62		if (isdigit(path[len-1])) {
63			(void) strcat(path, "p1");
64			len += 2;
65		} else {
66			(void) strcat(path, "1");
67			len += 1;
68		}
69	}
70
71	return (len);
72}
73
74/*
75 * Remove partition suffix from a vdev path.  Partition suffixes may take three
76 * forms: "-partX", "pX", or "X", where X is a string of digits.  The second
77 * case only occurs when the suffix is preceded by a digit, i.e. "md0p0" The
78 * third case only occurs when preceded by a string matching the regular
79 * expression "^([hsv]|xv)d[a-z]+", i.e. a scsi, ide, virtio or xen disk.
80 *
81 * caller must free the returned string
82 */
83char *
84zfs_strip_partition(const char *path)
85{
86	char *tmp = strdup(path);
87	char *part = NULL, *d = NULL;
88	if (!tmp)
89		return (NULL);
90
91	if ((part = strstr(tmp, "-part")) && part != tmp) {
92		d = part + 5;
93	} else if ((part = strrchr(tmp, 'p')) &&
94	    part > tmp + 1 && isdigit(*(part-1))) {
95		d = part + 1;
96	} else if ((tmp[0] == 'h' || tmp[0] == 's' || tmp[0] == 'v') &&
97	    tmp[1] == 'd') {
98		for (d = &tmp[2]; isalpha(*d); part = ++d) { }
99	} else if (strncmp("xvd", tmp, 3) == 0) {
100		for (d = &tmp[3]; isalpha(*d); part = ++d) { }
101	}
102	if (part && d && *d != '\0') {
103		for (; isdigit(*d); d++) { }
104		if (*d == '\0')
105			*part = '\0';
106	}
107
108	return (tmp);
109}
110
111/*
112 * Same as zfs_strip_partition, but allows "/dev/" to be in the pathname
113 *
114 * path:	/dev/sda1
115 * returns:	/dev/sda
116 *
117 * Returned string must be freed.
118 */
119static char *
120zfs_strip_partition_path(const char *path)
121{
122	char *newpath = strdup(path);
123	char *sd_offset;
124	char *new_sd;
125
126	if (!newpath)
127		return (NULL);
128
129	/* Point to "sda1" part of "/dev/sda1" */
130	sd_offset = strrchr(newpath, '/') + 1;
131
132	/* Get our new name "sda" */
133	new_sd = zfs_strip_partition(sd_offset);
134	if (!new_sd) {
135		free(newpath);
136		return (NULL);
137	}
138
139	/* Paste the "sda" where "sda1" was */
140	strlcpy(sd_offset, new_sd, strlen(sd_offset) + 1);
141
142	/* Free temporary "sda" */
143	free(new_sd);
144
145	return (newpath);
146}
147
148/*
149 * Strip the unwanted portion of a device path.
150 */
151const char *
152zfs_strip_path(const char *path)
153{
154	size_t spath_count;
155	const char *const *spaths = zpool_default_search_paths(&spath_count);
156
157	for (size_t i = 0; i < spath_count; ++i)
158		if (strncmp(path, spaths[i], strlen(spaths[i])) == 0 &&
159		    path[strlen(spaths[i])] == '/')
160			return (path + strlen(spaths[i]) + 1);
161
162	return (path);
163}
164
165/*
166 * Read the contents of a sysfs file into an allocated buffer and remove the
167 * last newline.
168 *
169 * This is useful for reading sysfs files that return a single string.  Return
170 * an allocated string pointer on success, NULL otherwise.  Returned buffer
171 * must be freed by the user.
172 */
173static char *
174zfs_read_sysfs_file(char *filepath)
175{
176	char buf[4096];	/* all sysfs files report 4k size */
177	char *str = NULL;
178
179	FILE *fp = fopen(filepath, "r");
180	if (fp == NULL) {
181		return (NULL);
182	}
183	if (fgets(buf, sizeof (buf), fp) == buf) {
184		/* success */
185
186		/* Remove the last newline (if any) */
187		size_t len = strlen(buf);
188		if (buf[len - 1] == '\n') {
189			buf[len - 1] = '\0';
190		}
191		str = strdup(buf);
192	}
193
194	fclose(fp);
195
196	return (str);
197}
198
199/*
200 * Given a dev name like "nvme0n1", return the full PCI slot sysfs path to
201 * the drive (in /sys/bus/pci/slots).
202 *
203 * For example:
204 *     dev:            "nvme0n1"
205 *     returns:        "/sys/bus/pci/slots/0"
206 *
207 * 'dev' must be an NVMe device.
208 *
209 * Returned string must be freed.  Returns NULL on error or no sysfs path.
210 */
211static char *
212zfs_get_pci_slots_sys_path(const char *dev_name)
213{
214	DIR *dp = NULL;
215	struct dirent *ep;
216	char *address1 = NULL;
217	char *address2 = NULL;
218	char *path = NULL;
219	char buf[MAXPATHLEN];
220	char *tmp;
221
222	/* If they preface 'dev' with a path (like "/dev") then strip it off */
223	tmp = strrchr(dev_name, '/');
224	if (tmp != NULL)
225		dev_name = tmp + 1;    /* +1 since we want the chr after '/' */
226
227	if (strncmp("nvme", dev_name, 4) != 0)
228		return (NULL);
229
230	(void) snprintf(buf, sizeof (buf), "/sys/block/%s/device/address",
231	    dev_name);
232
233	address1 = zfs_read_sysfs_file(buf);
234	if (!address1)
235		return (NULL);
236
237	/*
238	 * /sys/block/nvme0n1/device/address format will
239	 * be "0000:01:00.0" while /sys/bus/pci/slots/0/address will be
240	 * "0000:01:00".  Just NULL terminate at the '.' so they match.
241	 */
242	tmp = strrchr(address1, '.');
243	if (tmp != NULL)
244		*tmp = '\0';
245
246	dp = opendir("/sys/bus/pci/slots/");
247	if (dp == NULL) {
248		free(address1);
249		return (NULL);
250	}
251
252	/*
253	 * Look through all the /sys/bus/pci/slots/ subdirs
254	 */
255	while ((ep = readdir(dp))) {
256		/*
257		 * We only care about directory names that are a single number.
258		 * Sometimes there's other directories like
259		 * "/sys/bus/pci/slots/0-3/" in there - skip those.
260		 */
261		if (!zfs_isnumber(ep->d_name))
262			continue;
263
264		(void) snprintf(buf, sizeof (buf),
265		    "/sys/bus/pci/slots/%s/address", ep->d_name);
266
267		address2 = zfs_read_sysfs_file(buf);
268		if (!address2)
269			continue;
270
271		if (strcmp(address1, address2) == 0) {
272			/* Addresses match, we're all done */
273			free(address2);
274			if (asprintf(&path, "/sys/bus/pci/slots/%s",
275			    ep->d_name) == -1) {
276				continue;
277			}
278			break;
279		}
280		free(address2);
281	}
282
283	closedir(dp);
284	free(address1);
285
286	return (path);
287}
288
289/*
290 * Given a dev name like "sda", return the full enclosure sysfs path to
291 * the disk.  You can also pass in the name with "/dev" prepended
292 * to it (like /dev/sda).  This works for both JBODs and NVMe PCI devices.
293 *
294 * For example, disk "sda" in enclosure slot 1:
295 *     dev_name:       "sda"
296 *     returns:        "/sys/class/enclosure/1:0:3:0/Slot 1"
297 *
298 * Or:
299 *
300 *      dev_name:   "nvme0n1"
301 *      returns:    "/sys/bus/pci/slots/0"
302 *
303 * 'dev' must be a non-devicemapper device.
304 *
305 * Returned string must be freed.  Returns NULL on error.
306 */
307char *
308zfs_get_enclosure_sysfs_path(const char *dev_name)
309{
310	DIR *dp = NULL;
311	struct dirent *ep;
312	char buf[MAXPATHLEN];
313	char *tmp1 = NULL;
314	char *tmp2 = NULL;
315	char *tmp3 = NULL;
316	char *path = NULL;
317	size_t size;
318	int tmpsize;
319
320	if (dev_name == NULL)
321		return (NULL);
322
323	/* If they preface 'dev' with a path (like "/dev") then strip it off */
324	tmp1 = strrchr(dev_name, '/');
325	if (tmp1 != NULL)
326		dev_name = tmp1 + 1;    /* +1 since we want the chr after '/' */
327
328	tmpsize = asprintf(&tmp1, "/sys/block/%s/device", dev_name);
329	if (tmpsize == -1 || tmp1 == NULL) {
330		tmp1 = NULL;
331		goto end;
332	}
333
334	dp = opendir(tmp1);
335	if (dp == NULL)
336		goto end;
337
338	/*
339	 * Look though all sysfs entries in /sys/block/<dev>/device for
340	 * the enclosure symlink.
341	 */
342	while ((ep = readdir(dp))) {
343		/* Ignore everything that's not our enclosure_device link */
344		if (strstr(ep->d_name, "enclosure_device") == NULL)
345			continue;
346
347		if (tmp2 != NULL)
348			free(tmp2);
349		if (asprintf(&tmp2, "%s/%s", tmp1, ep->d_name) == -1) {
350			tmp2 = NULL;
351			break;
352		}
353
354		size = readlink(tmp2, buf, sizeof (buf));
355
356		/* Did readlink fail or crop the link name? */
357		if (size == -1 || size >= sizeof (buf))
358			break;
359
360		/*
361		 * We got a valid link.  readlink() doesn't terminate strings
362		 * so we have to do it.
363		 */
364		buf[size] = '\0';
365
366		/*
367		 * Our link will look like:
368		 *
369		 * "../../../../port-11:1:2/..STUFF../enclosure/1:0:3:0/SLOT 1"
370		 *
371		 * We want to grab the "enclosure/1:0:3:0/SLOT 1" part
372		 */
373		tmp3 = strstr(buf, "enclosure");
374		if (tmp3 == NULL)
375			break;
376
377		if (path != NULL)
378			free(path);
379		if (asprintf(&path, "/sys/class/%s", tmp3) == -1) {
380			/* If asprintf() fails, 'path' is undefined */
381			path = NULL;
382			break;
383		}
384	}
385
386end:
387	free(tmp2);
388	free(tmp1);
389
390	if (dp != NULL)
391		closedir(dp);
392
393	if (!path) {
394		/*
395		 * This particular disk isn't in a JBOD.  It could be an NVMe
396		 * drive. If so, look up the NVMe device's path in
397		 * /sys/bus/pci/slots/. Within that directory is a 'attention'
398		 * file which controls the NVMe fault LED.
399		 */
400		path = zfs_get_pci_slots_sys_path(dev_name);
401	}
402
403	return (path);
404}
405
406/*
407 * Allocate and return the underlying device name for a device mapper device.
408 *
409 * For example, dm_name = "/dev/dm-0" could return "/dev/sda". Symlinks to a
410 * DM device (like /dev/disk/by-vdev/A0) are also allowed.
411 *
412 * If the DM device has multiple underlying devices (like with multipath
413 * DM devices), then favor underlying devices that have a symlink back to their
414 * back to their enclosure device in sysfs.  This will be useful for the
415 * zedlet scripts that toggle the fault LED.
416 *
417 * Returns an underlying device name, or NULL on error or no match.  If dm_name
418 * is not a DM device then return NULL.
419 *
420 * NOTE: The returned name string must be *freed*.
421 */
422static char *
423dm_get_underlying_path(const char *dm_name)
424{
425	DIR *dp = NULL;
426	struct dirent *ep;
427	char *realp;
428	char *tmp = NULL;
429	char *path = NULL;
430	char *dev_str;
431	char *first_path = NULL;
432	char *enclosure_path;
433
434	if (dm_name == NULL)
435		return (NULL);
436
437	/* dm name may be a symlink (like /dev/disk/by-vdev/A0) */
438	realp = realpath(dm_name, NULL);
439	if (realp == NULL)
440		return (NULL);
441
442	/*
443	 * If they preface 'dev' with a path (like "/dev") then strip it off.
444	 * We just want the 'dm-N' part.
445	 */
446	tmp = strrchr(realp, '/');
447	if (tmp != NULL)
448		dev_str = tmp + 1;    /* +1 since we want the chr after '/' */
449	else
450		dev_str = tmp;
451
452	if (asprintf(&tmp, "/sys/block/%s/slaves/", dev_str) == -1) {
453		tmp = NULL;
454		goto end;
455	}
456
457	dp = opendir(tmp);
458	if (dp == NULL)
459		goto end;
460
461	/*
462	 * A device-mapper device can have multiple paths to it (multipath).
463	 * Favor paths that have a symlink back to their enclosure device.
464	 * We have to do this since some enclosures may only provide a symlink
465	 * back for one underlying path to a disk and not the other.
466	 *
467	 * If no paths have links back to their enclosure, then just return the
468	 * first path.
469	 */
470	while ((ep = readdir(dp))) {
471		if (ep->d_type != DT_DIR) {	/* skip "." and ".." dirs */
472			if (!first_path)
473				first_path = strdup(ep->d_name);
474
475			enclosure_path =
476			    zfs_get_enclosure_sysfs_path(ep->d_name);
477
478			if (!enclosure_path)
479				continue;
480
481			if (asprintf(&path, "/dev/%s", ep->d_name) == -1)
482				path = NULL;
483			free(enclosure_path);
484			break;
485		}
486	}
487
488end:
489	if (dp != NULL)
490		closedir(dp);
491	free(tmp);
492	free(realp);
493
494	if (!path && first_path) {
495		/*
496		 * None of the underlying paths had a link back to their
497		 * enclosure devices.  Throw up out hands and return the first
498		 * underlying path.
499		 */
500		if (asprintf(&path, "/dev/%s", first_path) == -1)
501			path = NULL;
502	}
503
504	free(first_path);
505	return (path);
506}
507
508/*
509 * Return B_TRUE if device is a device mapper or multipath device.
510 * Return B_FALSE if not.
511 */
512boolean_t
513zfs_dev_is_dm(const char *dev_name)
514{
515
516	char *tmp;
517	tmp = dm_get_underlying_path(dev_name);
518	if (tmp == NULL)
519		return (B_FALSE);
520
521	free(tmp);
522	return (B_TRUE);
523}
524
525/*
526 * By "whole disk" we mean an entire physical disk (something we can
527 * label, toggle the write cache on, etc.) as opposed to the full
528 * capacity of a pseudo-device such as lofi or did.  We act as if we
529 * are labeling the disk, which should be a pretty good test of whether
530 * it's a viable device or not.  Returns B_TRUE if it is and B_FALSE if
531 * it isn't.
532 */
533boolean_t
534zfs_dev_is_whole_disk(const char *dev_name)
535{
536	struct dk_gpt *label = NULL;
537	int fd;
538
539	if ((fd = open(dev_name, O_RDONLY | O_DIRECT | O_CLOEXEC)) < 0)
540		return (B_FALSE);
541
542	if (efi_alloc_and_init(fd, EFI_NUMPAR, &label) != 0) {
543		(void) close(fd);
544		return (B_FALSE);
545	}
546
547	efi_free(label);
548	(void) close(fd);
549
550	return (B_TRUE);
551}
552
553/*
554 * Lookup the underlying device for a device name
555 *
556 * Often you'll have a symlink to a device, a partition device,
557 * or a multipath device, and want to look up the underlying device.
558 * This function returns the underlying device name.  If the device
559 * name is already the underlying device, then just return the same
560 * name.  If the device is a DM device with multiple underlying devices
561 * then return the first one.
562 *
563 * For example:
564 *
565 * 1. /dev/disk/by-id/ata-QEMU_HARDDISK_QM00001 -> ../../sda
566 * dev_name:	/dev/disk/by-id/ata-QEMU_HARDDISK_QM00001
567 * returns:	/dev/sda
568 *
569 * 2. /dev/mapper/mpatha (made up of /dev/sda and /dev/sdb)
570 * dev_name:	/dev/mapper/mpatha
571 * returns:	/dev/sda (first device)
572 *
573 * 3. /dev/sda (already the underlying device)
574 * dev_name:	/dev/sda
575 * returns:	/dev/sda
576 *
577 * 4. /dev/dm-3 (mapped to /dev/sda)
578 * dev_name:	/dev/dm-3
579 * returns:	/dev/sda
580 *
581 * 5. /dev/disk/by-id/scsi-0QEMU_drive-scsi0-0-0-0-part9 -> ../../sdb9
582 * dev_name:	/dev/disk/by-id/scsi-0QEMU_drive-scsi0-0-0-0-part9
583 * returns:	/dev/sdb
584 *
585 * 6. /dev/disk/by-uuid/5df030cf-3cd9-46e4-8e99-3ccb462a4e9a -> ../dev/sda2
586 * dev_name:	/dev/disk/by-uuid/5df030cf-3cd9-46e4-8e99-3ccb462a4e9a
587 * returns:	/dev/sda
588 *
589 * Returns underlying device name, or NULL on error or no match.
590 *
591 * NOTE: The returned name string must be *freed*.
592 */
593char *
594zfs_get_underlying_path(const char *dev_name)
595{
596	char *name = NULL;
597	char *tmp;
598
599	if (dev_name == NULL)
600		return (NULL);
601
602	tmp = dm_get_underlying_path(dev_name);
603
604	/* dev_name not a DM device, so just un-symlinkize it */
605	if (tmp == NULL)
606		tmp = realpath(dev_name, NULL);
607
608	if (tmp != NULL) {
609		name = zfs_strip_partition_path(tmp);
610		free(tmp);
611	}
612
613	return (name);
614}
615
616
617#ifdef HAVE_LIBUDEV
618
619/*
620 * A disk is considered a multipath whole disk when:
621 *	DEVNAME key value has "dm-"
622 *	DM_UUID key exists and starts with 'mpath-'
623 *	ID_PART_TABLE_TYPE key does not exist or is not gpt
624 *	ID_FS_LABEL key does not exist (disk isn't labeled)
625 */
626static boolean_t
627is_mpath_udev_sane(struct udev_device *dev)
628{
629	const char *devname, *type, *uuid, *label;
630
631	devname = udev_device_get_property_value(dev, "DEVNAME");
632	type = udev_device_get_property_value(dev, "ID_PART_TABLE_TYPE");
633	uuid = udev_device_get_property_value(dev, "DM_UUID");
634	label = udev_device_get_property_value(dev, "ID_FS_LABEL");
635
636	if ((devname != NULL && strncmp(devname, "/dev/dm-", 8) == 0) &&
637	    ((type == NULL) || (strcmp(type, "gpt") != 0)) &&
638	    ((uuid != NULL) && (strncmp(uuid, "mpath-", 6) == 0)) &&
639	    (label == NULL)) {
640		return (B_TRUE);
641	}
642
643	return (B_FALSE);
644}
645
646/*
647 * Check if a disk is a multipath "blank" disk:
648 *
649 * 1. The disk has udev values that suggest it's a multipath disk
650 * 2. The disk is not currently labeled with a filesystem of any type
651 * 3. There are no partitions on the disk
652 */
653boolean_t
654is_mpath_whole_disk(const char *path)
655{
656	struct udev *udev;
657	struct udev_device *dev = NULL;
658	char nodepath[MAXPATHLEN];
659	char *sysname;
660
661	if (realpath(path, nodepath) == NULL)
662		return (B_FALSE);
663	sysname = strrchr(nodepath, '/') + 1;
664	if (strncmp(sysname, "dm-", 3) != 0)
665		return (B_FALSE);
666	if ((udev = udev_new()) == NULL)
667		return (B_FALSE);
668	if ((dev = udev_device_new_from_subsystem_sysname(udev, "block",
669	    sysname)) == NULL) {
670		udev_device_unref(dev);
671		return (B_FALSE);
672	}
673
674	/* Sanity check some udev values */
675	boolean_t is_sane = is_mpath_udev_sane(dev);
676	udev_device_unref(dev);
677
678	return (is_sane);
679}
680
681#else /* HAVE_LIBUDEV */
682
683boolean_t
684is_mpath_whole_disk(const char *path)
685{
686	(void) path;
687	return (B_FALSE);
688}
689
690#endif /* HAVE_LIBUDEV */
691