1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2013, 2018 by Delphix. All rights reserved.
25 * Copyright (c) 2016, 2017 Intel Corporation.
26 * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>.
27 */
28
29/*
30 * Functions to convert between a list of vdevs and an nvlist representing the
31 * configuration.  Each entry in the list can be one of:
32 *
33 * 	Device vdevs
34 * 		disk=(path=..., devid=...)
35 * 		file=(path=...)
36 *
37 * 	Group vdevs
38 * 		raidz[1|2]=(...)
39 * 		mirror=(...)
40 *
41 * 	Hot spares
42 *
43 * While the underlying implementation supports it, group vdevs cannot contain
44 * other group vdevs.  All userland verification of devices is contained within
45 * this file.  If successful, the nvlist returned can be passed directly to the
46 * kernel; we've done as much verification as possible in userland.
47 *
48 * Hot spares are a special case, and passed down as an array of disk vdevs, at
49 * the same level as the root of the vdev tree.
50 *
51 * The only function exported by this file is 'make_root_vdev'.  The
52 * function performs several passes:
53 *
54 * 	1. Construct the vdev specification.  Performs syntax validation and
55 *         makes sure each device is valid.
56 * 	2. Check for devices in use.  Using libblkid to make sure that no
57 *         devices are also in use.  Some can be overridden using the 'force'
58 *         flag, others cannot.
59 * 	3. Check for replication errors if the 'force' flag is not specified.
60 *         validates that the replication level is consistent across the
61 *         entire pool.
62 * 	4. Call libzfs to label any whole disks with an EFI label.
63 */
64
65#include <assert.h>
66#include <ctype.h>
67#include <errno.h>
68#include <fcntl.h>
69#include <libintl.h>
70#include <libnvpair.h>
71#include <libzutil.h>
72#include <limits.h>
73#include <sys/spa.h>
74#include <stdio.h>
75#include <string.h>
76#include <unistd.h>
77#include "zpool_util.h"
78#include <sys/zfs_context.h>
79
80#include <scsi/scsi.h>
81#include <scsi/sg.h>
82#include <sys/efi_partition.h>
83#include <sys/stat.h>
84#include <sys/vtoc.h>
85#include <sys/mntent.h>
86#include <uuid/uuid.h>
87#include <blkid/blkid.h>
88
89typedef struct vdev_disk_db_entry
90{
91	char id[24];
92	int sector_size;
93} vdev_disk_db_entry_t;
94
95/*
96 * Database of block devices that lie about physical sector sizes.  The
97 * identification string must be precisely 24 characters to avoid false
98 * negatives
99 */
100static vdev_disk_db_entry_t vdev_disk_database[] = {
101	{"ATA     ADATA SSD S396 3", 8192},
102	{"ATA     APPLE SSD SM128E", 8192},
103	{"ATA     APPLE SSD SM256E", 8192},
104	{"ATA     APPLE SSD SM512E", 8192},
105	{"ATA     APPLE SSD SM768E", 8192},
106	{"ATA     C400-MTFDDAC064M", 8192},
107	{"ATA     C400-MTFDDAC128M", 8192},
108	{"ATA     C400-MTFDDAC256M", 8192},
109	{"ATA     C400-MTFDDAC512M", 8192},
110	{"ATA     Corsair Force 3 ", 8192},
111	{"ATA     Corsair Force GS", 8192},
112	{"ATA     INTEL SSDSA2CT04", 8192},
113	{"ATA     INTEL SSDSA2BZ10", 8192},
114	{"ATA     INTEL SSDSA2BZ20", 8192},
115	{"ATA     INTEL SSDSA2BZ30", 8192},
116	{"ATA     INTEL SSDSA2CW04", 8192},
117	{"ATA     INTEL SSDSA2CW08", 8192},
118	{"ATA     INTEL SSDSA2CW12", 8192},
119	{"ATA     INTEL SSDSA2CW16", 8192},
120	{"ATA     INTEL SSDSA2CW30", 8192},
121	{"ATA     INTEL SSDSA2CW60", 8192},
122	{"ATA     INTEL SSDSC2CT06", 8192},
123	{"ATA     INTEL SSDSC2CT12", 8192},
124	{"ATA     INTEL SSDSC2CT18", 8192},
125	{"ATA     INTEL SSDSC2CT24", 8192},
126	{"ATA     INTEL SSDSC2CW06", 8192},
127	{"ATA     INTEL SSDSC2CW12", 8192},
128	{"ATA     INTEL SSDSC2CW18", 8192},
129	{"ATA     INTEL SSDSC2CW24", 8192},
130	{"ATA     INTEL SSDSC2CW48", 8192},
131	{"ATA     KINGSTON SH100S3", 8192},
132	{"ATA     KINGSTON SH103S3", 8192},
133	{"ATA     M4-CT064M4SSD2  ", 8192},
134	{"ATA     M4-CT128M4SSD2  ", 8192},
135	{"ATA     M4-CT256M4SSD2  ", 8192},
136	{"ATA     M4-CT512M4SSD2  ", 8192},
137	{"ATA     OCZ-AGILITY2    ", 8192},
138	{"ATA     OCZ-AGILITY3    ", 8192},
139	{"ATA     OCZ-VERTEX2 3.5 ", 8192},
140	{"ATA     OCZ-VERTEX3     ", 8192},
141	{"ATA     OCZ-VERTEX3 LT  ", 8192},
142	{"ATA     OCZ-VERTEX3 MI  ", 8192},
143	{"ATA     OCZ-VERTEX4     ", 8192},
144	{"ATA     SAMSUNG MZ7WD120", 8192},
145	{"ATA     SAMSUNG MZ7WD240", 8192},
146	{"ATA     SAMSUNG MZ7WD480", 8192},
147	{"ATA     SAMSUNG MZ7WD960", 8192},
148	{"ATA     SAMSUNG SSD 830 ", 8192},
149	{"ATA     Samsung SSD 840 ", 8192},
150	{"ATA     SanDisk SSD U100", 8192},
151	{"ATA     TOSHIBA THNSNH06", 8192},
152	{"ATA     TOSHIBA THNSNH12", 8192},
153	{"ATA     TOSHIBA THNSNH25", 8192},
154	{"ATA     TOSHIBA THNSNH51", 8192},
155	{"ATA     APPLE SSD TS064C", 4096},
156	{"ATA     APPLE SSD TS128C", 4096},
157	{"ATA     APPLE SSD TS256C", 4096},
158	{"ATA     APPLE SSD TS512C", 4096},
159	{"ATA     INTEL SSDSA2M040", 4096},
160	{"ATA     INTEL SSDSA2M080", 4096},
161	{"ATA     INTEL SSDSA2M160", 4096},
162	{"ATA     INTEL SSDSC2MH12", 4096},
163	{"ATA     INTEL SSDSC2MH25", 4096},
164	{"ATA     OCZ CORE_SSD    ", 4096},
165	{"ATA     OCZ-VERTEX      ", 4096},
166	{"ATA     SAMSUNG MCCOE32G", 4096},
167	{"ATA     SAMSUNG MCCOE64G", 4096},
168	{"ATA     SAMSUNG SSD PM80", 4096},
169	/* Flash drives optimized for 4KB IOs on larger pages */
170	{"ATA     INTEL SSDSC2BA10", 4096},
171	{"ATA     INTEL SSDSC2BA20", 4096},
172	{"ATA     INTEL SSDSC2BA40", 4096},
173	{"ATA     INTEL SSDSC2BA80", 4096},
174	{"ATA     INTEL SSDSC2BB08", 4096},
175	{"ATA     INTEL SSDSC2BB12", 4096},
176	{"ATA     INTEL SSDSC2BB16", 4096},
177	{"ATA     INTEL SSDSC2BB24", 4096},
178	{"ATA     INTEL SSDSC2BB30", 4096},
179	{"ATA     INTEL SSDSC2BB40", 4096},
180	{"ATA     INTEL SSDSC2BB48", 4096},
181	{"ATA     INTEL SSDSC2BB60", 4096},
182	{"ATA     INTEL SSDSC2BB80", 4096},
183	{"ATA     INTEL SSDSC2BW24", 4096},
184	{"ATA     INTEL SSDSC2BW48", 4096},
185	{"ATA     INTEL SSDSC2BP24", 4096},
186	{"ATA     INTEL SSDSC2BP48", 4096},
187	{"NA      SmrtStorSDLKAE9W", 4096},
188	{"NVMe    Amazon EC2 NVMe ", 4096},
189	/* Imported from Open Solaris */
190	{"ATA     MARVELL SD88SA02", 4096},
191	/* Advanced format Hard drives */
192	{"ATA     Hitachi HDS5C303", 4096},
193	{"ATA     SAMSUNG HD204UI ", 4096},
194	{"ATA     ST2000DL004 HD20", 4096},
195	{"ATA     WDC WD10EARS-00M", 4096},
196	{"ATA     WDC WD10EARS-00S", 4096},
197	{"ATA     WDC WD10EARS-00Z", 4096},
198	{"ATA     WDC WD15EARS-00M", 4096},
199	{"ATA     WDC WD15EARS-00S", 4096},
200	{"ATA     WDC WD15EARS-00Z", 4096},
201	{"ATA     WDC WD20EARS-00M", 4096},
202	{"ATA     WDC WD20EARS-00S", 4096},
203	{"ATA     WDC WD20EARS-00Z", 4096},
204	{"ATA     WDC WD1600BEVT-0", 4096},
205	{"ATA     WDC WD2500BEVT-0", 4096},
206	{"ATA     WDC WD3200BEVT-0", 4096},
207	{"ATA     WDC WD5000BEVT-0", 4096},
208};
209
210
211#define	INQ_REPLY_LEN	96
212#define	INQ_CMD_LEN	6
213
214static const int vdev_disk_database_size =
215	sizeof (vdev_disk_database) / sizeof (vdev_disk_database[0]);
216
217boolean_t
218check_sector_size_database(char *path, int *sector_size)
219{
220	unsigned char inq_buff[INQ_REPLY_LEN];
221	unsigned char sense_buffer[32];
222	unsigned char inq_cmd_blk[INQ_CMD_LEN] =
223	    {INQUIRY, 0, 0, 0, INQ_REPLY_LEN, 0};
224	sg_io_hdr_t io_hdr;
225	int error;
226	int fd;
227	int i;
228
229	/* Prepare INQUIRY command */
230	memset(&io_hdr, 0, sizeof (sg_io_hdr_t));
231	io_hdr.interface_id = 'S';
232	io_hdr.cmd_len = sizeof (inq_cmd_blk);
233	io_hdr.mx_sb_len = sizeof (sense_buffer);
234	io_hdr.dxfer_direction = SG_DXFER_FROM_DEV;
235	io_hdr.dxfer_len = INQ_REPLY_LEN;
236	io_hdr.dxferp = inq_buff;
237	io_hdr.cmdp = inq_cmd_blk;
238	io_hdr.sbp = sense_buffer;
239	io_hdr.timeout = 10;		/* 10 milliseconds is ample time */
240
241	if ((fd = open(path, O_RDONLY|O_DIRECT)) < 0)
242		return (B_FALSE);
243
244	error = ioctl(fd, SG_IO, (unsigned long) &io_hdr);
245
246	(void) close(fd);
247
248	if (error < 0)
249		return (B_FALSE);
250
251	if ((io_hdr.info & SG_INFO_OK_MASK) != SG_INFO_OK)
252		return (B_FALSE);
253
254	for (i = 0; i < vdev_disk_database_size; i++) {
255		if (memcmp(inq_buff + 8, vdev_disk_database[i].id, 24))
256			continue;
257
258		*sector_size = vdev_disk_database[i].sector_size;
259		return (B_TRUE);
260	}
261
262	return (B_FALSE);
263}
264
265static int
266check_slice(const char *path, blkid_cache cache, int force, boolean_t isspare)
267{
268	int err;
269	char *value;
270
271	/* No valid type detected device is safe to use */
272	value = blkid_get_tag_value(cache, "TYPE", path);
273	if (value == NULL)
274		return (0);
275
276	/*
277	 * If libblkid detects a ZFS device, we check the device
278	 * using check_file() to see if it's safe.  The one safe
279	 * case is a spare device shared between multiple pools.
280	 */
281	if (strcmp(value, "zfs_member") == 0) {
282		err = check_file(path, force, isspare);
283	} else {
284		if (force) {
285			err = 0;
286		} else {
287			err = -1;
288			vdev_error(gettext("%s contains a filesystem of "
289			    "type '%s'\n"), path, value);
290		}
291	}
292
293	free(value);
294
295	return (err);
296}
297
298/*
299 * Validate that a disk including all partitions are safe to use.
300 *
301 * For EFI labeled disks this can done relatively easily with the libefi
302 * library.  The partition numbers are extracted from the label and used
303 * to generate the expected /dev/ paths.  Each partition can then be
304 * checked for conflicts.
305 *
306 * For non-EFI labeled disks (MBR/EBR/etc) the same process is possible
307 * but due to the lack of a readily available libraries this scanning is
308 * not implemented.  Instead only the device path as given is checked.
309 */
310static int
311check_disk(const char *path, blkid_cache cache, int force,
312    boolean_t isspare, boolean_t iswholedisk)
313{
314	struct dk_gpt *vtoc;
315	char slice_path[MAXPATHLEN];
316	int err = 0;
317	int fd, i;
318	int flags = O_RDONLY|O_DIRECT;
319
320	if (!iswholedisk)
321		return (check_slice(path, cache, force, isspare));
322
323	/* only spares can be shared, other devices require exclusive access */
324	if (!isspare)
325		flags |= O_EXCL;
326
327	if ((fd = open(path, flags)) < 0) {
328		char *value = blkid_get_tag_value(cache, "TYPE", path);
329		(void) fprintf(stderr, gettext("%s is in use and contains "
330		    "a %s filesystem.\n"), path, value ? value : "unknown");
331		free(value);
332		return (-1);
333	}
334
335	/*
336	 * Expected to fail for non-EFI labeled disks.  Just check the device
337	 * as given and do not attempt to detect and scan partitions.
338	 */
339	err = efi_alloc_and_read(fd, &vtoc);
340	if (err) {
341		(void) close(fd);
342		return (check_slice(path, cache, force, isspare));
343	}
344
345	/*
346	 * The primary efi partition label is damaged however the secondary
347	 * label at the end of the device is intact.  Rather than use this
348	 * label we should play it safe and treat this as a non efi device.
349	 */
350	if (vtoc->efi_flags & EFI_GPT_PRIMARY_CORRUPT) {
351		efi_free(vtoc);
352		(void) close(fd);
353
354		if (force) {
355			/* Partitions will now be created using the backup */
356			return (0);
357		} else {
358			vdev_error(gettext("%s contains a corrupt primary "
359			    "EFI label.\n"), path);
360			return (-1);
361		}
362	}
363
364	for (i = 0; i < vtoc->efi_nparts; i++) {
365
366		if (vtoc->efi_parts[i].p_tag == V_UNASSIGNED ||
367		    uuid_is_null((uchar_t *)&vtoc->efi_parts[i].p_guid))
368			continue;
369
370		if (strncmp(path, UDISK_ROOT, strlen(UDISK_ROOT)) == 0)
371			(void) snprintf(slice_path, sizeof (slice_path),
372			    "%s%s%d", path, "-part", i+1);
373		else
374			(void) snprintf(slice_path, sizeof (slice_path),
375			    "%s%s%d", path, isdigit(path[strlen(path)-1]) ?
376			    "p" : "", i+1);
377
378		err = check_slice(slice_path, cache, force, isspare);
379		if (err)
380			break;
381	}
382
383	efi_free(vtoc);
384	(void) close(fd);
385
386	return (err);
387}
388
389int
390check_device(const char *path, boolean_t force,
391    boolean_t isspare, boolean_t iswholedisk)
392{
393	blkid_cache cache;
394	int error;
395
396	error = blkid_get_cache(&cache, NULL);
397	if (error != 0) {
398		(void) fprintf(stderr, gettext("unable to access the blkid "
399		    "cache.\n"));
400		return (-1);
401	}
402
403	error = check_disk(path, cache, force, isspare, iswholedisk);
404	blkid_put_cache(cache);
405
406	return (error);
407}
408
409void
410after_zpool_upgrade(zpool_handle_t *zhp)
411{
412}
413