1306196Sjkim/*
2238405Sjkim * CDDL HEADER START
3238405Sjkim *
4238405Sjkim * The contents of this file are subject to the terms of the
5238405Sjkim * Common Development and Distribution License (the "License").
6238405Sjkim * You may not use this file except in compliance with the License.
7238405Sjkim *
8238405Sjkim * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9238405Sjkim * or https://opensource.org/licenses/CDDL-1.0.
10238405Sjkim * See the License for the specific language governing permissions
11238405Sjkim * and limitations under the License.
12238405Sjkim *
13238405Sjkim * When distributing Covered Code, include this CDDL HEADER in each
14238405Sjkim * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15238405Sjkim * If applicable, add the following below this CDDL HEADER, with the
16238405Sjkim * fields enclosed by brackets "[]" replaced with your own identifying
17238405Sjkim * information: Portions Copyright [yyyy] [name of copyright owner]
18238405Sjkim *
19238405Sjkim * CDDL HEADER END
20238405Sjkim */
21238405Sjkim
22238405Sjkim/*
23238405Sjkim * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
24238405Sjkim * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
25238405Sjkim * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
26238405Sjkim * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>
27238405Sjkim * Copyright (c) 2018 Datto Inc.
28238405Sjkim * Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
29238405Sjkim * Copyright (c) 2017, Intel Corporation.
30238405Sjkim * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>
31238405Sjkim */
32238405Sjkim
33238405Sjkim#include <errno.h>
34238405Sjkim#include <libintl.h>
35238405Sjkim#include <stdio.h>
36238405Sjkim#include <stdlib.h>
37238405Sjkim#include <string.h>
38238405Sjkim#include <unistd.h>
39238405Sjkim#include <libgen.h>
40238405Sjkim#include <zone.h>
41276864Sjkim#include <sys/stat.h>
42276864Sjkim#include <sys/efi_partition.h>
43238405Sjkim#include <sys/systeminfo.h>
44238405Sjkim#include <sys/zfs_ioctl.h>
45238405Sjkim#include <sys/vdev_disk.h>
46238405Sjkim#include <dlfcn.h>
47238405Sjkim#include <libzutil.h>
48238405Sjkim
49238405Sjkim#include "zfs_namecheck.h"
50238405Sjkim#include "zfs_prop.h"
51238405Sjkim#include "../../libzfs_impl.h"
52238405Sjkim#include "zfs_comutil.h"
53276864Sjkim#include "zfeature_common.h"
54276864Sjkim
55276864Sjkim/*
56238405Sjkim * If the device has being dynamically expanded then we need to relabel
57276864Sjkim * the disk to use the new unallocated space.
58276864Sjkim */
59276864Sjkimint
60276864Sjkimzpool_relabel_disk(libzfs_handle_t *hdl, const char *path, const char *msg)
61276864Sjkim{
62276864Sjkim	int fd, error;
63238405Sjkim
64276864Sjkim	if ((fd = open(path, O_RDWR|O_DIRECT|O_CLOEXEC)) < 0) {
65276864Sjkim		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot "
66276864Sjkim		    "relabel '%s': unable to open device: %d"), path, errno);
67276864Sjkim		return (zfs_error(hdl, EZFS_OPENFAILED, msg));
68276864Sjkim	}
69238405Sjkim
70276864Sjkim	/*
71238405Sjkim	 * It's possible that we might encounter an error if the device
72238405Sjkim	 * does not have any unallocated space left. If so, we simply
73238405Sjkim	 * ignore that error and continue on.
74238405Sjkim	 */
75238405Sjkim	error = efi_use_whole_disk(fd);
76238405Sjkim
77238405Sjkim	/* Flush the buffers to disk and invalidate the page cache. */
78238405Sjkim	(void) fsync(fd);
79238405Sjkim	(void) ioctl(fd, BLKFLSBUF);
80238405Sjkim
81238405Sjkim	(void) close(fd);
82238405Sjkim	if (error && error != VT_ENOSPC) {
83238405Sjkim		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot "
84238405Sjkim		    "relabel '%s': unable to read disk capacity"), path);
85238405Sjkim		return (zfs_error(hdl, EZFS_NOCAP, msg));
86238405Sjkim	}
87238405Sjkim	return (0);
88238405Sjkim}
89238405Sjkim
90238405Sjkim/*
91238405Sjkim * Read the EFI label from the config, if a label does not exist then
92238405Sjkim * pass back the error to the caller. If the caller has passed a non-NULL
93238405Sjkim * diskaddr argument then we set it to the starting address of the EFI
94238405Sjkim * partition.
95238405Sjkim */
96238405Sjkimstatic int
97238405Sjkimread_efi_label(nvlist_t *config, diskaddr_t *sb)
98238405Sjkim{
99238405Sjkim	const char *path;
100238405Sjkim	int fd;
101238405Sjkim	char diskname[MAXPATHLEN];
102238405Sjkim	int err = -1;
103238405Sjkim
104238405Sjkim	if (nvlist_lookup_string(config, ZPOOL_CONFIG_PATH, &path) != 0)
105238405Sjkim		return (err);
106238405Sjkim
107238405Sjkim	(void) snprintf(diskname, sizeof (diskname), "%s%s", DISK_ROOT,
108238405Sjkim	    strrchr(path, '/'));
109238405Sjkim	if ((fd = open(diskname, O_RDONLY|O_DIRECT|O_CLOEXEC)) >= 0) {
110238405Sjkim		struct dk_gpt *vtoc;
111238405Sjkim
112238405Sjkim		if ((err = efi_alloc_and_read(fd, &vtoc)) >= 0) {
113238405Sjkim			if (sb != NULL)
114238405Sjkim				*sb = vtoc->efi_parts[0].p_start;
115238405Sjkim			efi_free(vtoc);
116238405Sjkim		}
117238405Sjkim		(void) close(fd);
118238405Sjkim	}
119238405Sjkim	return (err);
120238405Sjkim}
121238405Sjkim
122238405Sjkim/*
123238405Sjkim * determine where a partition starts on a disk in the current
124238405Sjkim * configuration
125238405Sjkim */
126238405Sjkimstatic diskaddr_t
127238405Sjkimfind_start_block(nvlist_t *config)
128238405Sjkim{
129238405Sjkim	nvlist_t **child;
130238405Sjkim	uint_t c, children;
131238405Sjkim	diskaddr_t sb = MAXOFFSET_T;
132238405Sjkim	uint64_t wholedisk;
133238405Sjkim
134238405Sjkim	if (nvlist_lookup_nvlist_array(config,
135238405Sjkim	    ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) {
136306196Sjkim		if (nvlist_lookup_uint64(config,
137238405Sjkim		    ZPOOL_CONFIG_WHOLE_DISK,
138238405Sjkim		    &wholedisk) != 0 || !wholedisk) {
139238405Sjkim			return (MAXOFFSET_T);
140238405Sjkim		}
141238405Sjkim		if (read_efi_label(config, &sb) < 0)
142238405Sjkim			sb = MAXOFFSET_T;
143238405Sjkim		return (sb);
144238405Sjkim	}
145238405Sjkim
146238405Sjkim	for (c = 0; c < children; c++) {
147238405Sjkim		sb = find_start_block(child[c]);
148238405Sjkim		if (sb != MAXOFFSET_T) {
149238405Sjkim			return (sb);
150238405Sjkim		}
151238405Sjkim	}
152238405Sjkim	return (MAXOFFSET_T);
153238405Sjkim}
154238405Sjkim
155238405Sjkimstatic int
156238405Sjkimzpool_label_disk_check(char *path)
157238405Sjkim{
158238405Sjkim	struct dk_gpt *vtoc;
159238405Sjkim	int fd, err;
160276864Sjkim
161238405Sjkim	if ((fd = open(path, O_RDONLY|O_DIRECT|O_CLOEXEC)) < 0)
162238405Sjkim		return (errno);
163238405Sjkim
164238405Sjkim	if ((err = efi_alloc_and_read(fd, &vtoc)) != 0) {
165238405Sjkim		(void) close(fd);
166238405Sjkim		return (err);
167238405Sjkim	}
168267258Sjkim
169238405Sjkim	if (vtoc->efi_flags & EFI_GPT_PRIMARY_CORRUPT) {
170267258Sjkim		efi_free(vtoc);
171267258Sjkim		(void) close(fd);
172267258Sjkim		return (EIDRM);
173267258Sjkim	}
174267258Sjkim
175267258Sjkim	efi_free(vtoc);
176267258Sjkim	(void) close(fd);
177267258Sjkim	return (0);
178267258Sjkim}
179267258Sjkim
180267258Sjkim/*
181267258Sjkim * Generate a unique partition name for the ZFS member.  Partitions must
182267258Sjkim * have unique names to ensure udev will be able to create symlinks under
183267258Sjkim * /dev/disk/by-partlabel/ for all pool members.  The partition names are
184238405Sjkim * of the form <pool>-<unique-id>.
185238405Sjkim */
186238405Sjkimstatic void
187276864Sjkimzpool_label_name(char *label_name, int label_size)
188238405Sjkim{
189238405Sjkim	uint64_t id = 0;
190238405Sjkim	int fd;
191276864Sjkim
192238405Sjkim	fd = open("/dev/urandom", O_RDONLY|O_CLOEXEC);
193238405Sjkim	if (fd >= 0) {
194238405Sjkim		if (read(fd, &id, sizeof (id)) != sizeof (id))
195238405Sjkim			id = 0;
196238405Sjkim
197238405Sjkim		close(fd);
198238405Sjkim	}
199238405Sjkim
200238405Sjkim	if (id == 0)
201238405Sjkim		id = (((uint64_t)rand()) << 32) | (uint64_t)rand();
202238405Sjkim
203238405Sjkim	snprintf(label_name, label_size, "zfs-%016llx", (u_longlong_t)id);
204238405Sjkim}
205238405Sjkim
206238405Sjkim/*
207238405Sjkim * Label an individual disk.  The name provided is the short name,
208238405Sjkim * stripped of any leading /dev path.
209238405Sjkim */
210238405Sjkimint
211238405Sjkimzpool_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, const char *name)
212{
213	char path[MAXPATHLEN];
214	struct dk_gpt *vtoc;
215	int rval, fd;
216	size_t resv = EFI_MIN_RESV_SIZE;
217	uint64_t slice_size;
218	diskaddr_t start_block;
219	char errbuf[ERRBUFLEN];
220
221	/* prepare an error message just in case */
222	(void) snprintf(errbuf, sizeof (errbuf),
223	    dgettext(TEXT_DOMAIN, "cannot label '%s'"), name);
224
225	if (zhp) {
226		nvlist_t *nvroot = fnvlist_lookup_nvlist(zhp->zpool_config,
227		    ZPOOL_CONFIG_VDEV_TREE);
228
229		if (zhp->zpool_start_block == 0)
230			start_block = find_start_block(nvroot);
231		else
232			start_block = zhp->zpool_start_block;
233		zhp->zpool_start_block = start_block;
234	} else {
235		/* new pool */
236		start_block = NEW_START_BLOCK;
237	}
238
239	(void) snprintf(path, sizeof (path), "%s/%s", DISK_ROOT, name);
240
241	if ((fd = open(path, O_RDWR|O_DIRECT|O_EXCL|O_CLOEXEC)) < 0) {
242		/*
243		 * This shouldn't happen.  We've long since verified that this
244		 * is a valid device.
245		 */
246		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot "
247		    "label '%s': unable to open device: %d"), path, errno);
248		return (zfs_error(hdl, EZFS_OPENFAILED, errbuf));
249	}
250
251	if (efi_alloc_and_init(fd, EFI_NUMPAR, &vtoc) != 0) {
252		/*
253		 * The only way this can fail is if we run out of memory, or we
254		 * were unable to read the disk's capacity
255		 */
256		if (errno == ENOMEM)
257			(void) no_memory(hdl);
258
259		(void) close(fd);
260		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot "
261		    "label '%s': unable to read disk capacity"), path);
262
263		return (zfs_error(hdl, EZFS_NOCAP, errbuf));
264	}
265
266	slice_size = vtoc->efi_last_u_lba + 1;
267	slice_size -= EFI_MIN_RESV_SIZE;
268	if (start_block == MAXOFFSET_T)
269		start_block = NEW_START_BLOCK;
270	slice_size -= start_block;
271	slice_size = P2ALIGN_TYPED(slice_size, PARTITION_END_ALIGNMENT,
272	    uint64_t);
273
274	vtoc->efi_parts[0].p_start = start_block;
275	vtoc->efi_parts[0].p_size = slice_size;
276
277	if (vtoc->efi_parts[0].p_size * vtoc->efi_lbasize < SPA_MINDEVSIZE) {
278		(void) close(fd);
279		efi_free(vtoc);
280
281		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot "
282		    "label '%s': partition would be less than the minimum "
283		    "device size (64M)"), path);
284		return (zfs_error(hdl, EZFS_LABELFAILED, errbuf));
285	}
286
287	/*
288	 * Why we use V_USR: V_BACKUP confuses users, and is considered
289	 * disposable by some EFI utilities (since EFI doesn't have a backup
290	 * slice).  V_UNASSIGNED is supposed to be used only for zero size
291	 * partitions, and efi_write() will fail if we use it.
292	 * Other available types were all pretty specific.
293	 * V_USR is as close to reality as we
294	 * can get, in the absence of V_OTHER.
295	 */
296	vtoc->efi_parts[0].p_tag = V_USR;
297	zpool_label_name(vtoc->efi_parts[0].p_name, EFI_PART_NAME_LEN);
298
299	vtoc->efi_parts[8].p_start = slice_size + start_block;
300	vtoc->efi_parts[8].p_size = resv;
301	vtoc->efi_parts[8].p_tag = V_RESERVED;
302
303	rval = efi_write(fd, vtoc);
304
305	/* Flush the buffers to disk and invalidate the page cache. */
306	(void) fsync(fd);
307	(void) ioctl(fd, BLKFLSBUF);
308
309	if (rval == 0)
310		rval = efi_rescan(fd);
311
312	/*
313	 * Some block drivers (like pcata) may not support EFI GPT labels.
314	 * Print out a helpful error message directing the user to manually
315	 * label the disk and give a specific slice.
316	 */
317	if (rval != 0) {
318		(void) close(fd);
319		efi_free(vtoc);
320
321		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "try using "
322		    "parted(8) and then provide a specific slice: %d"), rval);
323		return (zfs_error(hdl, EZFS_LABELFAILED, errbuf));
324	}
325
326	(void) close(fd);
327	efi_free(vtoc);
328
329	(void) snprintf(path, sizeof (path), "%s/%s", DISK_ROOT, name);
330	(void) zfs_append_partition(path, MAXPATHLEN);
331
332	/* Wait to udev to signal use the device has settled. */
333	rval = zpool_label_disk_wait(path, DISK_LABEL_WAIT);
334	if (rval) {
335		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "failed to "
336		    "detect device partitions on '%s': %d"), path, rval);
337		return (zfs_error(hdl, EZFS_LABELFAILED, errbuf));
338	}
339
340	/* We can't be to paranoid.  Read the label back and verify it. */
341	(void) snprintf(path, sizeof (path), "%s/%s", DISK_ROOT, name);
342	rval = zpool_label_disk_check(path);
343	if (rval) {
344		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "freshly written "
345		    "EFI label on '%s' is damaged.  Ensure\nthis device "
346		    "is not in use, and is functioning properly: %d"),
347		    path, rval);
348		return (zfs_error(hdl, EZFS_LABELFAILED, errbuf));
349	}
350	return (0);
351}
352