1306196Sjkim/* 2238405Sjkim * CDDL HEADER START 3238405Sjkim * 4238405Sjkim * The contents of this file are subject to the terms of the 5238405Sjkim * Common Development and Distribution License (the "License"). 6238405Sjkim * You may not use this file except in compliance with the License. 7238405Sjkim * 8238405Sjkim * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9238405Sjkim * or https://opensource.org/licenses/CDDL-1.0. 10238405Sjkim * See the License for the specific language governing permissions 11238405Sjkim * and limitations under the License. 12238405Sjkim * 13238405Sjkim * When distributing Covered Code, include this CDDL HEADER in each 14238405Sjkim * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15238405Sjkim * If applicable, add the following below this CDDL HEADER, with the 16238405Sjkim * fields enclosed by brackets "[]" replaced with your own identifying 17238405Sjkim * information: Portions Copyright [yyyy] [name of copyright owner] 18238405Sjkim * 19238405Sjkim * CDDL HEADER END 20238405Sjkim */ 21238405Sjkim 22238405Sjkim/* 23238405Sjkim * Copyright 2015 Nexenta Systems, Inc. All rights reserved. 24238405Sjkim * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 25238405Sjkim * Copyright (c) 2011, 2018 by Delphix. All rights reserved. 26238405Sjkim * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com> 27238405Sjkim * Copyright (c) 2018 Datto Inc. 28238405Sjkim * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. 29238405Sjkim * Copyright (c) 2017, Intel Corporation. 30238405Sjkim * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com> 31238405Sjkim */ 32238405Sjkim 33238405Sjkim#include <errno.h> 34238405Sjkim#include <libintl.h> 35238405Sjkim#include <stdio.h> 36238405Sjkim#include <stdlib.h> 37238405Sjkim#include <string.h> 38238405Sjkim#include <unistd.h> 39238405Sjkim#include <libgen.h> 40238405Sjkim#include <zone.h> 41276864Sjkim#include <sys/stat.h> 42276864Sjkim#include <sys/efi_partition.h> 43238405Sjkim#include <sys/systeminfo.h> 44238405Sjkim#include <sys/zfs_ioctl.h> 45238405Sjkim#include <sys/vdev_disk.h> 46238405Sjkim#include <dlfcn.h> 47238405Sjkim#include <libzutil.h> 48238405Sjkim 49238405Sjkim#include "zfs_namecheck.h" 50238405Sjkim#include "zfs_prop.h" 51238405Sjkim#include "../../libzfs_impl.h" 52238405Sjkim#include "zfs_comutil.h" 53276864Sjkim#include "zfeature_common.h" 54276864Sjkim 55276864Sjkim/* 56238405Sjkim * If the device has being dynamically expanded then we need to relabel 57276864Sjkim * the disk to use the new unallocated space. 58276864Sjkim */ 59276864Sjkimint 60276864Sjkimzpool_relabel_disk(libzfs_handle_t *hdl, const char *path, const char *msg) 61276864Sjkim{ 62276864Sjkim int fd, error; 63238405Sjkim 64276864Sjkim if ((fd = open(path, O_RDWR|O_DIRECT|O_CLOEXEC)) < 0) { 65276864Sjkim zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot " 66276864Sjkim "relabel '%s': unable to open device: %d"), path, errno); 67276864Sjkim return (zfs_error(hdl, EZFS_OPENFAILED, msg)); 68276864Sjkim } 69238405Sjkim 70276864Sjkim /* 71238405Sjkim * It's possible that we might encounter an error if the device 72238405Sjkim * does not have any unallocated space left. If so, we simply 73238405Sjkim * ignore that error and continue on. 74238405Sjkim */ 75238405Sjkim error = efi_use_whole_disk(fd); 76238405Sjkim 77238405Sjkim /* Flush the buffers to disk and invalidate the page cache. */ 78238405Sjkim (void) fsync(fd); 79238405Sjkim (void) ioctl(fd, BLKFLSBUF); 80238405Sjkim 81238405Sjkim (void) close(fd); 82238405Sjkim if (error && error != VT_ENOSPC) { 83238405Sjkim zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot " 84238405Sjkim "relabel '%s': unable to read disk capacity"), path); 85238405Sjkim return (zfs_error(hdl, EZFS_NOCAP, msg)); 86238405Sjkim } 87238405Sjkim return (0); 88238405Sjkim} 89238405Sjkim 90238405Sjkim/* 91238405Sjkim * Read the EFI label from the config, if a label does not exist then 92238405Sjkim * pass back the error to the caller. If the caller has passed a non-NULL 93238405Sjkim * diskaddr argument then we set it to the starting address of the EFI 94238405Sjkim * partition. 95238405Sjkim */ 96238405Sjkimstatic int 97238405Sjkimread_efi_label(nvlist_t *config, diskaddr_t *sb) 98238405Sjkim{ 99238405Sjkim const char *path; 100238405Sjkim int fd; 101238405Sjkim char diskname[MAXPATHLEN]; 102238405Sjkim int err = -1; 103238405Sjkim 104238405Sjkim if (nvlist_lookup_string(config, ZPOOL_CONFIG_PATH, &path) != 0) 105238405Sjkim return (err); 106238405Sjkim 107238405Sjkim (void) snprintf(diskname, sizeof (diskname), "%s%s", DISK_ROOT, 108238405Sjkim strrchr(path, '/')); 109238405Sjkim if ((fd = open(diskname, O_RDONLY|O_DIRECT|O_CLOEXEC)) >= 0) { 110238405Sjkim struct dk_gpt *vtoc; 111238405Sjkim 112238405Sjkim if ((err = efi_alloc_and_read(fd, &vtoc)) >= 0) { 113238405Sjkim if (sb != NULL) 114238405Sjkim *sb = vtoc->efi_parts[0].p_start; 115238405Sjkim efi_free(vtoc); 116238405Sjkim } 117238405Sjkim (void) close(fd); 118238405Sjkim } 119238405Sjkim return (err); 120238405Sjkim} 121238405Sjkim 122238405Sjkim/* 123238405Sjkim * determine where a partition starts on a disk in the current 124238405Sjkim * configuration 125238405Sjkim */ 126238405Sjkimstatic diskaddr_t 127238405Sjkimfind_start_block(nvlist_t *config) 128238405Sjkim{ 129238405Sjkim nvlist_t **child; 130238405Sjkim uint_t c, children; 131238405Sjkim diskaddr_t sb = MAXOFFSET_T; 132238405Sjkim uint64_t wholedisk; 133238405Sjkim 134238405Sjkim if (nvlist_lookup_nvlist_array(config, 135238405Sjkim ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) { 136306196Sjkim if (nvlist_lookup_uint64(config, 137238405Sjkim ZPOOL_CONFIG_WHOLE_DISK, 138238405Sjkim &wholedisk) != 0 || !wholedisk) { 139238405Sjkim return (MAXOFFSET_T); 140238405Sjkim } 141238405Sjkim if (read_efi_label(config, &sb) < 0) 142238405Sjkim sb = MAXOFFSET_T; 143238405Sjkim return (sb); 144238405Sjkim } 145238405Sjkim 146238405Sjkim for (c = 0; c < children; c++) { 147238405Sjkim sb = find_start_block(child[c]); 148238405Sjkim if (sb != MAXOFFSET_T) { 149238405Sjkim return (sb); 150238405Sjkim } 151238405Sjkim } 152238405Sjkim return (MAXOFFSET_T); 153238405Sjkim} 154238405Sjkim 155238405Sjkimstatic int 156238405Sjkimzpool_label_disk_check(char *path) 157238405Sjkim{ 158238405Sjkim struct dk_gpt *vtoc; 159238405Sjkim int fd, err; 160276864Sjkim 161238405Sjkim if ((fd = open(path, O_RDONLY|O_DIRECT|O_CLOEXEC)) < 0) 162238405Sjkim return (errno); 163238405Sjkim 164238405Sjkim if ((err = efi_alloc_and_read(fd, &vtoc)) != 0) { 165238405Sjkim (void) close(fd); 166238405Sjkim return (err); 167238405Sjkim } 168267258Sjkim 169238405Sjkim if (vtoc->efi_flags & EFI_GPT_PRIMARY_CORRUPT) { 170267258Sjkim efi_free(vtoc); 171267258Sjkim (void) close(fd); 172267258Sjkim return (EIDRM); 173267258Sjkim } 174267258Sjkim 175267258Sjkim efi_free(vtoc); 176267258Sjkim (void) close(fd); 177267258Sjkim return (0); 178267258Sjkim} 179267258Sjkim 180267258Sjkim/* 181267258Sjkim * Generate a unique partition name for the ZFS member. Partitions must 182267258Sjkim * have unique names to ensure udev will be able to create symlinks under 183267258Sjkim * /dev/disk/by-partlabel/ for all pool members. The partition names are 184238405Sjkim * of the form <pool>-<unique-id>. 185238405Sjkim */ 186238405Sjkimstatic void 187276864Sjkimzpool_label_name(char *label_name, int label_size) 188238405Sjkim{ 189238405Sjkim uint64_t id = 0; 190238405Sjkim int fd; 191276864Sjkim 192238405Sjkim fd = open("/dev/urandom", O_RDONLY|O_CLOEXEC); 193238405Sjkim if (fd >= 0) { 194238405Sjkim if (read(fd, &id, sizeof (id)) != sizeof (id)) 195238405Sjkim id = 0; 196238405Sjkim 197238405Sjkim close(fd); 198238405Sjkim } 199238405Sjkim 200238405Sjkim if (id == 0) 201238405Sjkim id = (((uint64_t)rand()) << 32) | (uint64_t)rand(); 202238405Sjkim 203238405Sjkim snprintf(label_name, label_size, "zfs-%016llx", (u_longlong_t)id); 204238405Sjkim} 205238405Sjkim 206238405Sjkim/* 207238405Sjkim * Label an individual disk. The name provided is the short name, 208238405Sjkim * stripped of any leading /dev path. 209238405Sjkim */ 210238405Sjkimint 211238405Sjkimzpool_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, const char *name) 212{ 213 char path[MAXPATHLEN]; 214 struct dk_gpt *vtoc; 215 int rval, fd; 216 size_t resv = EFI_MIN_RESV_SIZE; 217 uint64_t slice_size; 218 diskaddr_t start_block; 219 char errbuf[ERRBUFLEN]; 220 221 /* prepare an error message just in case */ 222 (void) snprintf(errbuf, sizeof (errbuf), 223 dgettext(TEXT_DOMAIN, "cannot label '%s'"), name); 224 225 if (zhp) { 226 nvlist_t *nvroot = fnvlist_lookup_nvlist(zhp->zpool_config, 227 ZPOOL_CONFIG_VDEV_TREE); 228 229 if (zhp->zpool_start_block == 0) 230 start_block = find_start_block(nvroot); 231 else 232 start_block = zhp->zpool_start_block; 233 zhp->zpool_start_block = start_block; 234 } else { 235 /* new pool */ 236 start_block = NEW_START_BLOCK; 237 } 238 239 (void) snprintf(path, sizeof (path), "%s/%s", DISK_ROOT, name); 240 241 if ((fd = open(path, O_RDWR|O_DIRECT|O_EXCL|O_CLOEXEC)) < 0) { 242 /* 243 * This shouldn't happen. We've long since verified that this 244 * is a valid device. 245 */ 246 zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot " 247 "label '%s': unable to open device: %d"), path, errno); 248 return (zfs_error(hdl, EZFS_OPENFAILED, errbuf)); 249 } 250 251 if (efi_alloc_and_init(fd, EFI_NUMPAR, &vtoc) != 0) { 252 /* 253 * The only way this can fail is if we run out of memory, or we 254 * were unable to read the disk's capacity 255 */ 256 if (errno == ENOMEM) 257 (void) no_memory(hdl); 258 259 (void) close(fd); 260 zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot " 261 "label '%s': unable to read disk capacity"), path); 262 263 return (zfs_error(hdl, EZFS_NOCAP, errbuf)); 264 } 265 266 slice_size = vtoc->efi_last_u_lba + 1; 267 slice_size -= EFI_MIN_RESV_SIZE; 268 if (start_block == MAXOFFSET_T) 269 start_block = NEW_START_BLOCK; 270 slice_size -= start_block; 271 slice_size = P2ALIGN_TYPED(slice_size, PARTITION_END_ALIGNMENT, 272 uint64_t); 273 274 vtoc->efi_parts[0].p_start = start_block; 275 vtoc->efi_parts[0].p_size = slice_size; 276 277 if (vtoc->efi_parts[0].p_size * vtoc->efi_lbasize < SPA_MINDEVSIZE) { 278 (void) close(fd); 279 efi_free(vtoc); 280 281 zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot " 282 "label '%s': partition would be less than the minimum " 283 "device size (64M)"), path); 284 return (zfs_error(hdl, EZFS_LABELFAILED, errbuf)); 285 } 286 287 /* 288 * Why we use V_USR: V_BACKUP confuses users, and is considered 289 * disposable by some EFI utilities (since EFI doesn't have a backup 290 * slice). V_UNASSIGNED is supposed to be used only for zero size 291 * partitions, and efi_write() will fail if we use it. 292 * Other available types were all pretty specific. 293 * V_USR is as close to reality as we 294 * can get, in the absence of V_OTHER. 295 */ 296 vtoc->efi_parts[0].p_tag = V_USR; 297 zpool_label_name(vtoc->efi_parts[0].p_name, EFI_PART_NAME_LEN); 298 299 vtoc->efi_parts[8].p_start = slice_size + start_block; 300 vtoc->efi_parts[8].p_size = resv; 301 vtoc->efi_parts[8].p_tag = V_RESERVED; 302 303 rval = efi_write(fd, vtoc); 304 305 /* Flush the buffers to disk and invalidate the page cache. */ 306 (void) fsync(fd); 307 (void) ioctl(fd, BLKFLSBUF); 308 309 if (rval == 0) 310 rval = efi_rescan(fd); 311 312 /* 313 * Some block drivers (like pcata) may not support EFI GPT labels. 314 * Print out a helpful error message directing the user to manually 315 * label the disk and give a specific slice. 316 */ 317 if (rval != 0) { 318 (void) close(fd); 319 efi_free(vtoc); 320 321 zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "try using " 322 "parted(8) and then provide a specific slice: %d"), rval); 323 return (zfs_error(hdl, EZFS_LABELFAILED, errbuf)); 324 } 325 326 (void) close(fd); 327 efi_free(vtoc); 328 329 (void) snprintf(path, sizeof (path), "%s/%s", DISK_ROOT, name); 330 (void) zfs_append_partition(path, MAXPATHLEN); 331 332 /* Wait to udev to signal use the device has settled. */ 333 rval = zpool_label_disk_wait(path, DISK_LABEL_WAIT); 334 if (rval) { 335 zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "failed to " 336 "detect device partitions on '%s': %d"), path, rval); 337 return (zfs_error(hdl, EZFS_LABELFAILED, errbuf)); 338 } 339 340 /* We can't be to paranoid. Read the label back and verify it. */ 341 (void) snprintf(path, sizeof (path), "%s/%s", DISK_ROOT, name); 342 rval = zpool_label_disk_check(path); 343 if (rval) { 344 zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "freshly written " 345 "EFI label on '%s' is damaged. Ensure\nthis device " 346 "is not in use, and is functioning properly: %d"), 347 path, rval); 348 return (zfs_error(hdl, EZFS_LABELFAILED, errbuf)); 349 } 350 return (0); 351} 352