1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd/* 22219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23297108Smav * Copyright (c) 2012, 2015 by Delphix. All rights reserved. 24254012Sdelphij * Copyright 2013 Nexenta Systems, Inc. All rights reserved. 25263395Sdelphij * Copyright (c) 2013 Joyent, Inc. All rights reserved. 26168404Spjd */ 27168404Spjd 28168404Spjd#include <sys/zfs_context.h> 29219089Spjd#include <sys/spa_impl.h> 30185029Spjd#include <sys/refcount.h> 31168404Spjd#include <sys/vdev_disk.h> 32168404Spjd#include <sys/vdev_impl.h> 33168404Spjd#include <sys/fs/zfs.h> 34168404Spjd#include <sys/zio.h> 35168404Spjd#include <sys/sunldi.h> 36236155Smm#include <sys/efi_partition.h> 37185029Spjd#include <sys/fm/fs/zfs.h> 38168404Spjd 39168404Spjd/* 40168404Spjd * Virtual device vector for disks. 41168404Spjd */ 42168404Spjd 43168404Spjdextern ldi_ident_t zfs_li; 44168404Spjd 45263395Sdelphijstatic void vdev_disk_close(vdev_t *); 46263395Sdelphij 47263395Sdelphijtypedef struct vdev_disk_ldi_cb { 48263395Sdelphij list_node_t lcb_next; 49263395Sdelphij ldi_callback_id_t lcb_id; 50263395Sdelphij} vdev_disk_ldi_cb_t; 51263395Sdelphij 52219089Spjdstatic void 53263395Sdelphijvdev_disk_alloc(vdev_t *vd) 54263395Sdelphij{ 55263395Sdelphij vdev_disk_t *dvd; 56263395Sdelphij 57263395Sdelphij dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); 58263395Sdelphij /* 59263395Sdelphij * Create the LDI event callback list. 60263395Sdelphij */ 61263395Sdelphij list_create(&dvd->vd_ldi_cbs, sizeof (vdev_disk_ldi_cb_t), 62263395Sdelphij offsetof(vdev_disk_ldi_cb_t, lcb_next)); 63263395Sdelphij} 64263395Sdelphij 65263395Sdelphijstatic void 66263395Sdelphijvdev_disk_free(vdev_t *vd) 67263395Sdelphij{ 68263395Sdelphij vdev_disk_t *dvd = vd->vdev_tsd; 69263395Sdelphij vdev_disk_ldi_cb_t *lcb; 70263395Sdelphij 71263395Sdelphij if (dvd == NULL) 72263395Sdelphij return; 73263395Sdelphij 74263395Sdelphij /* 75263395Sdelphij * We have already closed the LDI handle. Clean up the LDI event 76263395Sdelphij * callbacks and free vd->vdev_tsd. 77263395Sdelphij */ 78263395Sdelphij while ((lcb = list_head(&dvd->vd_ldi_cbs)) != NULL) { 79263395Sdelphij list_remove(&dvd->vd_ldi_cbs, lcb); 80263395Sdelphij (void) ldi_ev_remove_callbacks(lcb->lcb_id); 81263395Sdelphij kmem_free(lcb, sizeof (vdev_disk_ldi_cb_t)); 82263395Sdelphij } 83263395Sdelphij list_destroy(&dvd->vd_ldi_cbs); 84263395Sdelphij kmem_free(dvd, sizeof (vdev_disk_t)); 85263395Sdelphij vd->vdev_tsd = NULL; 86263395Sdelphij} 87263395Sdelphij 88263395Sdelphij/* ARGSUSED */ 89263395Sdelphijstatic int 90263395Sdelphijvdev_disk_off_notify(ldi_handle_t lh, ldi_ev_cookie_t ecookie, void *arg, 91263395Sdelphij void *ev_data) 92263395Sdelphij{ 93263395Sdelphij vdev_t *vd = (vdev_t *)arg; 94263395Sdelphij vdev_disk_t *dvd = vd->vdev_tsd; 95263395Sdelphij 96263395Sdelphij /* 97263395Sdelphij * Ignore events other than offline. 98263395Sdelphij */ 99263395Sdelphij if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_OFFLINE) != 0) 100263395Sdelphij return (LDI_EV_SUCCESS); 101263395Sdelphij 102263395Sdelphij /* 103263395Sdelphij * All LDI handles must be closed for the state change to succeed, so 104263395Sdelphij * call on vdev_disk_close() to do this. 105263395Sdelphij * 106263395Sdelphij * We inform vdev_disk_close that it is being called from offline 107263395Sdelphij * notify context so it will defer cleanup of LDI event callbacks and 108263395Sdelphij * freeing of vd->vdev_tsd to the offline finalize or a reopen. 109263395Sdelphij */ 110263395Sdelphij dvd->vd_ldi_offline = B_TRUE; 111263395Sdelphij vdev_disk_close(vd); 112263395Sdelphij 113263395Sdelphij /* 114263395Sdelphij * Now that the device is closed, request that the spa_async_thread 115263395Sdelphij * mark the device as REMOVED and notify FMA of the removal. 116263395Sdelphij */ 117263395Sdelphij zfs_post_remove(vd->vdev_spa, vd); 118263395Sdelphij vd->vdev_remove_wanted = B_TRUE; 119263395Sdelphij spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE); 120263395Sdelphij 121263395Sdelphij return (LDI_EV_SUCCESS); 122263395Sdelphij} 123263395Sdelphij 124263395Sdelphij/* ARGSUSED */ 125263395Sdelphijstatic void 126263395Sdelphijvdev_disk_off_finalize(ldi_handle_t lh, ldi_ev_cookie_t ecookie, 127263395Sdelphij int ldi_result, void *arg, void *ev_data) 128263395Sdelphij{ 129263395Sdelphij vdev_t *vd = (vdev_t *)arg; 130263395Sdelphij 131263395Sdelphij /* 132263395Sdelphij * Ignore events other than offline. 133263395Sdelphij */ 134263395Sdelphij if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_OFFLINE) != 0) 135263395Sdelphij return; 136263395Sdelphij 137263395Sdelphij /* 138263395Sdelphij * We have already closed the LDI handle in notify. 139263395Sdelphij * Clean up the LDI event callbacks and free vd->vdev_tsd. 140263395Sdelphij */ 141263395Sdelphij vdev_disk_free(vd); 142263395Sdelphij 143263395Sdelphij /* 144263395Sdelphij * Request that the vdev be reopened if the offline state change was 145263395Sdelphij * unsuccessful. 146263395Sdelphij */ 147263395Sdelphij if (ldi_result != LDI_EV_SUCCESS) { 148263395Sdelphij vd->vdev_probe_wanted = B_TRUE; 149263395Sdelphij spa_async_request(vd->vdev_spa, SPA_ASYNC_PROBE); 150263395Sdelphij } 151263395Sdelphij} 152263395Sdelphij 153263395Sdelphijstatic ldi_ev_callback_t vdev_disk_off_callb = { 154263395Sdelphij .cb_vers = LDI_EV_CB_VERS, 155263395Sdelphij .cb_notify = vdev_disk_off_notify, 156263395Sdelphij .cb_finalize = vdev_disk_off_finalize 157263395Sdelphij}; 158263395Sdelphij 159263395Sdelphij/* ARGSUSED */ 160263395Sdelphijstatic void 161263395Sdelphijvdev_disk_dgrd_finalize(ldi_handle_t lh, ldi_ev_cookie_t ecookie, 162263395Sdelphij int ldi_result, void *arg, void *ev_data) 163263395Sdelphij{ 164263395Sdelphij vdev_t *vd = (vdev_t *)arg; 165263395Sdelphij 166263395Sdelphij /* 167263395Sdelphij * Ignore events other than degrade. 168263395Sdelphij */ 169263395Sdelphij if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_DEGRADE) != 0) 170263395Sdelphij return; 171263395Sdelphij 172263395Sdelphij /* 173263395Sdelphij * Degrade events always succeed. Mark the vdev as degraded. 174263395Sdelphij * This status is purely informative for the user. 175263395Sdelphij */ 176263395Sdelphij (void) vdev_degrade(vd->vdev_spa, vd->vdev_guid, 0); 177263395Sdelphij} 178263395Sdelphij 179263395Sdelphijstatic ldi_ev_callback_t vdev_disk_dgrd_callb = { 180263395Sdelphij .cb_vers = LDI_EV_CB_VERS, 181263395Sdelphij .cb_notify = NULL, 182263395Sdelphij .cb_finalize = vdev_disk_dgrd_finalize 183263395Sdelphij}; 184263395Sdelphij 185263395Sdelphijstatic void 186219089Spjdvdev_disk_hold(vdev_t *vd) 187219089Spjd{ 188219089Spjd ddi_devid_t devid; 189219089Spjd char *minor; 190219089Spjd 191219089Spjd ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); 192219089Spjd 193219089Spjd /* 194219089Spjd * We must have a pathname, and it must be absolute. 195219089Spjd */ 196219089Spjd if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') 197219089Spjd return; 198219089Spjd 199219089Spjd /* 200219089Spjd * Only prefetch path and devid info if the device has 201219089Spjd * never been opened. 202219089Spjd */ 203219089Spjd if (vd->vdev_tsd != NULL) 204219089Spjd return; 205219089Spjd 206219089Spjd if (vd->vdev_wholedisk == -1ULL) { 207219089Spjd size_t len = strlen(vd->vdev_path) + 3; 208219089Spjd char *buf = kmem_alloc(len, KM_SLEEP); 209219089Spjd 210219089Spjd (void) snprintf(buf, len, "%ss0", vd->vdev_path); 211219089Spjd 212219089Spjd (void) ldi_vp_from_name(buf, &vd->vdev_name_vp); 213219089Spjd kmem_free(buf, len); 214219089Spjd } 215219089Spjd 216219089Spjd if (vd->vdev_name_vp == NULL) 217219089Spjd (void) ldi_vp_from_name(vd->vdev_path, &vd->vdev_name_vp); 218219089Spjd 219219089Spjd if (vd->vdev_devid != NULL && 220219089Spjd ddi_devid_str_decode(vd->vdev_devid, &devid, &minor) == 0) { 221219089Spjd (void) ldi_vp_from_devid(devid, minor, &vd->vdev_devid_vp); 222219089Spjd ddi_devid_str_free(minor); 223219089Spjd ddi_devid_free(devid); 224219089Spjd } 225219089Spjd} 226219089Spjd 227219089Spjdstatic void 228219089Spjdvdev_disk_rele(vdev_t *vd) 229219089Spjd{ 230219089Spjd ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); 231219089Spjd 232219089Spjd if (vd->vdev_name_vp) { 233219089Spjd VN_RELE_ASYNC(vd->vdev_name_vp, 234219089Spjd dsl_pool_vnrele_taskq(vd->vdev_spa->spa_dsl_pool)); 235219089Spjd vd->vdev_name_vp = NULL; 236219089Spjd } 237219089Spjd if (vd->vdev_devid_vp) { 238219089Spjd VN_RELE_ASYNC(vd->vdev_devid_vp, 239219089Spjd dsl_pool_vnrele_taskq(vd->vdev_spa->spa_dsl_pool)); 240219089Spjd vd->vdev_devid_vp = NULL; 241219089Spjd } 242219089Spjd} 243219089Spjd 244254012Sdelphij/* 245254012Sdelphij * We want to be loud in DEBUG kernels when DKIOCGMEDIAINFOEXT fails, or when 246254012Sdelphij * even a fallback to DKIOCGMEDIAINFO fails. 247254012Sdelphij */ 248254012Sdelphij#ifdef DEBUG 249254012Sdelphij#define VDEV_DEBUG(...) cmn_err(CE_NOTE, __VA_ARGS__) 250254012Sdelphij#else 251254012Sdelphij#define VDEV_DEBUG(...) /* Nothing... */ 252254012Sdelphij#endif 253254012Sdelphij 254168404Spjdstatic int 255236155Smmvdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, 256236155Smm uint64_t *ashift) 257168404Spjd{ 258209962Smm spa_t *spa = vd->vdev_spa; 259263395Sdelphij vdev_disk_t *dvd = vd->vdev_tsd; 260263395Sdelphij ldi_ev_cookie_t ecookie; 261263395Sdelphij vdev_disk_ldi_cb_t *lcb; 262254012Sdelphij union { 263254012Sdelphij struct dk_minfo_ext ude; 264254012Sdelphij struct dk_minfo ud; 265254012Sdelphij } dks; 266254012Sdelphij struct dk_minfo_ext *dkmext = &dks.ude; 267254012Sdelphij struct dk_minfo *dkm = &dks.ud; 268168404Spjd int error; 269185029Spjd dev_t dev; 270185029Spjd int otyp; 271249209Smm boolean_t validate_devid = B_FALSE; 272249209Smm ddi_devid_t devid; 273254012Sdelphij uint64_t capacity = 0, blksz = 0, pbsize; 274168404Spjd 275168404Spjd /* 276168404Spjd * We must have a pathname, and it must be absolute. 277168404Spjd */ 278168404Spjd if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { 279168404Spjd vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 280249195Smm return (SET_ERROR(EINVAL)); 281168404Spjd } 282168404Spjd 283219089Spjd /* 284219089Spjd * Reopen the device if it's not currently open. Otherwise, 285219089Spjd * just update the physical size of the device. 286219089Spjd */ 287263395Sdelphij if (dvd != NULL) { 288263395Sdelphij if (dvd->vd_ldi_offline && dvd->vd_lh == NULL) { 289263395Sdelphij /* 290263395Sdelphij * If we are opening a device in its offline notify 291263395Sdelphij * context, the LDI handle was just closed. Clean 292263395Sdelphij * up the LDI event callbacks and free vd->vdev_tsd. 293263395Sdelphij */ 294263395Sdelphij vdev_disk_free(vd); 295263395Sdelphij } else { 296263395Sdelphij ASSERT(vd->vdev_reopening); 297263395Sdelphij goto skip_open; 298263395Sdelphij } 299219089Spjd } 300219089Spjd 301263395Sdelphij /* 302263395Sdelphij * Create vd->vdev_tsd. 303263395Sdelphij */ 304263395Sdelphij vdev_disk_alloc(vd); 305263395Sdelphij dvd = vd->vdev_tsd; 306168404Spjd 307168404Spjd /* 308168404Spjd * When opening a disk device, we want to preserve the user's original 309168404Spjd * intent. We always want to open the device by the path the user gave 310168404Spjd * us, even if it is one of multiple paths to the save device. But we 311168404Spjd * also want to be able to survive disks being removed/recabled. 312168404Spjd * Therefore the sequence of opening devices is: 313168404Spjd * 314168404Spjd * 1. Try opening the device by path. For legacy pools without the 315168404Spjd * 'whole_disk' property, attempt to fix the path by appending 's0'. 316168404Spjd * 317168404Spjd * 2. If the devid of the device matches the stored value, return 318168404Spjd * success. 319168404Spjd * 320168404Spjd * 3. Otherwise, the device may have moved. Try opening the device 321168404Spjd * by the devid instead. 322168404Spjd */ 323168404Spjd if (vd->vdev_devid != NULL) { 324168404Spjd if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid, 325168404Spjd &dvd->vd_minor) != 0) { 326168404Spjd vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 327249195Smm return (SET_ERROR(EINVAL)); 328168404Spjd } 329168404Spjd } 330168404Spjd 331168404Spjd error = EINVAL; /* presume failure */ 332168404Spjd 333219089Spjd if (vd->vdev_path != NULL) { 334168404Spjd 335168404Spjd if (vd->vdev_wholedisk == -1ULL) { 336168404Spjd size_t len = strlen(vd->vdev_path) + 3; 337168404Spjd char *buf = kmem_alloc(len, KM_SLEEP); 338168404Spjd 339168404Spjd (void) snprintf(buf, len, "%ss0", vd->vdev_path); 340168404Spjd 341263395Sdelphij error = ldi_open_by_name(buf, spa_mode(spa), kcred, 342263395Sdelphij &dvd->vd_lh, zfs_li); 343263395Sdelphij if (error == 0) { 344168404Spjd spa_strfree(vd->vdev_path); 345168404Spjd vd->vdev_path = buf; 346168404Spjd vd->vdev_wholedisk = 1ULL; 347168404Spjd } else { 348168404Spjd kmem_free(buf, len); 349168404Spjd } 350168404Spjd } 351168404Spjd 352263395Sdelphij /* 353263395Sdelphij * If we have not yet opened the device, try to open it by the 354263395Sdelphij * specified path. 355263395Sdelphij */ 356263395Sdelphij if (error != 0) { 357263395Sdelphij error = ldi_open_by_name(vd->vdev_path, spa_mode(spa), 358263395Sdelphij kcred, &dvd->vd_lh, zfs_li); 359263395Sdelphij } 360168404Spjd 361168404Spjd /* 362168404Spjd * Compare the devid to the stored value. 363168404Spjd */ 364168404Spjd if (error == 0 && vd->vdev_devid != NULL && 365168404Spjd ldi_get_devid(dvd->vd_lh, &devid) == 0) { 366168404Spjd if (ddi_devid_compare(devid, dvd->vd_devid) != 0) { 367249195Smm error = SET_ERROR(EINVAL); 368209962Smm (void) ldi_close(dvd->vd_lh, spa_mode(spa), 369209962Smm kcred); 370168404Spjd dvd->vd_lh = NULL; 371168404Spjd } 372168404Spjd ddi_devid_free(devid); 373168404Spjd } 374168404Spjd 375168404Spjd /* 376168404Spjd * If we succeeded in opening the device, but 'vdev_wholedisk' 377168404Spjd * is not yet set, then this must be a slice. 378168404Spjd */ 379168404Spjd if (error == 0 && vd->vdev_wholedisk == -1ULL) 380168404Spjd vd->vdev_wholedisk = 0; 381168404Spjd } 382168404Spjd 383168404Spjd /* 384168404Spjd * If we were unable to open by path, or the devid check fails, open by 385168404Spjd * devid instead. 386168404Spjd */ 387249209Smm if (error != 0 && vd->vdev_devid != NULL) { 388168404Spjd error = ldi_open_by_devid(dvd->vd_devid, dvd->vd_minor, 389209962Smm spa_mode(spa), kcred, &dvd->vd_lh, zfs_li); 390249209Smm } 391168404Spjd 392185029Spjd /* 393185029Spjd * If all else fails, then try opening by physical path (if available) 394185029Spjd * or the logical path (if we failed due to the devid check). While not 395185029Spjd * as reliable as the devid, this will give us something, and the higher 396185029Spjd * level vdev validation will prevent us from opening the wrong device. 397185029Spjd */ 398168404Spjd if (error) { 399249209Smm if (vd->vdev_devid != NULL) 400249209Smm validate_devid = B_TRUE; 401249209Smm 402185029Spjd if (vd->vdev_physpath != NULL && 403209962Smm (dev = ddi_pathname_to_dev_t(vd->vdev_physpath)) != NODEV) 404209962Smm error = ldi_open_by_dev(&dev, OTYP_BLK, spa_mode(spa), 405185029Spjd kcred, &dvd->vd_lh, zfs_li); 406185029Spjd 407185029Spjd /* 408185029Spjd * Note that we don't support the legacy auto-wholedisk support 409185029Spjd * as above. This hasn't been used in a very long time and we 410185029Spjd * don't need to propagate its oddities to this edge condition. 411185029Spjd */ 412219089Spjd if (error && vd->vdev_path != NULL) 413209962Smm error = ldi_open_by_name(vd->vdev_path, spa_mode(spa), 414209962Smm kcred, &dvd->vd_lh, zfs_li); 415185029Spjd } 416185029Spjd 417185029Spjd if (error) { 418168404Spjd vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; 419168404Spjd return (error); 420168404Spjd } 421168404Spjd 422168404Spjd /* 423249209Smm * Now that the device has been successfully opened, update the devid 424249209Smm * if necessary. 425249209Smm */ 426249209Smm if (validate_devid && spa_writeable(spa) && 427249209Smm ldi_get_devid(dvd->vd_lh, &devid) == 0) { 428249209Smm if (ddi_devid_compare(devid, dvd->vd_devid) != 0) { 429249209Smm char *vd_devid; 430249209Smm 431249209Smm vd_devid = ddi_devid_str_encode(devid, dvd->vd_minor); 432249209Smm zfs_dbgmsg("vdev %s: update devid from %s, " 433249209Smm "to %s", vd->vdev_path, vd->vdev_devid, vd_devid); 434249209Smm spa_strfree(vd->vdev_devid); 435249209Smm vd->vdev_devid = spa_strdup(vd_devid); 436249209Smm ddi_devid_str_free(vd_devid); 437249209Smm } 438249209Smm ddi_devid_free(devid); 439249209Smm } 440249209Smm 441249209Smm /* 442185029Spjd * Once a device is opened, verify that the physical device path (if 443185029Spjd * available) is up to date. 444185029Spjd */ 445185029Spjd if (ldi_get_dev(dvd->vd_lh, &dev) == 0 && 446185029Spjd ldi_get_otyp(dvd->vd_lh, &otyp) == 0) { 447185029Spjd char *physpath, *minorname; 448185029Spjd 449185029Spjd physpath = kmem_alloc(MAXPATHLEN, KM_SLEEP); 450185029Spjd minorname = NULL; 451185029Spjd if (ddi_dev_pathname(dev, otyp, physpath) == 0 && 452185029Spjd ldi_get_minor_name(dvd->vd_lh, &minorname) == 0 && 453185029Spjd (vd->vdev_physpath == NULL || 454185029Spjd strcmp(vd->vdev_physpath, physpath) != 0)) { 455185029Spjd if (vd->vdev_physpath) 456185029Spjd spa_strfree(vd->vdev_physpath); 457185029Spjd (void) strlcat(physpath, ":", MAXPATHLEN); 458185029Spjd (void) strlcat(physpath, minorname, MAXPATHLEN); 459185029Spjd vd->vdev_physpath = spa_strdup(physpath); 460185029Spjd } 461185029Spjd if (minorname) 462185029Spjd kmem_free(minorname, strlen(minorname) + 1); 463185029Spjd kmem_free(physpath, MAXPATHLEN); 464185029Spjd } 465185029Spjd 466263395Sdelphij /* 467263395Sdelphij * Register callbacks for the LDI offline event. 468263395Sdelphij */ 469263395Sdelphij if (ldi_ev_get_cookie(dvd->vd_lh, LDI_EV_OFFLINE, &ecookie) == 470263395Sdelphij LDI_EV_SUCCESS) { 471263395Sdelphij lcb = kmem_zalloc(sizeof (vdev_disk_ldi_cb_t), KM_SLEEP); 472263395Sdelphij list_insert_tail(&dvd->vd_ldi_cbs, lcb); 473263395Sdelphij (void) ldi_ev_register_callbacks(dvd->vd_lh, ecookie, 474263395Sdelphij &vdev_disk_off_callb, (void *) vd, &lcb->lcb_id); 475263395Sdelphij } 476263395Sdelphij 477263395Sdelphij /* 478263395Sdelphij * Register callbacks for the LDI degrade event. 479263395Sdelphij */ 480263395Sdelphij if (ldi_ev_get_cookie(dvd->vd_lh, LDI_EV_DEGRADE, &ecookie) == 481263395Sdelphij LDI_EV_SUCCESS) { 482263395Sdelphij lcb = kmem_zalloc(sizeof (vdev_disk_ldi_cb_t), KM_SLEEP); 483263395Sdelphij list_insert_tail(&dvd->vd_ldi_cbs, lcb); 484263395Sdelphij (void) ldi_ev_register_callbacks(dvd->vd_lh, ecookie, 485263395Sdelphij &vdev_disk_dgrd_callb, (void *) vd, &lcb->lcb_id); 486263395Sdelphij } 487219089Spjdskip_open: 488185029Spjd /* 489168404Spjd * Determine the actual size of the device. 490168404Spjd */ 491168404Spjd if (ldi_get_size(dvd->vd_lh, psize) != 0) { 492168404Spjd vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; 493249195Smm return (SET_ERROR(EINVAL)); 494168404Spjd } 495168404Spjd 496254012Sdelphij *max_psize = *psize; 497254012Sdelphij 498168404Spjd /* 499168404Spjd * Determine the device's minimum transfer size. 500168404Spjd * If the ioctl isn't supported, assume DEV_BSIZE. 501168404Spjd */ 502254012Sdelphij if ((error = ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFOEXT, 503254012Sdelphij (intptr_t)dkmext, FKIOCTL, kcred, NULL)) == 0) { 504254012Sdelphij capacity = dkmext->dki_capacity - 1; 505254012Sdelphij blksz = dkmext->dki_lbsize; 506254012Sdelphij pbsize = dkmext->dki_pbsize; 507254012Sdelphij } else if ((error = ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFO, 508254012Sdelphij (intptr_t)dkm, FKIOCTL, kcred, NULL)) == 0) { 509254012Sdelphij VDEV_DEBUG( 510254012Sdelphij "vdev_disk_open(\"%s\"): fallback to DKIOCGMEDIAINFO\n", 511254012Sdelphij vd->vdev_path); 512254012Sdelphij capacity = dkm->dki_capacity - 1; 513254012Sdelphij blksz = dkm->dki_lbsize; 514254012Sdelphij pbsize = blksz; 515254012Sdelphij } else { 516254012Sdelphij VDEV_DEBUG("vdev_disk_open(\"%s\"): " 517254012Sdelphij "both DKIOCGMEDIAINFO{,EXT} calls failed, %d\n", 518254012Sdelphij vd->vdev_path, error); 519254012Sdelphij pbsize = DEV_BSIZE; 520254012Sdelphij } 521168404Spjd 522265740Sdelphij *ashift = highbit64(MAX(pbsize, SPA_MINBLOCKSIZE)) - 1; 523168404Spjd 524236155Smm if (vd->vdev_wholedisk == 1) { 525236155Smm int wce = 1; 526236155Smm 527254012Sdelphij if (error == 0) { 528254012Sdelphij /* 529254012Sdelphij * If we have the capability to expand, we'd have 530254012Sdelphij * found out via success from DKIOCGMEDIAINFO{,EXT}. 531254012Sdelphij * Adjust max_psize upward accordingly since we know 532254012Sdelphij * we own the whole disk now. 533254012Sdelphij */ 534307268Smav *max_psize = capacity * blksz; 535254012Sdelphij } 536254012Sdelphij 537236155Smm /* 538254012Sdelphij * Since we own the whole disk, try to enable disk write 539254012Sdelphij * caching. We ignore errors because it's OK if we can't do it. 540236155Smm */ 541236155Smm (void) ldi_ioctl(dvd->vd_lh, DKIOCSETWCE, (intptr_t)&wce, 542236155Smm FKIOCTL, kcred, NULL); 543236155Smm } 544236155Smm 545168404Spjd /* 546168404Spjd * Clear the nowritecache bit, so that on a vdev_reopen() we will 547168404Spjd * try again. 548168404Spjd */ 549168404Spjd vd->vdev_nowritecache = B_FALSE; 550168404Spjd 551168404Spjd return (0); 552168404Spjd} 553168404Spjd 554168404Spjdstatic void 555168404Spjdvdev_disk_close(vdev_t *vd) 556168404Spjd{ 557168404Spjd vdev_disk_t *dvd = vd->vdev_tsd; 558168404Spjd 559219089Spjd if (vd->vdev_reopening || dvd == NULL) 560168404Spjd return; 561168404Spjd 562263395Sdelphij if (dvd->vd_minor != NULL) { 563168404Spjd ddi_devid_str_free(dvd->vd_minor); 564263395Sdelphij dvd->vd_minor = NULL; 565263395Sdelphij } 566168404Spjd 567263395Sdelphij if (dvd->vd_devid != NULL) { 568168404Spjd ddi_devid_free(dvd->vd_devid); 569263395Sdelphij dvd->vd_devid = NULL; 570263395Sdelphij } 571168404Spjd 572263395Sdelphij if (dvd->vd_lh != NULL) { 573209962Smm (void) ldi_close(dvd->vd_lh, spa_mode(vd->vdev_spa), kcred); 574263395Sdelphij dvd->vd_lh = NULL; 575263395Sdelphij } 576168404Spjd 577219089Spjd vd->vdev_delayed_close = B_FALSE; 578263395Sdelphij /* 579263395Sdelphij * If we closed the LDI handle due to an offline notify from LDI, 580263395Sdelphij * don't free vd->vdev_tsd or unregister the callbacks here; 581263395Sdelphij * the offline finalize callback or a reopen will take care of it. 582263395Sdelphij */ 583263395Sdelphij if (dvd->vd_ldi_offline) 584263395Sdelphij return; 585263395Sdelphij 586263395Sdelphij vdev_disk_free(vd); 587168404Spjd} 588168404Spjd 589185029Spjdint 590255750Sdelphijvdev_disk_physio(vdev_t *vd, caddr_t data, 591255750Sdelphij size_t size, uint64_t offset, int flags, boolean_t isdump) 592185029Spjd{ 593255750Sdelphij vdev_disk_t *dvd = vd->vdev_tsd; 594255750Sdelphij 595263395Sdelphij /* 596263395Sdelphij * If the vdev is closed, it's likely in the REMOVED or FAULTED state. 597263395Sdelphij * Nothing to be done here but return failure. 598263395Sdelphij */ 599263395Sdelphij if (dvd == NULL || (dvd->vd_ldi_offline && dvd->vd_lh == NULL)) 600263395Sdelphij return (EIO); 601263395Sdelphij 602255750Sdelphij ASSERT(vd->vdev_ops == &vdev_disk_ops); 603255750Sdelphij 604255750Sdelphij /* 605255750Sdelphij * If in the context of an active crash dump, use the ldi_dump(9F) 606255750Sdelphij * call instead of ldi_strategy(9F) as usual. 607255750Sdelphij */ 608255750Sdelphij if (isdump) { 609255750Sdelphij ASSERT3P(dvd, !=, NULL); 610255750Sdelphij return (ldi_dump(dvd->vd_lh, data, lbtodb(offset), 611255750Sdelphij lbtodb(size))); 612255750Sdelphij } 613255750Sdelphij 614255750Sdelphij return (vdev_disk_ldi_physio(dvd->vd_lh, data, size, offset, flags)); 615255750Sdelphij} 616255750Sdelphij 617255750Sdelphijint 618255750Sdelphijvdev_disk_ldi_physio(ldi_handle_t vd_lh, caddr_t data, 619255750Sdelphij size_t size, uint64_t offset, int flags) 620255750Sdelphij{ 621185029Spjd buf_t *bp; 622185029Spjd int error = 0; 623185029Spjd 624185029Spjd if (vd_lh == NULL) 625249195Smm return (SET_ERROR(EINVAL)); 626185029Spjd 627185029Spjd ASSERT(flags & B_READ || flags & B_WRITE); 628185029Spjd 629185029Spjd bp = getrbuf(KM_SLEEP); 630185029Spjd bp->b_flags = flags | B_BUSY | B_NOCACHE | B_FAILFAST; 631185029Spjd bp->b_bcount = size; 632185029Spjd bp->b_un.b_addr = (void *)data; 633185029Spjd bp->b_lblkno = lbtodb(offset); 634185029Spjd bp->b_bufsize = size; 635185029Spjd 636185029Spjd error = ldi_strategy(vd_lh, bp); 637185029Spjd ASSERT(error == 0); 638185029Spjd if ((error = biowait(bp)) == 0 && bp->b_resid != 0) 639249195Smm error = SET_ERROR(EIO); 640185029Spjd freerbuf(bp); 641185029Spjd 642185029Spjd return (error); 643185029Spjd} 644185029Spjd 645168404Spjdstatic void 646168404Spjdvdev_disk_io_intr(buf_t *bp) 647168404Spjd{ 648263393Sdelphij vdev_buf_t *vb = (vdev_buf_t *)bp; 649263393Sdelphij zio_t *zio = vb->vb_io; 650168404Spjd 651185029Spjd /* 652185029Spjd * The rest of the zio stack only deals with EIO, ECKSUM, and ENXIO. 653185029Spjd * Rather than teach the rest of the stack about other error 654185029Spjd * possibilities (EFAULT, etc), we normalize the error value here. 655185029Spjd */ 656270312Ssmh zio->io_error = (geterror(bp) != 0 ? SET_ERROR(EIO) : 0); 657185029Spjd 658185029Spjd if (zio->io_error == 0 && bp->b_resid != 0) 659249195Smm zio->io_error = SET_ERROR(EIO); 660168404Spjd 661263393Sdelphij kmem_free(vb, sizeof (vdev_buf_t)); 662168404Spjd 663297108Smav zio_delay_interrupt(zio); 664168404Spjd} 665168404Spjd 666168404Spjdstatic void 667185029Spjdvdev_disk_ioctl_free(zio_t *zio) 668185029Spjd{ 669185029Spjd kmem_free(zio->io_vsd, sizeof (struct dk_callback)); 670185029Spjd} 671185029Spjd 672219089Spjdstatic const zio_vsd_ops_t vdev_disk_vsd_ops = { 673219089Spjd vdev_disk_ioctl_free, 674219089Spjd zio_vsd_default_cksum_report 675219089Spjd}; 676219089Spjd 677185029Spjdstatic void 678168404Spjdvdev_disk_ioctl_done(void *zio_arg, int error) 679168404Spjd{ 680168404Spjd zio_t *zio = zio_arg; 681168404Spjd 682168404Spjd zio->io_error = error; 683168404Spjd 684185029Spjd zio_interrupt(zio); 685168404Spjd} 686168404Spjd 687297078Smavstatic void 688168404Spjdvdev_disk_io_start(zio_t *zio) 689168404Spjd{ 690168404Spjd vdev_t *vd = zio->io_vd; 691168404Spjd vdev_disk_t *dvd = vd->vdev_tsd; 692263393Sdelphij vdev_buf_t *vb; 693185029Spjd struct dk_callback *dkc; 694168404Spjd buf_t *bp; 695185029Spjd int error; 696168404Spjd 697263395Sdelphij /* 698263395Sdelphij * If the vdev is closed, it's likely in the REMOVED or FAULTED state. 699263395Sdelphij * Nothing to be done here but return failure. 700263395Sdelphij */ 701263395Sdelphij if (dvd == NULL || (dvd->vd_ldi_offline && dvd->vd_lh == NULL)) { 702270312Ssmh zio->io_error = SET_ERROR(ENXIO); 703270312Ssmh zio_interrupt(zio); 704297078Smav return; 705263395Sdelphij } 706263395Sdelphij 707168404Spjd if (zio->io_type == ZIO_TYPE_IOCTL) { 708168404Spjd /* XXPOLICY */ 709185029Spjd if (!vdev_readable(vd)) { 710249195Smm zio->io_error = SET_ERROR(ENXIO); 711270312Ssmh zio_interrupt(zio); 712297078Smav return; 713168404Spjd } 714168404Spjd 715168404Spjd switch (zio->io_cmd) { 716168404Spjd 717168404Spjd case DKIOCFLUSHWRITECACHE: 718168404Spjd 719168404Spjd if (zfs_nocacheflush) 720168404Spjd break; 721168404Spjd 722168404Spjd if (vd->vdev_nowritecache) { 723249195Smm zio->io_error = SET_ERROR(ENOTSUP); 724168404Spjd break; 725168404Spjd } 726168404Spjd 727185029Spjd zio->io_vsd = dkc = kmem_alloc(sizeof (*dkc), KM_SLEEP); 728219089Spjd zio->io_vsd_ops = &vdev_disk_vsd_ops; 729168404Spjd 730185029Spjd dkc->dkc_callback = vdev_disk_ioctl_done; 731185029Spjd dkc->dkc_flag = FLUSH_VOLATILE; 732185029Spjd dkc->dkc_cookie = zio; 733185029Spjd 734168404Spjd error = ldi_ioctl(dvd->vd_lh, zio->io_cmd, 735185029Spjd (uintptr_t)dkc, FKIOCTL, kcred, NULL); 736168404Spjd 737168404Spjd if (error == 0) { 738168404Spjd /* 739168404Spjd * The ioctl will be done asychronously, 740168404Spjd * and will call vdev_disk_ioctl_done() 741168404Spjd * upon completion. 742168404Spjd */ 743297078Smav return; 744185029Spjd } 745185029Spjd 746185029Spjd if (error == ENOTSUP || error == ENOTTY) { 747168404Spjd /* 748185029Spjd * If we get ENOTSUP or ENOTTY, we know that 749185029Spjd * no future attempts will ever succeed. 750185029Spjd * In this case we set a persistent bit so 751185029Spjd * that we don't bother with the ioctl in the 752185029Spjd * future. 753168404Spjd */ 754168404Spjd vd->vdev_nowritecache = B_TRUE; 755168404Spjd } 756168404Spjd zio->io_error = error; 757168404Spjd 758168404Spjd break; 759168404Spjd 760168404Spjd default: 761249195Smm zio->io_error = SET_ERROR(ENOTSUP); 762168404Spjd } 763168404Spjd 764297078Smav zio_execute(zio); 765297078Smav return; 766168404Spjd } 767168404Spjd 768274800Ssmh ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); 769297108Smav zio->io_target_timestamp = zio_handle_io_delay(zio); 770274800Ssmh 771263393Sdelphij vb = kmem_alloc(sizeof (vdev_buf_t), KM_SLEEP); 772168404Spjd 773263393Sdelphij vb->vb_io = zio; 774263393Sdelphij bp = &vb->vb_buf; 775168404Spjd 776168404Spjd bioinit(bp); 777185029Spjd bp->b_flags = B_BUSY | B_NOCACHE | 778213198Smm (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE); 779213198Smm if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD))) 780213198Smm bp->b_flags |= B_FAILFAST; 781168404Spjd bp->b_bcount = zio->io_size; 782168404Spjd bp->b_un.b_addr = zio->io_data; 783168404Spjd bp->b_lblkno = lbtodb(zio->io_offset); 784168404Spjd bp->b_bufsize = zio->io_size; 785168404Spjd bp->b_iodone = (int (*)())vdev_disk_io_intr; 786168404Spjd 787185029Spjd /* ldi_strategy() will return non-zero only on programming errors */ 788185029Spjd VERIFY(ldi_strategy(dvd->vd_lh, bp) == 0); 789168404Spjd} 790168404Spjd 791168404Spjdstatic void 792168404Spjdvdev_disk_io_done(zio_t *zio) 793168404Spjd{ 794185029Spjd vdev_t *vd = zio->io_vd; 795168404Spjd 796185029Spjd /* 797185029Spjd * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if 798185029Spjd * the device has been removed. If this is the case, then we trigger an 799185029Spjd * asynchronous removal of the device. Otherwise, probe the device and 800185029Spjd * make sure it's still accessible. 801185029Spjd */ 802219089Spjd if (zio->io_error == EIO && !vd->vdev_remove_wanted) { 803185029Spjd vdev_disk_t *dvd = vd->vdev_tsd; 804185029Spjd int state = DKIO_NONE; 805168404Spjd 806185029Spjd if (ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state, 807185029Spjd FKIOCTL, kcred, NULL) == 0 && state != DKIO_INSERTED) { 808219089Spjd /* 809219089Spjd * We post the resource as soon as possible, instead of 810219089Spjd * when the async removal actually happens, because the 811219089Spjd * DE is using this information to discard previous I/O 812219089Spjd * errors. 813219089Spjd */ 814219089Spjd zfs_post_remove(zio->io_spa, vd); 815185029Spjd vd->vdev_remove_wanted = B_TRUE; 816185029Spjd spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); 817219089Spjd } else if (!vd->vdev_delayed_close) { 818219089Spjd vd->vdev_delayed_close = B_TRUE; 819185029Spjd } 820185029Spjd } 821168404Spjd} 822168404Spjd 823168404Spjdvdev_ops_t vdev_disk_ops = { 824168404Spjd vdev_disk_open, 825168404Spjd vdev_disk_close, 826168404Spjd vdev_default_asize, 827168404Spjd vdev_disk_io_start, 828168404Spjd vdev_disk_io_done, 829168404Spjd NULL, 830219089Spjd vdev_disk_hold, 831219089Spjd vdev_disk_rele, 832168404Spjd VDEV_TYPE_DISK, /* name of this vdev type */ 833168404Spjd B_TRUE /* leaf vdev */ 834168404Spjd}; 835185029Spjd 836185029Spjd/* 837185029Spjd * Given the root disk device devid or pathname, read the label from 838185029Spjd * the device, and construct a configuration nvlist. 839185029Spjd */ 840185029Spjdint 841185029Spjdvdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config) 842185029Spjd{ 843185029Spjd ldi_handle_t vd_lh; 844185029Spjd vdev_label_t *label; 845185029Spjd uint64_t s, size; 846185029Spjd int l; 847185029Spjd ddi_devid_t tmpdevid; 848185029Spjd int error = -1; 849185029Spjd char *minor_name; 850185029Spjd 851185029Spjd /* 852185029Spjd * Read the device label and build the nvlist. 853185029Spjd */ 854185029Spjd if (devid != NULL && ddi_devid_str_decode(devid, &tmpdevid, 855185029Spjd &minor_name) == 0) { 856185029Spjd error = ldi_open_by_devid(tmpdevid, minor_name, 857209962Smm FREAD, kcred, &vd_lh, zfs_li); 858185029Spjd ddi_devid_free(tmpdevid); 859185029Spjd ddi_devid_str_free(minor_name); 860185029Spjd } 861185029Spjd 862185029Spjd if (error && (error = ldi_open_by_name(devpath, FREAD, kcred, &vd_lh, 863185029Spjd zfs_li))) 864185029Spjd return (error); 865185029Spjd 866185029Spjd if (ldi_get_size(vd_lh, &s)) { 867185029Spjd (void) ldi_close(vd_lh, FREAD, kcred); 868249195Smm return (SET_ERROR(EIO)); 869185029Spjd } 870185029Spjd 871185029Spjd size = P2ALIGN_TYPED(s, sizeof (vdev_label_t), uint64_t); 872185029Spjd label = kmem_alloc(sizeof (vdev_label_t), KM_SLEEP); 873185029Spjd 874219089Spjd *config = NULL; 875185029Spjd for (l = 0; l < VDEV_LABELS; l++) { 876185029Spjd uint64_t offset, state, txg = 0; 877185029Spjd 878185029Spjd /* read vdev label */ 879185029Spjd offset = vdev_label_offset(size, l, 0); 880255750Sdelphij if (vdev_disk_ldi_physio(vd_lh, (caddr_t)label, 881209962Smm VDEV_SKIP_SIZE + VDEV_PHYS_SIZE, offset, B_READ) != 0) 882185029Spjd continue; 883185029Spjd 884185029Spjd if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist, 885185029Spjd sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0) { 886185029Spjd *config = NULL; 887185029Spjd continue; 888185029Spjd } 889185029Spjd 890185029Spjd if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE, 891185029Spjd &state) != 0 || state >= POOL_STATE_DESTROYED) { 892185029Spjd nvlist_free(*config); 893185029Spjd *config = NULL; 894185029Spjd continue; 895185029Spjd } 896185029Spjd 897185029Spjd if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG, 898185029Spjd &txg) != 0 || txg == 0) { 899185029Spjd nvlist_free(*config); 900185029Spjd *config = NULL; 901185029Spjd continue; 902185029Spjd } 903185029Spjd 904185029Spjd break; 905185029Spjd } 906185029Spjd 907185029Spjd kmem_free(label, sizeof (vdev_label_t)); 908185029Spjd (void) ldi_close(vd_lh, FREAD, kcred); 909219089Spjd if (*config == NULL) 910249195Smm error = SET_ERROR(EIDRM); 911185029Spjd 912185029Spjd return (error); 913185029Spjd} 914