1168404Spjd/*
2168404Spjd * CDDL HEADER START
3168404Spjd *
4168404Spjd * The contents of this file are subject to the terms of the
5168404Spjd * Common Development and Distribution License (the "License").
6168404Spjd * You may not use this file except in compliance with the License.
7168404Spjd *
8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9168404Spjd * or http://www.opensolaris.org/os/licensing.
10168404Spjd * See the License for the specific language governing permissions
11168404Spjd * and limitations under the License.
12168404Spjd *
13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each
14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15168404Spjd * If applicable, add the following below this CDDL HEADER, with the
16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying
17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner]
18168404Spjd *
19168404Spjd * CDDL HEADER END
20168404Spjd */
21168404Spjd/*
22219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23297108Smav * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
24254012Sdelphij * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
25263395Sdelphij * Copyright (c) 2013 Joyent, Inc.  All rights reserved.
26168404Spjd */
27168404Spjd
28168404Spjd#include <sys/zfs_context.h>
29219089Spjd#include <sys/spa_impl.h>
30185029Spjd#include <sys/refcount.h>
31168404Spjd#include <sys/vdev_disk.h>
32168404Spjd#include <sys/vdev_impl.h>
33168404Spjd#include <sys/fs/zfs.h>
34168404Spjd#include <sys/zio.h>
35168404Spjd#include <sys/sunldi.h>
36236155Smm#include <sys/efi_partition.h>
37185029Spjd#include <sys/fm/fs/zfs.h>
38168404Spjd
39168404Spjd/*
40168404Spjd * Virtual device vector for disks.
41168404Spjd */
42168404Spjd
43168404Spjdextern ldi_ident_t zfs_li;
44168404Spjd
45263395Sdelphijstatic void vdev_disk_close(vdev_t *);
46263395Sdelphij
47263395Sdelphijtypedef struct vdev_disk_ldi_cb {
48263395Sdelphij	list_node_t		lcb_next;
49263395Sdelphij	ldi_callback_id_t	lcb_id;
50263395Sdelphij} vdev_disk_ldi_cb_t;
51263395Sdelphij
52219089Spjdstatic void
53263395Sdelphijvdev_disk_alloc(vdev_t *vd)
54263395Sdelphij{
55263395Sdelphij	vdev_disk_t *dvd;
56263395Sdelphij
57263395Sdelphij	dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
58263395Sdelphij	/*
59263395Sdelphij	 * Create the LDI event callback list.
60263395Sdelphij	 */
61263395Sdelphij	list_create(&dvd->vd_ldi_cbs, sizeof (vdev_disk_ldi_cb_t),
62263395Sdelphij	    offsetof(vdev_disk_ldi_cb_t, lcb_next));
63263395Sdelphij}
64263395Sdelphij
65263395Sdelphijstatic void
66263395Sdelphijvdev_disk_free(vdev_t *vd)
67263395Sdelphij{
68263395Sdelphij	vdev_disk_t *dvd = vd->vdev_tsd;
69263395Sdelphij	vdev_disk_ldi_cb_t *lcb;
70263395Sdelphij
71263395Sdelphij	if (dvd == NULL)
72263395Sdelphij		return;
73263395Sdelphij
74263395Sdelphij	/*
75263395Sdelphij	 * We have already closed the LDI handle. Clean up the LDI event
76263395Sdelphij	 * callbacks and free vd->vdev_tsd.
77263395Sdelphij	 */
78263395Sdelphij	while ((lcb = list_head(&dvd->vd_ldi_cbs)) != NULL) {
79263395Sdelphij		list_remove(&dvd->vd_ldi_cbs, lcb);
80263395Sdelphij		(void) ldi_ev_remove_callbacks(lcb->lcb_id);
81263395Sdelphij		kmem_free(lcb, sizeof (vdev_disk_ldi_cb_t));
82263395Sdelphij	}
83263395Sdelphij	list_destroy(&dvd->vd_ldi_cbs);
84263395Sdelphij	kmem_free(dvd, sizeof (vdev_disk_t));
85263395Sdelphij	vd->vdev_tsd = NULL;
86263395Sdelphij}
87263395Sdelphij
88263395Sdelphij/* ARGSUSED */
89263395Sdelphijstatic int
90263395Sdelphijvdev_disk_off_notify(ldi_handle_t lh, ldi_ev_cookie_t ecookie, void *arg,
91263395Sdelphij    void *ev_data)
92263395Sdelphij{
93263395Sdelphij	vdev_t *vd = (vdev_t *)arg;
94263395Sdelphij	vdev_disk_t *dvd = vd->vdev_tsd;
95263395Sdelphij
96263395Sdelphij	/*
97263395Sdelphij	 * Ignore events other than offline.
98263395Sdelphij	 */
99263395Sdelphij	if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_OFFLINE) != 0)
100263395Sdelphij		return (LDI_EV_SUCCESS);
101263395Sdelphij
102263395Sdelphij	/*
103263395Sdelphij	 * All LDI handles must be closed for the state change to succeed, so
104263395Sdelphij	 * call on vdev_disk_close() to do this.
105263395Sdelphij	 *
106263395Sdelphij	 * We inform vdev_disk_close that it is being called from offline
107263395Sdelphij	 * notify context so it will defer cleanup of LDI event callbacks and
108263395Sdelphij	 * freeing of vd->vdev_tsd to the offline finalize or a reopen.
109263395Sdelphij	 */
110263395Sdelphij	dvd->vd_ldi_offline = B_TRUE;
111263395Sdelphij	vdev_disk_close(vd);
112263395Sdelphij
113263395Sdelphij	/*
114263395Sdelphij	 * Now that the device is closed, request that the spa_async_thread
115263395Sdelphij	 * mark the device as REMOVED and notify FMA of the removal.
116263395Sdelphij	 */
117263395Sdelphij	zfs_post_remove(vd->vdev_spa, vd);
118263395Sdelphij	vd->vdev_remove_wanted = B_TRUE;
119263395Sdelphij	spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE);
120263395Sdelphij
121263395Sdelphij	return (LDI_EV_SUCCESS);
122263395Sdelphij}
123263395Sdelphij
124263395Sdelphij/* ARGSUSED */
125263395Sdelphijstatic void
126263395Sdelphijvdev_disk_off_finalize(ldi_handle_t lh, ldi_ev_cookie_t ecookie,
127263395Sdelphij    int ldi_result, void *arg, void *ev_data)
128263395Sdelphij{
129263395Sdelphij	vdev_t *vd = (vdev_t *)arg;
130263395Sdelphij
131263395Sdelphij	/*
132263395Sdelphij	 * Ignore events other than offline.
133263395Sdelphij	 */
134263395Sdelphij	if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_OFFLINE) != 0)
135263395Sdelphij		return;
136263395Sdelphij
137263395Sdelphij	/*
138263395Sdelphij	 * We have already closed the LDI handle in notify.
139263395Sdelphij	 * Clean up the LDI event callbacks and free vd->vdev_tsd.
140263395Sdelphij	 */
141263395Sdelphij	vdev_disk_free(vd);
142263395Sdelphij
143263395Sdelphij	/*
144263395Sdelphij	 * Request that the vdev be reopened if the offline state change was
145263395Sdelphij	 * unsuccessful.
146263395Sdelphij	 */
147263395Sdelphij	if (ldi_result != LDI_EV_SUCCESS) {
148263395Sdelphij		vd->vdev_probe_wanted = B_TRUE;
149263395Sdelphij		spa_async_request(vd->vdev_spa, SPA_ASYNC_PROBE);
150263395Sdelphij	}
151263395Sdelphij}
152263395Sdelphij
153263395Sdelphijstatic ldi_ev_callback_t vdev_disk_off_callb = {
154263395Sdelphij	.cb_vers = LDI_EV_CB_VERS,
155263395Sdelphij	.cb_notify = vdev_disk_off_notify,
156263395Sdelphij	.cb_finalize = vdev_disk_off_finalize
157263395Sdelphij};
158263395Sdelphij
159263395Sdelphij/* ARGSUSED */
160263395Sdelphijstatic void
161263395Sdelphijvdev_disk_dgrd_finalize(ldi_handle_t lh, ldi_ev_cookie_t ecookie,
162263395Sdelphij    int ldi_result, void *arg, void *ev_data)
163263395Sdelphij{
164263395Sdelphij	vdev_t *vd = (vdev_t *)arg;
165263395Sdelphij
166263395Sdelphij	/*
167263395Sdelphij	 * Ignore events other than degrade.
168263395Sdelphij	 */
169263395Sdelphij	if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_DEGRADE) != 0)
170263395Sdelphij		return;
171263395Sdelphij
172263395Sdelphij	/*
173263395Sdelphij	 * Degrade events always succeed. Mark the vdev as degraded.
174263395Sdelphij	 * This status is purely informative for the user.
175263395Sdelphij	 */
176263395Sdelphij	(void) vdev_degrade(vd->vdev_spa, vd->vdev_guid, 0);
177263395Sdelphij}
178263395Sdelphij
179263395Sdelphijstatic ldi_ev_callback_t vdev_disk_dgrd_callb = {
180263395Sdelphij	.cb_vers = LDI_EV_CB_VERS,
181263395Sdelphij	.cb_notify = NULL,
182263395Sdelphij	.cb_finalize = vdev_disk_dgrd_finalize
183263395Sdelphij};
184263395Sdelphij
185263395Sdelphijstatic void
186219089Spjdvdev_disk_hold(vdev_t *vd)
187219089Spjd{
188219089Spjd	ddi_devid_t devid;
189219089Spjd	char *minor;
190219089Spjd
191219089Spjd	ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
192219089Spjd
193219089Spjd	/*
194219089Spjd	 * We must have a pathname, and it must be absolute.
195219089Spjd	 */
196219089Spjd	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/')
197219089Spjd		return;
198219089Spjd
199219089Spjd	/*
200219089Spjd	 * Only prefetch path and devid info if the device has
201219089Spjd	 * never been opened.
202219089Spjd	 */
203219089Spjd	if (vd->vdev_tsd != NULL)
204219089Spjd		return;
205219089Spjd
206219089Spjd	if (vd->vdev_wholedisk == -1ULL) {
207219089Spjd		size_t len = strlen(vd->vdev_path) + 3;
208219089Spjd		char *buf = kmem_alloc(len, KM_SLEEP);
209219089Spjd
210219089Spjd		(void) snprintf(buf, len, "%ss0", vd->vdev_path);
211219089Spjd
212219089Spjd		(void) ldi_vp_from_name(buf, &vd->vdev_name_vp);
213219089Spjd		kmem_free(buf, len);
214219089Spjd	}
215219089Spjd
216219089Spjd	if (vd->vdev_name_vp == NULL)
217219089Spjd		(void) ldi_vp_from_name(vd->vdev_path, &vd->vdev_name_vp);
218219089Spjd
219219089Spjd	if (vd->vdev_devid != NULL &&
220219089Spjd	    ddi_devid_str_decode(vd->vdev_devid, &devid, &minor) == 0) {
221219089Spjd		(void) ldi_vp_from_devid(devid, minor, &vd->vdev_devid_vp);
222219089Spjd		ddi_devid_str_free(minor);
223219089Spjd		ddi_devid_free(devid);
224219089Spjd	}
225219089Spjd}
226219089Spjd
227219089Spjdstatic void
228219089Spjdvdev_disk_rele(vdev_t *vd)
229219089Spjd{
230219089Spjd	ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
231219089Spjd
232219089Spjd	if (vd->vdev_name_vp) {
233219089Spjd		VN_RELE_ASYNC(vd->vdev_name_vp,
234219089Spjd		    dsl_pool_vnrele_taskq(vd->vdev_spa->spa_dsl_pool));
235219089Spjd		vd->vdev_name_vp = NULL;
236219089Spjd	}
237219089Spjd	if (vd->vdev_devid_vp) {
238219089Spjd		VN_RELE_ASYNC(vd->vdev_devid_vp,
239219089Spjd		    dsl_pool_vnrele_taskq(vd->vdev_spa->spa_dsl_pool));
240219089Spjd		vd->vdev_devid_vp = NULL;
241219089Spjd	}
242219089Spjd}
243219089Spjd
244254012Sdelphij/*
245254012Sdelphij * We want to be loud in DEBUG kernels when DKIOCGMEDIAINFOEXT fails, or when
246254012Sdelphij * even a fallback to DKIOCGMEDIAINFO fails.
247254012Sdelphij */
248254012Sdelphij#ifdef DEBUG
249254012Sdelphij#define	VDEV_DEBUG(...)	cmn_err(CE_NOTE, __VA_ARGS__)
250254012Sdelphij#else
251254012Sdelphij#define	VDEV_DEBUG(...)	/* Nothing... */
252254012Sdelphij#endif
253254012Sdelphij
254168404Spjdstatic int
255236155Smmvdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
256236155Smm    uint64_t *ashift)
257168404Spjd{
258209962Smm	spa_t *spa = vd->vdev_spa;
259263395Sdelphij	vdev_disk_t *dvd = vd->vdev_tsd;
260263395Sdelphij	ldi_ev_cookie_t ecookie;
261263395Sdelphij	vdev_disk_ldi_cb_t *lcb;
262254012Sdelphij	union {
263254012Sdelphij		struct dk_minfo_ext ude;
264254012Sdelphij		struct dk_minfo ud;
265254012Sdelphij	} dks;
266254012Sdelphij	struct dk_minfo_ext *dkmext = &dks.ude;
267254012Sdelphij	struct dk_minfo *dkm = &dks.ud;
268168404Spjd	int error;
269185029Spjd	dev_t dev;
270185029Spjd	int otyp;
271249209Smm	boolean_t validate_devid = B_FALSE;
272249209Smm	ddi_devid_t devid;
273254012Sdelphij	uint64_t capacity = 0, blksz = 0, pbsize;
274168404Spjd
275168404Spjd	/*
276168404Spjd	 * We must have a pathname, and it must be absolute.
277168404Spjd	 */
278168404Spjd	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
279168404Spjd		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
280249195Smm		return (SET_ERROR(EINVAL));
281168404Spjd	}
282168404Spjd
283219089Spjd	/*
284219089Spjd	 * Reopen the device if it's not currently open. Otherwise,
285219089Spjd	 * just update the physical size of the device.
286219089Spjd	 */
287263395Sdelphij	if (dvd != NULL) {
288263395Sdelphij		if (dvd->vd_ldi_offline && dvd->vd_lh == NULL) {
289263395Sdelphij			/*
290263395Sdelphij			 * If we are opening a device in its offline notify
291263395Sdelphij			 * context, the LDI handle was just closed. Clean
292263395Sdelphij			 * up the LDI event callbacks and free vd->vdev_tsd.
293263395Sdelphij			 */
294263395Sdelphij			vdev_disk_free(vd);
295263395Sdelphij		} else {
296263395Sdelphij			ASSERT(vd->vdev_reopening);
297263395Sdelphij			goto skip_open;
298263395Sdelphij		}
299219089Spjd	}
300219089Spjd
301263395Sdelphij	/*
302263395Sdelphij	 * Create vd->vdev_tsd.
303263395Sdelphij	 */
304263395Sdelphij	vdev_disk_alloc(vd);
305263395Sdelphij	dvd = vd->vdev_tsd;
306168404Spjd
307168404Spjd	/*
308168404Spjd	 * When opening a disk device, we want to preserve the user's original
309168404Spjd	 * intent.  We always want to open the device by the path the user gave
310168404Spjd	 * us, even if it is one of multiple paths to the save device.  But we
311168404Spjd	 * also want to be able to survive disks being removed/recabled.
312168404Spjd	 * Therefore the sequence of opening devices is:
313168404Spjd	 *
314168404Spjd	 * 1. Try opening the device by path.  For legacy pools without the
315168404Spjd	 *    'whole_disk' property, attempt to fix the path by appending 's0'.
316168404Spjd	 *
317168404Spjd	 * 2. If the devid of the device matches the stored value, return
318168404Spjd	 *    success.
319168404Spjd	 *
320168404Spjd	 * 3. Otherwise, the device may have moved.  Try opening the device
321168404Spjd	 *    by the devid instead.
322168404Spjd	 */
323168404Spjd	if (vd->vdev_devid != NULL) {
324168404Spjd		if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid,
325168404Spjd		    &dvd->vd_minor) != 0) {
326168404Spjd			vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
327249195Smm			return (SET_ERROR(EINVAL));
328168404Spjd		}
329168404Spjd	}
330168404Spjd
331168404Spjd	error = EINVAL;		/* presume failure */
332168404Spjd
333219089Spjd	if (vd->vdev_path != NULL) {
334168404Spjd
335168404Spjd		if (vd->vdev_wholedisk == -1ULL) {
336168404Spjd			size_t len = strlen(vd->vdev_path) + 3;
337168404Spjd			char *buf = kmem_alloc(len, KM_SLEEP);
338168404Spjd
339168404Spjd			(void) snprintf(buf, len, "%ss0", vd->vdev_path);
340168404Spjd
341263395Sdelphij			error = ldi_open_by_name(buf, spa_mode(spa), kcred,
342263395Sdelphij			    &dvd->vd_lh, zfs_li);
343263395Sdelphij			if (error == 0) {
344168404Spjd				spa_strfree(vd->vdev_path);
345168404Spjd				vd->vdev_path = buf;
346168404Spjd				vd->vdev_wholedisk = 1ULL;
347168404Spjd			} else {
348168404Spjd				kmem_free(buf, len);
349168404Spjd			}
350168404Spjd		}
351168404Spjd
352263395Sdelphij		/*
353263395Sdelphij		 * If we have not yet opened the device, try to open it by the
354263395Sdelphij		 * specified path.
355263395Sdelphij		 */
356263395Sdelphij		if (error != 0) {
357263395Sdelphij			error = ldi_open_by_name(vd->vdev_path, spa_mode(spa),
358263395Sdelphij			    kcred, &dvd->vd_lh, zfs_li);
359263395Sdelphij		}
360168404Spjd
361168404Spjd		/*
362168404Spjd		 * Compare the devid to the stored value.
363168404Spjd		 */
364168404Spjd		if (error == 0 && vd->vdev_devid != NULL &&
365168404Spjd		    ldi_get_devid(dvd->vd_lh, &devid) == 0) {
366168404Spjd			if (ddi_devid_compare(devid, dvd->vd_devid) != 0) {
367249195Smm				error = SET_ERROR(EINVAL);
368209962Smm				(void) ldi_close(dvd->vd_lh, spa_mode(spa),
369209962Smm				    kcred);
370168404Spjd				dvd->vd_lh = NULL;
371168404Spjd			}
372168404Spjd			ddi_devid_free(devid);
373168404Spjd		}
374168404Spjd
375168404Spjd		/*
376168404Spjd		 * If we succeeded in opening the device, but 'vdev_wholedisk'
377168404Spjd		 * is not yet set, then this must be a slice.
378168404Spjd		 */
379168404Spjd		if (error == 0 && vd->vdev_wholedisk == -1ULL)
380168404Spjd			vd->vdev_wholedisk = 0;
381168404Spjd	}
382168404Spjd
383168404Spjd	/*
384168404Spjd	 * If we were unable to open by path, or the devid check fails, open by
385168404Spjd	 * devid instead.
386168404Spjd	 */
387249209Smm	if (error != 0 && vd->vdev_devid != NULL) {
388168404Spjd		error = ldi_open_by_devid(dvd->vd_devid, dvd->vd_minor,
389209962Smm		    spa_mode(spa), kcred, &dvd->vd_lh, zfs_li);
390249209Smm	}
391168404Spjd
392185029Spjd	/*
393185029Spjd	 * If all else fails, then try opening by physical path (if available)
394185029Spjd	 * or the logical path (if we failed due to the devid check).  While not
395185029Spjd	 * as reliable as the devid, this will give us something, and the higher
396185029Spjd	 * level vdev validation will prevent us from opening the wrong device.
397185029Spjd	 */
398168404Spjd	if (error) {
399249209Smm		if (vd->vdev_devid != NULL)
400249209Smm			validate_devid = B_TRUE;
401249209Smm
402185029Spjd		if (vd->vdev_physpath != NULL &&
403209962Smm		    (dev = ddi_pathname_to_dev_t(vd->vdev_physpath)) != NODEV)
404209962Smm			error = ldi_open_by_dev(&dev, OTYP_BLK, spa_mode(spa),
405185029Spjd			    kcred, &dvd->vd_lh, zfs_li);
406185029Spjd
407185029Spjd		/*
408185029Spjd		 * Note that we don't support the legacy auto-wholedisk support
409185029Spjd		 * as above.  This hasn't been used in a very long time and we
410185029Spjd		 * don't need to propagate its oddities to this edge condition.
411185029Spjd		 */
412219089Spjd		if (error && vd->vdev_path != NULL)
413209962Smm			error = ldi_open_by_name(vd->vdev_path, spa_mode(spa),
414209962Smm			    kcred, &dvd->vd_lh, zfs_li);
415185029Spjd	}
416185029Spjd
417185029Spjd	if (error) {
418168404Spjd		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
419168404Spjd		return (error);
420168404Spjd	}
421168404Spjd
422168404Spjd	/*
423249209Smm	 * Now that the device has been successfully opened, update the devid
424249209Smm	 * if necessary.
425249209Smm	 */
426249209Smm	if (validate_devid && spa_writeable(spa) &&
427249209Smm	    ldi_get_devid(dvd->vd_lh, &devid) == 0) {
428249209Smm		if (ddi_devid_compare(devid, dvd->vd_devid) != 0) {
429249209Smm			char *vd_devid;
430249209Smm
431249209Smm			vd_devid = ddi_devid_str_encode(devid, dvd->vd_minor);
432249209Smm			zfs_dbgmsg("vdev %s: update devid from %s, "
433249209Smm			    "to %s", vd->vdev_path, vd->vdev_devid, vd_devid);
434249209Smm			spa_strfree(vd->vdev_devid);
435249209Smm			vd->vdev_devid = spa_strdup(vd_devid);
436249209Smm			ddi_devid_str_free(vd_devid);
437249209Smm		}
438249209Smm		ddi_devid_free(devid);
439249209Smm	}
440249209Smm
441249209Smm	/*
442185029Spjd	 * Once a device is opened, verify that the physical device path (if
443185029Spjd	 * available) is up to date.
444185029Spjd	 */
445185029Spjd	if (ldi_get_dev(dvd->vd_lh, &dev) == 0 &&
446185029Spjd	    ldi_get_otyp(dvd->vd_lh, &otyp) == 0) {
447185029Spjd		char *physpath, *minorname;
448185029Spjd
449185029Spjd		physpath = kmem_alloc(MAXPATHLEN, KM_SLEEP);
450185029Spjd		minorname = NULL;
451185029Spjd		if (ddi_dev_pathname(dev, otyp, physpath) == 0 &&
452185029Spjd		    ldi_get_minor_name(dvd->vd_lh, &minorname) == 0 &&
453185029Spjd		    (vd->vdev_physpath == NULL ||
454185029Spjd		    strcmp(vd->vdev_physpath, physpath) != 0)) {
455185029Spjd			if (vd->vdev_physpath)
456185029Spjd				spa_strfree(vd->vdev_physpath);
457185029Spjd			(void) strlcat(physpath, ":", MAXPATHLEN);
458185029Spjd			(void) strlcat(physpath, minorname, MAXPATHLEN);
459185029Spjd			vd->vdev_physpath = spa_strdup(physpath);
460185029Spjd		}
461185029Spjd		if (minorname)
462185029Spjd			kmem_free(minorname, strlen(minorname) + 1);
463185029Spjd		kmem_free(physpath, MAXPATHLEN);
464185029Spjd	}
465185029Spjd
466263395Sdelphij	/*
467263395Sdelphij	 * Register callbacks for the LDI offline event.
468263395Sdelphij	 */
469263395Sdelphij	if (ldi_ev_get_cookie(dvd->vd_lh, LDI_EV_OFFLINE, &ecookie) ==
470263395Sdelphij	    LDI_EV_SUCCESS) {
471263395Sdelphij		lcb = kmem_zalloc(sizeof (vdev_disk_ldi_cb_t), KM_SLEEP);
472263395Sdelphij		list_insert_tail(&dvd->vd_ldi_cbs, lcb);
473263395Sdelphij		(void) ldi_ev_register_callbacks(dvd->vd_lh, ecookie,
474263395Sdelphij		    &vdev_disk_off_callb, (void *) vd, &lcb->lcb_id);
475263395Sdelphij	}
476263395Sdelphij
477263395Sdelphij	/*
478263395Sdelphij	 * Register callbacks for the LDI degrade event.
479263395Sdelphij	 */
480263395Sdelphij	if (ldi_ev_get_cookie(dvd->vd_lh, LDI_EV_DEGRADE, &ecookie) ==
481263395Sdelphij	    LDI_EV_SUCCESS) {
482263395Sdelphij		lcb = kmem_zalloc(sizeof (vdev_disk_ldi_cb_t), KM_SLEEP);
483263395Sdelphij		list_insert_tail(&dvd->vd_ldi_cbs, lcb);
484263395Sdelphij		(void) ldi_ev_register_callbacks(dvd->vd_lh, ecookie,
485263395Sdelphij		    &vdev_disk_dgrd_callb, (void *) vd, &lcb->lcb_id);
486263395Sdelphij	}
487219089Spjdskip_open:
488185029Spjd	/*
489168404Spjd	 * Determine the actual size of the device.
490168404Spjd	 */
491168404Spjd	if (ldi_get_size(dvd->vd_lh, psize) != 0) {
492168404Spjd		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
493249195Smm		return (SET_ERROR(EINVAL));
494168404Spjd	}
495168404Spjd
496254012Sdelphij	*max_psize = *psize;
497254012Sdelphij
498168404Spjd	/*
499168404Spjd	 * Determine the device's minimum transfer size.
500168404Spjd	 * If the ioctl isn't supported, assume DEV_BSIZE.
501168404Spjd	 */
502254012Sdelphij	if ((error = ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFOEXT,
503254012Sdelphij	    (intptr_t)dkmext, FKIOCTL, kcred, NULL)) == 0) {
504254012Sdelphij		capacity = dkmext->dki_capacity - 1;
505254012Sdelphij		blksz = dkmext->dki_lbsize;
506254012Sdelphij		pbsize = dkmext->dki_pbsize;
507254012Sdelphij	} else if ((error = ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFO,
508254012Sdelphij	    (intptr_t)dkm, FKIOCTL, kcred, NULL)) == 0) {
509254012Sdelphij		VDEV_DEBUG(
510254012Sdelphij		    "vdev_disk_open(\"%s\"): fallback to DKIOCGMEDIAINFO\n",
511254012Sdelphij		    vd->vdev_path);
512254012Sdelphij		capacity = dkm->dki_capacity - 1;
513254012Sdelphij		blksz = dkm->dki_lbsize;
514254012Sdelphij		pbsize = blksz;
515254012Sdelphij	} else {
516254012Sdelphij		VDEV_DEBUG("vdev_disk_open(\"%s\"): "
517254012Sdelphij		    "both DKIOCGMEDIAINFO{,EXT} calls failed, %d\n",
518254012Sdelphij		    vd->vdev_path, error);
519254012Sdelphij		pbsize = DEV_BSIZE;
520254012Sdelphij	}
521168404Spjd
522265740Sdelphij	*ashift = highbit64(MAX(pbsize, SPA_MINBLOCKSIZE)) - 1;
523168404Spjd
524236155Smm	if (vd->vdev_wholedisk == 1) {
525236155Smm		int wce = 1;
526236155Smm
527254012Sdelphij		if (error == 0) {
528254012Sdelphij			/*
529254012Sdelphij			 * If we have the capability to expand, we'd have
530254012Sdelphij			 * found out via success from DKIOCGMEDIAINFO{,EXT}.
531254012Sdelphij			 * Adjust max_psize upward accordingly since we know
532254012Sdelphij			 * we own the whole disk now.
533254012Sdelphij			 */
534307268Smav			*max_psize = capacity * blksz;
535254012Sdelphij		}
536254012Sdelphij
537236155Smm		/*
538254012Sdelphij		 * Since we own the whole disk, try to enable disk write
539254012Sdelphij		 * caching.  We ignore errors because it's OK if we can't do it.
540236155Smm		 */
541236155Smm		(void) ldi_ioctl(dvd->vd_lh, DKIOCSETWCE, (intptr_t)&wce,
542236155Smm		    FKIOCTL, kcred, NULL);
543236155Smm	}
544236155Smm
545168404Spjd	/*
546168404Spjd	 * Clear the nowritecache bit, so that on a vdev_reopen() we will
547168404Spjd	 * try again.
548168404Spjd	 */
549168404Spjd	vd->vdev_nowritecache = B_FALSE;
550168404Spjd
551168404Spjd	return (0);
552168404Spjd}
553168404Spjd
554168404Spjdstatic void
555168404Spjdvdev_disk_close(vdev_t *vd)
556168404Spjd{
557168404Spjd	vdev_disk_t *dvd = vd->vdev_tsd;
558168404Spjd
559219089Spjd	if (vd->vdev_reopening || dvd == NULL)
560168404Spjd		return;
561168404Spjd
562263395Sdelphij	if (dvd->vd_minor != NULL) {
563168404Spjd		ddi_devid_str_free(dvd->vd_minor);
564263395Sdelphij		dvd->vd_minor = NULL;
565263395Sdelphij	}
566168404Spjd
567263395Sdelphij	if (dvd->vd_devid != NULL) {
568168404Spjd		ddi_devid_free(dvd->vd_devid);
569263395Sdelphij		dvd->vd_devid = NULL;
570263395Sdelphij	}
571168404Spjd
572263395Sdelphij	if (dvd->vd_lh != NULL) {
573209962Smm		(void) ldi_close(dvd->vd_lh, spa_mode(vd->vdev_spa), kcred);
574263395Sdelphij		dvd->vd_lh = NULL;
575263395Sdelphij	}
576168404Spjd
577219089Spjd	vd->vdev_delayed_close = B_FALSE;
578263395Sdelphij	/*
579263395Sdelphij	 * If we closed the LDI handle due to an offline notify from LDI,
580263395Sdelphij	 * don't free vd->vdev_tsd or unregister the callbacks here;
581263395Sdelphij	 * the offline finalize callback or a reopen will take care of it.
582263395Sdelphij	 */
583263395Sdelphij	if (dvd->vd_ldi_offline)
584263395Sdelphij		return;
585263395Sdelphij
586263395Sdelphij	vdev_disk_free(vd);
587168404Spjd}
588168404Spjd
589185029Spjdint
590255750Sdelphijvdev_disk_physio(vdev_t *vd, caddr_t data,
591255750Sdelphij    size_t size, uint64_t offset, int flags, boolean_t isdump)
592185029Spjd{
593255750Sdelphij	vdev_disk_t *dvd = vd->vdev_tsd;
594255750Sdelphij
595263395Sdelphij	/*
596263395Sdelphij	 * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
597263395Sdelphij	 * Nothing to be done here but return failure.
598263395Sdelphij	 */
599263395Sdelphij	if (dvd == NULL || (dvd->vd_ldi_offline && dvd->vd_lh == NULL))
600263395Sdelphij		return (EIO);
601263395Sdelphij
602255750Sdelphij	ASSERT(vd->vdev_ops == &vdev_disk_ops);
603255750Sdelphij
604255750Sdelphij	/*
605255750Sdelphij	 * If in the context of an active crash dump, use the ldi_dump(9F)
606255750Sdelphij	 * call instead of ldi_strategy(9F) as usual.
607255750Sdelphij	 */
608255750Sdelphij	if (isdump) {
609255750Sdelphij		ASSERT3P(dvd, !=, NULL);
610255750Sdelphij		return (ldi_dump(dvd->vd_lh, data, lbtodb(offset),
611255750Sdelphij		    lbtodb(size)));
612255750Sdelphij	}
613255750Sdelphij
614255750Sdelphij	return (vdev_disk_ldi_physio(dvd->vd_lh, data, size, offset, flags));
615255750Sdelphij}
616255750Sdelphij
617255750Sdelphijint
618255750Sdelphijvdev_disk_ldi_physio(ldi_handle_t vd_lh, caddr_t data,
619255750Sdelphij    size_t size, uint64_t offset, int flags)
620255750Sdelphij{
621185029Spjd	buf_t *bp;
622185029Spjd	int error = 0;
623185029Spjd
624185029Spjd	if (vd_lh == NULL)
625249195Smm		return (SET_ERROR(EINVAL));
626185029Spjd
627185029Spjd	ASSERT(flags & B_READ || flags & B_WRITE);
628185029Spjd
629185029Spjd	bp = getrbuf(KM_SLEEP);
630185029Spjd	bp->b_flags = flags | B_BUSY | B_NOCACHE | B_FAILFAST;
631185029Spjd	bp->b_bcount = size;
632185029Spjd	bp->b_un.b_addr = (void *)data;
633185029Spjd	bp->b_lblkno = lbtodb(offset);
634185029Spjd	bp->b_bufsize = size;
635185029Spjd
636185029Spjd	error = ldi_strategy(vd_lh, bp);
637185029Spjd	ASSERT(error == 0);
638185029Spjd	if ((error = biowait(bp)) == 0 && bp->b_resid != 0)
639249195Smm		error = SET_ERROR(EIO);
640185029Spjd	freerbuf(bp);
641185029Spjd
642185029Spjd	return (error);
643185029Spjd}
644185029Spjd
645168404Spjdstatic void
646168404Spjdvdev_disk_io_intr(buf_t *bp)
647168404Spjd{
648263393Sdelphij	vdev_buf_t *vb = (vdev_buf_t *)bp;
649263393Sdelphij	zio_t *zio = vb->vb_io;
650168404Spjd
651185029Spjd	/*
652185029Spjd	 * The rest of the zio stack only deals with EIO, ECKSUM, and ENXIO.
653185029Spjd	 * Rather than teach the rest of the stack about other error
654185029Spjd	 * possibilities (EFAULT, etc), we normalize the error value here.
655185029Spjd	 */
656270312Ssmh	zio->io_error = (geterror(bp) != 0 ? SET_ERROR(EIO) : 0);
657185029Spjd
658185029Spjd	if (zio->io_error == 0 && bp->b_resid != 0)
659249195Smm		zio->io_error = SET_ERROR(EIO);
660168404Spjd
661263393Sdelphij	kmem_free(vb, sizeof (vdev_buf_t));
662168404Spjd
663297108Smav	zio_delay_interrupt(zio);
664168404Spjd}
665168404Spjd
666168404Spjdstatic void
667185029Spjdvdev_disk_ioctl_free(zio_t *zio)
668185029Spjd{
669185029Spjd	kmem_free(zio->io_vsd, sizeof (struct dk_callback));
670185029Spjd}
671185029Spjd
672219089Spjdstatic const zio_vsd_ops_t vdev_disk_vsd_ops = {
673219089Spjd	vdev_disk_ioctl_free,
674219089Spjd	zio_vsd_default_cksum_report
675219089Spjd};
676219089Spjd
677185029Spjdstatic void
678168404Spjdvdev_disk_ioctl_done(void *zio_arg, int error)
679168404Spjd{
680168404Spjd	zio_t *zio = zio_arg;
681168404Spjd
682168404Spjd	zio->io_error = error;
683168404Spjd
684185029Spjd	zio_interrupt(zio);
685168404Spjd}
686168404Spjd
687297078Smavstatic void
688168404Spjdvdev_disk_io_start(zio_t *zio)
689168404Spjd{
690168404Spjd	vdev_t *vd = zio->io_vd;
691168404Spjd	vdev_disk_t *dvd = vd->vdev_tsd;
692263393Sdelphij	vdev_buf_t *vb;
693185029Spjd	struct dk_callback *dkc;
694168404Spjd	buf_t *bp;
695185029Spjd	int error;
696168404Spjd
697263395Sdelphij	/*
698263395Sdelphij	 * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
699263395Sdelphij	 * Nothing to be done here but return failure.
700263395Sdelphij	 */
701263395Sdelphij	if (dvd == NULL || (dvd->vd_ldi_offline && dvd->vd_lh == NULL)) {
702270312Ssmh		zio->io_error = SET_ERROR(ENXIO);
703270312Ssmh		zio_interrupt(zio);
704297078Smav		return;
705263395Sdelphij	}
706263395Sdelphij
707168404Spjd	if (zio->io_type == ZIO_TYPE_IOCTL) {
708168404Spjd		/* XXPOLICY */
709185029Spjd		if (!vdev_readable(vd)) {
710249195Smm			zio->io_error = SET_ERROR(ENXIO);
711270312Ssmh			zio_interrupt(zio);
712297078Smav			return;
713168404Spjd		}
714168404Spjd
715168404Spjd		switch (zio->io_cmd) {
716168404Spjd
717168404Spjd		case DKIOCFLUSHWRITECACHE:
718168404Spjd
719168404Spjd			if (zfs_nocacheflush)
720168404Spjd				break;
721168404Spjd
722168404Spjd			if (vd->vdev_nowritecache) {
723249195Smm				zio->io_error = SET_ERROR(ENOTSUP);
724168404Spjd				break;
725168404Spjd			}
726168404Spjd
727185029Spjd			zio->io_vsd = dkc = kmem_alloc(sizeof (*dkc), KM_SLEEP);
728219089Spjd			zio->io_vsd_ops = &vdev_disk_vsd_ops;
729168404Spjd
730185029Spjd			dkc->dkc_callback = vdev_disk_ioctl_done;
731185029Spjd			dkc->dkc_flag = FLUSH_VOLATILE;
732185029Spjd			dkc->dkc_cookie = zio;
733185029Spjd
734168404Spjd			error = ldi_ioctl(dvd->vd_lh, zio->io_cmd,
735185029Spjd			    (uintptr_t)dkc, FKIOCTL, kcred, NULL);
736168404Spjd
737168404Spjd			if (error == 0) {
738168404Spjd				/*
739168404Spjd				 * The ioctl will be done asychronously,
740168404Spjd				 * and will call vdev_disk_ioctl_done()
741168404Spjd				 * upon completion.
742168404Spjd				 */
743297078Smav				return;
744185029Spjd			}
745185029Spjd
746185029Spjd			if (error == ENOTSUP || error == ENOTTY) {
747168404Spjd				/*
748185029Spjd				 * If we get ENOTSUP or ENOTTY, we know that
749185029Spjd				 * no future attempts will ever succeed.
750185029Spjd				 * In this case we set a persistent bit so
751185029Spjd				 * that we don't bother with the ioctl in the
752185029Spjd				 * future.
753168404Spjd				 */
754168404Spjd				vd->vdev_nowritecache = B_TRUE;
755168404Spjd			}
756168404Spjd			zio->io_error = error;
757168404Spjd
758168404Spjd			break;
759168404Spjd
760168404Spjd		default:
761249195Smm			zio->io_error = SET_ERROR(ENOTSUP);
762168404Spjd		}
763168404Spjd
764297078Smav		zio_execute(zio);
765297078Smav		return;
766168404Spjd	}
767168404Spjd
768274800Ssmh	ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
769297108Smav	zio->io_target_timestamp = zio_handle_io_delay(zio);
770274800Ssmh
771263393Sdelphij	vb = kmem_alloc(sizeof (vdev_buf_t), KM_SLEEP);
772168404Spjd
773263393Sdelphij	vb->vb_io = zio;
774263393Sdelphij	bp = &vb->vb_buf;
775168404Spjd
776168404Spjd	bioinit(bp);
777185029Spjd	bp->b_flags = B_BUSY | B_NOCACHE |
778213198Smm	    (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE);
779213198Smm	if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))
780213198Smm		bp->b_flags |= B_FAILFAST;
781168404Spjd	bp->b_bcount = zio->io_size;
782168404Spjd	bp->b_un.b_addr = zio->io_data;
783168404Spjd	bp->b_lblkno = lbtodb(zio->io_offset);
784168404Spjd	bp->b_bufsize = zio->io_size;
785168404Spjd	bp->b_iodone = (int (*)())vdev_disk_io_intr;
786168404Spjd
787185029Spjd	/* ldi_strategy() will return non-zero only on programming errors */
788185029Spjd	VERIFY(ldi_strategy(dvd->vd_lh, bp) == 0);
789168404Spjd}
790168404Spjd
791168404Spjdstatic void
792168404Spjdvdev_disk_io_done(zio_t *zio)
793168404Spjd{
794185029Spjd	vdev_t *vd = zio->io_vd;
795168404Spjd
796185029Spjd	/*
797185029Spjd	 * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if
798185029Spjd	 * the device has been removed.  If this is the case, then we trigger an
799185029Spjd	 * asynchronous removal of the device. Otherwise, probe the device and
800185029Spjd	 * make sure it's still accessible.
801185029Spjd	 */
802219089Spjd	if (zio->io_error == EIO && !vd->vdev_remove_wanted) {
803185029Spjd		vdev_disk_t *dvd = vd->vdev_tsd;
804185029Spjd		int state = DKIO_NONE;
805168404Spjd
806185029Spjd		if (ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state,
807185029Spjd		    FKIOCTL, kcred, NULL) == 0 && state != DKIO_INSERTED) {
808219089Spjd			/*
809219089Spjd			 * We post the resource as soon as possible, instead of
810219089Spjd			 * when the async removal actually happens, because the
811219089Spjd			 * DE is using this information to discard previous I/O
812219089Spjd			 * errors.
813219089Spjd			 */
814219089Spjd			zfs_post_remove(zio->io_spa, vd);
815185029Spjd			vd->vdev_remove_wanted = B_TRUE;
816185029Spjd			spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
817219089Spjd		} else if (!vd->vdev_delayed_close) {
818219089Spjd			vd->vdev_delayed_close = B_TRUE;
819185029Spjd		}
820185029Spjd	}
821168404Spjd}
822168404Spjd
823168404Spjdvdev_ops_t vdev_disk_ops = {
824168404Spjd	vdev_disk_open,
825168404Spjd	vdev_disk_close,
826168404Spjd	vdev_default_asize,
827168404Spjd	vdev_disk_io_start,
828168404Spjd	vdev_disk_io_done,
829168404Spjd	NULL,
830219089Spjd	vdev_disk_hold,
831219089Spjd	vdev_disk_rele,
832168404Spjd	VDEV_TYPE_DISK,		/* name of this vdev type */
833168404Spjd	B_TRUE			/* leaf vdev */
834168404Spjd};
835185029Spjd
836185029Spjd/*
837185029Spjd * Given the root disk device devid or pathname, read the label from
838185029Spjd * the device, and construct a configuration nvlist.
839185029Spjd */
840185029Spjdint
841185029Spjdvdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config)
842185029Spjd{
843185029Spjd	ldi_handle_t vd_lh;
844185029Spjd	vdev_label_t *label;
845185029Spjd	uint64_t s, size;
846185029Spjd	int l;
847185029Spjd	ddi_devid_t tmpdevid;
848185029Spjd	int error = -1;
849185029Spjd	char *minor_name;
850185029Spjd
851185029Spjd	/*
852185029Spjd	 * Read the device label and build the nvlist.
853185029Spjd	 */
854185029Spjd	if (devid != NULL && ddi_devid_str_decode(devid, &tmpdevid,
855185029Spjd	    &minor_name) == 0) {
856185029Spjd		error = ldi_open_by_devid(tmpdevid, minor_name,
857209962Smm		    FREAD, kcred, &vd_lh, zfs_li);
858185029Spjd		ddi_devid_free(tmpdevid);
859185029Spjd		ddi_devid_str_free(minor_name);
860185029Spjd	}
861185029Spjd
862185029Spjd	if (error && (error = ldi_open_by_name(devpath, FREAD, kcred, &vd_lh,
863185029Spjd	    zfs_li)))
864185029Spjd		return (error);
865185029Spjd
866185029Spjd	if (ldi_get_size(vd_lh, &s)) {
867185029Spjd		(void) ldi_close(vd_lh, FREAD, kcred);
868249195Smm		return (SET_ERROR(EIO));
869185029Spjd	}
870185029Spjd
871185029Spjd	size = P2ALIGN_TYPED(s, sizeof (vdev_label_t), uint64_t);
872185029Spjd	label = kmem_alloc(sizeof (vdev_label_t), KM_SLEEP);
873185029Spjd
874219089Spjd	*config = NULL;
875185029Spjd	for (l = 0; l < VDEV_LABELS; l++) {
876185029Spjd		uint64_t offset, state, txg = 0;
877185029Spjd
878185029Spjd		/* read vdev label */
879185029Spjd		offset = vdev_label_offset(size, l, 0);
880255750Sdelphij		if (vdev_disk_ldi_physio(vd_lh, (caddr_t)label,
881209962Smm		    VDEV_SKIP_SIZE + VDEV_PHYS_SIZE, offset, B_READ) != 0)
882185029Spjd			continue;
883185029Spjd
884185029Spjd		if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist,
885185029Spjd		    sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0) {
886185029Spjd			*config = NULL;
887185029Spjd			continue;
888185029Spjd		}
889185029Spjd
890185029Spjd		if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,
891185029Spjd		    &state) != 0 || state >= POOL_STATE_DESTROYED) {
892185029Spjd			nvlist_free(*config);
893185029Spjd			*config = NULL;
894185029Spjd			continue;
895185029Spjd		}
896185029Spjd
897185029Spjd		if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,
898185029Spjd		    &txg) != 0 || txg == 0) {
899185029Spjd			nvlist_free(*config);
900185029Spjd			*config = NULL;
901185029Spjd			continue;
902185029Spjd		}
903185029Spjd
904185029Spjd		break;
905185029Spjd	}
906185029Spjd
907185029Spjd	kmem_free(label, sizeof (vdev_label_t));
908185029Spjd	(void) ldi_close(vd_lh, FREAD, kcred);
909219089Spjd	if (*config == NULL)
910249195Smm		error = SET_ERROR(EIDRM);
911185029Spjd
912185029Spjd	return (error);
913185029Spjd}
914