vdev_disk.c revision 297078
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
24 * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
25 * Copyright (c) 2013 Joyent, Inc.  All rights reserved.
26 */
27
28#include <sys/zfs_context.h>
29#include <sys/spa_impl.h>
30#include <sys/refcount.h>
31#include <sys/vdev_disk.h>
32#include <sys/vdev_impl.h>
33#include <sys/fs/zfs.h>
34#include <sys/zio.h>
35#include <sys/sunldi.h>
36#include <sys/efi_partition.h>
37#include <sys/fm/fs/zfs.h>
38
39/*
40 * Virtual device vector for disks.
41 */
42
43extern ldi_ident_t zfs_li;
44
45static void vdev_disk_close(vdev_t *);
46
47typedef struct vdev_disk_ldi_cb {
48	list_node_t		lcb_next;
49	ldi_callback_id_t	lcb_id;
50} vdev_disk_ldi_cb_t;
51
52static void
53vdev_disk_alloc(vdev_t *vd)
54{
55	vdev_disk_t *dvd;
56
57	dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
58	/*
59	 * Create the LDI event callback list.
60	 */
61	list_create(&dvd->vd_ldi_cbs, sizeof (vdev_disk_ldi_cb_t),
62	    offsetof(vdev_disk_ldi_cb_t, lcb_next));
63}
64
65static void
66vdev_disk_free(vdev_t *vd)
67{
68	vdev_disk_t *dvd = vd->vdev_tsd;
69	vdev_disk_ldi_cb_t *lcb;
70
71	if (dvd == NULL)
72		return;
73
74	/*
75	 * We have already closed the LDI handle. Clean up the LDI event
76	 * callbacks and free vd->vdev_tsd.
77	 */
78	while ((lcb = list_head(&dvd->vd_ldi_cbs)) != NULL) {
79		list_remove(&dvd->vd_ldi_cbs, lcb);
80		(void) ldi_ev_remove_callbacks(lcb->lcb_id);
81		kmem_free(lcb, sizeof (vdev_disk_ldi_cb_t));
82	}
83	list_destroy(&dvd->vd_ldi_cbs);
84	kmem_free(dvd, sizeof (vdev_disk_t));
85	vd->vdev_tsd = NULL;
86}
87
88/* ARGSUSED */
89static int
90vdev_disk_off_notify(ldi_handle_t lh, ldi_ev_cookie_t ecookie, void *arg,
91    void *ev_data)
92{
93	vdev_t *vd = (vdev_t *)arg;
94	vdev_disk_t *dvd = vd->vdev_tsd;
95
96	/*
97	 * Ignore events other than offline.
98	 */
99	if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_OFFLINE) != 0)
100		return (LDI_EV_SUCCESS);
101
102	/*
103	 * All LDI handles must be closed for the state change to succeed, so
104	 * call on vdev_disk_close() to do this.
105	 *
106	 * We inform vdev_disk_close that it is being called from offline
107	 * notify context so it will defer cleanup of LDI event callbacks and
108	 * freeing of vd->vdev_tsd to the offline finalize or a reopen.
109	 */
110	dvd->vd_ldi_offline = B_TRUE;
111	vdev_disk_close(vd);
112
113	/*
114	 * Now that the device is closed, request that the spa_async_thread
115	 * mark the device as REMOVED and notify FMA of the removal.
116	 */
117	zfs_post_remove(vd->vdev_spa, vd);
118	vd->vdev_remove_wanted = B_TRUE;
119	spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE);
120
121	return (LDI_EV_SUCCESS);
122}
123
124/* ARGSUSED */
125static void
126vdev_disk_off_finalize(ldi_handle_t lh, ldi_ev_cookie_t ecookie,
127    int ldi_result, void *arg, void *ev_data)
128{
129	vdev_t *vd = (vdev_t *)arg;
130
131	/*
132	 * Ignore events other than offline.
133	 */
134	if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_OFFLINE) != 0)
135		return;
136
137	/*
138	 * We have already closed the LDI handle in notify.
139	 * Clean up the LDI event callbacks and free vd->vdev_tsd.
140	 */
141	vdev_disk_free(vd);
142
143	/*
144	 * Request that the vdev be reopened if the offline state change was
145	 * unsuccessful.
146	 */
147	if (ldi_result != LDI_EV_SUCCESS) {
148		vd->vdev_probe_wanted = B_TRUE;
149		spa_async_request(vd->vdev_spa, SPA_ASYNC_PROBE);
150	}
151}
152
153static ldi_ev_callback_t vdev_disk_off_callb = {
154	.cb_vers = LDI_EV_CB_VERS,
155	.cb_notify = vdev_disk_off_notify,
156	.cb_finalize = vdev_disk_off_finalize
157};
158
159/* ARGSUSED */
160static void
161vdev_disk_dgrd_finalize(ldi_handle_t lh, ldi_ev_cookie_t ecookie,
162    int ldi_result, void *arg, void *ev_data)
163{
164	vdev_t *vd = (vdev_t *)arg;
165
166	/*
167	 * Ignore events other than degrade.
168	 */
169	if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_DEGRADE) != 0)
170		return;
171
172	/*
173	 * Degrade events always succeed. Mark the vdev as degraded.
174	 * This status is purely informative for the user.
175	 */
176	(void) vdev_degrade(vd->vdev_spa, vd->vdev_guid, 0);
177}
178
179static ldi_ev_callback_t vdev_disk_dgrd_callb = {
180	.cb_vers = LDI_EV_CB_VERS,
181	.cb_notify = NULL,
182	.cb_finalize = vdev_disk_dgrd_finalize
183};
184
185static void
186vdev_disk_hold(vdev_t *vd)
187{
188	ddi_devid_t devid;
189	char *minor;
190
191	ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
192
193	/*
194	 * We must have a pathname, and it must be absolute.
195	 */
196	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/')
197		return;
198
199	/*
200	 * Only prefetch path and devid info if the device has
201	 * never been opened.
202	 */
203	if (vd->vdev_tsd != NULL)
204		return;
205
206	if (vd->vdev_wholedisk == -1ULL) {
207		size_t len = strlen(vd->vdev_path) + 3;
208		char *buf = kmem_alloc(len, KM_SLEEP);
209
210		(void) snprintf(buf, len, "%ss0", vd->vdev_path);
211
212		(void) ldi_vp_from_name(buf, &vd->vdev_name_vp);
213		kmem_free(buf, len);
214	}
215
216	if (vd->vdev_name_vp == NULL)
217		(void) ldi_vp_from_name(vd->vdev_path, &vd->vdev_name_vp);
218
219	if (vd->vdev_devid != NULL &&
220	    ddi_devid_str_decode(vd->vdev_devid, &devid, &minor) == 0) {
221		(void) ldi_vp_from_devid(devid, minor, &vd->vdev_devid_vp);
222		ddi_devid_str_free(minor);
223		ddi_devid_free(devid);
224	}
225}
226
227static void
228vdev_disk_rele(vdev_t *vd)
229{
230	ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
231
232	if (vd->vdev_name_vp) {
233		VN_RELE_ASYNC(vd->vdev_name_vp,
234		    dsl_pool_vnrele_taskq(vd->vdev_spa->spa_dsl_pool));
235		vd->vdev_name_vp = NULL;
236	}
237	if (vd->vdev_devid_vp) {
238		VN_RELE_ASYNC(vd->vdev_devid_vp,
239		    dsl_pool_vnrele_taskq(vd->vdev_spa->spa_dsl_pool));
240		vd->vdev_devid_vp = NULL;
241	}
242}
243
244static uint64_t
245vdev_disk_get_space(vdev_t *vd, uint64_t capacity, uint_t blksz)
246{
247	ASSERT(vd->vdev_wholedisk);
248
249	vdev_disk_t *dvd = vd->vdev_tsd;
250	dk_efi_t dk_ioc;
251	efi_gpt_t *efi;
252	uint64_t avail_space = 0;
253	int efisize = EFI_LABEL_SIZE * 2;
254
255	dk_ioc.dki_data = kmem_alloc(efisize, KM_SLEEP);
256	dk_ioc.dki_lba = 1;
257	dk_ioc.dki_length = efisize;
258	dk_ioc.dki_data_64 = (uint64_t)(uintptr_t)dk_ioc.dki_data;
259	efi = dk_ioc.dki_data;
260
261	if (ldi_ioctl(dvd->vd_lh, DKIOCGETEFI, (intptr_t)&dk_ioc,
262	    FKIOCTL, kcred, NULL) == 0) {
263		uint64_t efi_altern_lba = LE_64(efi->efi_gpt_AlternateLBA);
264
265		if (capacity > efi_altern_lba)
266			avail_space = (capacity - efi_altern_lba) * blksz;
267	}
268	kmem_free(dk_ioc.dki_data, efisize);
269	return (avail_space);
270}
271
272/*
273 * We want to be loud in DEBUG kernels when DKIOCGMEDIAINFOEXT fails, or when
274 * even a fallback to DKIOCGMEDIAINFO fails.
275 */
276#ifdef DEBUG
277#define	VDEV_DEBUG(...)	cmn_err(CE_NOTE, __VA_ARGS__)
278#else
279#define	VDEV_DEBUG(...)	/* Nothing... */
280#endif
281
282static int
283vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
284    uint64_t *ashift)
285{
286	spa_t *spa = vd->vdev_spa;
287	vdev_disk_t *dvd = vd->vdev_tsd;
288	ldi_ev_cookie_t ecookie;
289	vdev_disk_ldi_cb_t *lcb;
290	union {
291		struct dk_minfo_ext ude;
292		struct dk_minfo ud;
293	} dks;
294	struct dk_minfo_ext *dkmext = &dks.ude;
295	struct dk_minfo *dkm = &dks.ud;
296	int error;
297	dev_t dev;
298	int otyp;
299	boolean_t validate_devid = B_FALSE;
300	ddi_devid_t devid;
301	uint64_t capacity = 0, blksz = 0, pbsize;
302
303	/*
304	 * We must have a pathname, and it must be absolute.
305	 */
306	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
307		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
308		return (SET_ERROR(EINVAL));
309	}
310
311	/*
312	 * Reopen the device if it's not currently open. Otherwise,
313	 * just update the physical size of the device.
314	 */
315	if (dvd != NULL) {
316		if (dvd->vd_ldi_offline && dvd->vd_lh == NULL) {
317			/*
318			 * If we are opening a device in its offline notify
319			 * context, the LDI handle was just closed. Clean
320			 * up the LDI event callbacks and free vd->vdev_tsd.
321			 */
322			vdev_disk_free(vd);
323		} else {
324			ASSERT(vd->vdev_reopening);
325			goto skip_open;
326		}
327	}
328
329	/*
330	 * Create vd->vdev_tsd.
331	 */
332	vdev_disk_alloc(vd);
333	dvd = vd->vdev_tsd;
334
335	/*
336	 * When opening a disk device, we want to preserve the user's original
337	 * intent.  We always want to open the device by the path the user gave
338	 * us, even if it is one of multiple paths to the save device.  But we
339	 * also want to be able to survive disks being removed/recabled.
340	 * Therefore the sequence of opening devices is:
341	 *
342	 * 1. Try opening the device by path.  For legacy pools without the
343	 *    'whole_disk' property, attempt to fix the path by appending 's0'.
344	 *
345	 * 2. If the devid of the device matches the stored value, return
346	 *    success.
347	 *
348	 * 3. Otherwise, the device may have moved.  Try opening the device
349	 *    by the devid instead.
350	 */
351	if (vd->vdev_devid != NULL) {
352		if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid,
353		    &dvd->vd_minor) != 0) {
354			vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
355			return (SET_ERROR(EINVAL));
356		}
357	}
358
359	error = EINVAL;		/* presume failure */
360
361	if (vd->vdev_path != NULL) {
362
363		if (vd->vdev_wholedisk == -1ULL) {
364			size_t len = strlen(vd->vdev_path) + 3;
365			char *buf = kmem_alloc(len, KM_SLEEP);
366
367			(void) snprintf(buf, len, "%ss0", vd->vdev_path);
368
369			error = ldi_open_by_name(buf, spa_mode(spa), kcred,
370			    &dvd->vd_lh, zfs_li);
371			if (error == 0) {
372				spa_strfree(vd->vdev_path);
373				vd->vdev_path = buf;
374				vd->vdev_wholedisk = 1ULL;
375			} else {
376				kmem_free(buf, len);
377			}
378		}
379
380		/*
381		 * If we have not yet opened the device, try to open it by the
382		 * specified path.
383		 */
384		if (error != 0) {
385			error = ldi_open_by_name(vd->vdev_path, spa_mode(spa),
386			    kcred, &dvd->vd_lh, zfs_li);
387		}
388
389		/*
390		 * Compare the devid to the stored value.
391		 */
392		if (error == 0 && vd->vdev_devid != NULL &&
393		    ldi_get_devid(dvd->vd_lh, &devid) == 0) {
394			if (ddi_devid_compare(devid, dvd->vd_devid) != 0) {
395				error = SET_ERROR(EINVAL);
396				(void) ldi_close(dvd->vd_lh, spa_mode(spa),
397				    kcred);
398				dvd->vd_lh = NULL;
399			}
400			ddi_devid_free(devid);
401		}
402
403		/*
404		 * If we succeeded in opening the device, but 'vdev_wholedisk'
405		 * is not yet set, then this must be a slice.
406		 */
407		if (error == 0 && vd->vdev_wholedisk == -1ULL)
408			vd->vdev_wholedisk = 0;
409	}
410
411	/*
412	 * If we were unable to open by path, or the devid check fails, open by
413	 * devid instead.
414	 */
415	if (error != 0 && vd->vdev_devid != NULL) {
416		error = ldi_open_by_devid(dvd->vd_devid, dvd->vd_minor,
417		    spa_mode(spa), kcred, &dvd->vd_lh, zfs_li);
418	}
419
420	/*
421	 * If all else fails, then try opening by physical path (if available)
422	 * or the logical path (if we failed due to the devid check).  While not
423	 * as reliable as the devid, this will give us something, and the higher
424	 * level vdev validation will prevent us from opening the wrong device.
425	 */
426	if (error) {
427		if (vd->vdev_devid != NULL)
428			validate_devid = B_TRUE;
429
430		if (vd->vdev_physpath != NULL &&
431		    (dev = ddi_pathname_to_dev_t(vd->vdev_physpath)) != NODEV)
432			error = ldi_open_by_dev(&dev, OTYP_BLK, spa_mode(spa),
433			    kcred, &dvd->vd_lh, zfs_li);
434
435		/*
436		 * Note that we don't support the legacy auto-wholedisk support
437		 * as above.  This hasn't been used in a very long time and we
438		 * don't need to propagate its oddities to this edge condition.
439		 */
440		if (error && vd->vdev_path != NULL)
441			error = ldi_open_by_name(vd->vdev_path, spa_mode(spa),
442			    kcred, &dvd->vd_lh, zfs_li);
443	}
444
445	if (error) {
446		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
447		return (error);
448	}
449
450	/*
451	 * Now that the device has been successfully opened, update the devid
452	 * if necessary.
453	 */
454	if (validate_devid && spa_writeable(spa) &&
455	    ldi_get_devid(dvd->vd_lh, &devid) == 0) {
456		if (ddi_devid_compare(devid, dvd->vd_devid) != 0) {
457			char *vd_devid;
458
459			vd_devid = ddi_devid_str_encode(devid, dvd->vd_minor);
460			zfs_dbgmsg("vdev %s: update devid from %s, "
461			    "to %s", vd->vdev_path, vd->vdev_devid, vd_devid);
462			spa_strfree(vd->vdev_devid);
463			vd->vdev_devid = spa_strdup(vd_devid);
464			ddi_devid_str_free(vd_devid);
465		}
466		ddi_devid_free(devid);
467	}
468
469	/*
470	 * Once a device is opened, verify that the physical device path (if
471	 * available) is up to date.
472	 */
473	if (ldi_get_dev(dvd->vd_lh, &dev) == 0 &&
474	    ldi_get_otyp(dvd->vd_lh, &otyp) == 0) {
475		char *physpath, *minorname;
476
477		physpath = kmem_alloc(MAXPATHLEN, KM_SLEEP);
478		minorname = NULL;
479		if (ddi_dev_pathname(dev, otyp, physpath) == 0 &&
480		    ldi_get_minor_name(dvd->vd_lh, &minorname) == 0 &&
481		    (vd->vdev_physpath == NULL ||
482		    strcmp(vd->vdev_physpath, physpath) != 0)) {
483			if (vd->vdev_physpath)
484				spa_strfree(vd->vdev_physpath);
485			(void) strlcat(physpath, ":", MAXPATHLEN);
486			(void) strlcat(physpath, minorname, MAXPATHLEN);
487			vd->vdev_physpath = spa_strdup(physpath);
488		}
489		if (minorname)
490			kmem_free(minorname, strlen(minorname) + 1);
491		kmem_free(physpath, MAXPATHLEN);
492	}
493
494	/*
495	 * Register callbacks for the LDI offline event.
496	 */
497	if (ldi_ev_get_cookie(dvd->vd_lh, LDI_EV_OFFLINE, &ecookie) ==
498	    LDI_EV_SUCCESS) {
499		lcb = kmem_zalloc(sizeof (vdev_disk_ldi_cb_t), KM_SLEEP);
500		list_insert_tail(&dvd->vd_ldi_cbs, lcb);
501		(void) ldi_ev_register_callbacks(dvd->vd_lh, ecookie,
502		    &vdev_disk_off_callb, (void *) vd, &lcb->lcb_id);
503	}
504
505	/*
506	 * Register callbacks for the LDI degrade event.
507	 */
508	if (ldi_ev_get_cookie(dvd->vd_lh, LDI_EV_DEGRADE, &ecookie) ==
509	    LDI_EV_SUCCESS) {
510		lcb = kmem_zalloc(sizeof (vdev_disk_ldi_cb_t), KM_SLEEP);
511		list_insert_tail(&dvd->vd_ldi_cbs, lcb);
512		(void) ldi_ev_register_callbacks(dvd->vd_lh, ecookie,
513		    &vdev_disk_dgrd_callb, (void *) vd, &lcb->lcb_id);
514	}
515skip_open:
516	/*
517	 * Determine the actual size of the device.
518	 */
519	if (ldi_get_size(dvd->vd_lh, psize) != 0) {
520		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
521		return (SET_ERROR(EINVAL));
522	}
523
524	*max_psize = *psize;
525
526	/*
527	 * Determine the device's minimum transfer size.
528	 * If the ioctl isn't supported, assume DEV_BSIZE.
529	 */
530	if ((error = ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFOEXT,
531	    (intptr_t)dkmext, FKIOCTL, kcred, NULL)) == 0) {
532		capacity = dkmext->dki_capacity - 1;
533		blksz = dkmext->dki_lbsize;
534		pbsize = dkmext->dki_pbsize;
535	} else if ((error = ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFO,
536	    (intptr_t)dkm, FKIOCTL, kcred, NULL)) == 0) {
537		VDEV_DEBUG(
538		    "vdev_disk_open(\"%s\"): fallback to DKIOCGMEDIAINFO\n",
539		    vd->vdev_path);
540		capacity = dkm->dki_capacity - 1;
541		blksz = dkm->dki_lbsize;
542		pbsize = blksz;
543	} else {
544		VDEV_DEBUG("vdev_disk_open(\"%s\"): "
545		    "both DKIOCGMEDIAINFO{,EXT} calls failed, %d\n",
546		    vd->vdev_path, error);
547		pbsize = DEV_BSIZE;
548	}
549
550	*ashift = highbit64(MAX(pbsize, SPA_MINBLOCKSIZE)) - 1;
551
552	if (vd->vdev_wholedisk == 1) {
553		int wce = 1;
554
555		if (error == 0) {
556			/*
557			 * If we have the capability to expand, we'd have
558			 * found out via success from DKIOCGMEDIAINFO{,EXT}.
559			 * Adjust max_psize upward accordingly since we know
560			 * we own the whole disk now.
561			 */
562			*max_psize += vdev_disk_get_space(vd, capacity, blksz);
563			zfs_dbgmsg("capacity change: vdev %s, psize %llu, "
564			    "max_psize %llu", vd->vdev_path, *psize,
565			    *max_psize);
566		}
567
568		/*
569		 * Since we own the whole disk, try to enable disk write
570		 * caching.  We ignore errors because it's OK if we can't do it.
571		 */
572		(void) ldi_ioctl(dvd->vd_lh, DKIOCSETWCE, (intptr_t)&wce,
573		    FKIOCTL, kcred, NULL);
574	}
575
576	/*
577	 * Clear the nowritecache bit, so that on a vdev_reopen() we will
578	 * try again.
579	 */
580	vd->vdev_nowritecache = B_FALSE;
581
582	return (0);
583}
584
585static void
586vdev_disk_close(vdev_t *vd)
587{
588	vdev_disk_t *dvd = vd->vdev_tsd;
589
590	if (vd->vdev_reopening || dvd == NULL)
591		return;
592
593	if (dvd->vd_minor != NULL) {
594		ddi_devid_str_free(dvd->vd_minor);
595		dvd->vd_minor = NULL;
596	}
597
598	if (dvd->vd_devid != NULL) {
599		ddi_devid_free(dvd->vd_devid);
600		dvd->vd_devid = NULL;
601	}
602
603	if (dvd->vd_lh != NULL) {
604		(void) ldi_close(dvd->vd_lh, spa_mode(vd->vdev_spa), kcred);
605		dvd->vd_lh = NULL;
606	}
607
608	vd->vdev_delayed_close = B_FALSE;
609	/*
610	 * If we closed the LDI handle due to an offline notify from LDI,
611	 * don't free vd->vdev_tsd or unregister the callbacks here;
612	 * the offline finalize callback or a reopen will take care of it.
613	 */
614	if (dvd->vd_ldi_offline)
615		return;
616
617	vdev_disk_free(vd);
618}
619
620int
621vdev_disk_physio(vdev_t *vd, caddr_t data,
622    size_t size, uint64_t offset, int flags, boolean_t isdump)
623{
624	vdev_disk_t *dvd = vd->vdev_tsd;
625
626	/*
627	 * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
628	 * Nothing to be done here but return failure.
629	 */
630	if (dvd == NULL || (dvd->vd_ldi_offline && dvd->vd_lh == NULL))
631		return (EIO);
632
633	ASSERT(vd->vdev_ops == &vdev_disk_ops);
634
635	/*
636	 * If in the context of an active crash dump, use the ldi_dump(9F)
637	 * call instead of ldi_strategy(9F) as usual.
638	 */
639	if (isdump) {
640		ASSERT3P(dvd, !=, NULL);
641		return (ldi_dump(dvd->vd_lh, data, lbtodb(offset),
642		    lbtodb(size)));
643	}
644
645	return (vdev_disk_ldi_physio(dvd->vd_lh, data, size, offset, flags));
646}
647
648int
649vdev_disk_ldi_physio(ldi_handle_t vd_lh, caddr_t data,
650    size_t size, uint64_t offset, int flags)
651{
652	buf_t *bp;
653	int error = 0;
654
655	if (vd_lh == NULL)
656		return (SET_ERROR(EINVAL));
657
658	ASSERT(flags & B_READ || flags & B_WRITE);
659
660	bp = getrbuf(KM_SLEEP);
661	bp->b_flags = flags | B_BUSY | B_NOCACHE | B_FAILFAST;
662	bp->b_bcount = size;
663	bp->b_un.b_addr = (void *)data;
664	bp->b_lblkno = lbtodb(offset);
665	bp->b_bufsize = size;
666
667	error = ldi_strategy(vd_lh, bp);
668	ASSERT(error == 0);
669	if ((error = biowait(bp)) == 0 && bp->b_resid != 0)
670		error = SET_ERROR(EIO);
671	freerbuf(bp);
672
673	return (error);
674}
675
676static void
677vdev_disk_io_intr(buf_t *bp)
678{
679	vdev_buf_t *vb = (vdev_buf_t *)bp;
680	zio_t *zio = vb->vb_io;
681
682	/*
683	 * The rest of the zio stack only deals with EIO, ECKSUM, and ENXIO.
684	 * Rather than teach the rest of the stack about other error
685	 * possibilities (EFAULT, etc), we normalize the error value here.
686	 */
687	zio->io_error = (geterror(bp) != 0 ? SET_ERROR(EIO) : 0);
688
689	if (zio->io_error == 0 && bp->b_resid != 0)
690		zio->io_error = SET_ERROR(EIO);
691
692	kmem_free(vb, sizeof (vdev_buf_t));
693
694	zio_interrupt(zio);
695}
696
697static void
698vdev_disk_ioctl_free(zio_t *zio)
699{
700	kmem_free(zio->io_vsd, sizeof (struct dk_callback));
701}
702
703static const zio_vsd_ops_t vdev_disk_vsd_ops = {
704	vdev_disk_ioctl_free,
705	zio_vsd_default_cksum_report
706};
707
708static void
709vdev_disk_ioctl_done(void *zio_arg, int error)
710{
711	zio_t *zio = zio_arg;
712
713	zio->io_error = error;
714
715	zio_interrupt(zio);
716}
717
718static void
719vdev_disk_io_start(zio_t *zio)
720{
721	vdev_t *vd = zio->io_vd;
722	vdev_disk_t *dvd = vd->vdev_tsd;
723	vdev_buf_t *vb;
724	struct dk_callback *dkc;
725	buf_t *bp;
726	int error;
727
728	/*
729	 * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
730	 * Nothing to be done here but return failure.
731	 */
732	if (dvd == NULL || (dvd->vd_ldi_offline && dvd->vd_lh == NULL)) {
733		zio->io_error = SET_ERROR(ENXIO);
734		zio_interrupt(zio);
735		return;
736	}
737
738	if (zio->io_type == ZIO_TYPE_IOCTL) {
739		/* XXPOLICY */
740		if (!vdev_readable(vd)) {
741			zio->io_error = SET_ERROR(ENXIO);
742			zio_interrupt(zio);
743			return;
744		}
745
746		switch (zio->io_cmd) {
747
748		case DKIOCFLUSHWRITECACHE:
749
750			if (zfs_nocacheflush)
751				break;
752
753			if (vd->vdev_nowritecache) {
754				zio->io_error = SET_ERROR(ENOTSUP);
755				break;
756			}
757
758			zio->io_vsd = dkc = kmem_alloc(sizeof (*dkc), KM_SLEEP);
759			zio->io_vsd_ops = &vdev_disk_vsd_ops;
760
761			dkc->dkc_callback = vdev_disk_ioctl_done;
762			dkc->dkc_flag = FLUSH_VOLATILE;
763			dkc->dkc_cookie = zio;
764
765			error = ldi_ioctl(dvd->vd_lh, zio->io_cmd,
766			    (uintptr_t)dkc, FKIOCTL, kcred, NULL);
767
768			if (error == 0) {
769				/*
770				 * The ioctl will be done asychronously,
771				 * and will call vdev_disk_ioctl_done()
772				 * upon completion.
773				 */
774				return;
775			}
776
777			if (error == ENOTSUP || error == ENOTTY) {
778				/*
779				 * If we get ENOTSUP or ENOTTY, we know that
780				 * no future attempts will ever succeed.
781				 * In this case we set a persistent bit so
782				 * that we don't bother with the ioctl in the
783				 * future.
784				 */
785				vd->vdev_nowritecache = B_TRUE;
786			}
787			zio->io_error = error;
788
789			break;
790
791		default:
792			zio->io_error = SET_ERROR(ENOTSUP);
793		}
794
795		zio_execute(zio);
796		return;
797	}
798
799	ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
800
801	vb = kmem_alloc(sizeof (vdev_buf_t), KM_SLEEP);
802
803	vb->vb_io = zio;
804	bp = &vb->vb_buf;
805
806	bioinit(bp);
807	bp->b_flags = B_BUSY | B_NOCACHE |
808	    (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE);
809	if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))
810		bp->b_flags |= B_FAILFAST;
811	bp->b_bcount = zio->io_size;
812	bp->b_un.b_addr = zio->io_data;
813	bp->b_lblkno = lbtodb(zio->io_offset);
814	bp->b_bufsize = zio->io_size;
815	bp->b_iodone = (int (*)())vdev_disk_io_intr;
816
817	/* ldi_strategy() will return non-zero only on programming errors */
818	VERIFY(ldi_strategy(dvd->vd_lh, bp) == 0);
819}
820
821static void
822vdev_disk_io_done(zio_t *zio)
823{
824	vdev_t *vd = zio->io_vd;
825
826	/*
827	 * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if
828	 * the device has been removed.  If this is the case, then we trigger an
829	 * asynchronous removal of the device. Otherwise, probe the device and
830	 * make sure it's still accessible.
831	 */
832	if (zio->io_error == EIO && !vd->vdev_remove_wanted) {
833		vdev_disk_t *dvd = vd->vdev_tsd;
834		int state = DKIO_NONE;
835
836		if (ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state,
837		    FKIOCTL, kcred, NULL) == 0 && state != DKIO_INSERTED) {
838			/*
839			 * We post the resource as soon as possible, instead of
840			 * when the async removal actually happens, because the
841			 * DE is using this information to discard previous I/O
842			 * errors.
843			 */
844			zfs_post_remove(zio->io_spa, vd);
845			vd->vdev_remove_wanted = B_TRUE;
846			spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
847		} else if (!vd->vdev_delayed_close) {
848			vd->vdev_delayed_close = B_TRUE;
849		}
850	}
851}
852
853vdev_ops_t vdev_disk_ops = {
854	vdev_disk_open,
855	vdev_disk_close,
856	vdev_default_asize,
857	vdev_disk_io_start,
858	vdev_disk_io_done,
859	NULL,
860	vdev_disk_hold,
861	vdev_disk_rele,
862	VDEV_TYPE_DISK,		/* name of this vdev type */
863	B_TRUE			/* leaf vdev */
864};
865
866/*
867 * Given the root disk device devid or pathname, read the label from
868 * the device, and construct a configuration nvlist.
869 */
870int
871vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config)
872{
873	ldi_handle_t vd_lh;
874	vdev_label_t *label;
875	uint64_t s, size;
876	int l;
877	ddi_devid_t tmpdevid;
878	int error = -1;
879	char *minor_name;
880
881	/*
882	 * Read the device label and build the nvlist.
883	 */
884	if (devid != NULL && ddi_devid_str_decode(devid, &tmpdevid,
885	    &minor_name) == 0) {
886		error = ldi_open_by_devid(tmpdevid, minor_name,
887		    FREAD, kcred, &vd_lh, zfs_li);
888		ddi_devid_free(tmpdevid);
889		ddi_devid_str_free(minor_name);
890	}
891
892	if (error && (error = ldi_open_by_name(devpath, FREAD, kcred, &vd_lh,
893	    zfs_li)))
894		return (error);
895
896	if (ldi_get_size(vd_lh, &s)) {
897		(void) ldi_close(vd_lh, FREAD, kcred);
898		return (SET_ERROR(EIO));
899	}
900
901	size = P2ALIGN_TYPED(s, sizeof (vdev_label_t), uint64_t);
902	label = kmem_alloc(sizeof (vdev_label_t), KM_SLEEP);
903
904	*config = NULL;
905	for (l = 0; l < VDEV_LABELS; l++) {
906		uint64_t offset, state, txg = 0;
907
908		/* read vdev label */
909		offset = vdev_label_offset(size, l, 0);
910		if (vdev_disk_ldi_physio(vd_lh, (caddr_t)label,
911		    VDEV_SKIP_SIZE + VDEV_PHYS_SIZE, offset, B_READ) != 0)
912			continue;
913
914		if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist,
915		    sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0) {
916			*config = NULL;
917			continue;
918		}
919
920		if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,
921		    &state) != 0 || state >= POOL_STATE_DESTROYED) {
922			nvlist_free(*config);
923			*config = NULL;
924			continue;
925		}
926
927		if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,
928		    &txg) != 0 || txg == 0) {
929			nvlist_free(*config);
930			*config = NULL;
931			continue;
932		}
933
934		break;
935	}
936
937	kmem_free(label, sizeof (vdev_label_t));
938	(void) ldi_close(vd_lh, FREAD, kcred);
939	if (*config == NULL)
940		error = SET_ERROR(EIDRM);
941
942	return (error);
943}
944