vdev_disk.c revision 307268
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
24 * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
25 * Copyright (c) 2013 Joyent, Inc.  All rights reserved.
26 */
27
28#include <sys/zfs_context.h>
29#include <sys/spa_impl.h>
30#include <sys/refcount.h>
31#include <sys/vdev_disk.h>
32#include <sys/vdev_impl.h>
33#include <sys/fs/zfs.h>
34#include <sys/zio.h>
35#include <sys/sunldi.h>
36#include <sys/efi_partition.h>
37#include <sys/fm/fs/zfs.h>
38
39/*
40 * Virtual device vector for disks.
41 */
42
43extern ldi_ident_t zfs_li;
44
45static void vdev_disk_close(vdev_t *);
46
47typedef struct vdev_disk_ldi_cb {
48	list_node_t		lcb_next;
49	ldi_callback_id_t	lcb_id;
50} vdev_disk_ldi_cb_t;
51
52static void
53vdev_disk_alloc(vdev_t *vd)
54{
55	vdev_disk_t *dvd;
56
57	dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
58	/*
59	 * Create the LDI event callback list.
60	 */
61	list_create(&dvd->vd_ldi_cbs, sizeof (vdev_disk_ldi_cb_t),
62	    offsetof(vdev_disk_ldi_cb_t, lcb_next));
63}
64
65static void
66vdev_disk_free(vdev_t *vd)
67{
68	vdev_disk_t *dvd = vd->vdev_tsd;
69	vdev_disk_ldi_cb_t *lcb;
70
71	if (dvd == NULL)
72		return;
73
74	/*
75	 * We have already closed the LDI handle. Clean up the LDI event
76	 * callbacks and free vd->vdev_tsd.
77	 */
78	while ((lcb = list_head(&dvd->vd_ldi_cbs)) != NULL) {
79		list_remove(&dvd->vd_ldi_cbs, lcb);
80		(void) ldi_ev_remove_callbacks(lcb->lcb_id);
81		kmem_free(lcb, sizeof (vdev_disk_ldi_cb_t));
82	}
83	list_destroy(&dvd->vd_ldi_cbs);
84	kmem_free(dvd, sizeof (vdev_disk_t));
85	vd->vdev_tsd = NULL;
86}
87
88/* ARGSUSED */
89static int
90vdev_disk_off_notify(ldi_handle_t lh, ldi_ev_cookie_t ecookie, void *arg,
91    void *ev_data)
92{
93	vdev_t *vd = (vdev_t *)arg;
94	vdev_disk_t *dvd = vd->vdev_tsd;
95
96	/*
97	 * Ignore events other than offline.
98	 */
99	if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_OFFLINE) != 0)
100		return (LDI_EV_SUCCESS);
101
102	/*
103	 * All LDI handles must be closed for the state change to succeed, so
104	 * call on vdev_disk_close() to do this.
105	 *
106	 * We inform vdev_disk_close that it is being called from offline
107	 * notify context so it will defer cleanup of LDI event callbacks and
108	 * freeing of vd->vdev_tsd to the offline finalize or a reopen.
109	 */
110	dvd->vd_ldi_offline = B_TRUE;
111	vdev_disk_close(vd);
112
113	/*
114	 * Now that the device is closed, request that the spa_async_thread
115	 * mark the device as REMOVED and notify FMA of the removal.
116	 */
117	zfs_post_remove(vd->vdev_spa, vd);
118	vd->vdev_remove_wanted = B_TRUE;
119	spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE);
120
121	return (LDI_EV_SUCCESS);
122}
123
124/* ARGSUSED */
125static void
126vdev_disk_off_finalize(ldi_handle_t lh, ldi_ev_cookie_t ecookie,
127    int ldi_result, void *arg, void *ev_data)
128{
129	vdev_t *vd = (vdev_t *)arg;
130
131	/*
132	 * Ignore events other than offline.
133	 */
134	if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_OFFLINE) != 0)
135		return;
136
137	/*
138	 * We have already closed the LDI handle in notify.
139	 * Clean up the LDI event callbacks and free vd->vdev_tsd.
140	 */
141	vdev_disk_free(vd);
142
143	/*
144	 * Request that the vdev be reopened if the offline state change was
145	 * unsuccessful.
146	 */
147	if (ldi_result != LDI_EV_SUCCESS) {
148		vd->vdev_probe_wanted = B_TRUE;
149		spa_async_request(vd->vdev_spa, SPA_ASYNC_PROBE);
150	}
151}
152
153static ldi_ev_callback_t vdev_disk_off_callb = {
154	.cb_vers = LDI_EV_CB_VERS,
155	.cb_notify = vdev_disk_off_notify,
156	.cb_finalize = vdev_disk_off_finalize
157};
158
159/* ARGSUSED */
160static void
161vdev_disk_dgrd_finalize(ldi_handle_t lh, ldi_ev_cookie_t ecookie,
162    int ldi_result, void *arg, void *ev_data)
163{
164	vdev_t *vd = (vdev_t *)arg;
165
166	/*
167	 * Ignore events other than degrade.
168	 */
169	if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_DEGRADE) != 0)
170		return;
171
172	/*
173	 * Degrade events always succeed. Mark the vdev as degraded.
174	 * This status is purely informative for the user.
175	 */
176	(void) vdev_degrade(vd->vdev_spa, vd->vdev_guid, 0);
177}
178
179static ldi_ev_callback_t vdev_disk_dgrd_callb = {
180	.cb_vers = LDI_EV_CB_VERS,
181	.cb_notify = NULL,
182	.cb_finalize = vdev_disk_dgrd_finalize
183};
184
185static void
186vdev_disk_hold(vdev_t *vd)
187{
188	ddi_devid_t devid;
189	char *minor;
190
191	ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
192
193	/*
194	 * We must have a pathname, and it must be absolute.
195	 */
196	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/')
197		return;
198
199	/*
200	 * Only prefetch path and devid info if the device has
201	 * never been opened.
202	 */
203	if (vd->vdev_tsd != NULL)
204		return;
205
206	if (vd->vdev_wholedisk == -1ULL) {
207		size_t len = strlen(vd->vdev_path) + 3;
208		char *buf = kmem_alloc(len, KM_SLEEP);
209
210		(void) snprintf(buf, len, "%ss0", vd->vdev_path);
211
212		(void) ldi_vp_from_name(buf, &vd->vdev_name_vp);
213		kmem_free(buf, len);
214	}
215
216	if (vd->vdev_name_vp == NULL)
217		(void) ldi_vp_from_name(vd->vdev_path, &vd->vdev_name_vp);
218
219	if (vd->vdev_devid != NULL &&
220	    ddi_devid_str_decode(vd->vdev_devid, &devid, &minor) == 0) {
221		(void) ldi_vp_from_devid(devid, minor, &vd->vdev_devid_vp);
222		ddi_devid_str_free(minor);
223		ddi_devid_free(devid);
224	}
225}
226
227static void
228vdev_disk_rele(vdev_t *vd)
229{
230	ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
231
232	if (vd->vdev_name_vp) {
233		VN_RELE_ASYNC(vd->vdev_name_vp,
234		    dsl_pool_vnrele_taskq(vd->vdev_spa->spa_dsl_pool));
235		vd->vdev_name_vp = NULL;
236	}
237	if (vd->vdev_devid_vp) {
238		VN_RELE_ASYNC(vd->vdev_devid_vp,
239		    dsl_pool_vnrele_taskq(vd->vdev_spa->spa_dsl_pool));
240		vd->vdev_devid_vp = NULL;
241	}
242}
243
244/*
245 * We want to be loud in DEBUG kernels when DKIOCGMEDIAINFOEXT fails, or when
246 * even a fallback to DKIOCGMEDIAINFO fails.
247 */
248#ifdef DEBUG
249#define	VDEV_DEBUG(...)	cmn_err(CE_NOTE, __VA_ARGS__)
250#else
251#define	VDEV_DEBUG(...)	/* Nothing... */
252#endif
253
254static int
255vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
256    uint64_t *ashift)
257{
258	spa_t *spa = vd->vdev_spa;
259	vdev_disk_t *dvd = vd->vdev_tsd;
260	ldi_ev_cookie_t ecookie;
261	vdev_disk_ldi_cb_t *lcb;
262	union {
263		struct dk_minfo_ext ude;
264		struct dk_minfo ud;
265	} dks;
266	struct dk_minfo_ext *dkmext = &dks.ude;
267	struct dk_minfo *dkm = &dks.ud;
268	int error;
269	dev_t dev;
270	int otyp;
271	boolean_t validate_devid = B_FALSE;
272	ddi_devid_t devid;
273	uint64_t capacity = 0, blksz = 0, pbsize;
274
275	/*
276	 * We must have a pathname, and it must be absolute.
277	 */
278	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
279		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
280		return (SET_ERROR(EINVAL));
281	}
282
283	/*
284	 * Reopen the device if it's not currently open. Otherwise,
285	 * just update the physical size of the device.
286	 */
287	if (dvd != NULL) {
288		if (dvd->vd_ldi_offline && dvd->vd_lh == NULL) {
289			/*
290			 * If we are opening a device in its offline notify
291			 * context, the LDI handle was just closed. Clean
292			 * up the LDI event callbacks and free vd->vdev_tsd.
293			 */
294			vdev_disk_free(vd);
295		} else {
296			ASSERT(vd->vdev_reopening);
297			goto skip_open;
298		}
299	}
300
301	/*
302	 * Create vd->vdev_tsd.
303	 */
304	vdev_disk_alloc(vd);
305	dvd = vd->vdev_tsd;
306
307	/*
308	 * When opening a disk device, we want to preserve the user's original
309	 * intent.  We always want to open the device by the path the user gave
310	 * us, even if it is one of multiple paths to the save device.  But we
311	 * also want to be able to survive disks being removed/recabled.
312	 * Therefore the sequence of opening devices is:
313	 *
314	 * 1. Try opening the device by path.  For legacy pools without the
315	 *    'whole_disk' property, attempt to fix the path by appending 's0'.
316	 *
317	 * 2. If the devid of the device matches the stored value, return
318	 *    success.
319	 *
320	 * 3. Otherwise, the device may have moved.  Try opening the device
321	 *    by the devid instead.
322	 */
323	if (vd->vdev_devid != NULL) {
324		if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid,
325		    &dvd->vd_minor) != 0) {
326			vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
327			return (SET_ERROR(EINVAL));
328		}
329	}
330
331	error = EINVAL;		/* presume failure */
332
333	if (vd->vdev_path != NULL) {
334
335		if (vd->vdev_wholedisk == -1ULL) {
336			size_t len = strlen(vd->vdev_path) + 3;
337			char *buf = kmem_alloc(len, KM_SLEEP);
338
339			(void) snprintf(buf, len, "%ss0", vd->vdev_path);
340
341			error = ldi_open_by_name(buf, spa_mode(spa), kcred,
342			    &dvd->vd_lh, zfs_li);
343			if (error == 0) {
344				spa_strfree(vd->vdev_path);
345				vd->vdev_path = buf;
346				vd->vdev_wholedisk = 1ULL;
347			} else {
348				kmem_free(buf, len);
349			}
350		}
351
352		/*
353		 * If we have not yet opened the device, try to open it by the
354		 * specified path.
355		 */
356		if (error != 0) {
357			error = ldi_open_by_name(vd->vdev_path, spa_mode(spa),
358			    kcred, &dvd->vd_lh, zfs_li);
359		}
360
361		/*
362		 * Compare the devid to the stored value.
363		 */
364		if (error == 0 && vd->vdev_devid != NULL &&
365		    ldi_get_devid(dvd->vd_lh, &devid) == 0) {
366			if (ddi_devid_compare(devid, dvd->vd_devid) != 0) {
367				error = SET_ERROR(EINVAL);
368				(void) ldi_close(dvd->vd_lh, spa_mode(spa),
369				    kcred);
370				dvd->vd_lh = NULL;
371			}
372			ddi_devid_free(devid);
373		}
374
375		/*
376		 * If we succeeded in opening the device, but 'vdev_wholedisk'
377		 * is not yet set, then this must be a slice.
378		 */
379		if (error == 0 && vd->vdev_wholedisk == -1ULL)
380			vd->vdev_wholedisk = 0;
381	}
382
383	/*
384	 * If we were unable to open by path, or the devid check fails, open by
385	 * devid instead.
386	 */
387	if (error != 0 && vd->vdev_devid != NULL) {
388		error = ldi_open_by_devid(dvd->vd_devid, dvd->vd_minor,
389		    spa_mode(spa), kcred, &dvd->vd_lh, zfs_li);
390	}
391
392	/*
393	 * If all else fails, then try opening by physical path (if available)
394	 * or the logical path (if we failed due to the devid check).  While not
395	 * as reliable as the devid, this will give us something, and the higher
396	 * level vdev validation will prevent us from opening the wrong device.
397	 */
398	if (error) {
399		if (vd->vdev_devid != NULL)
400			validate_devid = B_TRUE;
401
402		if (vd->vdev_physpath != NULL &&
403		    (dev = ddi_pathname_to_dev_t(vd->vdev_physpath)) != NODEV)
404			error = ldi_open_by_dev(&dev, OTYP_BLK, spa_mode(spa),
405			    kcred, &dvd->vd_lh, zfs_li);
406
407		/*
408		 * Note that we don't support the legacy auto-wholedisk support
409		 * as above.  This hasn't been used in a very long time and we
410		 * don't need to propagate its oddities to this edge condition.
411		 */
412		if (error && vd->vdev_path != NULL)
413			error = ldi_open_by_name(vd->vdev_path, spa_mode(spa),
414			    kcred, &dvd->vd_lh, zfs_li);
415	}
416
417	if (error) {
418		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
419		return (error);
420	}
421
422	/*
423	 * Now that the device has been successfully opened, update the devid
424	 * if necessary.
425	 */
426	if (validate_devid && spa_writeable(spa) &&
427	    ldi_get_devid(dvd->vd_lh, &devid) == 0) {
428		if (ddi_devid_compare(devid, dvd->vd_devid) != 0) {
429			char *vd_devid;
430
431			vd_devid = ddi_devid_str_encode(devid, dvd->vd_minor);
432			zfs_dbgmsg("vdev %s: update devid from %s, "
433			    "to %s", vd->vdev_path, vd->vdev_devid, vd_devid);
434			spa_strfree(vd->vdev_devid);
435			vd->vdev_devid = spa_strdup(vd_devid);
436			ddi_devid_str_free(vd_devid);
437		}
438		ddi_devid_free(devid);
439	}
440
441	/*
442	 * Once a device is opened, verify that the physical device path (if
443	 * available) is up to date.
444	 */
445	if (ldi_get_dev(dvd->vd_lh, &dev) == 0 &&
446	    ldi_get_otyp(dvd->vd_lh, &otyp) == 0) {
447		char *physpath, *minorname;
448
449		physpath = kmem_alloc(MAXPATHLEN, KM_SLEEP);
450		minorname = NULL;
451		if (ddi_dev_pathname(dev, otyp, physpath) == 0 &&
452		    ldi_get_minor_name(dvd->vd_lh, &minorname) == 0 &&
453		    (vd->vdev_physpath == NULL ||
454		    strcmp(vd->vdev_physpath, physpath) != 0)) {
455			if (vd->vdev_physpath)
456				spa_strfree(vd->vdev_physpath);
457			(void) strlcat(physpath, ":", MAXPATHLEN);
458			(void) strlcat(physpath, minorname, MAXPATHLEN);
459			vd->vdev_physpath = spa_strdup(physpath);
460		}
461		if (minorname)
462			kmem_free(minorname, strlen(minorname) + 1);
463		kmem_free(physpath, MAXPATHLEN);
464	}
465
466	/*
467	 * Register callbacks for the LDI offline event.
468	 */
469	if (ldi_ev_get_cookie(dvd->vd_lh, LDI_EV_OFFLINE, &ecookie) ==
470	    LDI_EV_SUCCESS) {
471		lcb = kmem_zalloc(sizeof (vdev_disk_ldi_cb_t), KM_SLEEP);
472		list_insert_tail(&dvd->vd_ldi_cbs, lcb);
473		(void) ldi_ev_register_callbacks(dvd->vd_lh, ecookie,
474		    &vdev_disk_off_callb, (void *) vd, &lcb->lcb_id);
475	}
476
477	/*
478	 * Register callbacks for the LDI degrade event.
479	 */
480	if (ldi_ev_get_cookie(dvd->vd_lh, LDI_EV_DEGRADE, &ecookie) ==
481	    LDI_EV_SUCCESS) {
482		lcb = kmem_zalloc(sizeof (vdev_disk_ldi_cb_t), KM_SLEEP);
483		list_insert_tail(&dvd->vd_ldi_cbs, lcb);
484		(void) ldi_ev_register_callbacks(dvd->vd_lh, ecookie,
485		    &vdev_disk_dgrd_callb, (void *) vd, &lcb->lcb_id);
486	}
487skip_open:
488	/*
489	 * Determine the actual size of the device.
490	 */
491	if (ldi_get_size(dvd->vd_lh, psize) != 0) {
492		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
493		return (SET_ERROR(EINVAL));
494	}
495
496	*max_psize = *psize;
497
498	/*
499	 * Determine the device's minimum transfer size.
500	 * If the ioctl isn't supported, assume DEV_BSIZE.
501	 */
502	if ((error = ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFOEXT,
503	    (intptr_t)dkmext, FKIOCTL, kcred, NULL)) == 0) {
504		capacity = dkmext->dki_capacity - 1;
505		blksz = dkmext->dki_lbsize;
506		pbsize = dkmext->dki_pbsize;
507	} else if ((error = ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFO,
508	    (intptr_t)dkm, FKIOCTL, kcred, NULL)) == 0) {
509		VDEV_DEBUG(
510		    "vdev_disk_open(\"%s\"): fallback to DKIOCGMEDIAINFO\n",
511		    vd->vdev_path);
512		capacity = dkm->dki_capacity - 1;
513		blksz = dkm->dki_lbsize;
514		pbsize = blksz;
515	} else {
516		VDEV_DEBUG("vdev_disk_open(\"%s\"): "
517		    "both DKIOCGMEDIAINFO{,EXT} calls failed, %d\n",
518		    vd->vdev_path, error);
519		pbsize = DEV_BSIZE;
520	}
521
522	*ashift = highbit64(MAX(pbsize, SPA_MINBLOCKSIZE)) - 1;
523
524	if (vd->vdev_wholedisk == 1) {
525		int wce = 1;
526
527		if (error == 0) {
528			/*
529			 * If we have the capability to expand, we'd have
530			 * found out via success from DKIOCGMEDIAINFO{,EXT}.
531			 * Adjust max_psize upward accordingly since we know
532			 * we own the whole disk now.
533			 */
534			*max_psize = capacity * blksz;
535		}
536
537		/*
538		 * Since we own the whole disk, try to enable disk write
539		 * caching.  We ignore errors because it's OK if we can't do it.
540		 */
541		(void) ldi_ioctl(dvd->vd_lh, DKIOCSETWCE, (intptr_t)&wce,
542		    FKIOCTL, kcred, NULL);
543	}
544
545	/*
546	 * Clear the nowritecache bit, so that on a vdev_reopen() we will
547	 * try again.
548	 */
549	vd->vdev_nowritecache = B_FALSE;
550
551	return (0);
552}
553
554static void
555vdev_disk_close(vdev_t *vd)
556{
557	vdev_disk_t *dvd = vd->vdev_tsd;
558
559	if (vd->vdev_reopening || dvd == NULL)
560		return;
561
562	if (dvd->vd_minor != NULL) {
563		ddi_devid_str_free(dvd->vd_minor);
564		dvd->vd_minor = NULL;
565	}
566
567	if (dvd->vd_devid != NULL) {
568		ddi_devid_free(dvd->vd_devid);
569		dvd->vd_devid = NULL;
570	}
571
572	if (dvd->vd_lh != NULL) {
573		(void) ldi_close(dvd->vd_lh, spa_mode(vd->vdev_spa), kcred);
574		dvd->vd_lh = NULL;
575	}
576
577	vd->vdev_delayed_close = B_FALSE;
578	/*
579	 * If we closed the LDI handle due to an offline notify from LDI,
580	 * don't free vd->vdev_tsd or unregister the callbacks here;
581	 * the offline finalize callback or a reopen will take care of it.
582	 */
583	if (dvd->vd_ldi_offline)
584		return;
585
586	vdev_disk_free(vd);
587}
588
589int
590vdev_disk_physio(vdev_t *vd, caddr_t data,
591    size_t size, uint64_t offset, int flags, boolean_t isdump)
592{
593	vdev_disk_t *dvd = vd->vdev_tsd;
594
595	/*
596	 * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
597	 * Nothing to be done here but return failure.
598	 */
599	if (dvd == NULL || (dvd->vd_ldi_offline && dvd->vd_lh == NULL))
600		return (EIO);
601
602	ASSERT(vd->vdev_ops == &vdev_disk_ops);
603
604	/*
605	 * If in the context of an active crash dump, use the ldi_dump(9F)
606	 * call instead of ldi_strategy(9F) as usual.
607	 */
608	if (isdump) {
609		ASSERT3P(dvd, !=, NULL);
610		return (ldi_dump(dvd->vd_lh, data, lbtodb(offset),
611		    lbtodb(size)));
612	}
613
614	return (vdev_disk_ldi_physio(dvd->vd_lh, data, size, offset, flags));
615}
616
617int
618vdev_disk_ldi_physio(ldi_handle_t vd_lh, caddr_t data,
619    size_t size, uint64_t offset, int flags)
620{
621	buf_t *bp;
622	int error = 0;
623
624	if (vd_lh == NULL)
625		return (SET_ERROR(EINVAL));
626
627	ASSERT(flags & B_READ || flags & B_WRITE);
628
629	bp = getrbuf(KM_SLEEP);
630	bp->b_flags = flags | B_BUSY | B_NOCACHE | B_FAILFAST;
631	bp->b_bcount = size;
632	bp->b_un.b_addr = (void *)data;
633	bp->b_lblkno = lbtodb(offset);
634	bp->b_bufsize = size;
635
636	error = ldi_strategy(vd_lh, bp);
637	ASSERT(error == 0);
638	if ((error = biowait(bp)) == 0 && bp->b_resid != 0)
639		error = SET_ERROR(EIO);
640	freerbuf(bp);
641
642	return (error);
643}
644
645static void
646vdev_disk_io_intr(buf_t *bp)
647{
648	vdev_buf_t *vb = (vdev_buf_t *)bp;
649	zio_t *zio = vb->vb_io;
650
651	/*
652	 * The rest of the zio stack only deals with EIO, ECKSUM, and ENXIO.
653	 * Rather than teach the rest of the stack about other error
654	 * possibilities (EFAULT, etc), we normalize the error value here.
655	 */
656	zio->io_error = (geterror(bp) != 0 ? SET_ERROR(EIO) : 0);
657
658	if (zio->io_error == 0 && bp->b_resid != 0)
659		zio->io_error = SET_ERROR(EIO);
660
661	kmem_free(vb, sizeof (vdev_buf_t));
662
663	zio_delay_interrupt(zio);
664}
665
666static void
667vdev_disk_ioctl_free(zio_t *zio)
668{
669	kmem_free(zio->io_vsd, sizeof (struct dk_callback));
670}
671
672static const zio_vsd_ops_t vdev_disk_vsd_ops = {
673	vdev_disk_ioctl_free,
674	zio_vsd_default_cksum_report
675};
676
677static void
678vdev_disk_ioctl_done(void *zio_arg, int error)
679{
680	zio_t *zio = zio_arg;
681
682	zio->io_error = error;
683
684	zio_interrupt(zio);
685}
686
687static void
688vdev_disk_io_start(zio_t *zio)
689{
690	vdev_t *vd = zio->io_vd;
691	vdev_disk_t *dvd = vd->vdev_tsd;
692	vdev_buf_t *vb;
693	struct dk_callback *dkc;
694	buf_t *bp;
695	int error;
696
697	/*
698	 * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
699	 * Nothing to be done here but return failure.
700	 */
701	if (dvd == NULL || (dvd->vd_ldi_offline && dvd->vd_lh == NULL)) {
702		zio->io_error = SET_ERROR(ENXIO);
703		zio_interrupt(zio);
704		return;
705	}
706
707	if (zio->io_type == ZIO_TYPE_IOCTL) {
708		/* XXPOLICY */
709		if (!vdev_readable(vd)) {
710			zio->io_error = SET_ERROR(ENXIO);
711			zio_interrupt(zio);
712			return;
713		}
714
715		switch (zio->io_cmd) {
716
717		case DKIOCFLUSHWRITECACHE:
718
719			if (zfs_nocacheflush)
720				break;
721
722			if (vd->vdev_nowritecache) {
723				zio->io_error = SET_ERROR(ENOTSUP);
724				break;
725			}
726
727			zio->io_vsd = dkc = kmem_alloc(sizeof (*dkc), KM_SLEEP);
728			zio->io_vsd_ops = &vdev_disk_vsd_ops;
729
730			dkc->dkc_callback = vdev_disk_ioctl_done;
731			dkc->dkc_flag = FLUSH_VOLATILE;
732			dkc->dkc_cookie = zio;
733
734			error = ldi_ioctl(dvd->vd_lh, zio->io_cmd,
735			    (uintptr_t)dkc, FKIOCTL, kcred, NULL);
736
737			if (error == 0) {
738				/*
739				 * The ioctl will be done asychronously,
740				 * and will call vdev_disk_ioctl_done()
741				 * upon completion.
742				 */
743				return;
744			}
745
746			if (error == ENOTSUP || error == ENOTTY) {
747				/*
748				 * If we get ENOTSUP or ENOTTY, we know that
749				 * no future attempts will ever succeed.
750				 * In this case we set a persistent bit so
751				 * that we don't bother with the ioctl in the
752				 * future.
753				 */
754				vd->vdev_nowritecache = B_TRUE;
755			}
756			zio->io_error = error;
757
758			break;
759
760		default:
761			zio->io_error = SET_ERROR(ENOTSUP);
762		}
763
764		zio_execute(zio);
765		return;
766	}
767
768	ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
769	zio->io_target_timestamp = zio_handle_io_delay(zio);
770
771	vb = kmem_alloc(sizeof (vdev_buf_t), KM_SLEEP);
772
773	vb->vb_io = zio;
774	bp = &vb->vb_buf;
775
776	bioinit(bp);
777	bp->b_flags = B_BUSY | B_NOCACHE |
778	    (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE);
779	if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))
780		bp->b_flags |= B_FAILFAST;
781	bp->b_bcount = zio->io_size;
782	bp->b_un.b_addr = zio->io_data;
783	bp->b_lblkno = lbtodb(zio->io_offset);
784	bp->b_bufsize = zio->io_size;
785	bp->b_iodone = (int (*)())vdev_disk_io_intr;
786
787	/* ldi_strategy() will return non-zero only on programming errors */
788	VERIFY(ldi_strategy(dvd->vd_lh, bp) == 0);
789}
790
791static void
792vdev_disk_io_done(zio_t *zio)
793{
794	vdev_t *vd = zio->io_vd;
795
796	/*
797	 * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if
798	 * the device has been removed.  If this is the case, then we trigger an
799	 * asynchronous removal of the device. Otherwise, probe the device and
800	 * make sure it's still accessible.
801	 */
802	if (zio->io_error == EIO && !vd->vdev_remove_wanted) {
803		vdev_disk_t *dvd = vd->vdev_tsd;
804		int state = DKIO_NONE;
805
806		if (ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state,
807		    FKIOCTL, kcred, NULL) == 0 && state != DKIO_INSERTED) {
808			/*
809			 * We post the resource as soon as possible, instead of
810			 * when the async removal actually happens, because the
811			 * DE is using this information to discard previous I/O
812			 * errors.
813			 */
814			zfs_post_remove(zio->io_spa, vd);
815			vd->vdev_remove_wanted = B_TRUE;
816			spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
817		} else if (!vd->vdev_delayed_close) {
818			vd->vdev_delayed_close = B_TRUE;
819		}
820	}
821}
822
823vdev_ops_t vdev_disk_ops = {
824	vdev_disk_open,
825	vdev_disk_close,
826	vdev_default_asize,
827	vdev_disk_io_start,
828	vdev_disk_io_done,
829	NULL,
830	vdev_disk_hold,
831	vdev_disk_rele,
832	VDEV_TYPE_DISK,		/* name of this vdev type */
833	B_TRUE			/* leaf vdev */
834};
835
836/*
837 * Given the root disk device devid or pathname, read the label from
838 * the device, and construct a configuration nvlist.
839 */
840int
841vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config)
842{
843	ldi_handle_t vd_lh;
844	vdev_label_t *label;
845	uint64_t s, size;
846	int l;
847	ddi_devid_t tmpdevid;
848	int error = -1;
849	char *minor_name;
850
851	/*
852	 * Read the device label and build the nvlist.
853	 */
854	if (devid != NULL && ddi_devid_str_decode(devid, &tmpdevid,
855	    &minor_name) == 0) {
856		error = ldi_open_by_devid(tmpdevid, minor_name,
857		    FREAD, kcred, &vd_lh, zfs_li);
858		ddi_devid_free(tmpdevid);
859		ddi_devid_str_free(minor_name);
860	}
861
862	if (error && (error = ldi_open_by_name(devpath, FREAD, kcred, &vd_lh,
863	    zfs_li)))
864		return (error);
865
866	if (ldi_get_size(vd_lh, &s)) {
867		(void) ldi_close(vd_lh, FREAD, kcred);
868		return (SET_ERROR(EIO));
869	}
870
871	size = P2ALIGN_TYPED(s, sizeof (vdev_label_t), uint64_t);
872	label = kmem_alloc(sizeof (vdev_label_t), KM_SLEEP);
873
874	*config = NULL;
875	for (l = 0; l < VDEV_LABELS; l++) {
876		uint64_t offset, state, txg = 0;
877
878		/* read vdev label */
879		offset = vdev_label_offset(size, l, 0);
880		if (vdev_disk_ldi_physio(vd_lh, (caddr_t)label,
881		    VDEV_SKIP_SIZE + VDEV_PHYS_SIZE, offset, B_READ) != 0)
882			continue;
883
884		if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist,
885		    sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0) {
886			*config = NULL;
887			continue;
888		}
889
890		if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,
891		    &state) != 0 || state >= POOL_STATE_DESTROYED) {
892			nvlist_free(*config);
893			*config = NULL;
894			continue;
895		}
896
897		if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,
898		    &txg) != 0 || txg == 0) {
899			nvlist_free(*config);
900			*config = NULL;
901			continue;
902		}
903
904		break;
905	}
906
907	kmem_free(label, sizeof (vdev_label_t));
908	(void) ldi_close(vd_lh, FREAD, kcred);
909	if (*config == NULL)
910		error = SET_ERROR(EIDRM);
911
912	return (error);
913}
914