vdev_disk.c revision 177698
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#pragma ident	"%Z%%M%	%I%	%E% SMI"
27
28#include <sys/zfs_context.h>
29#include <sys/spa.h>
30#include <sys/vdev_disk.h>
31#include <sys/vdev_impl.h>
32#include <sys/fs/zfs.h>
33#include <sys/zio.h>
34#include <sys/sunldi.h>
35
36/*
37 * Virtual device vector for disks.
38 */
39
40extern ldi_ident_t zfs_li;
41
42typedef struct vdev_disk_buf {
43	buf_t	vdb_buf;
44	zio_t	*vdb_io;
45} vdev_disk_buf_t;
46
47static int
48vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
49{
50	vdev_disk_t *dvd;
51	struct dk_minfo dkm;
52	int error;
53
54	/*
55	 * We must have a pathname, and it must be absolute.
56	 */
57	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
58		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
59		return (EINVAL);
60	}
61
62	dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
63
64	/*
65	 * When opening a disk device, we want to preserve the user's original
66	 * intent.  We always want to open the device by the path the user gave
67	 * us, even if it is one of multiple paths to the save device.  But we
68	 * also want to be able to survive disks being removed/recabled.
69	 * Therefore the sequence of opening devices is:
70	 *
71	 * 1. Try opening the device by path.  For legacy pools without the
72	 *    'whole_disk' property, attempt to fix the path by appending 's0'.
73	 *
74	 * 2. If the devid of the device matches the stored value, return
75	 *    success.
76	 *
77	 * 3. Otherwise, the device may have moved.  Try opening the device
78	 *    by the devid instead.
79	 *
80	 */
81	if (vd->vdev_devid != NULL) {
82		if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid,
83		    &dvd->vd_minor) != 0) {
84			vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
85			return (EINVAL);
86		}
87	}
88
89	error = EINVAL;		/* presume failure */
90
91	if (vd->vdev_path != NULL) {
92		ddi_devid_t devid;
93
94		if (vd->vdev_wholedisk == -1ULL) {
95			size_t len = strlen(vd->vdev_path) + 3;
96			char *buf = kmem_alloc(len, KM_SLEEP);
97			ldi_handle_t lh;
98
99			(void) snprintf(buf, len, "%ss0", vd->vdev_path);
100
101			if (ldi_open_by_name(buf, spa_mode, kcred,
102			    &lh, zfs_li) == 0) {
103				spa_strfree(vd->vdev_path);
104				vd->vdev_path = buf;
105				vd->vdev_wholedisk = 1ULL;
106				(void) ldi_close(lh, spa_mode, kcred);
107			} else {
108				kmem_free(buf, len);
109			}
110		}
111
112		error = ldi_open_by_name(vd->vdev_path, spa_mode, kcred,
113		    &dvd->vd_lh, zfs_li);
114
115		/*
116		 * Compare the devid to the stored value.
117		 */
118		if (error == 0 && vd->vdev_devid != NULL &&
119		    ldi_get_devid(dvd->vd_lh, &devid) == 0) {
120			if (ddi_devid_compare(devid, dvd->vd_devid) != 0) {
121				error = EINVAL;
122				(void) ldi_close(dvd->vd_lh, spa_mode, kcred);
123				dvd->vd_lh = NULL;
124			}
125			ddi_devid_free(devid);
126		}
127
128		/*
129		 * If we succeeded in opening the device, but 'vdev_wholedisk'
130		 * is not yet set, then this must be a slice.
131		 */
132		if (error == 0 && vd->vdev_wholedisk == -1ULL)
133			vd->vdev_wholedisk = 0;
134	}
135
136	/*
137	 * If we were unable to open by path, or the devid check fails, open by
138	 * devid instead.
139	 */
140	if (error != 0 && vd->vdev_devid != NULL)
141		error = ldi_open_by_devid(dvd->vd_devid, dvd->vd_minor,
142		    spa_mode, kcred, &dvd->vd_lh, zfs_li);
143
144	if (error) {
145		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
146		return (error);
147	}
148
149	/*
150	 * Determine the actual size of the device.
151	 */
152	if (ldi_get_size(dvd->vd_lh, psize) != 0) {
153		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
154		return (EINVAL);
155	}
156
157	/*
158	 * If we own the whole disk, try to enable disk write caching.
159	 * We ignore errors because it's OK if we can't do it.
160	 */
161	if (vd->vdev_wholedisk == 1) {
162		int wce = 1;
163		(void) ldi_ioctl(dvd->vd_lh, DKIOCSETWCE, (intptr_t)&wce,
164		    FKIOCTL, kcred, NULL);
165	}
166
167	/*
168	 * Determine the device's minimum transfer size.
169	 * If the ioctl isn't supported, assume DEV_BSIZE.
170	 */
171	if (ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFO, (intptr_t)&dkm,
172	    FKIOCTL, kcred, NULL) != 0)
173		dkm.dki_lbsize = DEV_BSIZE;
174
175	*ashift = highbit(MAX(dkm.dki_lbsize, SPA_MINBLOCKSIZE)) - 1;
176
177	/*
178	 * Clear the nowritecache bit, so that on a vdev_reopen() we will
179	 * try again.
180	 */
181	vd->vdev_nowritecache = B_FALSE;
182
183	return (0);
184}
185
186static void
187vdev_disk_close(vdev_t *vd)
188{
189	vdev_disk_t *dvd = vd->vdev_tsd;
190
191	if (dvd == NULL)
192		return;
193
194	dprintf("removing disk %s, devid %s\n",
195	    vd->vdev_path ? vd->vdev_path : "<none>",
196	    vd->vdev_devid ? vd->vdev_devid : "<none>");
197
198	if (dvd->vd_minor != NULL)
199		ddi_devid_str_free(dvd->vd_minor);
200
201	if (dvd->vd_devid != NULL)
202		ddi_devid_free(dvd->vd_devid);
203
204	if (dvd->vd_lh != NULL)
205		(void) ldi_close(dvd->vd_lh, spa_mode, kcred);
206
207	kmem_free(dvd, sizeof (vdev_disk_t));
208	vd->vdev_tsd = NULL;
209}
210
211static void
212vdev_disk_io_intr(buf_t *bp)
213{
214	vdev_disk_buf_t *vdb = (vdev_disk_buf_t *)bp;
215	zio_t *zio = vdb->vdb_io;
216
217	if ((zio->io_error = geterror(bp)) == 0 && bp->b_resid != 0)
218		zio->io_error = EIO;
219
220	kmem_free(vdb, sizeof (vdev_disk_buf_t));
221
222	zio_next_stage_async(zio);
223}
224
225static void
226vdev_disk_ioctl_done(void *zio_arg, int error)
227{
228	zio_t *zio = zio_arg;
229
230	zio->io_error = error;
231
232	zio_next_stage_async(zio);
233}
234
235static void
236vdev_disk_io_start(zio_t *zio)
237{
238	vdev_t *vd = zio->io_vd;
239	vdev_disk_t *dvd = vd->vdev_tsd;
240	vdev_disk_buf_t *vdb;
241	buf_t *bp;
242	int flags, error;
243
244	if (zio->io_type == ZIO_TYPE_IOCTL) {
245		zio_vdev_io_bypass(zio);
246
247		/* XXPOLICY */
248		if (vdev_is_dead(vd)) {
249			zio->io_error = ENXIO;
250			zio_next_stage_async(zio);
251			return;
252		}
253
254		switch (zio->io_cmd) {
255
256		case DKIOCFLUSHWRITECACHE:
257
258			if (zfs_nocacheflush)
259				break;
260
261			if (vd->vdev_nowritecache) {
262				zio->io_error = ENOTSUP;
263				break;
264			}
265
266			zio->io_dk_callback.dkc_callback = vdev_disk_ioctl_done;
267			zio->io_dk_callback.dkc_cookie = zio;
268
269			error = ldi_ioctl(dvd->vd_lh, zio->io_cmd,
270			    (uintptr_t)&zio->io_dk_callback,
271			    FKIOCTL, kcred, NULL);
272
273			if (error == 0) {
274				/*
275				 * The ioctl will be done asychronously,
276				 * and will call vdev_disk_ioctl_done()
277				 * upon completion.
278				 */
279				return;
280			} else if (error == ENOTSUP) {
281				/*
282				 * If we get ENOTSUP, we know that no future
283				 * attempts will ever succeed.  In this case we
284				 * set a persistent bit so that we don't bother
285				 * with the ioctl in the future.
286				 */
287				vd->vdev_nowritecache = B_TRUE;
288			}
289			zio->io_error = error;
290
291			break;
292
293		default:
294			zio->io_error = ENOTSUP;
295		}
296
297		zio_next_stage_async(zio);
298		return;
299	}
300
301	if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0)
302		return;
303
304	if ((zio = vdev_queue_io(zio)) == NULL)
305		return;
306
307	flags = (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE);
308	flags |= B_BUSY | B_NOCACHE;
309	if (zio->io_flags & ZIO_FLAG_FAILFAST)
310		flags |= B_FAILFAST;
311
312	vdb = kmem_alloc(sizeof (vdev_disk_buf_t), KM_SLEEP);
313
314	vdb->vdb_io = zio;
315	bp = &vdb->vdb_buf;
316
317	bioinit(bp);
318	bp->b_flags = flags;
319	bp->b_bcount = zio->io_size;
320	bp->b_un.b_addr = zio->io_data;
321	bp->b_lblkno = lbtodb(zio->io_offset);
322	bp->b_bufsize = zio->io_size;
323	bp->b_iodone = (int (*)())vdev_disk_io_intr;
324
325	/* XXPOLICY */
326	error = vdev_is_dead(vd) ? ENXIO : vdev_error_inject(vd, zio);
327	if (error) {
328		zio->io_error = error;
329		bioerror(bp, error);
330		bp->b_resid = bp->b_bcount;
331		bp->b_iodone(bp);
332		return;
333	}
334
335	error = ldi_strategy(dvd->vd_lh, bp);
336	/* ldi_strategy() will return non-zero only on programming errors */
337	ASSERT(error == 0);
338}
339
340static void
341vdev_disk_io_done(zio_t *zio)
342{
343	vdev_queue_io_done(zio);
344
345	if (zio->io_type == ZIO_TYPE_WRITE)
346		vdev_cache_write(zio);
347
348	if (zio_injection_enabled && zio->io_error == 0)
349		zio->io_error = zio_handle_device_injection(zio->io_vd, EIO);
350
351	zio_next_stage(zio);
352}
353
354vdev_ops_t vdev_disk_ops = {
355	vdev_disk_open,
356	vdev_disk_close,
357	vdev_default_asize,
358	vdev_disk_io_start,
359	vdev_disk_io_done,
360	NULL,
361	VDEV_TYPE_DISK,		/* name of this vdev type */
362	B_TRUE			/* leaf vdev */
363};
364