1
2/*
3 * CDDL HEADER START
4 *
5 * The contents of this file are subject to the terms of the
6 * Common Development and Distribution License (the "License").
7 * You may not use this file except in compliance with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22/*
23 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#include <sys/zfs_context.h>
28#include <sys/spa.h>
29#include <sys/refcount.h>
30#include <sys/vdev_disk.h>
31#include <sys/vdev_impl.h>
32#include <sys/fs/zfs.h>
33#include <sys/zio.h>
34#include <sys/sunldi.h>
35#include <sys/fm/fs/zfs.h>
36#include <sys/disklabel.h>
37#include <sys/dkio.h>
38#include <sys/workqueue.h>
39
40/*
41 * Virtual device vector for disks.
42 */
43
44static void	vdev_disk_io_intr(buf_t *);
45
46static void
47vdev_disk_flush(struct work *work, void *cookie)
48{
49	vdev_disk_t *dvd;
50	int error, cmd;
51	buf_t *bp;
52	vnode_t *vp;
53
54	bp = (struct buf *)work;
55	vp = bp->b_vp;
56	dvd = cookie;
57
58	KASSERT(vp == dvd->vd_vn);
59
60	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
61	cmd = 1;
62	error = VOP_IOCTL(vp, DIOCCACHESYNC, &cmd, FREAD|FWRITE,
63	    kauth_cred_get());
64	VOP_UNLOCK(vp);
65	bp->b_error = error;
66	vdev_disk_io_intr(bp);
67}
68
69static int
70vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
71{
72	spa_t *spa = vd->vdev_spa;
73	vdev_disk_t *dvd;
74	vnode_t *vp;
75	int error, cmd;
76	struct partinfo pinfo;
77
78	/*
79	 * We must have a pathname, and it must be absolute.
80	 */
81	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
82		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
83		return (EINVAL);
84	}
85
86	/*
87	 * Reopen the device if it's not currently open. Otherwise,
88	 * just update the physical size of the device.
89	 */
90	if (vd->vdev_tsd != NULL) {
91		ASSERT(vd->vdev_reopening);
92		dvd = vd->vdev_tsd;
93		goto skip_open;
94	}
95
96	dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
97
98	/*
99	 * When opening a disk device, we want to preserve the user's original
100	 * intent.  We always want to open the device by the path the user gave
101	 * us, even if it is one of multiple paths to the save device.  But we
102	 * also want to be able to survive disks being removed/recabled.
103	 * Therefore the sequence of opening devices is:
104	 *
105	 * 1. Try opening the device by path.  For legacy pools without the
106	 *    'whole_disk' property, attempt to fix the path by appending 's0'.
107	 *
108	 * 2. If the devid of the device matches the stored value, return
109	 *    success.
110	 *
111	 * 3. Otherwise, the device may have moved.  Try opening the device
112	 *    by the devid instead.
113	 */
114	if (vd->vdev_devid != NULL) {
115		/* XXXNETBSD wedges */
116	}
117
118	error = EINVAL;		/* presume failure */
119
120	error = vn_open(vd->vdev_path, UIO_SYSSPACE, FREAD|FWRITE, 0,
121	    &vp, CRCREAT, 0);
122	if (error != 0) {
123		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
124		return error;
125	}
126	if (vp->v_type != VBLK) {
127		vrele(vp);
128		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
129		return EINVAL;
130	}
131
132	/*
133	 * XXXNETBSD Compare the devid to the stored value.
134	 */
135
136skip_open:
137	/*
138	 * Determine the actual size of the device.
139	 * XXXNETBSD wedges.
140	 */
141	error = VOP_IOCTL(vp, DIOCGPART, &pinfo, FREAD|FWRITE,
142	    kauth_cred_get());
143	if (error != 0) {
144		vrele(vp);
145		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
146		return error;
147	}
148	*psize = (uint64_t)pinfo.part->p_size * pinfo.disklab->d_secsize;
149	*ashift = highbit(MAX(pinfo.disklab->d_secsize, SPA_MINBLOCKSIZE)) - 1;
150	vd->vdev_wholedisk = (pinfo.part->p_offset == 0); /* XXXNETBSD */
151
152	/*
153	 * Create a workqueue to process cache-flushes concurrently.
154	 */
155	error = workqueue_create(&dvd->vd_wq, "vdevsync",
156	    vdev_disk_flush, dvd, PRI_NONE, IPL_NONE, WQ_MPSAFE);
157	if (error != 0) {
158		vrele(vp);
159		return error;
160	}
161
162	/*
163	 * Clear the nowritecache bit, so that on a vdev_reopen() we will
164	 * try again.
165	 */
166	vd->vdev_nowritecache = B_FALSE;
167
168	dvd->vd_vn = vp;
169	return 0;
170}
171
172static void
173vdev_disk_close(vdev_t *vd)
174{
175	vdev_disk_t *dvd = vd->vdev_tsd;
176	vnode_t *vp;
177
178	if (vd->vdev_reopening || dvd == NULL)
179		return;
180
181	if ((vp = dvd->vd_vn) != NULL) {
182/* XXX NetBSD Sometimes we deadlock on this why ? */
183//		vprint("vnode close info", vp);
184		vn_close(vp, FREAD|FWRITE, kauth_cred_get());
185//		vprint("vnode close info", vp);
186/* XXX is this needed ?		vrele(vp); */
187		workqueue_destroy(dvd->vd_wq);
188	}
189
190	kmem_free(dvd, sizeof (vdev_disk_t));
191	vd->vdev_tsd = NULL;
192}
193
194static void
195vdev_disk_io_intr(buf_t *bp)
196{
197	zio_t *zio = bp->b_private;
198
199	/*
200	 * The rest of the zio stack only deals with EIO, ECKSUM, and ENXIO.
201	 * Rather than teach the rest of the stack about other error
202	 * possibilities (EFAULT, etc), we normalize the error value here.
203	 */
204	if (bp->b_error == 0) {
205		if (bp->b_resid != 0) {
206			zio->io_error = EIO;
207		} else {
208			zio->io_error = 0;
209		}
210	} else {
211		zio->io_error = EIO;
212	}
213
214
215	putiobuf(bp);
216	zio_interrupt(zio);
217}
218
219static void
220vdev_disk_ioctl_free(zio_t *zio)
221{
222	kmem_free(zio->io_vsd, sizeof (struct dk_callback));
223}
224
225static const zio_vsd_ops_t vdev_disk_vsd_ops = {
226	vdev_disk_ioctl_free,
227	zio_vsd_default_cksum_report
228};
229
230static void
231vdev_disk_ioctl_done(void *zio_arg, int error)
232{
233	zio_t *zio = zio_arg;
234
235	zio->io_error = error;
236
237	zio_interrupt(zio);
238}
239
240static int
241vdev_disk_io_start(zio_t *zio)
242{
243	vdev_t *vd = zio->io_vd;
244	vdev_disk_t *dvd = vd->vdev_tsd;
245	vnode_t *vp;
246	buf_t *bp, *nbp;
247	int error, size, off, resid;
248
249	vp = dvd->vd_vn;
250	if (zio->io_type == ZIO_TYPE_IOCTL) {
251		/* XXPOLICY */
252		if (!vdev_readable(vd)) {
253			zio->io_error = ENXIO;
254			return (ZIO_PIPELINE_CONTINUE);
255		}
256
257		switch (zio->io_cmd) {
258		case DKIOCFLUSHWRITECACHE:
259
260			if (zfs_nocacheflush)
261				break;
262
263			if (vd->vdev_nowritecache) {
264				zio->io_error = ENOTSUP;
265				break;
266			}
267
268			bp = getiobuf(vp, true);
269			bp->b_private = zio;
270			workqueue_enqueue(dvd->vd_wq, &bp->b_work, NULL);
271			return (ZIO_PIPELINE_STOP);
272			break;
273
274		default:
275			zio->io_error = ENOTSUP;
276			break;
277		}
278
279		return (ZIO_PIPELINE_CONTINUE);
280	}
281
282	bp = getiobuf(vp, true);
283	bp->b_flags = (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE);
284	bp->b_cflags = BC_BUSY | BC_NOCACHE;
285	bp->b_data = zio->io_data;
286	bp->b_blkno = btodb(zio->io_offset);
287	bp->b_bcount = zio->io_size;
288	bp->b_resid = zio->io_size;
289	bp->b_iodone = vdev_disk_io_intr;
290	bp->b_private = zio;
291
292	if (!(bp->b_flags & B_READ)) {
293		mutex_enter(vp->v_interlock);
294		vp->v_numoutput++;
295		mutex_exit(vp->v_interlock);
296	}
297
298	if (bp->b_bcount <= MAXPHYS) {
299		/* We can do this I/O in one pass. */
300		(void)VOP_STRATEGY(vp, bp);
301	} else {
302		/*
303		 * The I/O is larger than we can process in one pass.
304		 * Split it into smaller pieces.
305		 */
306		resid = zio->io_size;
307		off = 0;
308		while (resid != 0) {
309			size = min(resid, MAXPHYS);
310			nbp = getiobuf(vp, true);
311			nbp->b_blkno = btodb(zio->io_offset + off);
312			/* Below call increments v_numoutput. */
313			nestiobuf_setup(bp, nbp, off, size);
314			(void)VOP_STRATEGY(vp, nbp);
315			resid -= size;
316			off += size;
317		}
318	}
319
320	return (ZIO_PIPELINE_STOP);
321}
322
323static void
324vdev_disk_io_done(zio_t *zio)
325{
326
327	/* NetBSD: nothing */
328}
329
330vdev_ops_t vdev_disk_ops = {
331	vdev_disk_open,
332	vdev_disk_close,
333	vdev_default_asize,
334	vdev_disk_io_start,
335	vdev_disk_io_done,
336	NULL,
337	VDEV_TYPE_DISK,		/* name of this vdev type */
338	B_TRUE			/* leaf vdev */
339};
340
341/*
342 * Given the root disk device devid or pathname, read the label from
343 * the device, and construct a configuration nvlist.
344 */
345int
346vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config)
347{
348
349	return EOPNOTSUPP;
350}
351