1 2/* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22/* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27#include <sys/zfs_context.h> 28#include <sys/spa.h> 29#include <sys/refcount.h> 30#include <sys/vdev_disk.h> 31#include <sys/vdev_impl.h> 32#include <sys/fs/zfs.h> 33#include <sys/zio.h> 34#include <sys/sunldi.h> 35#include <sys/fm/fs/zfs.h> 36#include <sys/disklabel.h> 37#include <sys/dkio.h> 38#include <sys/workqueue.h> 39 40/* 41 * Virtual device vector for disks. 42 */ 43 44static void vdev_disk_io_intr(buf_t *); 45 46static void 47vdev_disk_flush(struct work *work, void *cookie) 48{ 49 vdev_disk_t *dvd; 50 int error, cmd; 51 buf_t *bp; 52 vnode_t *vp; 53 54 bp = (struct buf *)work; 55 vp = bp->b_vp; 56 dvd = cookie; 57 58 KASSERT(vp == dvd->vd_vn); 59 60 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 61 cmd = 1; 62 error = VOP_IOCTL(vp, DIOCCACHESYNC, &cmd, FREAD|FWRITE, 63 kauth_cred_get()); 64 VOP_UNLOCK(vp); 65 bp->b_error = error; 66 vdev_disk_io_intr(bp); 67} 68 69static int 70vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) 71{ 72 spa_t *spa = vd->vdev_spa; 73 vdev_disk_t *dvd; 74 vnode_t *vp; 75 int error, cmd; 76 struct partinfo pinfo; 77 78 /* 79 * We must have a pathname, and it must be absolute. 80 */ 81 if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { 82 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 83 return (EINVAL); 84 } 85 86 /* 87 * Reopen the device if it's not currently open. Otherwise, 88 * just update the physical size of the device. 89 */ 90 if (vd->vdev_tsd != NULL) { 91 ASSERT(vd->vdev_reopening); 92 dvd = vd->vdev_tsd; 93 goto skip_open; 94 } 95 96 dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); 97 98 /* 99 * When opening a disk device, we want to preserve the user's original 100 * intent. We always want to open the device by the path the user gave 101 * us, even if it is one of multiple paths to the save device. But we 102 * also want to be able to survive disks being removed/recabled. 103 * Therefore the sequence of opening devices is: 104 * 105 * 1. Try opening the device by path. For legacy pools without the 106 * 'whole_disk' property, attempt to fix the path by appending 's0'. 107 * 108 * 2. If the devid of the device matches the stored value, return 109 * success. 110 * 111 * 3. Otherwise, the device may have moved. Try opening the device 112 * by the devid instead. 113 */ 114 if (vd->vdev_devid != NULL) { 115 /* XXXNETBSD wedges */ 116 } 117 118 error = EINVAL; /* presume failure */ 119 120 error = vn_open(vd->vdev_path, UIO_SYSSPACE, FREAD|FWRITE, 0, 121 &vp, CRCREAT, 0); 122 if (error != 0) { 123 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; 124 return error; 125 } 126 if (vp->v_type != VBLK) { 127 vrele(vp); 128 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; 129 return EINVAL; 130 } 131 132 /* 133 * XXXNETBSD Compare the devid to the stored value. 134 */ 135 136skip_open: 137 /* 138 * Determine the actual size of the device. 139 * XXXNETBSD wedges. 140 */ 141 error = VOP_IOCTL(vp, DIOCGPART, &pinfo, FREAD|FWRITE, 142 kauth_cred_get()); 143 if (error != 0) { 144 vrele(vp); 145 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; 146 return error; 147 } 148 *psize = (uint64_t)pinfo.part->p_size * pinfo.disklab->d_secsize; 149 *ashift = highbit(MAX(pinfo.disklab->d_secsize, SPA_MINBLOCKSIZE)) - 1; 150 vd->vdev_wholedisk = (pinfo.part->p_offset == 0); /* XXXNETBSD */ 151 152 /* 153 * Create a workqueue to process cache-flushes concurrently. 154 */ 155 error = workqueue_create(&dvd->vd_wq, "vdevsync", 156 vdev_disk_flush, dvd, PRI_NONE, IPL_NONE, WQ_MPSAFE); 157 if (error != 0) { 158 vrele(vp); 159 return error; 160 } 161 162 /* 163 * Clear the nowritecache bit, so that on a vdev_reopen() we will 164 * try again. 165 */ 166 vd->vdev_nowritecache = B_FALSE; 167 168 dvd->vd_vn = vp; 169 return 0; 170} 171 172static void 173vdev_disk_close(vdev_t *vd) 174{ 175 vdev_disk_t *dvd = vd->vdev_tsd; 176 vnode_t *vp; 177 178 if (vd->vdev_reopening || dvd == NULL) 179 return; 180 181 if ((vp = dvd->vd_vn) != NULL) { 182/* XXX NetBSD Sometimes we deadlock on this why ? */ 183// vprint("vnode close info", vp); 184 vn_close(vp, FREAD|FWRITE, kauth_cred_get()); 185// vprint("vnode close info", vp); 186/* XXX is this needed ? vrele(vp); */ 187 workqueue_destroy(dvd->vd_wq); 188 } 189 190 kmem_free(dvd, sizeof (vdev_disk_t)); 191 vd->vdev_tsd = NULL; 192} 193 194static void 195vdev_disk_io_intr(buf_t *bp) 196{ 197 zio_t *zio = bp->b_private; 198 199 /* 200 * The rest of the zio stack only deals with EIO, ECKSUM, and ENXIO. 201 * Rather than teach the rest of the stack about other error 202 * possibilities (EFAULT, etc), we normalize the error value here. 203 */ 204 if (bp->b_error == 0) { 205 if (bp->b_resid != 0) { 206 zio->io_error = EIO; 207 } else { 208 zio->io_error = 0; 209 } 210 } else { 211 zio->io_error = EIO; 212 } 213 214 215 putiobuf(bp); 216 zio_interrupt(zio); 217} 218 219static void 220vdev_disk_ioctl_free(zio_t *zio) 221{ 222 kmem_free(zio->io_vsd, sizeof (struct dk_callback)); 223} 224 225static const zio_vsd_ops_t vdev_disk_vsd_ops = { 226 vdev_disk_ioctl_free, 227 zio_vsd_default_cksum_report 228}; 229 230static void 231vdev_disk_ioctl_done(void *zio_arg, int error) 232{ 233 zio_t *zio = zio_arg; 234 235 zio->io_error = error; 236 237 zio_interrupt(zio); 238} 239 240static int 241vdev_disk_io_start(zio_t *zio) 242{ 243 vdev_t *vd = zio->io_vd; 244 vdev_disk_t *dvd = vd->vdev_tsd; 245 vnode_t *vp; 246 buf_t *bp, *nbp; 247 int error, size, off, resid; 248 249 vp = dvd->vd_vn; 250 if (zio->io_type == ZIO_TYPE_IOCTL) { 251 /* XXPOLICY */ 252 if (!vdev_readable(vd)) { 253 zio->io_error = ENXIO; 254 return (ZIO_PIPELINE_CONTINUE); 255 } 256 257 switch (zio->io_cmd) { 258 case DKIOCFLUSHWRITECACHE: 259 260 if (zfs_nocacheflush) 261 break; 262 263 if (vd->vdev_nowritecache) { 264 zio->io_error = ENOTSUP; 265 break; 266 } 267 268 bp = getiobuf(vp, true); 269 bp->b_private = zio; 270 workqueue_enqueue(dvd->vd_wq, &bp->b_work, NULL); 271 return (ZIO_PIPELINE_STOP); 272 break; 273 274 default: 275 zio->io_error = ENOTSUP; 276 break; 277 } 278 279 return (ZIO_PIPELINE_CONTINUE); 280 } 281 282 bp = getiobuf(vp, true); 283 bp->b_flags = (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE); 284 bp->b_cflags = BC_BUSY | BC_NOCACHE; 285 bp->b_data = zio->io_data; 286 bp->b_blkno = btodb(zio->io_offset); 287 bp->b_bcount = zio->io_size; 288 bp->b_resid = zio->io_size; 289 bp->b_iodone = vdev_disk_io_intr; 290 bp->b_private = zio; 291 292 if (!(bp->b_flags & B_READ)) { 293 mutex_enter(vp->v_interlock); 294 vp->v_numoutput++; 295 mutex_exit(vp->v_interlock); 296 } 297 298 if (bp->b_bcount <= MAXPHYS) { 299 /* We can do this I/O in one pass. */ 300 (void)VOP_STRATEGY(vp, bp); 301 } else { 302 /* 303 * The I/O is larger than we can process in one pass. 304 * Split it into smaller pieces. 305 */ 306 resid = zio->io_size; 307 off = 0; 308 while (resid != 0) { 309 size = min(resid, MAXPHYS); 310 nbp = getiobuf(vp, true); 311 nbp->b_blkno = btodb(zio->io_offset + off); 312 /* Below call increments v_numoutput. */ 313 nestiobuf_setup(bp, nbp, off, size); 314 (void)VOP_STRATEGY(vp, nbp); 315 resid -= size; 316 off += size; 317 } 318 } 319 320 return (ZIO_PIPELINE_STOP); 321} 322 323static void 324vdev_disk_io_done(zio_t *zio) 325{ 326 327 /* NetBSD: nothing */ 328} 329 330vdev_ops_t vdev_disk_ops = { 331 vdev_disk_open, 332 vdev_disk_close, 333 vdev_default_asize, 334 vdev_disk_io_start, 335 vdev_disk_io_done, 336 NULL, 337 VDEV_TYPE_DISK, /* name of this vdev type */ 338 B_TRUE /* leaf vdev */ 339}; 340 341/* 342 * Given the root disk device devid or pathname, read the label from 343 * the device, and construct a configuration nvlist. 344 */ 345int 346vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config) 347{ 348 349 return EOPNOTSUPP; 350} 351