1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2016 by Delphix. All rights reserved. 24 */ 25 26#include <sys/zfs_context.h> 27#include <sys/spa.h> 28#include <sys/vdev_file.h> 29#include <sys/vdev_impl.h> 30#include <sys/zio.h> 31#include <sys/fs/zfs.h> 32#include <sys/fm/fs/zfs.h> 33#include <sys/abd.h> 34 35/* 36 * Virtual device vector for files. 37 */ 38 39static taskq_t *vdev_file_taskq; 40 41void 42vdev_file_init(void) 43{ 44 vdev_file_taskq = taskq_create("z_vdev_file", MAX(max_ncpus, 16), 45 minclsyspri, max_ncpus, INT_MAX, 0); 46} 47 48void 49vdev_file_fini(void) 50{ 51 taskq_destroy(vdev_file_taskq); 52} 53 54static void 55vdev_file_hold(vdev_t *vd) 56{ 57 ASSERT(vd->vdev_path != NULL); 58} 59 60static void 61vdev_file_rele(vdev_t *vd) 62{ 63 ASSERT(vd->vdev_path != NULL); 64} 65 66static int 67vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, 68 uint64_t *logical_ashift, uint64_t *physical_ashift) 69{ 70 vdev_file_t *vf; 71 vnode_t *vp; 72 vattr_t vattr; 73 int error; 74 75 /* Rotational optimizations only make sense on block devices */ 76 vd->vdev_nonrot = B_TRUE; 77 78 /* 79 * We must have a pathname, and it must be absolute. 80 */ 81 if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { 82 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 83 return (SET_ERROR(EINVAL)); 84 } 85 86 /* 87 * Reopen the device if it's not currently open. Otherwise, 88 * just update the physical size of the device. 89 */ 90 if (vd->vdev_tsd != NULL) { 91 ASSERT(vd->vdev_reopening); 92 vf = vd->vdev_tsd; 93 vp = vf->vf_vnode; 94 goto skip_open; 95 } 96 97 vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP); 98 99 /* 100 * We always open the files from the root of the global zone, even if 101 * we're in a local zone. If the user has gotten to this point, the 102 * administrator has already decided that the pool should be available 103 * to local zone users, so the underlying devices should be as well. 104 */ 105 ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/'); 106 error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE, 107 spa_mode(vd->vdev_spa) | FOFFMAX, 0, &vp, 0, 0, rootdir, -1); 108 109 if (error) { 110 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; 111 kmem_free(vd->vdev_tsd, sizeof (vdev_file_t)); 112 vd->vdev_tsd = NULL; 113 return (error); 114 } 115 116 vf->vf_vnode = vp; 117 118#ifdef _KERNEL 119 /* 120 * Make sure it's a regular file. 121 */ 122 if (vp->v_type != VREG) { 123#ifdef __FreeBSD__ 124 (void) VOP_CLOSE(vp, spa_mode(vd->vdev_spa), 1, 0, kcred, NULL); 125#endif 126 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; 127#ifdef __FreeBSD__ 128 kmem_free(vd->vdev_tsd, sizeof (vdev_file_t)); 129 vd->vdev_tsd = NULL; 130#endif 131 return (SET_ERROR(ENODEV)); 132 } 133#endif /* _KERNEL */ 134 135skip_open: 136 /* 137 * Determine the physical size of the file. 138 */ 139 vattr.va_mask = AT_SIZE; 140 vn_lock(vp, LK_SHARED | LK_RETRY); 141 error = VOP_GETATTR(vp, &vattr, kcred); 142 VOP_UNLOCK(vp, 0); 143 if (error) { 144 (void) VOP_CLOSE(vp, spa_mode(vd->vdev_spa), 1, 0, kcred, NULL); 145 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; 146 kmem_free(vd->vdev_tsd, sizeof (vdev_file_t)); 147 vd->vdev_tsd = NULL; 148 return (error); 149 } 150 151 vd->vdev_notrim = B_TRUE; 152 153 *max_psize = *psize = vattr.va_size; 154 *logical_ashift = SPA_MINBLOCKSHIFT; 155 *physical_ashift = SPA_MINBLOCKSHIFT; 156 157 return (0); 158} 159 160static void 161vdev_file_close(vdev_t *vd) 162{ 163 vdev_file_t *vf = vd->vdev_tsd; 164 165 if (vd->vdev_reopening || vf == NULL) 166 return; 167 168 if (vf->vf_vnode != NULL) { 169 (void) VOP_CLOSE(vf->vf_vnode, spa_mode(vd->vdev_spa), 1, 0, 170 kcred, NULL); 171 } 172 173 vd->vdev_delayed_close = B_FALSE; 174 kmem_free(vf, sizeof (vdev_file_t)); 175 vd->vdev_tsd = NULL; 176} 177 178/* 179 * Implements the interrupt side for file vdev types. This routine will be 180 * called when the I/O completes allowing us to transfer the I/O to the 181 * interrupt taskqs. For consistency, the code structure mimics disk vdev 182 * types. 183 */ 184static void 185vdev_file_io_intr(zio_t *zio) 186{ 187 zio_delay_interrupt(zio); 188} 189 190static void 191vdev_file_io_strategy(void *arg) 192{ 193 zio_t *zio = arg; 194 vdev_t *vd = zio->io_vd; 195 vdev_file_t *vf; 196 vnode_t *vp; 197 void *addr; 198 ssize_t resid; 199 200 vf = vd->vdev_tsd; 201 vp = vf->vf_vnode; 202 203 ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); 204 if (zio->io_type == ZIO_TYPE_READ) { 205 addr = abd_borrow_buf(zio->io_abd, zio->io_size); 206 } else { 207 addr = abd_borrow_buf_copy(zio->io_abd, zio->io_size); 208 } 209 210 zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ? 211 UIO_READ : UIO_WRITE, vp, addr, zio->io_size, 212 zio->io_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); 213 214 if (zio->io_type == ZIO_TYPE_READ) { 215 abd_return_buf_copy(zio->io_abd, addr, zio->io_size); 216 } else { 217 abd_return_buf(zio->io_abd, addr, zio->io_size); 218 } 219 220 if (resid != 0 && zio->io_error == 0) 221 zio->io_error = ENOSPC; 222 223 vdev_file_io_intr(zio); 224} 225 226static void 227vdev_file_io_start(zio_t *zio) 228{ 229 vdev_t *vd = zio->io_vd; 230 vdev_file_t *vf = vd->vdev_tsd; 231 232 if (zio->io_type == ZIO_TYPE_IOCTL) { 233 /* XXPOLICY */ 234 if (!vdev_readable(vd)) { 235 zio->io_error = SET_ERROR(ENXIO); 236 zio_interrupt(zio); 237 return; 238 } 239 240 switch (zio->io_cmd) { 241 case DKIOCFLUSHWRITECACHE: 242 zio->io_error = VOP_FSYNC(vf->vf_vnode, FSYNC | FDSYNC, 243 kcred, NULL); 244 break; 245 default: 246 zio->io_error = SET_ERROR(ENOTSUP); 247 } 248 249 zio_execute(zio); 250 return; 251 } 252 253 ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); 254 zio->io_target_timestamp = zio_handle_io_delay(zio); 255 256 VERIFY3U(taskq_dispatch(vdev_file_taskq, vdev_file_io_strategy, zio, 257 TQ_SLEEP), !=, 0); 258} 259 260/* ARGSUSED */ 261static void 262vdev_file_io_done(zio_t *zio) 263{ 264} 265 266vdev_ops_t vdev_file_ops = { 267 vdev_file_open, 268 vdev_file_close, 269 vdev_default_asize, 270 vdev_file_io_start, 271 vdev_file_io_done, 272 NULL, 273 NULL, 274 vdev_file_hold, 275 vdev_file_rele, 276 NULL, 277 vdev_default_xlate, 278 VDEV_TYPE_FILE, /* name of this vdev type */ 279 B_TRUE /* leaf vdev */ 280}; 281 282/* 283 * From userland we access disks just like files. 284 */ 285#ifndef _KERNEL 286 287vdev_ops_t vdev_disk_ops = { 288 vdev_file_open, 289 vdev_file_close, 290 vdev_default_asize, 291 vdev_file_io_start, 292 vdev_file_io_done, 293 NULL, 294 NULL, 295 vdev_file_hold, 296 vdev_file_rele, 297 NULL, 298 vdev_default_xlate, 299 VDEV_TYPE_DISK, /* name of this vdev type */ 300 B_TRUE /* leaf vdev */ 301}; 302 303#endif 304