1168404Spjd/*
2168404Spjd * CDDL HEADER START
3168404Spjd *
4168404Spjd * The contents of this file are subject to the terms of the
5168404Spjd * Common Development and Distribution License (the "License").
6168404Spjd * You may not use this file except in compliance with the License.
7168404Spjd *
8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9168404Spjd * or http://www.opensolaris.org/os/licensing.
10168404Spjd * See the License for the specific language governing permissions
11168404Spjd * and limitations under the License.
12168404Spjd *
13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each
14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15168404Spjd * If applicable, add the following below this CDDL HEADER, with the
16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying
17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner]
18168404Spjd *
19168404Spjd * CDDL HEADER END
20168404Spjd */
21168404Spjd/*
22219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23219089Spjd *
24219089Spjd * Copyright (c) 2006-2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
25168404Spjd * All rights reserved.
26264732Smav *
27264732Smav * Portions Copyright 2010 Robert Milkowski
28264732Smav *
29264732Smav * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
30268657Sdelphij * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
31268657Sdelphij * Copyright (c) 2013, Joyent, Inc. All rights reserved.
32297112Smav * Copyright (c) 2014 Integros [integros.com]
33168404Spjd */
34168404Spjd
35226724Smm/* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */
36219089Spjd
37168404Spjd/*
38168404Spjd * ZFS volume emulation driver.
39168404Spjd *
40168404Spjd * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
41168404Spjd * Volumes are accessed through the symbolic links named:
42168404Spjd *
43168404Spjd * /dev/zvol/dsk/<pool_name>/<dataset_name>
44168404Spjd * /dev/zvol/rdsk/<pool_name>/<dataset_name>
45168404Spjd *
46219089Spjd * These links are created by the /dev filesystem (sdev_zvolops.c).
47168404Spjd * Volumes are persistent through reboot.  No user command needs to be
48168404Spjd * run before opening and using a device.
49219089Spjd *
50219089Spjd * FreeBSD notes.
51219089Spjd * On FreeBSD ZVOLs are simply GEOM providers like any other storage device
52219089Spjd * in the system.
53168404Spjd */
54168404Spjd
55168962Spjd#include <sys/types.h>
56168404Spjd#include <sys/param.h>
57168404Spjd#include <sys/kernel.h>
58168404Spjd#include <sys/errno.h>
59168404Spjd#include <sys/uio.h>
60168404Spjd#include <sys/bio.h>
61168962Spjd#include <sys/buf.h>
62168404Spjd#include <sys/kmem.h>
63168404Spjd#include <sys/conf.h>
64168404Spjd#include <sys/cmn_err.h>
65168404Spjd#include <sys/stat.h>
66168404Spjd#include <sys/zap.h>
67168404Spjd#include <sys/spa.h>
68255750Sdelphij#include <sys/spa_impl.h>
69168404Spjd#include <sys/zio.h>
70265678Smav#include <sys/disk.h>
71185029Spjd#include <sys/dmu_traverse.h>
72185029Spjd#include <sys/dnode.h>
73185029Spjd#include <sys/dsl_dataset.h>
74168404Spjd#include <sys/dsl_prop.h>
75168962Spjd#include <sys/dkio.h>
76168404Spjd#include <sys/byteorder.h>
77168962Spjd#include <sys/sunddi.h>
78168404Spjd#include <sys/dirent.h>
79168962Spjd#include <sys/policy.h>
80265678Smav#include <sys/queue.h>
81168404Spjd#include <sys/fs/zfs.h>
82168404Spjd#include <sys/zfs_ioctl.h>
83168404Spjd#include <sys/zil.h>
84168404Spjd#include <sys/refcount.h>
85168404Spjd#include <sys/zfs_znode.h>
86168404Spjd#include <sys/zfs_rlock.h>
87185029Spjd#include <sys/vdev_impl.h>
88255750Sdelphij#include <sys/vdev_raidz.h>
89185029Spjd#include <sys/zvol.h>
90209962Smm#include <sys/zil_impl.h>
91243524Smm#include <sys/dbuf.h>
92255750Sdelphij#include <sys/dmu_tx.h>
93255750Sdelphij#include <sys/zfeature.h>
94255750Sdelphij#include <sys/zio_checksum.h>
95275892Smav#include <sys/filio.h>
96255750Sdelphij
97168404Spjd#include <geom/geom.h>
98168404Spjd
99168404Spjd#include "zfs_namecheck.h"
100168404Spjd
101277483Ssmh#ifndef illumos
102168404Spjdstruct g_class zfs_zvol_class = {
103168404Spjd	.name = "ZFS::ZVOL",
104168404Spjd	.version = G_VERSION,
105168404Spjd};
106168404Spjd
107168404SpjdDECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
108168404Spjd
109277483Ssmh#endif
110219089Spjdvoid *zfsdev_state;
111219089Spjdstatic char *zvol_tag = "zvol_tag";
112219089Spjd
113219089Spjd#define	ZVOL_DUMPSIZE		"dumpsize"
114219089Spjd
115185029Spjd/*
116277483Ssmh * This lock protects the zfsdev_state structure from being modified
117277483Ssmh * while it's being used, e.g. an open that comes in before a create
118277483Ssmh * finishes.  It also protects temporary opens of the dataset so that,
119185029Spjd * e.g., an open doesn't get a spurious EBUSY.
120185029Spjd */
121277483Ssmh#ifdef illumos
122277483Ssmhkmutex_t zfsdev_state_lock;
123277483Ssmh#else
124277483Ssmh/*
125277483Ssmh * In FreeBSD we've replaced the upstream zfsdev_state_lock with the
126277483Ssmh * spa_namespace_lock in the ZVOL code.
127277483Ssmh */
128277483Ssmh#define zfsdev_state_lock spa_namespace_lock
129277483Ssmh#endif
130168404Spjdstatic uint32_t zvol_minors;
131168404Spjd
132277483Ssmh#ifndef illumos
133265678SmavSYSCTL_DECL(_vfs_zfs);
134265678SmavSYSCTL_NODE(_vfs_zfs, OID_AUTO, vol, CTLFLAG_RW, 0, "ZFS VOLUME");
135265678Smavstatic int	volmode = ZFS_VOLMODE_GEOM;
136265678SmavTUNABLE_INT("vfs.zfs.vol.mode", &volmode);
137265678SmavSYSCTL_INT(_vfs_zfs_vol, OID_AUTO, mode, CTLFLAG_RWTUN, &volmode, 0,
138265678Smav    "Expose as GEOM providers (1), device files (2) or neither");
139265678Smav
140277483Ssmh#endif
141185029Spjdtypedef struct zvol_extent {
142208047Smm	list_node_t	ze_node;
143185029Spjd	dva_t		ze_dva;		/* dva associated with this extent */
144208047Smm	uint64_t	ze_nblks;	/* number of blocks in extent */
145185029Spjd} zvol_extent_t;
146185029Spjd
147168404Spjd/*
148168404Spjd * The in-core state of each volume.
149168404Spjd */
150168404Spjdtypedef struct zvol_state {
151277483Ssmh#ifndef illumos
152265678Smav	LIST_ENTRY(zvol_state)	zv_links;
153277483Ssmh#endif
154168404Spjd	char		zv_name[MAXPATHLEN]; /* pool/dd name */
155168404Spjd	uint64_t	zv_volsize;	/* amount of space we advertise */
156168404Spjd	uint64_t	zv_volblocksize; /* volume block size */
157277483Ssmh#ifdef illumos
158277483Ssmh	minor_t		zv_minor;	/* minor number */
159277483Ssmh#else
160265678Smav	struct cdev	*zv_dev;	/* non-GEOM device */
161168404Spjd	struct g_provider *zv_provider;	/* GEOM provider */
162277483Ssmh#endif
163168404Spjd	uint8_t		zv_min_bs;	/* minimum addressable block shift */
164219089Spjd	uint8_t		zv_flags;	/* readonly, dumpified, etc. */
165168404Spjd	objset_t	*zv_objset;	/* objset handle */
166277483Ssmh#ifdef illumos
167277483Ssmh	uint32_t	zv_open_count[OTYPCNT];	/* open counts */
168277483Ssmh#endif
169168404Spjd	uint32_t	zv_total_opens;	/* total open count */
170308596Smav	uint32_t	zv_sync_cnt;	/* synchronous open count */
171168404Spjd	zilog_t		*zv_zilog;	/* ZIL handle */
172208047Smm	list_t		zv_extents;	/* List of extents for dump */
173168404Spjd	znode_t		zv_znode;	/* for range locking */
174219089Spjd	dmu_buf_t	*zv_dbuf;	/* bonus handle */
175277483Ssmh#ifndef illumos
176168404Spjd	int		zv_state;
177265678Smav	int		zv_volmode;	/* Provide GEOM or cdev */
178168404Spjd	struct bio_queue_head zv_queue;
179168404Spjd	struct mtx	zv_queue_mtx;	/* zv_queue mutex */
180277483Ssmh#endif
181168404Spjd} zvol_state_t;
182168404Spjd
183277483Ssmh#ifndef illumos
184265678Smavstatic LIST_HEAD(, zvol_state) all_zvols;
185277483Ssmh#endif
186168404Spjd/*
187185029Spjd * zvol specific flags
188185029Spjd */
189185029Spjd#define	ZVOL_RDONLY	0x1
190185029Spjd#define	ZVOL_DUMPIFIED	0x2
191185029Spjd#define	ZVOL_EXCL	0x4
192219089Spjd#define	ZVOL_WCE	0x8
193185029Spjd
194185029Spjd/*
195168404Spjd * zvol maximum transfer in one DMU tx.
196168404Spjd */
197168404Spjdint zvol_maxphys = DMU_MAX_ACCESS/2;
198168404Spjd
199273345Sdelphij/*
200273345Sdelphij * Toggle unmap functionality.
201273345Sdelphij */
202273345Sdelphijboolean_t zvol_unmap_enabled = B_TRUE;
203277483Ssmh#ifndef illumos
204273345SdelphijSYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN,
205273345Sdelphij    &zvol_unmap_enabled, 0,
206273345Sdelphij    "Enable UNMAP functionality");
207273345Sdelphij
208265678Smavstatic d_open_t		zvol_d_open;
209265678Smavstatic d_close_t	zvol_d_close;
210265678Smavstatic d_read_t		zvol_read;
211265678Smavstatic d_write_t	zvol_write;
212265678Smavstatic d_ioctl_t	zvol_d_ioctl;
213265678Smavstatic d_strategy_t	zvol_strategy;
214265678Smav
215265678Smavstatic struct cdevsw zvol_cdevsw = {
216265678Smav	.d_version =	D_VERSION,
217265678Smav	.d_open =	zvol_d_open,
218265678Smav	.d_close =	zvol_d_close,
219265678Smav	.d_read =	zvol_read,
220265678Smav	.d_write =	zvol_write,
221265678Smav	.d_ioctl =	zvol_d_ioctl,
222265678Smav	.d_strategy =	zvol_strategy,
223265678Smav	.d_name =	"zvol",
224265678Smav	.d_flags =	D_DISK | D_TRACKCLOSE,
225265678Smav};
226265678Smav
227277483Ssmhstatic void zvol_geom_run(zvol_state_t *zv);
228277483Ssmhstatic void zvol_geom_destroy(zvol_state_t *zv);
229277483Ssmhstatic int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace);
230277483Ssmhstatic void zvol_geom_start(struct bio *bp);
231277483Ssmhstatic void zvol_geom_worker(void *arg);
232277483Ssmhstatic void zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off,
233277483Ssmh    uint64_t len, boolean_t sync);
234277483Ssmh#endif	/* !illumos */
235277483Ssmh
236219089Spjdextern int zfs_set_prop_nvlist(const char *, zprop_source_t,
237248571Smm    nvlist_t *, nvlist_t *);
238219089Spjdstatic int zvol_remove_zv(zvol_state_t *);
239168404Spjdstatic int zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio);
240185029Spjdstatic int zvol_dumpify(zvol_state_t *zv);
241185029Spjdstatic int zvol_dump_fini(zvol_state_t *zv);
242185029Spjdstatic int zvol_dump_init(zvol_state_t *zv, boolean_t resize);
243168404Spjd
244185029Spjdstatic void
245277483Ssmhzvol_size_changed(zvol_state_t *zv, uint64_t volsize)
246185029Spjd{
247277483Ssmh#ifdef illumos
248277483Ssmh	dev_t dev = makedevice(ddi_driver_major(zfs_dip), zv->zv_minor);
249219089Spjd
250277483Ssmh	zv->zv_volsize = volsize;
251219089Spjd	VERIFY(ddi_prop_update_int64(dev, zfs_dip,
252219089Spjd	    "Size", volsize) == DDI_SUCCESS);
253219089Spjd	VERIFY(ddi_prop_update_int64(dev, zfs_dip,
254219089Spjd	    "Nblocks", lbtodb(volsize)) == DDI_SUCCESS);
255219089Spjd
256219089Spjd	/* Notify specfs to invalidate the cached size */
257219089Spjd	spec_size_invalidate(dev, VBLK);
258219089Spjd	spec_size_invalidate(dev, VCHR);
259277483Ssmh#else	/* !illumos */
260277483Ssmh	zv->zv_volsize = volsize;
261265678Smav	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
262265678Smav		struct g_provider *pp;
263185029Spjd
264265678Smav		pp = zv->zv_provider;
265265678Smav		if (pp == NULL)
266265678Smav			return;
267265678Smav		g_topology_lock();
268265678Smav		g_resize_provider(pp, zv->zv_volsize);
269265678Smav		g_topology_unlock();
270265678Smav	}
271277483Ssmh#endif	/* illumos */
272185029Spjd}
273185029Spjd
274168404Spjdint
275168404Spjdzvol_check_volsize(uint64_t volsize, uint64_t blocksize)
276168404Spjd{
277168404Spjd	if (volsize == 0)
278249195Smm		return (SET_ERROR(EINVAL));
279168404Spjd
280168404Spjd	if (volsize % blocksize != 0)
281249195Smm		return (SET_ERROR(EINVAL));
282168404Spjd
283168404Spjd#ifdef _ILP32
284168404Spjd	if (volsize - 1 > SPEC_MAXOFFSET_T)
285249195Smm		return (SET_ERROR(EOVERFLOW));
286168404Spjd#endif
287168404Spjd	return (0);
288168404Spjd}
289168404Spjd
290168404Spjdint
291168404Spjdzvol_check_volblocksize(uint64_t volblocksize)
292168404Spjd{
293168404Spjd	if (volblocksize < SPA_MINBLOCKSIZE ||
294276081Sdelphij	    volblocksize > SPA_OLD_MAXBLOCKSIZE ||
295168404Spjd	    !ISP2(volblocksize))
296249195Smm		return (SET_ERROR(EDOM));
297168404Spjd
298168404Spjd	return (0);
299168404Spjd}
300168404Spjd
301168404Spjdint
302168404Spjdzvol_get_stats(objset_t *os, nvlist_t *nv)
303168404Spjd{
304168404Spjd	int error;
305168404Spjd	dmu_object_info_t doi;
306168404Spjd	uint64_t val;
307168404Spjd
308168404Spjd	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val);
309168404Spjd	if (error)
310168404Spjd		return (error);
311168404Spjd
312168404Spjd	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val);
313168404Spjd
314168404Spjd	error = dmu_object_info(os, ZVOL_OBJ, &doi);
315168404Spjd
316168404Spjd	if (error == 0) {
317168404Spjd		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE,
318168404Spjd		    doi.doi_data_block_size);
319168404Spjd	}
320168404Spjd
321168404Spjd	return (error);
322168404Spjd}
323168404Spjd
324168404Spjdstatic zvol_state_t *
325168404Spjdzvol_minor_lookup(const char *name)
326168404Spjd{
327277483Ssmh#ifdef illumos
328277483Ssmh	minor_t minor;
329277483Ssmh#endif
330265678Smav	zvol_state_t *zv;
331168404Spjd
332277483Ssmh	ASSERT(MUTEX_HELD(&zfsdev_state_lock));
333168404Spjd
334277483Ssmh#ifdef illumos
335277483Ssmh	for (minor = 1; minor <= ZFSDEV_MAX_MINOR; minor++) {
336277483Ssmh		zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
337277483Ssmh		if (zv == NULL)
338277483Ssmh			continue;
339277483Ssmh#else
340265678Smav	LIST_FOREACH(zv, &all_zvols, zv_links) {
341277483Ssmh#endif
342219089Spjd		if (strcmp(zv->zv_name, name) == 0)
343277483Ssmh			return (zv);
344168404Spjd	}
345168404Spjd
346277483Ssmh	return (NULL);
347168404Spjd}
348168404Spjd
349185029Spjd/* extent mapping arg */
350185029Spjdstruct maparg {
351208047Smm	zvol_state_t	*ma_zv;
352208047Smm	uint64_t	ma_blks;
353185029Spjd};
354185029Spjd
355185029Spjd/*ARGSUSED*/
356185029Spjdstatic int
357246666Smmzvol_map_block(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
358268657Sdelphij    const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
359185029Spjd{
360208047Smm	struct maparg *ma = arg;
361208047Smm	zvol_extent_t *ze;
362208047Smm	int bs = ma->ma_zv->zv_volblocksize;
363185029Spjd
364288571Smav	if (bp == NULL || BP_IS_HOLE(bp) ||
365263397Sdelphij	    zb->zb_object != ZVOL_OBJ || zb->zb_level != 0)
366208047Smm		return (0);
367185029Spjd
368268649Sdelphij	VERIFY(!BP_IS_EMBEDDED(bp));
369268649Sdelphij
370208047Smm	VERIFY3U(ma->ma_blks, ==, zb->zb_blkid);
371208047Smm	ma->ma_blks++;
372185029Spjd
373208047Smm	/* Abort immediately if we have encountered gang blocks */
374208047Smm	if (BP_IS_GANG(bp))
375249195Smm		return (SET_ERROR(EFRAGS));
376185029Spjd
377208047Smm	/*
378208047Smm	 * See if the block is at the end of the previous extent.
379208047Smm	 */
380208047Smm	ze = list_tail(&ma->ma_zv->zv_extents);
381208047Smm	if (ze &&
382208047Smm	    DVA_GET_VDEV(BP_IDENTITY(bp)) == DVA_GET_VDEV(&ze->ze_dva) &&
383208047Smm	    DVA_GET_OFFSET(BP_IDENTITY(bp)) ==
384208047Smm	    DVA_GET_OFFSET(&ze->ze_dva) + ze->ze_nblks * bs) {
385208047Smm		ze->ze_nblks++;
386208047Smm		return (0);
387185029Spjd	}
388185029Spjd
389208047Smm	dprintf_bp(bp, "%s", "next blkptr:");
390185029Spjd
391208047Smm	/* start a new extent */
392208047Smm	ze = kmem_zalloc(sizeof (zvol_extent_t), KM_SLEEP);
393208047Smm	ze->ze_dva = bp->blk_dva[0];	/* structure assignment */
394208047Smm	ze->ze_nblks = 1;
395208047Smm	list_insert_tail(&ma->ma_zv->zv_extents, ze);
396208047Smm	return (0);
397208047Smm}
398185029Spjd
399208047Smmstatic void
400208047Smmzvol_free_extents(zvol_state_t *zv)
401208047Smm{
402208047Smm	zvol_extent_t *ze;
403185029Spjd
404208047Smm	while (ze = list_head(&zv->zv_extents)) {
405208047Smm		list_remove(&zv->zv_extents, ze);
406208047Smm		kmem_free(ze, sizeof (zvol_extent_t));
407185029Spjd	}
408208047Smm}
409185029Spjd
410208047Smmstatic int
411208047Smmzvol_get_lbas(zvol_state_t *zv)
412208047Smm{
413219089Spjd	objset_t *os = zv->zv_objset;
414208047Smm	struct maparg	ma;
415208047Smm	int		err;
416185029Spjd
417208047Smm	ma.ma_zv = zv;
418208047Smm	ma.ma_blks = 0;
419208047Smm	zvol_free_extents(zv);
420208047Smm
421219089Spjd	/* commit any in-flight changes before traversing the dataset */
422219089Spjd	txg_wait_synced(dmu_objset_pool(os), 0);
423219089Spjd	err = traverse_dataset(dmu_objset_ds(os), 0,
424208047Smm	    TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, zvol_map_block, &ma);
425208047Smm	if (err || ma.ma_blks != (zv->zv_volsize / zv->zv_volblocksize)) {
426208047Smm		zvol_free_extents(zv);
427208047Smm		return (err ? err : EIO);
428185029Spjd	}
429185029Spjd
430185029Spjd	return (0);
431185029Spjd}
432185029Spjd
433185029Spjd/* ARGSUSED */
434185029Spjdvoid
435185029Spjdzvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
436185029Spjd{
437185029Spjd	zfs_creat_t *zct = arg;
438185029Spjd	nvlist_t *nvprops = zct->zct_props;
439168404Spjd	int error;
440168404Spjd	uint64_t volblocksize, volsize;
441168404Spjd
442185029Spjd	VERIFY(nvlist_lookup_uint64(nvprops,
443168404Spjd	    zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0);
444185029Spjd	if (nvlist_lookup_uint64(nvprops,
445168404Spjd	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0)
446168404Spjd		volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE);
447168404Spjd
448168404Spjd	/*
449185029Spjd	 * These properties must be removed from the list so the generic
450168404Spjd	 * property setting step won't apply to them.
451168404Spjd	 */
452185029Spjd	VERIFY(nvlist_remove_all(nvprops,
453168404Spjd	    zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0);
454185029Spjd	(void) nvlist_remove_all(nvprops,
455168404Spjd	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE));
456168404Spjd
457168404Spjd	error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize,
458168404Spjd	    DMU_OT_NONE, 0, tx);
459168404Spjd	ASSERT(error == 0);
460168404Spjd
461168404Spjd	error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP,
462168404Spjd	    DMU_OT_NONE, 0, tx);
463168404Spjd	ASSERT(error == 0);
464168404Spjd
465168404Spjd	error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx);
466168404Spjd	ASSERT(error == 0);
467168404Spjd}
468168404Spjd
469168404Spjd/*
470264732Smav * Replay a TX_TRUNCATE ZIL transaction if asked.  TX_TRUNCATE is how we
471264732Smav * implement DKIOCFREE/free-long-range.
472264732Smav */
473264732Smavstatic int
474264732Smavzvol_replay_truncate(zvol_state_t *zv, lr_truncate_t *lr, boolean_t byteswap)
475264732Smav{
476264732Smav	uint64_t offset, length;
477264732Smav
478264732Smav	if (byteswap)
479264732Smav		byteswap_uint64_array(lr, sizeof (*lr));
480264732Smav
481264732Smav	offset = lr->lr_offset;
482264732Smav	length = lr->lr_length;
483264732Smav
484264732Smav	return (dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, offset, length));
485264732Smav}
486264732Smav
487264732Smav/*
488168404Spjd * Replay a TX_WRITE ZIL transaction that didn't get committed
489168404Spjd * after a system failure
490168404Spjd */
491168404Spjdstatic int
492168404Spjdzvol_replay_write(zvol_state_t *zv, lr_write_t *lr, boolean_t byteswap)
493168404Spjd{
494168404Spjd	objset_t *os = zv->zv_objset;
495168404Spjd	char *data = (char *)(lr + 1);	/* data follows lr_write_t */
496219089Spjd	uint64_t offset, length;
497168404Spjd	dmu_tx_t *tx;
498168404Spjd	int error;
499168404Spjd
500168404Spjd	if (byteswap)
501168404Spjd		byteswap_uint64_array(lr, sizeof (*lr));
502168404Spjd
503219089Spjd	offset = lr->lr_offset;
504219089Spjd	length = lr->lr_length;
505209962Smm
506219089Spjd	/* If it's a dmu_sync() block, write the whole block */
507219089Spjd	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
508219089Spjd		uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
509219089Spjd		if (length < blocksize) {
510219089Spjd			offset -= offset % blocksize;
511219089Spjd			length = blocksize;
512219089Spjd		}
513219089Spjd	}
514219089Spjd
515168404Spjd	tx = dmu_tx_create(os);
516219089Spjd	dmu_tx_hold_write(tx, ZVOL_OBJ, offset, length);
517209962Smm	error = dmu_tx_assign(tx, TXG_WAIT);
518168404Spjd	if (error) {
519168404Spjd		dmu_tx_abort(tx);
520168404Spjd	} else {
521219089Spjd		dmu_write(os, ZVOL_OBJ, offset, length, data, tx);
522168404Spjd		dmu_tx_commit(tx);
523168404Spjd	}
524168404Spjd
525168404Spjd	return (error);
526168404Spjd}
527168404Spjd
528168404Spjd/* ARGSUSED */
529168404Spjdstatic int
530168404Spjdzvol_replay_err(zvol_state_t *zv, lr_t *lr, boolean_t byteswap)
531168404Spjd{
532249195Smm	return (SET_ERROR(ENOTSUP));
533168404Spjd}
534168404Spjd
535168404Spjd/*
536168404Spjd * Callback vectors for replaying records.
537264732Smav * Only TX_WRITE and TX_TRUNCATE are needed for zvol.
538168404Spjd */
539168404Spjdzil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = {
540168404Spjd	zvol_replay_err,	/* 0 no such transaction type */
541168404Spjd	zvol_replay_err,	/* TX_CREATE */
542168404Spjd	zvol_replay_err,	/* TX_MKDIR */
543168404Spjd	zvol_replay_err,	/* TX_MKXATTR */
544168404Spjd	zvol_replay_err,	/* TX_SYMLINK */
545168404Spjd	zvol_replay_err,	/* TX_REMOVE */
546168404Spjd	zvol_replay_err,	/* TX_RMDIR */
547168404Spjd	zvol_replay_err,	/* TX_LINK */
548168404Spjd	zvol_replay_err,	/* TX_RENAME */
549168404Spjd	zvol_replay_write,	/* TX_WRITE */
550264732Smav	zvol_replay_truncate,	/* TX_TRUNCATE */
551168404Spjd	zvol_replay_err,	/* TX_SETATTR */
552168404Spjd	zvol_replay_err,	/* TX_ACL */
553209962Smm	zvol_replay_err,	/* TX_CREATE_ACL */
554209962Smm	zvol_replay_err,	/* TX_CREATE_ATTR */
555209962Smm	zvol_replay_err,	/* TX_CREATE_ACL_ATTR */
556209962Smm	zvol_replay_err,	/* TX_MKDIR_ACL */
557209962Smm	zvol_replay_err,	/* TX_MKDIR_ATTR */
558209962Smm	zvol_replay_err,	/* TX_MKDIR_ACL_ATTR */
559209962Smm	zvol_replay_err,	/* TX_WRITE2 */
560168404Spjd};
561168404Spjd
562277483Ssmh#ifdef illumos
563219089Spjdint
564219089Spjdzvol_name2minor(const char *name, minor_t *minor)
565219089Spjd{
566219089Spjd	zvol_state_t *zv;
567219089Spjd
568277483Ssmh	mutex_enter(&zfsdev_state_lock);
569219089Spjd	zv = zvol_minor_lookup(name);
570219089Spjd	if (minor && zv)
571219089Spjd		*minor = zv->zv_minor;
572277483Ssmh	mutex_exit(&zfsdev_state_lock);
573219089Spjd	return (zv ? 0 : -1);
574219089Spjd}
575277483Ssmh#endif	/* illumos */
576219089Spjd
577168404Spjd/*
578185029Spjd * Create a minor node (plus a whole lot more) for the specified volume.
579185029Spjd */
580185029Spjdint
581219089Spjdzvol_create_minor(const char *name)
582185029Spjd{
583219089Spjd	zfs_soft_state_t *zs;
584168404Spjd	zvol_state_t *zv;
585168404Spjd	objset_t *os;
586277483Ssmh	dmu_object_info_t doi;
587277483Ssmh#ifdef illumos
588277483Ssmh	minor_t minor = 0;
589277483Ssmh	char chrbuf[30], blkbuf[30];
590277483Ssmh#else
591265678Smav	struct g_provider *pp;
592265678Smav	struct g_geom *gp;
593265678Smav	uint64_t volsize, mode;
594277483Ssmh#endif
595168404Spjd	int error;
596168404Spjd
597277483Ssmh#ifndef illumos
598219089Spjd	ZFS_LOG(1, "Creating ZVOL %s...", name);
599277483Ssmh#endif
600168404Spjd
601277483Ssmh	mutex_enter(&zfsdev_state_lock);
602219089Spjd
603219089Spjd	if (zvol_minor_lookup(name) != NULL) {
604277483Ssmh		mutex_exit(&zfsdev_state_lock);
605249195Smm		return (SET_ERROR(EEXIST));
606168404Spjd	}
607168404Spjd
608219089Spjd	/* lie and say we're read-only */
609219089Spjd	error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, FTAG, &os);
610168404Spjd
611168404Spjd	if (error) {
612277483Ssmh		mutex_exit(&zfsdev_state_lock);
613219089Spjd		return (error);
614168404Spjd	}
615168404Spjd
616277483Ssmh#ifdef illumos
617219089Spjd	if ((minor = zfsdev_minor_alloc()) == 0) {
618219089Spjd		dmu_objset_disown(os, FTAG);
619277483Ssmh		mutex_exit(&zfsdev_state_lock);
620249195Smm		return (SET_ERROR(ENXIO));
621219089Spjd	}
622168404Spjd
623219089Spjd	if (ddi_soft_state_zalloc(zfsdev_state, minor) != DDI_SUCCESS) {
624219089Spjd		dmu_objset_disown(os, FTAG);
625277483Ssmh		mutex_exit(&zfsdev_state_lock);
626249195Smm		return (SET_ERROR(EAGAIN));
627219089Spjd	}
628219089Spjd	(void) ddi_prop_update_string(minor, zfs_dip, ZVOL_PROP_NAME,
629219089Spjd	    (char *)name);
630219089Spjd
631219089Spjd	(void) snprintf(chrbuf, sizeof (chrbuf), "%u,raw", minor);
632219089Spjd
633219089Spjd	if (ddi_create_minor_node(zfs_dip, chrbuf, S_IFCHR,
634219089Spjd	    minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
635219089Spjd		ddi_soft_state_free(zfsdev_state, minor);
636219089Spjd		dmu_objset_disown(os, FTAG);
637277483Ssmh		mutex_exit(&zfsdev_state_lock);
638249195Smm		return (SET_ERROR(EAGAIN));
639219089Spjd	}
640219089Spjd
641219089Spjd	(void) snprintf(blkbuf, sizeof (blkbuf), "%u", minor);
642219089Spjd
643219089Spjd	if (ddi_create_minor_node(zfs_dip, blkbuf, S_IFBLK,
644219089Spjd	    minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
645219089Spjd		ddi_remove_minor_node(zfs_dip, chrbuf);
646219089Spjd		ddi_soft_state_free(zfsdev_state, minor);
647219089Spjd		dmu_objset_disown(os, FTAG);
648277483Ssmh		mutex_exit(&zfsdev_state_lock);
649249195Smm		return (SET_ERROR(EAGAIN));
650219089Spjd	}
651219089Spjd
652219089Spjd	zs = ddi_get_soft_state(zfsdev_state, minor);
653219089Spjd	zs->zss_type = ZSST_ZVOL;
654219089Spjd	zv = zs->zss_data = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
655277483Ssmh#else	/* !illumos */
656219089Spjd
657265678Smav	zv = kmem_zalloc(sizeof(*zv), KM_SLEEP);
658265678Smav	zv->zv_state = 0;
659241297Savg	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
660241297Savg	if (error) {
661265678Smav		kmem_free(zv, sizeof(*zv));
662241297Savg		dmu_objset_disown(os, zvol_tag);
663277483Ssmh		mutex_exit(&zfsdev_state_lock);
664241297Savg		return (error);
665241297Savg	}
666265678Smav	error = dsl_prop_get_integer(name,
667265678Smav	    zfs_prop_to_name(ZFS_PROP_VOLMODE), &mode, NULL);
668265678Smav	if (error != 0 || mode == ZFS_VOLMODE_DEFAULT)
669265678Smav		mode = volmode;
670241297Savg
671219089Spjd	DROP_GIANT();
672241297Savg	zv->zv_volsize = volsize;
673265678Smav	zv->zv_volmode = mode;
674265678Smav	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
675265678Smav		g_topology_lock();
676265678Smav		gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
677265678Smav		gp->start = zvol_geom_start;
678265678Smav		gp->access = zvol_geom_access;
679265678Smav		pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name);
680265678Smav		pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
681265678Smav		pp->sectorsize = DEV_BSIZE;
682265678Smav		pp->mediasize = zv->zv_volsize;
683265678Smav		pp->private = zv;
684241297Savg
685265678Smav		zv->zv_provider = pp;
686265678Smav		bioq_init(&zv->zv_queue);
687265678Smav		mtx_init(&zv->zv_queue_mtx, "zvol", NULL, MTX_DEF);
688265678Smav	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
689297548Smav		struct make_dev_args args;
690297548Smav
691297548Smav		make_dev_args_init(&args);
692297548Smav		args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
693297548Smav		args.mda_devsw = &zvol_cdevsw;
694297548Smav		args.mda_cr = NULL;
695297548Smav		args.mda_uid = UID_ROOT;
696297548Smav		args.mda_gid = GID_OPERATOR;
697297548Smav		args.mda_mode = 0640;
698297548Smav		args.mda_si_drv2 = zv;
699297548Smav		error = make_dev_s(&args, &zv->zv_dev,
700297548Smav		    "%s/%s", ZVOL_DRIVER, name);
701297547Smav		if (error != 0) {
702265678Smav			kmem_free(zv, sizeof(*zv));
703265678Smav			dmu_objset_disown(os, FTAG);
704277483Ssmh			mutex_exit(&zfsdev_state_lock);
705297547Smav			return (error);
706265678Smav		}
707297548Smav		zv->zv_dev->si_iosize_max = MAXPHYS;
708265678Smav	}
709265678Smav	LIST_INSERT_HEAD(&all_zvols, zv, zv_links);
710277483Ssmh#endif	/* illumos */
711219089Spjd
712219089Spjd	(void) strlcpy(zv->zv_name, name, MAXPATHLEN);
713168404Spjd	zv->zv_min_bs = DEV_BSHIFT;
714277483Ssmh#ifdef illumos
715277483Ssmh	zv->zv_minor = minor;
716277483Ssmh#endif
717168404Spjd	zv->zv_objset = os;
718219089Spjd	if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os)))
719219089Spjd		zv->zv_flags |= ZVOL_RDONLY;
720168404Spjd	mutex_init(&zv->zv_znode.z_range_lock, NULL, MUTEX_DEFAULT, NULL);
721168404Spjd	avl_create(&zv->zv_znode.z_range_avl, zfs_range_compare,
722168404Spjd	    sizeof (rl_t), offsetof(rl_t, r_node));
723208047Smm	list_create(&zv->zv_extents, sizeof (zvol_extent_t),
724208047Smm	    offsetof(zvol_extent_t, ze_node));
725168404Spjd	/* get and cache the blocksize */
726168404Spjd	error = dmu_object_info(os, ZVOL_OBJ, &doi);
727168404Spjd	ASSERT(error == 0);
728168404Spjd	zv->zv_volblocksize = doi.doi_data_block_size;
729168404Spjd
730219089Spjd	if (spa_writeable(dmu_objset_spa(os))) {
731219089Spjd		if (zil_replay_disable)
732219089Spjd			zil_destroy(dmu_objset_zil(os), B_FALSE);
733219089Spjd		else
734219089Spjd			zil_replay(os, zv, zvol_replay_vector);
735219089Spjd	}
736219089Spjd	dmu_objset_disown(os, FTAG);
737219089Spjd	zv->zv_objset = NULL;
738168404Spjd
739219089Spjd	zvol_minors++;
740168404Spjd
741277483Ssmh	mutex_exit(&zfsdev_state_lock);
742277483Ssmh#ifndef illumos
743265678Smav	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
744265678Smav		zvol_geom_run(zv);
745265678Smav		g_topology_unlock();
746265678Smav	}
747168404Spjd	PICKUP_GIANT();
748168404Spjd
749219089Spjd	ZFS_LOG(1, "ZVOL %s created.", name);
750277483Ssmh#endif
751219089Spjd
752219089Spjd	return (0);
753168404Spjd}
754168404Spjd
755168404Spjd/*
756168404Spjd * Remove minor node for the specified volume.
757168404Spjd */
758219089Spjdstatic int
759219089Spjdzvol_remove_zv(zvol_state_t *zv)
760219089Spjd{
761277483Ssmh#ifdef illumos
762277483Ssmh	char nmbuf[20];
763219089Spjd	minor_t minor = zv->zv_minor;
764219089Spjd#endif
765219089Spjd
766277483Ssmh	ASSERT(MUTEX_HELD(&zfsdev_state_lock));
767219089Spjd	if (zv->zv_total_opens != 0)
768249195Smm		return (SET_ERROR(EBUSY));
769219089Spjd
770277483Ssmh#ifdef illumos
771219089Spjd	(void) snprintf(nmbuf, sizeof (nmbuf), "%u,raw", minor);
772219089Spjd	ddi_remove_minor_node(zfs_dip, nmbuf);
773277483Ssmh
774277483Ssmh	(void) snprintf(nmbuf, sizeof (nmbuf), "%u", minor);
775277483Ssmh	ddi_remove_minor_node(zfs_dip, nmbuf);
776265678Smav#else
777277483Ssmh	ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name);
778277483Ssmh
779265678Smav	LIST_REMOVE(zv, zv_links);
780265678Smav	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
781265678Smav		g_topology_lock();
782265678Smav		zvol_geom_destroy(zv);
783265678Smav		g_topology_unlock();
784308448Smav	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
785308448Smav		if (zv->zv_dev != NULL)
786308448Smav			destroy_dev(zv->zv_dev);
787308448Smav	}
788277483Ssmh#endif
789219089Spjd
790219089Spjd	avl_destroy(&zv->zv_znode.z_range_avl);
791219089Spjd	mutex_destroy(&zv->zv_znode.z_range_lock);
792219089Spjd
793277483Ssmh	kmem_free(zv, sizeof (zvol_state_t));
794277483Ssmh#ifdef illumos
795277483Ssmh	ddi_soft_state_free(zfsdev_state, minor);
796277483Ssmh#endif
797219089Spjd	zvol_minors--;
798219089Spjd	return (0);
799219089Spjd}
800219089Spjd
801168404Spjdint
802168404Spjdzvol_remove_minor(const char *name)
803168404Spjd{
804168404Spjd	zvol_state_t *zv;
805219089Spjd	int rc;
806168404Spjd
807277483Ssmh	mutex_enter(&zfsdev_state_lock);
808168404Spjd	if ((zv = zvol_minor_lookup(name)) == NULL) {
809277483Ssmh		mutex_exit(&zfsdev_state_lock);
810249195Smm		return (SET_ERROR(ENXIO));
811168404Spjd	}
812219089Spjd	rc = zvol_remove_zv(zv);
813277483Ssmh	mutex_exit(&zfsdev_state_lock);
814219089Spjd	return (rc);
815219089Spjd}
816168404Spjd
817219089Spjdint
818219089Spjdzvol_first_open(zvol_state_t *zv)
819219089Spjd{
820219089Spjd	objset_t *os;
821219089Spjd	uint64_t volsize;
822219089Spjd	int error;
823219089Spjd	uint64_t readonly;
824168404Spjd
825219089Spjd	/* lie and say we're read-only */
826219089Spjd	error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, B_TRUE,
827219089Spjd	    zvol_tag, &os);
828219089Spjd	if (error)
829219089Spjd		return (error);
830168404Spjd
831277483Ssmh	zv->zv_objset = os;
832219089Spjd	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
833219089Spjd	if (error) {
834219089Spjd		ASSERT(error == 0);
835219089Spjd		dmu_objset_disown(os, zvol_tag);
836219089Spjd		return (error);
837219089Spjd	}
838277483Ssmh
839219089Spjd	error = dmu_bonus_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dbuf);
840219089Spjd	if (error) {
841219089Spjd		dmu_objset_disown(os, zvol_tag);
842219089Spjd		return (error);
843219089Spjd	}
844277483Ssmh
845277483Ssmh	zvol_size_changed(zv, volsize);
846219089Spjd	zv->zv_zilog = zil_open(os, zvol_get_data);
847168404Spjd
848219089Spjd	VERIFY(dsl_prop_get_integer(zv->zv_name, "readonly", &readonly,
849219089Spjd	    NULL) == 0);
850219089Spjd	if (readonly || dmu_objset_is_snapshot(os) ||
851219089Spjd	    !spa_writeable(dmu_objset_spa(os)))
852219089Spjd		zv->zv_flags |= ZVOL_RDONLY;
853219089Spjd	else
854219089Spjd		zv->zv_flags &= ~ZVOL_RDONLY;
855219089Spjd	return (error);
856219089Spjd}
857168404Spjd
858219089Spjdvoid
859219089Spjdzvol_last_close(zvol_state_t *zv)
860219089Spjd{
861168404Spjd	zil_close(zv->zv_zilog);
862168404Spjd	zv->zv_zilog = NULL;
863239774Smm
864219089Spjd	dmu_buf_rele(zv->zv_dbuf, zvol_tag);
865219089Spjd	zv->zv_dbuf = NULL;
866239774Smm
867239774Smm	/*
868239774Smm	 * Evict cached data
869239774Smm	 */
870239774Smm	if (dsl_dataset_is_dirty(dmu_objset_ds(zv->zv_objset)) &&
871239774Smm	    !(zv->zv_flags & ZVOL_RDONLY))
872239774Smm		txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
873248571Smm	dmu_objset_evict_dbufs(zv->zv_objset);
874239774Smm
875219089Spjd	dmu_objset_disown(zv->zv_objset, zvol_tag);
876168404Spjd	zv->zv_objset = NULL;
877168404Spjd}
878168404Spjd
879277483Ssmh#ifdef illumos
880168404Spjdint
881185029Spjdzvol_prealloc(zvol_state_t *zv)
882168404Spjd{
883185029Spjd	objset_t *os = zv->zv_objset;
884168404Spjd	dmu_tx_t *tx;
885185029Spjd	uint64_t refd, avail, usedobjs, availobjs;
886185029Spjd	uint64_t resid = zv->zv_volsize;
887185029Spjd	uint64_t off = 0;
888185029Spjd
889185029Spjd	/* Check the space usage before attempting to allocate the space */
890185029Spjd	dmu_objset_space(os, &refd, &avail, &usedobjs, &availobjs);
891185029Spjd	if (avail < zv->zv_volsize)
892249195Smm		return (SET_ERROR(ENOSPC));
893185029Spjd
894185029Spjd	/* Free old extents if they exist */
895185029Spjd	zvol_free_extents(zv);
896185029Spjd
897185029Spjd	while (resid != 0) {
898185029Spjd		int error;
899276081Sdelphij		uint64_t bytes = MIN(resid, SPA_OLD_MAXBLOCKSIZE);
900185029Spjd
901185029Spjd		tx = dmu_tx_create(os);
902185029Spjd		dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
903185029Spjd		error = dmu_tx_assign(tx, TXG_WAIT);
904185029Spjd		if (error) {
905185029Spjd			dmu_tx_abort(tx);
906185029Spjd			(void) dmu_free_long_range(os, ZVOL_OBJ, 0, off);
907185029Spjd			return (error);
908185029Spjd		}
909219089Spjd		dmu_prealloc(os, ZVOL_OBJ, off, bytes, tx);
910185029Spjd		dmu_tx_commit(tx);
911185029Spjd		off += bytes;
912185029Spjd		resid -= bytes;
913185029Spjd	}
914185029Spjd	txg_wait_synced(dmu_objset_pool(os), 0);
915185029Spjd
916185029Spjd	return (0);
917185029Spjd}
918277483Ssmh#endif	/* illumos */
919185029Spjd
920248571Smmstatic int
921219089Spjdzvol_update_volsize(objset_t *os, uint64_t volsize)
922185029Spjd{
923185029Spjd	dmu_tx_t *tx;
924168404Spjd	int error;
925185029Spjd
926277483Ssmh	ASSERT(MUTEX_HELD(&zfsdev_state_lock));
927185029Spjd
928219089Spjd	tx = dmu_tx_create(os);
929185029Spjd	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
930269002Sdelphij	dmu_tx_mark_netfree(tx);
931185029Spjd	error = dmu_tx_assign(tx, TXG_WAIT);
932185029Spjd	if (error) {
933185029Spjd		dmu_tx_abort(tx);
934185029Spjd		return (error);
935185029Spjd	}
936185029Spjd
937219089Spjd	error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1,
938185029Spjd	    &volsize, tx);
939185029Spjd	dmu_tx_commit(tx);
940185029Spjd
941185029Spjd	if (error == 0)
942219089Spjd		error = dmu_free_long_range(os,
943185029Spjd		    ZVOL_OBJ, volsize, DMU_OBJECT_END);
944219089Spjd	return (error);
945219089Spjd}
946185029Spjd
947219089Spjdvoid
948219089Spjdzvol_remove_minors(const char *name)
949219089Spjd{
950277483Ssmh#ifdef illumos
951277483Ssmh	zvol_state_t *zv;
952277483Ssmh	char *namebuf;
953277483Ssmh	minor_t minor;
954277483Ssmh
955277483Ssmh	namebuf = kmem_zalloc(strlen(name) + 2, KM_SLEEP);
956277483Ssmh	(void) strncpy(namebuf, name, strlen(name));
957277483Ssmh	(void) strcat(namebuf, "/");
958277483Ssmh	mutex_enter(&zfsdev_state_lock);
959277483Ssmh	for (minor = 1; minor <= ZFSDEV_MAX_MINOR; minor++) {
960277483Ssmh
961277483Ssmh		zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
962277483Ssmh		if (zv == NULL)
963277483Ssmh			continue;
964277483Ssmh		if (strncmp(namebuf, zv->zv_name, strlen(namebuf)) == 0)
965277483Ssmh			(void) zvol_remove_zv(zv);
966277483Ssmh	}
967277483Ssmh	kmem_free(namebuf, strlen(name) + 2);
968277483Ssmh
969277483Ssmh	mutex_exit(&zfsdev_state_lock);
970277483Ssmh#else	/* !illumos */
971265678Smav	zvol_state_t *zv, *tzv;
972219316Spjd	size_t namelen;
973219089Spjd
974219316Spjd	namelen = strlen(name);
975219316Spjd
976219089Spjd	DROP_GIANT();
977277483Ssmh	mutex_enter(&zfsdev_state_lock);
978219089Spjd
979265678Smav	LIST_FOREACH_SAFE(zv, &all_zvols, zv_links, tzv) {
980219316Spjd		if (strcmp(zv->zv_name, name) == 0 ||
981219316Spjd		    (strncmp(zv->zv_name, name, namelen) == 0 &&
982272883Ssmh		    strlen(zv->zv_name) > namelen && (zv->zv_name[namelen] == '/' ||
983272883Ssmh		    zv->zv_name[namelen] == '@'))) {
984219089Spjd			(void) zvol_remove_zv(zv);
985219316Spjd		}
986185029Spjd	}
987219089Spjd
988277483Ssmh	mutex_exit(&zfsdev_state_lock);
989219089Spjd	PICKUP_GIANT();
990277483Ssmh#endif	/* illumos */
991185029Spjd}
992185029Spjd
993277483Ssmhstatic int
994277483Ssmhzvol_update_live_volsize(zvol_state_t *zv, uint64_t volsize)
995185029Spjd{
996185029Spjd	uint64_t old_volsize = 0ULL;
997277483Ssmh	int error = 0;
998168404Spjd
999277483Ssmh	ASSERT(MUTEX_HELD(&zfsdev_state_lock));
1000168404Spjd
1001185029Spjd	/*
1002185029Spjd	 * Reinitialize the dump area to the new size. If we
1003219089Spjd	 * failed to resize the dump area then restore it back to
1004277483Ssmh	 * its original size.  We must set the new volsize prior
1005277483Ssmh	 * to calling dumpvp_resize() to ensure that the devices'
1006277483Ssmh	 * size(9P) is not visible by the dump subsystem.
1007185029Spjd	 */
1008277483Ssmh	old_volsize = zv->zv_volsize;
1009277483Ssmh	zvol_size_changed(zv, volsize);
1010277483Ssmh
1011219089Spjd#ifdef ZVOL_DUMP
1012277483Ssmh	if (zv->zv_flags & ZVOL_DUMPIFIED) {
1013277483Ssmh		if ((error = zvol_dumpify(zv)) != 0 ||
1014277483Ssmh		    (error = dumpvp_resize()) != 0) {
1015277483Ssmh			int dumpify_error;
1016277483Ssmh
1017277483Ssmh			(void) zvol_update_volsize(zv->zv_objset, old_volsize);
1018277483Ssmh			zvol_size_changed(zv, old_volsize);
1019277483Ssmh			dumpify_error = zvol_dumpify(zv);
1020277483Ssmh			error = dumpify_error ? dumpify_error : error;
1021185029Spjd		}
1022277483Ssmh	}
1023219089Spjd#endif	/* ZVOL_DUMP */
1024168404Spjd
1025277483Ssmh#ifdef illumos
1026219089Spjd	/*
1027219089Spjd	 * Generate a LUN expansion event.
1028219089Spjd	 */
1029277483Ssmh	if (error == 0) {
1030219089Spjd		sysevent_id_t eid;
1031219089Spjd		nvlist_t *attr;
1032219089Spjd		char *physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
1033219089Spjd
1034219089Spjd		(void) snprintf(physpath, MAXPATHLEN, "%s%u", ZVOL_PSEUDO_DEV,
1035219089Spjd		    zv->zv_minor);
1036219089Spjd
1037219089Spjd		VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0);
1038219089Spjd		VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0);
1039219089Spjd
1040219089Spjd		(void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS,
1041219089Spjd		    ESC_DEV_DLE, attr, &eid, DDI_SLEEP);
1042219089Spjd
1043219089Spjd		nvlist_free(attr);
1044219089Spjd		kmem_free(physpath, MAXPATHLEN);
1045219089Spjd	}
1046277483Ssmh#endif	/* illumos */
1047277483Ssmh	return (error);
1048277483Ssmh}
1049219089Spjd
1050277483Ssmhint
1051277483Ssmhzvol_set_volsize(const char *name, uint64_t volsize)
1052277483Ssmh{
1053277483Ssmh	zvol_state_t *zv = NULL;
1054277483Ssmh	objset_t *os;
1055277483Ssmh	int error;
1056277483Ssmh	dmu_object_info_t doi;
1057277483Ssmh	uint64_t readonly;
1058277483Ssmh	boolean_t owned = B_FALSE;
1059168404Spjd
1060277483Ssmh	error = dsl_prop_get_integer(name,
1061277483Ssmh	    zfs_prop_to_name(ZFS_PROP_READONLY), &readonly, NULL);
1062277483Ssmh	if (error != 0)
1063277483Ssmh		return (error);
1064277483Ssmh	if (readonly)
1065277483Ssmh		return (SET_ERROR(EROFS));
1066168404Spjd
1067277483Ssmh	mutex_enter(&zfsdev_state_lock);
1068277483Ssmh	zv = zvol_minor_lookup(name);
1069277483Ssmh
1070277483Ssmh	if (zv == NULL || zv->zv_objset == NULL) {
1071277483Ssmh		if ((error = dmu_objset_own(name, DMU_OST_ZVOL, B_FALSE,
1072277483Ssmh		    FTAG, &os)) != 0) {
1073277483Ssmh			mutex_exit(&zfsdev_state_lock);
1074277483Ssmh			return (error);
1075277483Ssmh		}
1076277483Ssmh		owned = B_TRUE;
1077277483Ssmh		if (zv != NULL)
1078277483Ssmh			zv->zv_objset = os;
1079277483Ssmh	} else {
1080277483Ssmh		os = zv->zv_objset;
1081277483Ssmh	}
1082277483Ssmh
1083277483Ssmh	if ((error = dmu_object_info(os, ZVOL_OBJ, &doi)) != 0 ||
1084277483Ssmh	    (error = zvol_check_volsize(volsize, doi.doi_data_block_size)) != 0)
1085277483Ssmh		goto out;
1086277483Ssmh
1087277483Ssmh	error = zvol_update_volsize(os, volsize);
1088277483Ssmh
1089277483Ssmh	if (error == 0 && zv != NULL)
1090277483Ssmh		error = zvol_update_live_volsize(zv, volsize);
1091277483Ssmhout:
1092277483Ssmh	if (owned) {
1093277483Ssmh		dmu_objset_disown(os, FTAG);
1094277483Ssmh		if (zv != NULL)
1095277483Ssmh			zv->zv_objset = NULL;
1096277483Ssmh	}
1097277483Ssmh	mutex_exit(&zfsdev_state_lock);
1098168404Spjd	return (error);
1099168404Spjd}
1100168404Spjd
1101219089Spjd/*ARGSUSED*/
1102277483Ssmh#ifdef illumos
1103277483Ssmhint
1104277483Ssmhzvol_open(dev_t *devp, int flag, int otyp, cred_t *cr)
1105277483Ssmh#else
1106219089Spjdstatic int
1107219089Spjdzvol_open(struct g_provider *pp, int flag, int count)
1108277483Ssmh#endif
1109168404Spjd{
1110168404Spjd	zvol_state_t *zv;
1111219089Spjd	int err = 0;
1112277483Ssmh#ifdef illumos
1113277483Ssmh
1114277483Ssmh	mutex_enter(&zfsdev_state_lock);
1115277483Ssmh
1116277483Ssmh	zv = zfsdev_get_soft_state(getminor(*devp), ZSST_ZVOL);
1117277483Ssmh	if (zv == NULL) {
1118277483Ssmh		mutex_exit(&zfsdev_state_lock);
1119277483Ssmh		return (SET_ERROR(ENXIO));
1120277483Ssmh	}
1121277483Ssmh
1122277483Ssmh	if (zv->zv_total_opens == 0)
1123277483Ssmh		err = zvol_first_open(zv);
1124277483Ssmh	if (err) {
1125277483Ssmh		mutex_exit(&zfsdev_state_lock);
1126277483Ssmh		return (err);
1127277483Ssmh	}
1128277483Ssmh#else	/* !illumos */
1129308057Smav	if (tsd_get(zfs_geom_probe_vdev_key) != NULL) {
1130308057Smav		/*
1131308057Smav		 * if zfs_geom_probe_vdev_key is set, that means that zfs is
1132308057Smav		 * attempting to probe geom providers while looking for a
1133308057Smav		 * replacement for a missing VDEV.  In this case, the
1134308057Smav		 * spa_namespace_lock will not be held, but it is still illegal
1135308057Smav		 * to use a zvol as a vdev.  Deadlocks can result if another
1136308057Smav		 * thread has spa_namespace_lock
1137308057Smav		 */
1138308057Smav		return (EOPNOTSUPP);
1139227110Spjd	}
1140227110Spjd
1141308057Smav	mutex_enter(&zfsdev_state_lock);
1142308057Smav
1143219089Spjd	zv = pp->private;
1144219089Spjd	if (zv == NULL) {
1145308057Smav		mutex_exit(&zfsdev_state_lock);
1146249195Smm		return (SET_ERROR(ENXIO));
1147168404Spjd	}
1148219089Spjd
1149263987Smav	if (zv->zv_total_opens == 0) {
1150219089Spjd		err = zvol_first_open(zv);
1151263987Smav		if (err) {
1152308057Smav			mutex_exit(&zfsdev_state_lock);
1153263987Smav			return (err);
1154263987Smav		}
1155263987Smav		pp->mediasize = zv->zv_volsize;
1156263987Smav		pp->stripeoffset = 0;
1157263987Smav		pp->stripesize = zv->zv_volblocksize;
1158168404Spjd	}
1159277483Ssmh#endif	/* illumos */
1160219089Spjd	if ((flag & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
1161249195Smm		err = SET_ERROR(EROFS);
1162219089Spjd		goto out;
1163219089Spjd	}
1164219089Spjd	if (zv->zv_flags & ZVOL_EXCL) {
1165249195Smm		err = SET_ERROR(EBUSY);
1166219089Spjd		goto out;
1167219089Spjd	}
1168219089Spjd#ifdef FEXCL
1169219089Spjd	if (flag & FEXCL) {
1170219089Spjd		if (zv->zv_total_opens != 0) {
1171249195Smm			err = SET_ERROR(EBUSY);
1172219089Spjd			goto out;
1173219089Spjd		}
1174219089Spjd		zv->zv_flags |= ZVOL_EXCL;
1175219089Spjd	}
1176219089Spjd#endif
1177168404Spjd
1178277483Ssmh#ifdef illumos
1179277483Ssmh	if (zv->zv_open_count[otyp] == 0 || otyp == OTYP_LYR) {
1180277483Ssmh		zv->zv_open_count[otyp]++;
1181277483Ssmh		zv->zv_total_opens++;
1182277483Ssmh	}
1183277483Ssmh	mutex_exit(&zfsdev_state_lock);
1184277483Ssmh#else
1185219089Spjd	zv->zv_total_opens += count;
1186308057Smav	mutex_exit(&zfsdev_state_lock);
1187277483Ssmh#endif
1188219089Spjd
1189219089Spjd	return (err);
1190219089Spjdout:
1191219089Spjd	if (zv->zv_total_opens == 0)
1192219089Spjd		zvol_last_close(zv);
1193277483Ssmh#ifdef illumos
1194277483Ssmh	mutex_exit(&zfsdev_state_lock);
1195277483Ssmh#else
1196308057Smav	mutex_exit(&zfsdev_state_lock);
1197277483Ssmh#endif
1198219089Spjd	return (err);
1199219089Spjd}
1200219089Spjd
1201219089Spjd/*ARGSUSED*/
1202277483Ssmh#ifdef illumos
1203277483Ssmhint
1204277483Ssmhzvol_close(dev_t dev, int flag, int otyp, cred_t *cr)
1205277483Ssmh{
1206277483Ssmh	minor_t minor = getminor(dev);
1207277483Ssmh	zvol_state_t *zv;
1208277483Ssmh	int error = 0;
1209277483Ssmh
1210277483Ssmh	mutex_enter(&zfsdev_state_lock);
1211277483Ssmh
1212277483Ssmh	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
1213277483Ssmh	if (zv == NULL) {
1214277483Ssmh		mutex_exit(&zfsdev_state_lock);
1215277483Ssmh#else	/* !illumos */
1216219089Spjdstatic int
1217219089Spjdzvol_close(struct g_provider *pp, int flag, int count)
1218219089Spjd{
1219219089Spjd	zvol_state_t *zv;
1220219089Spjd	int error = 0;
1221240831Savg	boolean_t locked = B_FALSE;
1222219089Spjd
1223240831Savg	/* See comment in zvol_open(). */
1224277483Ssmh	if (!MUTEX_HELD(&zfsdev_state_lock)) {
1225277483Ssmh		mutex_enter(&zfsdev_state_lock);
1226240831Savg		locked = B_TRUE;
1227240831Savg	}
1228219089Spjd
1229219089Spjd	zv = pp->private;
1230219089Spjd	if (zv == NULL) {
1231240831Savg		if (locked)
1232277483Ssmh			mutex_exit(&zfsdev_state_lock);
1233277483Ssmh#endif	/* illumos */
1234249195Smm		return (SET_ERROR(ENXIO));
1235168404Spjd	}
1236168404Spjd
1237219089Spjd	if (zv->zv_flags & ZVOL_EXCL) {
1238219089Spjd		ASSERT(zv->zv_total_opens == 1);
1239219089Spjd		zv->zv_flags &= ~ZVOL_EXCL;
1240219089Spjd	}
1241219089Spjd
1242219089Spjd	/*
1243219089Spjd	 * If the open count is zero, this is a spurious close.
1244219089Spjd	 * That indicates a bug in the kernel / DDI framework.
1245219089Spjd	 */
1246277483Ssmh#ifdef illumos
1247277483Ssmh	ASSERT(zv->zv_open_count[otyp] != 0);
1248277483Ssmh#endif
1249219089Spjd	ASSERT(zv->zv_total_opens != 0);
1250219089Spjd
1251219089Spjd	/*
1252219089Spjd	 * You may get multiple opens, but only one close.
1253219089Spjd	 */
1254277483Ssmh#ifdef illumos
1255277483Ssmh	zv->zv_open_count[otyp]--;
1256277483Ssmh	zv->zv_total_opens--;
1257277483Ssmh#else
1258219089Spjd	zv->zv_total_opens -= count;
1259277483Ssmh#endif
1260219089Spjd
1261219089Spjd	if (zv->zv_total_opens == 0)
1262219089Spjd		zvol_last_close(zv);
1263219089Spjd
1264277483Ssmh#ifdef illumos
1265277483Ssmh	mutex_exit(&zfsdev_state_lock);
1266277483Ssmh#else
1267240831Savg	if (locked)
1268277483Ssmh		mutex_exit(&zfsdev_state_lock);
1269277483Ssmh#endif
1270168404Spjd	return (error);
1271168404Spjd}
1272168404Spjd
1273219089Spjdstatic void
1274219089Spjdzvol_get_done(zgd_t *zgd, int error)
1275168404Spjd{
1276219089Spjd	if (zgd->zgd_db)
1277219089Spjd		dmu_buf_rele(zgd->zgd_db, zgd);
1278168404Spjd
1279219089Spjd	zfs_range_unlock(zgd->zgd_rl);
1280219089Spjd
1281219089Spjd	if (error == 0 && zgd->zgd_bp)
1282219089Spjd		zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
1283219089Spjd
1284168404Spjd	kmem_free(zgd, sizeof (zgd_t));
1285168404Spjd}
1286168404Spjd
1287168404Spjd/*
1288168404Spjd * Get data to generate a TX_WRITE intent log record.
1289168404Spjd */
1290168404Spjdstatic int
1291168404Spjdzvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
1292168404Spjd{
1293168404Spjd	zvol_state_t *zv = arg;
1294168404Spjd	objset_t *os = zv->zv_objset;
1295219089Spjd	uint64_t object = ZVOL_OBJ;
1296219089Spjd	uint64_t offset = lr->lr_offset;
1297219089Spjd	uint64_t size = lr->lr_length;	/* length of user data */
1298219089Spjd	blkptr_t *bp = &lr->lr_blkptr;
1299168404Spjd	dmu_buf_t *db;
1300168404Spjd	zgd_t *zgd;
1301168404Spjd	int error;
1302168404Spjd
1303219089Spjd	ASSERT(zio != NULL);
1304219089Spjd	ASSERT(size != 0);
1305168404Spjd
1306219089Spjd	zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
1307219089Spjd	zgd->zgd_zilog = zv->zv_zilog;
1308219089Spjd
1309168404Spjd	/*
1310168404Spjd	 * Write records come in two flavors: immediate and indirect.
1311168404Spjd	 * For small writes it's cheaper to store the data with the
1312168404Spjd	 * log record (immediate); for large writes it's cheaper to
1313168404Spjd	 * sync the data and get a pointer to it (indirect) so that
1314168404Spjd	 * we don't have to write the data twice.
1315168404Spjd	 */
1316324204Savg	if (buf != NULL) { /* immediate write */
1317324204Savg		zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size,
1318324204Savg		    RL_READER);
1319219089Spjd		error = dmu_read(os, object, offset, size, buf,
1320219089Spjd		    DMU_READ_NO_PREFETCH);
1321324204Savg	} else { /* indirect write */
1322324204Savg		/*
1323324204Savg		 * Have to lock the whole block to ensure when it's written out
1324324204Savg		 * and its checksum is being calculated that no one can change
1325324204Savg		 * the data. Contrarily to zfs_get_data we need not re-check
1326324204Savg		 * blocksize after we get the lock because it cannot be changed.
1327324204Savg		 */
1328219089Spjd		size = zv->zv_volblocksize;
1329219089Spjd		offset = P2ALIGN(offset, size);
1330324204Savg		zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size,
1331324204Savg		    RL_READER);
1332219089Spjd		error = dmu_buf_hold(os, object, offset, zgd, &db,
1333219089Spjd		    DMU_READ_NO_PREFETCH);
1334219089Spjd		if (error == 0) {
1335243524Smm			blkptr_t *obp = dmu_buf_get_blkptr(db);
1336243524Smm			if (obp) {
1337243524Smm				ASSERT(BP_IS_HOLE(bp));
1338243524Smm				*bp = *obp;
1339243524Smm			}
1340243524Smm
1341219089Spjd			zgd->zgd_db = db;
1342219089Spjd			zgd->zgd_bp = bp;
1343168404Spjd
1344219089Spjd			ASSERT(db->db_offset == offset);
1345219089Spjd			ASSERT(db->db_size == size);
1346168404Spjd
1347219089Spjd			error = dmu_sync(zio, lr->lr_common.lrc_txg,
1348219089Spjd			    zvol_get_done, zgd);
1349168404Spjd
1350219089Spjd			if (error == 0)
1351219089Spjd				return (0);
1352219089Spjd		}
1353219089Spjd	}
1354209962Smm
1355219089Spjd	zvol_get_done(zgd, error);
1356219089Spjd
1357219089Spjd	return (error);
1358219089Spjd}
1359219089Spjd
1360219089Spjd/*
1361219089Spjd * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions.
1362219089Spjd *
1363219089Spjd * We store data in the log buffers if it's small enough.
1364219089Spjd * Otherwise we will later flush the data out via dmu_sync().
1365219089Spjd */
1366219089Spjdssize_t zvol_immediate_write_sz = 32768;
1367219089Spjd
1368219089Spjdstatic void
1369219089Spjdzvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t resid,
1370219089Spjd    boolean_t sync)
1371219089Spjd{
1372219089Spjd	uint32_t blocksize = zv->zv_volblocksize;
1373219089Spjd	zilog_t *zilog = zv->zv_zilog;
1374320496Savg	itx_wr_state_t write_state;
1375219089Spjd
1376219089Spjd	if (zil_replaying(zilog, tx))
1377219089Spjd		return;
1378219089Spjd
1379320496Savg	if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
1380320496Savg		write_state = WR_INDIRECT;
1381320496Savg	else if (!spa_has_slogs(zilog->zl_spa) &&
1382320496Savg	    resid >= blocksize && blocksize > zvol_immediate_write_sz)
1383320496Savg		write_state = WR_INDIRECT;
1384320496Savg	else if (sync)
1385320496Savg		write_state = WR_COPIED;
1386320496Savg	else
1387320496Savg		write_state = WR_NEED_COPY;
1388219089Spjd
1389219089Spjd	while (resid) {
1390219089Spjd		itx_t *itx;
1391219089Spjd		lr_write_t *lr;
1392320496Savg		itx_wr_state_t wr_state = write_state;
1393320496Savg		ssize_t len = resid;
1394219089Spjd
1395320496Savg		if (wr_state == WR_COPIED && resid > ZIL_MAX_COPIED_DATA)
1396320496Savg			wr_state = WR_NEED_COPY;
1397320496Savg		else if (wr_state == WR_INDIRECT)
1398320496Savg			len = MIN(blocksize - P2PHASE(off, blocksize), resid);
1399219089Spjd
1400219089Spjd		itx = zil_itx_create(TX_WRITE, sizeof (*lr) +
1401320496Savg		    (wr_state == WR_COPIED ? len : 0));
1402219089Spjd		lr = (lr_write_t *)&itx->itx_lr;
1403320496Savg		if (wr_state == WR_COPIED && dmu_read(zv->zv_objset,
1404219089Spjd		    ZVOL_OBJ, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
1405219089Spjd			zil_itx_destroy(itx);
1406219089Spjd			itx = zil_itx_create(TX_WRITE, sizeof (*lr));
1407219089Spjd			lr = (lr_write_t *)&itx->itx_lr;
1408320496Savg			wr_state = WR_NEED_COPY;
1409219089Spjd		}
1410219089Spjd
1411320496Savg		itx->itx_wr_state = wr_state;
1412219089Spjd		lr->lr_foid = ZVOL_OBJ;
1413219089Spjd		lr->lr_offset = off;
1414219089Spjd		lr->lr_length = len;
1415219089Spjd		lr->lr_blkoff = 0;
1416219089Spjd		BP_ZERO(&lr->lr_blkptr);
1417219089Spjd
1418219089Spjd		itx->itx_private = zv;
1419219089Spjd
1420308596Smav		if (!sync && (zv->zv_sync_cnt == 0))
1421308596Smav			itx->itx_sync = B_FALSE;
1422308596Smav
1423219089Spjd		zil_itx_assign(zilog, itx, tx);
1424219089Spjd
1425219089Spjd		off += len;
1426219089Spjd		resid -= len;
1427209962Smm	}
1428219089Spjd}
1429209962Smm
1430277483Ssmh#ifdef illumos
1431219089Spjdstatic int
1432255750Sdelphijzvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t origoffset,
1433255750Sdelphij    uint64_t size, boolean_t doread, boolean_t isdump)
1434219089Spjd{
1435219089Spjd	vdev_disk_t *dvd;
1436219089Spjd	int c;
1437219089Spjd	int numerrors = 0;
1438219089Spjd
1439255750Sdelphij	if (vd->vdev_ops == &vdev_mirror_ops ||
1440255750Sdelphij	    vd->vdev_ops == &vdev_replacing_ops ||
1441255750Sdelphij	    vd->vdev_ops == &vdev_spare_ops) {
1442255750Sdelphij		for (c = 0; c < vd->vdev_children; c++) {
1443255750Sdelphij			int err = zvol_dumpio_vdev(vd->vdev_child[c],
1444255750Sdelphij			    addr, offset, origoffset, size, doread, isdump);
1445255750Sdelphij			if (err != 0) {
1446255750Sdelphij				numerrors++;
1447255750Sdelphij			} else if (doread) {
1448255750Sdelphij				break;
1449255750Sdelphij			}
1450219089Spjd		}
1451219089Spjd	}
1452219089Spjd
1453255750Sdelphij	if (!vd->vdev_ops->vdev_op_leaf && vd->vdev_ops != &vdev_raidz_ops)
1454219089Spjd		return (numerrors < vd->vdev_children ? 0 : EIO);
1455219089Spjd
1456219089Spjd	if (doread && !vdev_readable(vd))
1457249195Smm		return (SET_ERROR(EIO));
1458219089Spjd	else if (!doread && !vdev_writeable(vd))
1459249195Smm		return (SET_ERROR(EIO));
1460219089Spjd
1461255750Sdelphij	if (vd->vdev_ops == &vdev_raidz_ops) {
1462255750Sdelphij		return (vdev_raidz_physio(vd,
1463255750Sdelphij		    addr, size, offset, origoffset, doread, isdump));
1464255750Sdelphij	}
1465255750Sdelphij
1466219089Spjd	offset += VDEV_LABEL_START_SIZE;
1467219089Spjd
1468219089Spjd	if (ddi_in_panic() || isdump) {
1469219089Spjd		ASSERT(!doread);
1470219089Spjd		if (doread)
1471249195Smm			return (SET_ERROR(EIO));
1472255750Sdelphij		dvd = vd->vdev_tsd;
1473255750Sdelphij		ASSERT3P(dvd, !=, NULL);
1474219089Spjd		return (ldi_dump(dvd->vd_lh, addr, lbtodb(offset),
1475219089Spjd		    lbtodb(size)));
1476219089Spjd	} else {
1477255750Sdelphij		dvd = vd->vdev_tsd;
1478255750Sdelphij		ASSERT3P(dvd, !=, NULL);
1479255750Sdelphij		return (vdev_disk_ldi_physio(dvd->vd_lh, addr, size,
1480255750Sdelphij		    offset, doread ? B_READ : B_WRITE));
1481219089Spjd	}
1482219089Spjd}
1483219089Spjd
1484219089Spjdstatic int
1485219089Spjdzvol_dumpio(zvol_state_t *zv, void *addr, uint64_t offset, uint64_t size,
1486219089Spjd    boolean_t doread, boolean_t isdump)
1487219089Spjd{
1488219089Spjd	vdev_t *vd;
1489219089Spjd	int error;
1490219089Spjd	zvol_extent_t *ze;
1491219089Spjd	spa_t *spa = dmu_objset_spa(zv->zv_objset);
1492219089Spjd
1493219089Spjd	/* Must be sector aligned, and not stradle a block boundary. */
1494219089Spjd	if (P2PHASE(offset, DEV_BSIZE) || P2PHASE(size, DEV_BSIZE) ||
1495219089Spjd	    P2BOUNDARY(offset, size, zv->zv_volblocksize)) {
1496249195Smm		return (SET_ERROR(EINVAL));
1497219089Spjd	}
1498219089Spjd	ASSERT(size <= zv->zv_volblocksize);
1499219089Spjd
1500219089Spjd	/* Locate the extent this belongs to */
1501219089Spjd	ze = list_head(&zv->zv_extents);
1502219089Spjd	while (offset >= ze->ze_nblks * zv->zv_volblocksize) {
1503219089Spjd		offset -= ze->ze_nblks * zv->zv_volblocksize;
1504219089Spjd		ze = list_next(&zv->zv_extents, ze);
1505219089Spjd	}
1506219089Spjd
1507248571Smm	if (ze == NULL)
1508249195Smm		return (SET_ERROR(EINVAL));
1509248571Smm
1510219089Spjd	if (!ddi_in_panic())
1511219089Spjd		spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
1512219089Spjd
1513219089Spjd	vd = vdev_lookup_top(spa, DVA_GET_VDEV(&ze->ze_dva));
1514219089Spjd	offset += DVA_GET_OFFSET(&ze->ze_dva);
1515255750Sdelphij	error = zvol_dumpio_vdev(vd, addr, offset, DVA_GET_OFFSET(&ze->ze_dva),
1516255750Sdelphij	    size, doread, isdump);
1517219089Spjd
1518219089Spjd	if (!ddi_in_panic())
1519219089Spjd		spa_config_exit(spa, SCL_STATE, FTAG);
1520219089Spjd
1521219089Spjd	return (error);
1522219089Spjd}
1523219089Spjd
1524277483Ssmhint
1525277483Ssmhzvol_strategy(buf_t *bp)
1526277483Ssmh{
1527277483Ssmh	zfs_soft_state_t *zs = NULL;
1528277483Ssmh#else	/* !illumos */
1529265678Smavvoid
1530219089Spjdzvol_strategy(struct bio *bp)
1531219089Spjd{
1532277483Ssmh#endif	/* illumos */
1533265678Smav	zvol_state_t *zv;
1534219089Spjd	uint64_t off, volsize;
1535219089Spjd	size_t resid;
1536219089Spjd	char *addr;
1537219089Spjd	objset_t *os;
1538219089Spjd	rl_t *rl;
1539219089Spjd	int error = 0;
1540277483Ssmh#ifdef illumos
1541277483Ssmh	boolean_t doread = bp->b_flags & B_READ;
1542277483Ssmh#else
1543265678Smav	boolean_t doread = 0;
1544277483Ssmh#endif
1545255750Sdelphij	boolean_t is_dumpified;
1546219089Spjd	boolean_t sync;
1547219089Spjd
1548277483Ssmh#ifdef illumos
1549277483Ssmh	if (getminor(bp->b_edev) == 0) {
1550277483Ssmh		error = SET_ERROR(EINVAL);
1551277483Ssmh	} else {
1552277483Ssmh		zs = ddi_get_soft_state(zfsdev_state, getminor(bp->b_edev));
1553277483Ssmh		if (zs == NULL)
1554277483Ssmh			error = SET_ERROR(ENXIO);
1555277483Ssmh		else if (zs->zss_type != ZSST_ZVOL)
1556277483Ssmh			error = SET_ERROR(EINVAL);
1557277483Ssmh	}
1558277483Ssmh
1559277483Ssmh	if (error) {
1560277483Ssmh		bioerror(bp, error);
1561277483Ssmh		biodone(bp);
1562277483Ssmh		return (0);
1563277483Ssmh	}
1564277483Ssmh
1565277483Ssmh	zv = zs->zss_data;
1566277483Ssmh
1567277483Ssmh	if (!(bp->b_flags & B_READ) && (zv->zv_flags & ZVOL_RDONLY)) {
1568277483Ssmh		bioerror(bp, EROFS);
1569277483Ssmh		biodone(bp);
1570277483Ssmh		return (0);
1571277483Ssmh	}
1572277483Ssmh
1573277483Ssmh	off = ldbtob(bp->b_blkno);
1574277483Ssmh#else	/* !illumos */
1575265678Smav	if (bp->bio_to)
1576265678Smav		zv = bp->bio_to->private;
1577265678Smav	else
1578265678Smav		zv = bp->bio_dev->si_drv2;
1579265678Smav
1580219089Spjd	if (zv == NULL) {
1581277483Ssmh		error = SET_ERROR(ENXIO);
1582265678Smav		goto out;
1583219089Spjd	}
1584219089Spjd
1585219089Spjd	if (bp->bio_cmd != BIO_READ && (zv->zv_flags & ZVOL_RDONLY)) {
1586277483Ssmh		error = SET_ERROR(EROFS);
1587265678Smav		goto out;
1588219089Spjd	}
1589219089Spjd
1590265678Smav	switch (bp->bio_cmd) {
1591265678Smav	case BIO_FLUSH:
1592265678Smav		goto sync;
1593265678Smav	case BIO_READ:
1594265678Smav		doread = 1;
1595265678Smav	case BIO_WRITE:
1596265678Smav	case BIO_DELETE:
1597265678Smav		break;
1598265678Smav	default:
1599265678Smav		error = EOPNOTSUPP;
1600265678Smav		goto out;
1601265678Smav	}
1602265678Smav
1603219089Spjd	off = bp->bio_offset;
1604277483Ssmh#endif	/* illumos */
1605219089Spjd	volsize = zv->zv_volsize;
1606219089Spjd
1607219089Spjd	os = zv->zv_objset;
1608219089Spjd	ASSERT(os != NULL);
1609219089Spjd
1610277483Ssmh#ifdef illumos
1611277483Ssmh	bp_mapin(bp);
1612277483Ssmh	addr = bp->b_un.b_addr;
1613277483Ssmh	resid = bp->b_bcount;
1614277483Ssmh
1615277483Ssmh	if (resid > 0 && (off < 0 || off >= volsize)) {
1616277483Ssmh		bioerror(bp, EIO);
1617277483Ssmh		biodone(bp);
1618277483Ssmh		return (0);
1619277483Ssmh	}
1620277483Ssmh
1621277483Ssmh	is_dumpified = zv->zv_flags & ZVOL_DUMPIFIED;
1622277483Ssmh	sync = ((!(bp->b_flags & B_ASYNC) &&
1623277483Ssmh	    !(zv->zv_flags & ZVOL_WCE)) ||
1624277483Ssmh	    (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS)) &&
1625277483Ssmh	    !doread && !is_dumpified;
1626277483Ssmh#else	/* !illumos */
1627219089Spjd	addr = bp->bio_data;
1628219089Spjd	resid = bp->bio_length;
1629219089Spjd
1630219089Spjd	if (resid > 0 && (off < 0 || off >= volsize)) {
1631277483Ssmh		error = SET_ERROR(EIO);
1632265678Smav		goto out;
1633219089Spjd	}
1634219089Spjd
1635255750Sdelphij	is_dumpified = B_FALSE;
1636277483Ssmh	sync = !doread && !is_dumpified &&
1637255750Sdelphij	    zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
1638277483Ssmh#endif	/* illumos */
1639219089Spjd
1640168404Spjd	/*
1641219089Spjd	 * There must be no buffer changes when doing a dmu_sync() because
1642219089Spjd	 * we can't change the data whilst calculating the checksum.
1643168404Spjd	 */
1644219089Spjd	rl = zfs_range_lock(&zv->zv_znode, off, resid,
1645219089Spjd	    doread ? RL_READER : RL_WRITER);
1646219089Spjd
1647277483Ssmh#ifndef illumos
1648264732Smav	if (bp->bio_cmd == BIO_DELETE) {
1649264732Smav		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
1650264732Smav		error = dmu_tx_assign(tx, TXG_WAIT);
1651264732Smav		if (error != 0) {
1652264732Smav			dmu_tx_abort(tx);
1653264732Smav		} else {
1654308594Smav			zvol_log_truncate(zv, tx, off, resid, sync);
1655264732Smav			dmu_tx_commit(tx);
1656264732Smav			error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
1657264732Smav			    off, resid);
1658264732Smav			resid = 0;
1659264732Smav		}
1660264732Smav		goto unlock;
1661264732Smav	}
1662277483Ssmh#endif
1663219089Spjd	while (resid != 0 && off < volsize) {
1664219089Spjd		size_t size = MIN(resid, zvol_maxphys);
1665255750Sdelphij#ifdef illumos
1666255750Sdelphij		if (is_dumpified) {
1667255750Sdelphij			size = MIN(size, P2END(off, zv->zv_volblocksize) - off);
1668255750Sdelphij			error = zvol_dumpio(zv, addr, off, size,
1669255750Sdelphij			    doread, B_FALSE);
1670255750Sdelphij		} else if (doread) {
1671255750Sdelphij#else
1672219089Spjd		if (doread) {
1673255750Sdelphij#endif
1674219089Spjd			error = dmu_read(os, ZVOL_OBJ, off, size, addr,
1675219089Spjd			    DMU_READ_PREFETCH);
1676219089Spjd		} else {
1677219089Spjd			dmu_tx_t *tx = dmu_tx_create(os);
1678219089Spjd			dmu_tx_hold_write(tx, ZVOL_OBJ, off, size);
1679219089Spjd			error = dmu_tx_assign(tx, TXG_WAIT);
1680219089Spjd			if (error) {
1681219089Spjd				dmu_tx_abort(tx);
1682219089Spjd			} else {
1683219089Spjd				dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
1684219089Spjd				zvol_log_write(zv, tx, off, size, sync);
1685219089Spjd				dmu_tx_commit(tx);
1686219089Spjd			}
1687219089Spjd		}
1688219089Spjd		if (error) {
1689219089Spjd			/* convert checksum errors into IO errors */
1690219089Spjd			if (error == ECKSUM)
1691249195Smm				error = SET_ERROR(EIO);
1692219089Spjd			break;
1693219089Spjd		}
1694219089Spjd		off += size;
1695219089Spjd		addr += size;
1696219089Spjd		resid -= size;
1697219089Spjd	}
1698277483Ssmh#ifndef illumos
1699264732Smavunlock:
1700277483Ssmh#endif
1701168404Spjd	zfs_range_unlock(rl);
1702219089Spjd
1703277483Ssmh#ifdef illumos
1704277483Ssmh	if ((bp->b_resid = resid) == bp->b_bcount)
1705277483Ssmh		bioerror(bp, off > volsize ? EINVAL : error);
1706277483Ssmh
1707277483Ssmh	if (sync)
1708277483Ssmh		zil_commit(zv->zv_zilog, ZVOL_OBJ);
1709277483Ssmh	biodone(bp);
1710277483Ssmh
1711277483Ssmh	return (0);
1712277483Ssmh#else	/* !illumos */
1713219089Spjd	bp->bio_completed = bp->bio_length - resid;
1714265678Smav	if (bp->bio_completed < bp->bio_length && off > volsize)
1715265678Smav		error = EINVAL;
1716219089Spjd
1717265678Smav	if (sync) {
1718265678Smavsync:
1719219089Spjd		zil_commit(zv->zv_zilog, ZVOL_OBJ);
1720265678Smav	}
1721265678Smavout:
1722265678Smav	if (bp->bio_to)
1723265678Smav		g_io_deliver(bp, error);
1724265678Smav	else
1725265678Smav		biofinish(bp, NULL, error);
1726277483Ssmh#endif	/* illumos */
1727219089Spjd}
1728219089Spjd
1729277483Ssmh#ifdef illumos
1730219089Spjd/*
1731219089Spjd * Set the buffer count to the zvol maximum transfer.
1732219089Spjd * Using our own routine instead of the default minphys()
1733219089Spjd * means that for larger writes we write bigger buffers on X86
1734219089Spjd * (128K instead of 56K) and flush the disk write cache less often
1735219089Spjd * (every zvol_maxphys - currently 1MB) instead of minphys (currently
1736219089Spjd * 56K on X86 and 128K on sparc).
1737219089Spjd */
1738219089Spjdvoid
1739219089Spjdzvol_minphys(struct buf *bp)
1740219089Spjd{
1741219089Spjd	if (bp->b_bcount > zvol_maxphys)
1742219089Spjd		bp->b_bcount = zvol_maxphys;
1743219089Spjd}
1744219089Spjd
1745219089Spjdint
1746219089Spjdzvol_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblocks)
1747219089Spjd{
1748219089Spjd	minor_t minor = getminor(dev);
1749219089Spjd	zvol_state_t *zv;
1750219089Spjd	int error = 0;
1751219089Spjd	uint64_t size;
1752219089Spjd	uint64_t boff;
1753219089Spjd	uint64_t resid;
1754219089Spjd
1755219089Spjd	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
1756219089Spjd	if (zv == NULL)
1757249195Smm		return (SET_ERROR(ENXIO));
1758219089Spjd
1759248571Smm	if ((zv->zv_flags & ZVOL_DUMPIFIED) == 0)
1760249195Smm		return (SET_ERROR(EINVAL));
1761248571Smm
1762219089Spjd	boff = ldbtob(blkno);
1763219089Spjd	resid = ldbtob(nblocks);
1764219089Spjd
1765219089Spjd	VERIFY3U(boff + resid, <=, zv->zv_volsize);
1766219089Spjd
1767219089Spjd	while (resid) {
1768219089Spjd		size = MIN(resid, P2END(boff, zv->zv_volblocksize) - boff);
1769219089Spjd		error = zvol_dumpio(zv, addr, boff, size, B_FALSE, B_TRUE);
1770219089Spjd		if (error)
1771219089Spjd			break;
1772219089Spjd		boff += size;
1773219089Spjd		addr += size;
1774219089Spjd		resid -= size;
1775219089Spjd	}
1776219089Spjd
1777168404Spjd	return (error);
1778168404Spjd}
1779168404Spjd
1780219089Spjd/*ARGSUSED*/
1781168404Spjdint
1782219089Spjdzvol_read(dev_t dev, uio_t *uio, cred_t *cr)
1783219089Spjd{
1784219089Spjd	minor_t minor = getminor(dev);
1785277483Ssmh#else	/* !illumos */
1786265678Smavint
1787265678Smavzvol_read(struct cdev *dev, struct uio *uio, int ioflag)
1788265678Smav{
1789277483Ssmh#endif	/* illumos */
1790219089Spjd	zvol_state_t *zv;
1791219089Spjd	uint64_t volsize;
1792219089Spjd	rl_t *rl;
1793219089Spjd	int error = 0;
1794219089Spjd
1795277483Ssmh#ifdef illumos
1796219089Spjd	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
1797219089Spjd	if (zv == NULL)
1798249195Smm		return (SET_ERROR(ENXIO));
1799265678Smav#else
1800265678Smav	zv = dev->si_drv2;
1801265678Smav#endif
1802219089Spjd
1803219089Spjd	volsize = zv->zv_volsize;
1804277483Ssmh	/* uio_loffset == volsize isn't an error as its required for EOF processing. */
1805219089Spjd	if (uio->uio_resid > 0 &&
1806265678Smav	    (uio->uio_loffset < 0 || uio->uio_loffset > volsize))
1807249195Smm		return (SET_ERROR(EIO));
1808219089Spjd
1809265678Smav#ifdef illumos
1810219089Spjd	if (zv->zv_flags & ZVOL_DUMPIFIED) {
1811219089Spjd		error = physio(zvol_strategy, NULL, dev, B_READ,
1812219089Spjd		    zvol_minphys, uio);
1813219089Spjd		return (error);
1814219089Spjd	}
1815265678Smav#endif
1816219089Spjd
1817219089Spjd	rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
1818219089Spjd	    RL_READER);
1819219089Spjd	while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
1820219089Spjd		uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
1821219089Spjd
1822219089Spjd		/* don't read past the end */
1823219089Spjd		if (bytes > volsize - uio->uio_loffset)
1824219089Spjd			bytes = volsize - uio->uio_loffset;
1825219089Spjd
1826277699Smav		error =  dmu_read_uio_dbuf(zv->zv_dbuf, uio, bytes);
1827219089Spjd		if (error) {
1828219089Spjd			/* convert checksum errors into IO errors */
1829219089Spjd			if (error == ECKSUM)
1830249195Smm				error = SET_ERROR(EIO);
1831219089Spjd			break;
1832219089Spjd		}
1833219089Spjd	}
1834219089Spjd	zfs_range_unlock(rl);
1835219089Spjd	return (error);
1836219089Spjd}
1837219089Spjd
1838277483Ssmh#ifdef illumos
1839219089Spjd/*ARGSUSED*/
1840219089Spjdint
1841219089Spjdzvol_write(dev_t dev, uio_t *uio, cred_t *cr)
1842219089Spjd{
1843219089Spjd	minor_t minor = getminor(dev);
1844277483Ssmh#else	/* !illumos */
1845265678Smavint
1846265678Smavzvol_write(struct cdev *dev, struct uio *uio, int ioflag)
1847265678Smav{
1848277483Ssmh#endif	/* illumos */
1849219089Spjd	zvol_state_t *zv;
1850219089Spjd	uint64_t volsize;
1851219089Spjd	rl_t *rl;
1852219089Spjd	int error = 0;
1853219089Spjd	boolean_t sync;
1854219089Spjd
1855277483Ssmh#ifdef illumos
1856219089Spjd	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
1857219089Spjd	if (zv == NULL)
1858249195Smm		return (SET_ERROR(ENXIO));
1859265678Smav#else
1860265678Smav	zv = dev->si_drv2;
1861265678Smav#endif
1862219089Spjd
1863219089Spjd	volsize = zv->zv_volsize;
1864277483Ssmh	/* uio_loffset == volsize isn't an error as its required for EOF processing. */
1865219089Spjd	if (uio->uio_resid > 0 &&
1866265678Smav	    (uio->uio_loffset < 0 || uio->uio_loffset > volsize))
1867249195Smm		return (SET_ERROR(EIO));
1868219089Spjd
1869265678Smav#ifdef illumos
1870219089Spjd	if (zv->zv_flags & ZVOL_DUMPIFIED) {
1871219089Spjd		error = physio(zvol_strategy, NULL, dev, B_WRITE,
1872219089Spjd		    zvol_minphys, uio);
1873219089Spjd		return (error);
1874219089Spjd	}
1875219089Spjd
1876219089Spjd	sync = !(zv->zv_flags & ZVOL_WCE) ||
1877268274Smav#else
1878272615Smav	sync = (ioflag & IO_SYNC) ||
1879268274Smav#endif
1880219089Spjd	    (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
1881219089Spjd
1882219089Spjd	rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
1883219089Spjd	    RL_WRITER);
1884219089Spjd	while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
1885219089Spjd		uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
1886219089Spjd		uint64_t off = uio->uio_loffset;
1887219089Spjd		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
1888219089Spjd
1889219089Spjd		if (bytes > volsize - off)	/* don't write past the end */
1890219089Spjd			bytes = volsize - off;
1891219089Spjd
1892219089Spjd		dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
1893219089Spjd		error = dmu_tx_assign(tx, TXG_WAIT);
1894219089Spjd		if (error) {
1895219089Spjd			dmu_tx_abort(tx);
1896219089Spjd			break;
1897219089Spjd		}
1898219089Spjd		error = dmu_write_uio_dbuf(zv->zv_dbuf, uio, bytes, tx);
1899219089Spjd		if (error == 0)
1900219089Spjd			zvol_log_write(zv, tx, off, bytes, sync);
1901219089Spjd		dmu_tx_commit(tx);
1902219089Spjd
1903219089Spjd		if (error)
1904219089Spjd			break;
1905219089Spjd	}
1906219089Spjd	zfs_range_unlock(rl);
1907219089Spjd	if (sync)
1908219089Spjd		zil_commit(zv->zv_zilog, ZVOL_OBJ);
1909219089Spjd	return (error);
1910219089Spjd}
1911219089Spjd
1912277483Ssmh#ifdef illumos
1913219089Spjdint
1914219089Spjdzvol_getefi(void *arg, int flag, uint64_t vs, uint8_t bs)
1915219089Spjd{
1916219089Spjd	struct uuid uuid = EFI_RESERVED;
1917219089Spjd	efi_gpe_t gpe = { 0 };
1918219089Spjd	uint32_t crc;
1919219089Spjd	dk_efi_t efi;
1920219089Spjd	int length;
1921219089Spjd	char *ptr;
1922219089Spjd
1923219089Spjd	if (ddi_copyin(arg, &efi, sizeof (dk_efi_t), flag))
1924249195Smm		return (SET_ERROR(EFAULT));
1925219089Spjd	ptr = (char *)(uintptr_t)efi.dki_data_64;
1926219089Spjd	length = efi.dki_length;
1927219089Spjd	/*
1928219089Spjd	 * Some clients may attempt to request a PMBR for the
1929219089Spjd	 * zvol.  Currently this interface will return EINVAL to
1930219089Spjd	 * such requests.  These requests could be supported by
1931219089Spjd	 * adding a check for lba == 0 and consing up an appropriate
1932219089Spjd	 * PMBR.
1933219089Spjd	 */
1934219089Spjd	if (efi.dki_lba < 1 || efi.dki_lba > 2 || length <= 0)
1935249195Smm		return (SET_ERROR(EINVAL));
1936219089Spjd
1937219089Spjd	gpe.efi_gpe_StartingLBA = LE_64(34ULL);
1938219089Spjd	gpe.efi_gpe_EndingLBA = LE_64((vs >> bs) - 1);
1939219089Spjd	UUID_LE_CONVERT(gpe.efi_gpe_PartitionTypeGUID, uuid);
1940219089Spjd
1941219089Spjd	if (efi.dki_lba == 1) {
1942219089Spjd		efi_gpt_t gpt = { 0 };
1943219089Spjd
1944219089Spjd		gpt.efi_gpt_Signature = LE_64(EFI_SIGNATURE);
1945219089Spjd		gpt.efi_gpt_Revision = LE_32(EFI_VERSION_CURRENT);
1946219089Spjd		gpt.efi_gpt_HeaderSize = LE_32(sizeof (gpt));
1947219089Spjd		gpt.efi_gpt_MyLBA = LE_64(1ULL);
1948219089Spjd		gpt.efi_gpt_FirstUsableLBA = LE_64(34ULL);
1949219089Spjd		gpt.efi_gpt_LastUsableLBA = LE_64((vs >> bs) - 1);
1950219089Spjd		gpt.efi_gpt_PartitionEntryLBA = LE_64(2ULL);
1951219089Spjd		gpt.efi_gpt_NumberOfPartitionEntries = LE_32(1);
1952219089Spjd		gpt.efi_gpt_SizeOfPartitionEntry =
1953219089Spjd		    LE_32(sizeof (efi_gpe_t));
1954219089Spjd		CRC32(crc, &gpe, sizeof (gpe), -1U, crc32_table);
1955219089Spjd		gpt.efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc);
1956219089Spjd		CRC32(crc, &gpt, sizeof (gpt), -1U, crc32_table);
1957219089Spjd		gpt.efi_gpt_HeaderCRC32 = LE_32(~crc);
1958219089Spjd		if (ddi_copyout(&gpt, ptr, MIN(sizeof (gpt), length),
1959219089Spjd		    flag))
1960249195Smm			return (SET_ERROR(EFAULT));
1961219089Spjd		ptr += sizeof (gpt);
1962219089Spjd		length -= sizeof (gpt);
1963219089Spjd	}
1964219089Spjd	if (length > 0 && ddi_copyout(&gpe, ptr, MIN(sizeof (gpe),
1965219089Spjd	    length), flag))
1966249195Smm		return (SET_ERROR(EFAULT));
1967219089Spjd	return (0);
1968219089Spjd}
1969219089Spjd
1970219089Spjd/*
1971219089Spjd * BEGIN entry points to allow external callers access to the volume.
1972219089Spjd */
1973219089Spjd/*
1974219089Spjd * Return the volume parameters needed for access from an external caller.
1975219089Spjd * These values are invariant as long as the volume is held open.
1976219089Spjd */
1977219089Spjdint
1978219089Spjdzvol_get_volume_params(minor_t minor, uint64_t *blksize,
1979219089Spjd    uint64_t *max_xfer_len, void **minor_hdl, void **objset_hdl, void **zil_hdl,
1980219089Spjd    void **rl_hdl, void **bonus_hdl)
1981219089Spjd{
1982219089Spjd	zvol_state_t *zv;
1983219089Spjd
1984219089Spjd	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
1985219089Spjd	if (zv == NULL)
1986249195Smm		return (SET_ERROR(ENXIO));
1987219089Spjd	if (zv->zv_flags & ZVOL_DUMPIFIED)
1988249195Smm		return (SET_ERROR(ENXIO));
1989219089Spjd
1990219089Spjd	ASSERT(blksize && max_xfer_len && minor_hdl &&
1991219089Spjd	    objset_hdl && zil_hdl && rl_hdl && bonus_hdl);
1992219089Spjd
1993219089Spjd	*blksize = zv->zv_volblocksize;
1994219089Spjd	*max_xfer_len = (uint64_t)zvol_maxphys;
1995219089Spjd	*minor_hdl = zv;
1996219089Spjd	*objset_hdl = zv->zv_objset;
1997219089Spjd	*zil_hdl = zv->zv_zilog;
1998219089Spjd	*rl_hdl = &zv->zv_znode;
1999219089Spjd	*bonus_hdl = zv->zv_dbuf;
2000219089Spjd	return (0);
2001219089Spjd}
2002219089Spjd
2003219089Spjd/*
2004219089Spjd * Return the current volume size to an external caller.
2005219089Spjd * The size can change while the volume is open.
2006219089Spjd */
2007219089Spjduint64_t
2008219089Spjdzvol_get_volume_size(void *minor_hdl)
2009219089Spjd{
2010219089Spjd	zvol_state_t *zv = minor_hdl;
2011219089Spjd
2012219089Spjd	return (zv->zv_volsize);
2013219089Spjd}
2014219089Spjd
2015219089Spjd/*
2016219089Spjd * Return the current WCE setting to an external caller.
2017219089Spjd * The WCE setting can change while the volume is open.
2018219089Spjd */
2019219089Spjdint
2020219089Spjdzvol_get_volume_wce(void *minor_hdl)
2021219089Spjd{
2022219089Spjd	zvol_state_t *zv = minor_hdl;
2023219089Spjd
2024219089Spjd	return ((zv->zv_flags & ZVOL_WCE) ? 1 : 0);
2025219089Spjd}
2026219089Spjd
2027219089Spjd/*
2028219089Spjd * Entry point for external callers to zvol_log_write
2029219089Spjd */
2030219089Spjdvoid
2031219089Spjdzvol_log_write_minor(void *minor_hdl, dmu_tx_t *tx, offset_t off, ssize_t resid,
2032219089Spjd    boolean_t sync)
2033219089Spjd{
2034219089Spjd	zvol_state_t *zv = minor_hdl;
2035219089Spjd
2036219089Spjd	zvol_log_write(zv, tx, off, resid, sync);
2037219089Spjd}
2038219089Spjd/*
2039219089Spjd * END entry points to allow external callers access to the volume.
2040219089Spjd */
2041277483Ssmh#endif	/* illumos */
2042219089Spjd
2043219089Spjd/*
2044264732Smav * Log a DKIOCFREE/free-long-range to the ZIL with TX_TRUNCATE.
2045264732Smav */
2046264732Smavstatic void
2047264732Smavzvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len,
2048264732Smav    boolean_t sync)
2049264732Smav{
2050264732Smav	itx_t *itx;
2051264732Smav	lr_truncate_t *lr;
2052264732Smav	zilog_t *zilog = zv->zv_zilog;
2053264732Smav
2054264732Smav	if (zil_replaying(zilog, tx))
2055264732Smav		return;
2056264732Smav
2057264732Smav	itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr));
2058264732Smav	lr = (lr_truncate_t *)&itx->itx_lr;
2059264732Smav	lr->lr_foid = ZVOL_OBJ;
2060264732Smav	lr->lr_offset = off;
2061264732Smav	lr->lr_length = len;
2062264732Smav
2063308596Smav	itx->itx_sync = (sync || zv->zv_sync_cnt != 0);
2064264732Smav	zil_itx_assign(zilog, itx, tx);
2065264732Smav}
2066264732Smav
2067277483Ssmh#ifdef illumos
2068264732Smav/*
2069219089Spjd * Dirtbag ioctls to support mkfs(1M) for UFS filesystems.  See dkio(7I).
2070264732Smav * Also a dirtbag dkio ioctl for unmap/free-block functionality.
2071219089Spjd */
2072219089Spjd/*ARGSUSED*/
2073219089Spjdint
2074219089Spjdzvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
2075219089Spjd{
2076219089Spjd	zvol_state_t *zv;
2077219089Spjd	struct dk_callback *dkc;
2078219089Spjd	int error = 0;
2079219089Spjd	rl_t *rl;
2080219089Spjd
2081277483Ssmh	mutex_enter(&zfsdev_state_lock);
2082219089Spjd
2083219089Spjd	zv = zfsdev_get_soft_state(getminor(dev), ZSST_ZVOL);
2084219089Spjd
2085219089Spjd	if (zv == NULL) {
2086277483Ssmh		mutex_exit(&zfsdev_state_lock);
2087249195Smm		return (SET_ERROR(ENXIO));
2088219089Spjd	}
2089219089Spjd	ASSERT(zv->zv_total_opens > 0);
2090219089Spjd
2091219089Spjd	switch (cmd) {
2092219089Spjd
2093219089Spjd	case DKIOCINFO:
2094265677Smav	{
2095265677Smav		struct dk_cinfo dki;
2096265677Smav
2097219089Spjd		bzero(&dki, sizeof (dki));
2098219089Spjd		(void) strcpy(dki.dki_cname, "zvol");
2099219089Spjd		(void) strcpy(dki.dki_dname, "zvol");
2100219089Spjd		dki.dki_ctype = DKC_UNKNOWN;
2101219089Spjd		dki.dki_unit = getminor(dev);
2102276081Sdelphij		dki.dki_maxtransfer =
2103276081Sdelphij		    1 << (SPA_OLD_MAXBLOCKSHIFT - zv->zv_min_bs);
2104277483Ssmh		mutex_exit(&zfsdev_state_lock);
2105219089Spjd		if (ddi_copyout(&dki, (void *)arg, sizeof (dki), flag))
2106249195Smm			error = SET_ERROR(EFAULT);
2107219089Spjd		return (error);
2108265677Smav	}
2109219089Spjd
2110219089Spjd	case DKIOCGMEDIAINFO:
2111265677Smav	{
2112265677Smav		struct dk_minfo dkm;
2113265677Smav
2114219089Spjd		bzero(&dkm, sizeof (dkm));
2115219089Spjd		dkm.dki_lbsize = 1U << zv->zv_min_bs;
2116219089Spjd		dkm.dki_capacity = zv->zv_volsize >> zv->zv_min_bs;
2117219089Spjd		dkm.dki_media_type = DK_UNKNOWN;
2118277483Ssmh		mutex_exit(&zfsdev_state_lock);
2119219089Spjd		if (ddi_copyout(&dkm, (void *)arg, sizeof (dkm), flag))
2120249195Smm			error = SET_ERROR(EFAULT);
2121219089Spjd		return (error);
2122265677Smav	}
2123219089Spjd
2124265677Smav	case DKIOCGMEDIAINFOEXT:
2125265677Smav	{
2126265677Smav		struct dk_minfo_ext dkmext;
2127265677Smav
2128265677Smav		bzero(&dkmext, sizeof (dkmext));
2129265677Smav		dkmext.dki_lbsize = 1U << zv->zv_min_bs;
2130265677Smav		dkmext.dki_pbsize = zv->zv_volblocksize;
2131265677Smav		dkmext.dki_capacity = zv->zv_volsize >> zv->zv_min_bs;
2132265677Smav		dkmext.dki_media_type = DK_UNKNOWN;
2133277483Ssmh		mutex_exit(&zfsdev_state_lock);
2134265677Smav		if (ddi_copyout(&dkmext, (void *)arg, sizeof (dkmext), flag))
2135265677Smav			error = SET_ERROR(EFAULT);
2136265677Smav		return (error);
2137265677Smav	}
2138265677Smav
2139219089Spjd	case DKIOCGETEFI:
2140265677Smav	{
2141265677Smav		uint64_t vs = zv->zv_volsize;
2142265677Smav		uint8_t bs = zv->zv_min_bs;
2143219089Spjd
2144277483Ssmh		mutex_exit(&zfsdev_state_lock);
2145265677Smav		error = zvol_getefi((void *)arg, flag, vs, bs);
2146265677Smav		return (error);
2147265677Smav	}
2148219089Spjd
2149219089Spjd	case DKIOCFLUSHWRITECACHE:
2150219089Spjd		dkc = (struct dk_callback *)arg;
2151277483Ssmh		mutex_exit(&zfsdev_state_lock);
2152219089Spjd		zil_commit(zv->zv_zilog, ZVOL_OBJ);
2153219089Spjd		if ((flag & FKIOCTL) && dkc != NULL && dkc->dkc_callback) {
2154219089Spjd			(*dkc->dkc_callback)(dkc->dkc_cookie, error);
2155219089Spjd			error = 0;
2156219089Spjd		}
2157219089Spjd		return (error);
2158219089Spjd
2159219089Spjd	case DKIOCGETWCE:
2160265677Smav	{
2161265677Smav		int wce = (zv->zv_flags & ZVOL_WCE) ? 1 : 0;
2162265677Smav		if (ddi_copyout(&wce, (void *)arg, sizeof (int),
2163265677Smav		    flag))
2164265677Smav			error = SET_ERROR(EFAULT);
2165265677Smav		break;
2166265677Smav	}
2167265677Smav	case DKIOCSETWCE:
2168265677Smav	{
2169265677Smav		int wce;
2170265677Smav		if (ddi_copyin((void *)arg, &wce, sizeof (int),
2171265677Smav		    flag)) {
2172265677Smav			error = SET_ERROR(EFAULT);
2173219089Spjd			break;
2174219089Spjd		}
2175265677Smav		if (wce) {
2176265677Smav			zv->zv_flags |= ZVOL_WCE;
2177277483Ssmh			mutex_exit(&zfsdev_state_lock);
2178265677Smav		} else {
2179265677Smav			zv->zv_flags &= ~ZVOL_WCE;
2180277483Ssmh			mutex_exit(&zfsdev_state_lock);
2181265677Smav			zil_commit(zv->zv_zilog, ZVOL_OBJ);
2182219089Spjd		}
2183265677Smav		return (0);
2184265677Smav	}
2185219089Spjd
2186219089Spjd	case DKIOCGGEOM:
2187219089Spjd	case DKIOCGVTOC:
2188219089Spjd		/*
2189219089Spjd		 * commands using these (like prtvtoc) expect ENOTSUP
2190219089Spjd		 * since we're emulating an EFI label
2191219089Spjd		 */
2192249195Smm		error = SET_ERROR(ENOTSUP);
2193219089Spjd		break;
2194219089Spjd
2195219089Spjd	case DKIOCDUMPINIT:
2196219089Spjd		rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize,
2197219089Spjd		    RL_WRITER);
2198219089Spjd		error = zvol_dumpify(zv);
2199219089Spjd		zfs_range_unlock(rl);
2200219089Spjd		break;
2201219089Spjd
2202219089Spjd	case DKIOCDUMPFINI:
2203219089Spjd		if (!(zv->zv_flags & ZVOL_DUMPIFIED))
2204219089Spjd			break;
2205219089Spjd		rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize,
2206219089Spjd		    RL_WRITER);
2207219089Spjd		error = zvol_dump_fini(zv);
2208219089Spjd		zfs_range_unlock(rl);
2209219089Spjd		break;
2210219089Spjd
2211249195Smm	case DKIOCFREE:
2212249195Smm	{
2213249195Smm		dkioc_free_t df;
2214249195Smm		dmu_tx_t *tx;
2215249195Smm
2216273345Sdelphij		if (!zvol_unmap_enabled)
2217273345Sdelphij			break;
2218273345Sdelphij
2219249195Smm		if (ddi_copyin((void *)arg, &df, sizeof (df), flag)) {
2220249195Smm			error = SET_ERROR(EFAULT);
2221249195Smm			break;
2222249195Smm		}
2223249195Smm
2224249195Smm		/*
2225249195Smm		 * Apply Postel's Law to length-checking.  If they overshoot,
2226249195Smm		 * just blank out until the end, if there's a need to blank
2227249195Smm		 * out anything.
2228249195Smm		 */
2229249195Smm		if (df.df_start >= zv->zv_volsize)
2230249195Smm			break;	/* No need to do anything... */
2231249195Smm
2232277483Ssmh		mutex_exit(&zfsdev_state_lock);
2233277482Ssmh
2234249195Smm		rl = zfs_range_lock(&zv->zv_znode, df.df_start, df.df_length,
2235249195Smm		    RL_WRITER);
2236249195Smm		tx = dmu_tx_create(zv->zv_objset);
2237269002Sdelphij		dmu_tx_mark_netfree(tx);
2238249195Smm		error = dmu_tx_assign(tx, TXG_WAIT);
2239249195Smm		if (error != 0) {
2240249195Smm			dmu_tx_abort(tx);
2241249195Smm		} else {
2242249195Smm			zvol_log_truncate(zv, tx, df.df_start,
2243249195Smm			    df.df_length, B_TRUE);
2244249195Smm			dmu_tx_commit(tx);
2245249195Smm			error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
2246249195Smm			    df.df_start, df.df_length);
2247249195Smm		}
2248249195Smm
2249249195Smm		zfs_range_unlock(rl);
2250249195Smm
2251249195Smm		if (error == 0) {
2252249195Smm			/*
2253249195Smm			 * If the write-cache is disabled or 'sync' property
2254249195Smm			 * is set to 'always' then treat this as a synchronous
2255249195Smm			 * operation (i.e. commit to zil).
2256249195Smm			 */
2257249195Smm			if (!(zv->zv_flags & ZVOL_WCE) ||
2258249195Smm			    (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS))
2259249195Smm				zil_commit(zv->zv_zilog, ZVOL_OBJ);
2260249195Smm
2261249195Smm			/*
2262249195Smm			 * If the caller really wants synchronous writes, and
2263249195Smm			 * can't wait for them, don't return until the write
2264249195Smm			 * is done.
2265249195Smm			 */
2266249195Smm			if (df.df_flags & DF_WAIT_SYNC) {
2267249195Smm				txg_wait_synced(
2268249195Smm				    dmu_objset_pool(zv->zv_objset), 0);
2269249195Smm			}
2270249195Smm		}
2271277482Ssmh		return (error);
2272249195Smm	}
2273249195Smm
2274219089Spjd	default:
2275249195Smm		error = SET_ERROR(ENOTTY);
2276219089Spjd		break;
2277219089Spjd
2278219089Spjd	}
2279277483Ssmh	mutex_exit(&zfsdev_state_lock);
2280219089Spjd	return (error);
2281219089Spjd}
2282277483Ssmh#endif	/* illumos */
2283219089Spjd
2284219089Spjdint
2285168404Spjdzvol_busy(void)
2286168404Spjd{
2287168404Spjd	return (zvol_minors != 0);
2288168404Spjd}
2289168404Spjd
2290168404Spjdvoid
2291168404Spjdzvol_init(void)
2292168404Spjd{
2293219089Spjd	VERIFY(ddi_soft_state_init(&zfsdev_state, sizeof (zfs_soft_state_t),
2294219089Spjd	    1) == 0);
2295277483Ssmh#ifdef illumos
2296277483Ssmh	mutex_init(&zfsdev_state_lock, NULL, MUTEX_DEFAULT, NULL);
2297277483Ssmh#else
2298168404Spjd	ZFS_LOG(1, "ZVOL Initialized.");
2299277483Ssmh#endif
2300168404Spjd}
2301168404Spjd
2302168404Spjdvoid
2303168404Spjdzvol_fini(void)
2304168404Spjd{
2305277483Ssmh#ifdef illumos
2306277483Ssmh	mutex_destroy(&zfsdev_state_lock);
2307277483Ssmh#endif
2308219089Spjd	ddi_soft_state_fini(&zfsdev_state);
2309168404Spjd	ZFS_LOG(1, "ZVOL Deinitialized.");
2310168404Spjd}
2311185029Spjd
2312277483Ssmh#ifdef illumos
2313255750Sdelphij/*ARGSUSED*/
2314185029Spjdstatic int
2315255750Sdelphijzfs_mvdev_dump_feature_check(void *arg, dmu_tx_t *tx)
2316255750Sdelphij{
2317255750Sdelphij	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
2318255750Sdelphij
2319263390Sdelphij	if (spa_feature_is_active(spa, SPA_FEATURE_MULTI_VDEV_CRASH_DUMP))
2320255750Sdelphij		return (1);
2321255750Sdelphij	return (0);
2322255750Sdelphij}
2323255750Sdelphij
2324255750Sdelphij/*ARGSUSED*/
2325255750Sdelphijstatic void
2326255750Sdelphijzfs_mvdev_dump_activate_feature_sync(void *arg, dmu_tx_t *tx)
2327255750Sdelphij{
2328255750Sdelphij	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
2329255750Sdelphij
2330263390Sdelphij	spa_feature_incr(spa, SPA_FEATURE_MULTI_VDEV_CRASH_DUMP, tx);
2331255750Sdelphij}
2332255750Sdelphij
2333255750Sdelphijstatic int
2334185029Spjdzvol_dump_init(zvol_state_t *zv, boolean_t resize)
2335185029Spjd{
2336185029Spjd	dmu_tx_t *tx;
2337255750Sdelphij	int error;
2338185029Spjd	objset_t *os = zv->zv_objset;
2339255750Sdelphij	spa_t *spa = dmu_objset_spa(os);
2340255750Sdelphij	vdev_t *vd = spa->spa_root_vdev;
2341185029Spjd	nvlist_t *nv = NULL;
2342255750Sdelphij	uint64_t version = spa_version(spa);
2343290746Smav	uint64_t checksum, compress, refresrv, vbs, dedup;
2344185029Spjd
2345277483Ssmh	ASSERT(MUTEX_HELD(&zfsdev_state_lock));
2346255750Sdelphij	ASSERT(vd->vdev_ops == &vdev_root_ops);
2347255750Sdelphij
2348219089Spjd	error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, 0,
2349219089Spjd	    DMU_OBJECT_END);
2350290746Smav	if (error != 0)
2351290746Smav		return (error);
2352219089Spjd	/* wait for dmu_free_long_range to actually free the blocks */
2353219089Spjd	txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
2354185029Spjd
2355255750Sdelphij	/*
2356255750Sdelphij	 * If the pool on which the dump device is being initialized has more
2357255750Sdelphij	 * than one child vdev, check that the MULTI_VDEV_CRASH_DUMP feature is
2358255750Sdelphij	 * enabled.  If so, bump that feature's counter to indicate that the
2359255750Sdelphij	 * feature is active. We also check the vdev type to handle the
2360255750Sdelphij	 * following case:
2361255750Sdelphij	 *   # zpool create test raidz disk1 disk2 disk3
2362255750Sdelphij	 *   Now have spa_root_vdev->vdev_children == 1 (the raidz vdev),
2363255750Sdelphij	 *   the raidz vdev itself has 3 children.
2364255750Sdelphij	 */
2365255750Sdelphij	if (vd->vdev_children > 1 || vd->vdev_ops == &vdev_raidz_ops) {
2366255750Sdelphij		if (!spa_feature_is_enabled(spa,
2367263390Sdelphij		    SPA_FEATURE_MULTI_VDEV_CRASH_DUMP))
2368255750Sdelphij			return (SET_ERROR(ENOTSUP));
2369255750Sdelphij		(void) dsl_sync_task(spa_name(spa),
2370255750Sdelphij		    zfs_mvdev_dump_feature_check,
2371269006Sdelphij		    zfs_mvdev_dump_activate_feature_sync, NULL,
2372269006Sdelphij		    2, ZFS_SPACE_CHECK_RESERVED);
2373255750Sdelphij	}
2374255750Sdelphij
2375290746Smav	if (!resize) {
2376290746Smav		error = dsl_prop_get_integer(zv->zv_name,
2377290746Smav		    zfs_prop_to_name(ZFS_PROP_COMPRESSION), &compress, NULL);
2378290746Smav		if (error == 0) {
2379290746Smav			error = dsl_prop_get_integer(zv->zv_name,
2380290746Smav			    zfs_prop_to_name(ZFS_PROP_CHECKSUM), &checksum,
2381290746Smav			    NULL);
2382290746Smav		}
2383290746Smav		if (error == 0) {
2384290746Smav			error = dsl_prop_get_integer(zv->zv_name,
2385290746Smav			    zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
2386290746Smav			    &refresrv, NULL);
2387290746Smav		}
2388290746Smav		if (error == 0) {
2389290746Smav			error = dsl_prop_get_integer(zv->zv_name,
2390290746Smav			    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &vbs,
2391290746Smav			    NULL);
2392290746Smav		}
2393290746Smav		if (version >= SPA_VERSION_DEDUP && error == 0) {
2394290746Smav			error = dsl_prop_get_integer(zv->zv_name,
2395290746Smav			    zfs_prop_to_name(ZFS_PROP_DEDUP), &dedup, NULL);
2396290746Smav		}
2397290746Smav	}
2398290746Smav	if (error != 0)
2399290746Smav		return (error);
2400290746Smav
2401185029Spjd	tx = dmu_tx_create(os);
2402185029Spjd	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
2403219089Spjd	dmu_tx_hold_bonus(tx, ZVOL_OBJ);
2404185029Spjd	error = dmu_tx_assign(tx, TXG_WAIT);
2405290746Smav	if (error != 0) {
2406185029Spjd		dmu_tx_abort(tx);
2407185029Spjd		return (error);
2408185029Spjd	}
2409185029Spjd
2410185029Spjd	/*
2411185029Spjd	 * If we are resizing the dump device then we only need to
2412185029Spjd	 * update the refreservation to match the newly updated
2413185029Spjd	 * zvolsize. Otherwise, we save off the original state of the
2414185029Spjd	 * zvol so that we can restore them if the zvol is ever undumpified.
2415185029Spjd	 */
2416185029Spjd	if (resize) {
2417185029Spjd		error = zap_update(os, ZVOL_ZAP_OBJ,
2418185029Spjd		    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1,
2419185029Spjd		    &zv->zv_volsize, tx);
2420185029Spjd	} else {
2421290746Smav		error = zap_update(os, ZVOL_ZAP_OBJ,
2422185029Spjd		    zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1,
2423185029Spjd		    &compress, tx);
2424290746Smav		if (error == 0) {
2425290746Smav			error = zap_update(os, ZVOL_ZAP_OBJ,
2426290746Smav			    zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1,
2427290746Smav			    &checksum, tx);
2428290746Smav		}
2429290746Smav		if (error == 0) {
2430290746Smav			error = zap_update(os, ZVOL_ZAP_OBJ,
2431290746Smav			    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1,
2432290746Smav			    &refresrv, tx);
2433290746Smav		}
2434290746Smav		if (error == 0) {
2435290746Smav			error = zap_update(os, ZVOL_ZAP_OBJ,
2436290746Smav			    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1,
2437290746Smav			    &vbs, tx);
2438290746Smav		}
2439290746Smav		if (error == 0) {
2440290746Smav			error = dmu_object_set_blocksize(
2441290746Smav			    os, ZVOL_OBJ, SPA_OLD_MAXBLOCKSIZE, 0, tx);
2442290746Smav		}
2443290746Smav		if (version >= SPA_VERSION_DEDUP && error == 0) {
2444290746Smav			error = zap_update(os, ZVOL_ZAP_OBJ,
2445219089Spjd			    zfs_prop_to_name(ZFS_PROP_DEDUP), 8, 1,
2446219089Spjd			    &dedup, tx);
2447219089Spjd		}
2448219089Spjd		if (error == 0)
2449276081Sdelphij			zv->zv_volblocksize = SPA_OLD_MAXBLOCKSIZE;
2450185029Spjd	}
2451185029Spjd	dmu_tx_commit(tx);
2452185029Spjd
2453185029Spjd	/*
2454185029Spjd	 * We only need update the zvol's property if we are initializing
2455185029Spjd	 * the dump area for the first time.
2456185029Spjd	 */
2457290746Smav	if (error == 0 && !resize) {
2458290746Smav		/*
2459290746Smav		 * If MULTI_VDEV_CRASH_DUMP is active, use the NOPARITY checksum
2460290746Smav		 * function.  Otherwise, use the old default -- OFF.
2461290746Smav		 */
2462290746Smav		checksum = spa_feature_is_active(spa,
2463290746Smav		    SPA_FEATURE_MULTI_VDEV_CRASH_DUMP) ? ZIO_CHECKSUM_NOPARITY :
2464290746Smav		    ZIO_CHECKSUM_OFF;
2465290746Smav
2466185029Spjd		VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
2467185029Spjd		VERIFY(nvlist_add_uint64(nv,
2468185029Spjd		    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 0) == 0);
2469185029Spjd		VERIFY(nvlist_add_uint64(nv,
2470185029Spjd		    zfs_prop_to_name(ZFS_PROP_COMPRESSION),
2471185029Spjd		    ZIO_COMPRESS_OFF) == 0);
2472185029Spjd		VERIFY(nvlist_add_uint64(nv,
2473185029Spjd		    zfs_prop_to_name(ZFS_PROP_CHECKSUM),
2474255750Sdelphij		    checksum) == 0);
2475219089Spjd		if (version >= SPA_VERSION_DEDUP) {
2476219089Spjd			VERIFY(nvlist_add_uint64(nv,
2477219089Spjd			    zfs_prop_to_name(ZFS_PROP_DEDUP),
2478219089Spjd			    ZIO_CHECKSUM_OFF) == 0);
2479219089Spjd		}
2480185029Spjd
2481219089Spjd		error = zfs_set_prop_nvlist(zv->zv_name, ZPROP_SRC_LOCAL,
2482219089Spjd		    nv, NULL);
2483185029Spjd		nvlist_free(nv);
2484185029Spjd	}
2485185029Spjd
2486185029Spjd	/* Allocate the space for the dump */
2487290746Smav	if (error == 0)
2488290746Smav		error = zvol_prealloc(zv);
2489185029Spjd	return (error);
2490185029Spjd}
2491185029Spjd
2492185029Spjdstatic int
2493185029Spjdzvol_dumpify(zvol_state_t *zv)
2494185029Spjd{
2495185029Spjd	int error = 0;
2496185029Spjd	uint64_t dumpsize = 0;
2497185029Spjd	dmu_tx_t *tx;
2498185029Spjd	objset_t *os = zv->zv_objset;
2499185029Spjd
2500219089Spjd	if (zv->zv_flags & ZVOL_RDONLY)
2501249195Smm		return (SET_ERROR(EROFS));
2502185029Spjd
2503185029Spjd	if (zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE,
2504185029Spjd	    8, 1, &dumpsize) != 0 || dumpsize != zv->zv_volsize) {
2505248571Smm		boolean_t resize = (dumpsize > 0);
2506185029Spjd
2507185029Spjd		if ((error = zvol_dump_init(zv, resize)) != 0) {
2508185029Spjd			(void) zvol_dump_fini(zv);
2509185029Spjd			return (error);
2510185029Spjd		}
2511185029Spjd	}
2512185029Spjd
2513185029Spjd	/*
2514185029Spjd	 * Build up our lba mapping.
2515185029Spjd	 */
2516185029Spjd	error = zvol_get_lbas(zv);
2517185029Spjd	if (error) {
2518185029Spjd		(void) zvol_dump_fini(zv);
2519185029Spjd		return (error);
2520185029Spjd	}
2521185029Spjd
2522185029Spjd	tx = dmu_tx_create(os);
2523185029Spjd	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
2524185029Spjd	error = dmu_tx_assign(tx, TXG_WAIT);
2525185029Spjd	if (error) {
2526185029Spjd		dmu_tx_abort(tx);
2527185029Spjd		(void) zvol_dump_fini(zv);
2528185029Spjd		return (error);
2529185029Spjd	}
2530185029Spjd
2531185029Spjd	zv->zv_flags |= ZVOL_DUMPIFIED;
2532185029Spjd	error = zap_update(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, 8, 1,
2533185029Spjd	    &zv->zv_volsize, tx);
2534185029Spjd	dmu_tx_commit(tx);
2535185029Spjd
2536185029Spjd	if (error) {
2537185029Spjd		(void) zvol_dump_fini(zv);
2538185029Spjd		return (error);
2539185029Spjd	}
2540185029Spjd
2541185029Spjd	txg_wait_synced(dmu_objset_pool(os), 0);
2542185029Spjd	return (0);
2543185029Spjd}
2544185029Spjd
2545185029Spjdstatic int
2546185029Spjdzvol_dump_fini(zvol_state_t *zv)
2547185029Spjd{
2548185029Spjd	dmu_tx_t *tx;
2549185029Spjd	objset_t *os = zv->zv_objset;
2550185029Spjd	nvlist_t *nv;
2551185029Spjd	int error = 0;
2552219089Spjd	uint64_t checksum, compress, refresrv, vbs, dedup;
2553219089Spjd	uint64_t version = spa_version(dmu_objset_spa(zv->zv_objset));
2554185029Spjd
2555185029Spjd	/*
2556185029Spjd	 * Attempt to restore the zvol back to its pre-dumpified state.
2557185029Spjd	 * This is a best-effort attempt as it's possible that not all
2558185029Spjd	 * of these properties were initialized during the dumpify process
2559185029Spjd	 * (i.e. error during zvol_dump_init).
2560185029Spjd	 */
2561185029Spjd
2562185029Spjd	tx = dmu_tx_create(os);
2563185029Spjd	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
2564185029Spjd	error = dmu_tx_assign(tx, TXG_WAIT);
2565185029Spjd	if (error) {
2566185029Spjd		dmu_tx_abort(tx);
2567185029Spjd		return (error);
2568185029Spjd	}
2569185029Spjd	(void) zap_remove(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, tx);
2570185029Spjd	dmu_tx_commit(tx);
2571185029Spjd
2572185029Spjd	(void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2573185029Spjd	    zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1, &checksum);
2574185029Spjd	(void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2575185029Spjd	    zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1, &compress);
2576185029Spjd	(void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2577185029Spjd	    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1, &refresrv);
2578208047Smm	(void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2579208047Smm	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1, &vbs);
2580185029Spjd
2581185029Spjd	VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
2582185029Spjd	(void) nvlist_add_uint64(nv,
2583185029Spjd	    zfs_prop_to_name(ZFS_PROP_CHECKSUM), checksum);
2584185029Spjd	(void) nvlist_add_uint64(nv,
2585185029Spjd	    zfs_prop_to_name(ZFS_PROP_COMPRESSION), compress);
2586185029Spjd	(void) nvlist_add_uint64(nv,
2587185029Spjd	    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), refresrv);
2588219089Spjd	if (version >= SPA_VERSION_DEDUP &&
2589219089Spjd	    zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2590219089Spjd	    zfs_prop_to_name(ZFS_PROP_DEDUP), 8, 1, &dedup) == 0) {
2591219089Spjd		(void) nvlist_add_uint64(nv,
2592219089Spjd		    zfs_prop_to_name(ZFS_PROP_DEDUP), dedup);
2593219089Spjd	}
2594219089Spjd	(void) zfs_set_prop_nvlist(zv->zv_name, ZPROP_SRC_LOCAL,
2595219089Spjd	    nv, NULL);
2596185029Spjd	nvlist_free(nv);
2597185029Spjd
2598185029Spjd	zvol_free_extents(zv);
2599185029Spjd	zv->zv_flags &= ~ZVOL_DUMPIFIED;
2600185029Spjd	(void) dmu_free_long_range(os, ZVOL_OBJ, 0, DMU_OBJECT_END);
2601219089Spjd	/* wait for dmu_free_long_range to actually free the blocks */
2602219089Spjd	txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
2603219089Spjd	tx = dmu_tx_create(os);
2604219089Spjd	dmu_tx_hold_bonus(tx, ZVOL_OBJ);
2605219089Spjd	error = dmu_tx_assign(tx, TXG_WAIT);
2606219089Spjd	if (error) {
2607219089Spjd		dmu_tx_abort(tx);
2608219089Spjd		return (error);
2609219089Spjd	}
2610219089Spjd	if (dmu_object_set_blocksize(os, ZVOL_OBJ, vbs, 0, tx) == 0)
2611219089Spjd		zv->zv_volblocksize = vbs;
2612219089Spjd	dmu_tx_commit(tx);
2613185029Spjd
2614185029Spjd	return (0);
2615185029Spjd}
2616277483Ssmh#else	/* !illumos */
2617219089Spjd
2618219089Spjdstatic void
2619219089Spjdzvol_geom_run(zvol_state_t *zv)
2620219089Spjd{
2621219089Spjd	struct g_provider *pp;
2622219089Spjd
2623219089Spjd	pp = zv->zv_provider;
2624219089Spjd	g_error_provider(pp, 0);
2625219089Spjd
2626219089Spjd	kproc_kthread_add(zvol_geom_worker, zv, &zfsproc, NULL, 0, 0,
2627219089Spjd	    "zfskern", "zvol %s", pp->name + sizeof(ZVOL_DRIVER));
2628219089Spjd}
2629219089Spjd
2630219089Spjdstatic void
2631219089Spjdzvol_geom_destroy(zvol_state_t *zv)
2632219089Spjd{
2633219089Spjd	struct g_provider *pp;
2634219089Spjd
2635219089Spjd	g_topology_assert();
2636219089Spjd
2637219089Spjd	mtx_lock(&zv->zv_queue_mtx);
2638219089Spjd	zv->zv_state = 1;
2639219089Spjd	wakeup_one(&zv->zv_queue);
2640219089Spjd	while (zv->zv_state != 2)
2641219089Spjd		msleep(&zv->zv_state, &zv->zv_queue_mtx, 0, "zvol:w", 0);
2642219089Spjd	mtx_destroy(&zv->zv_queue_mtx);
2643219089Spjd
2644219089Spjd	pp = zv->zv_provider;
2645219089Spjd	zv->zv_provider = NULL;
2646219089Spjd	pp->private = NULL;
2647219089Spjd	g_wither_geom(pp->geom, ENXIO);
2648219089Spjd}
2649219089Spjd
2650219089Spjdstatic int
2651219089Spjdzvol_geom_access(struct g_provider *pp, int acr, int acw, int ace)
2652219089Spjd{
2653219089Spjd	int count, error, flags;
2654219089Spjd
2655219089Spjd	g_topology_assert();
2656219089Spjd
2657219089Spjd	/*
2658219089Spjd	 * To make it easier we expect either open or close, but not both
2659219089Spjd	 * at the same time.
2660219089Spjd	 */
2661219089Spjd	KASSERT((acr >= 0 && acw >= 0 && ace >= 0) ||
2662219089Spjd	    (acr <= 0 && acw <= 0 && ace <= 0),
2663219089Spjd	    ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).",
2664219089Spjd	    pp->name, acr, acw, ace));
2665219089Spjd
2666219089Spjd	if (pp->private == NULL) {
2667219089Spjd		if (acr <= 0 && acw <= 0 && ace <= 0)
2668219089Spjd			return (0);
2669219089Spjd		return (pp->error);
2670219089Spjd	}
2671219089Spjd
2672219089Spjd	/*
2673219089Spjd	 * We don't pass FEXCL flag to zvol_open()/zvol_close() if ace != 0,
2674219089Spjd	 * because GEOM already handles that and handles it a bit differently.
2675219089Spjd	 * GEOM allows for multiple read/exclusive consumers and ZFS allows
2676219089Spjd	 * only one exclusive consumer, no matter if it is reader or writer.
2677219089Spjd	 * I like better the way GEOM works so I'll leave it for GEOM to
2678219089Spjd	 * decide what to do.
2679219089Spjd	 */
2680219089Spjd
2681219089Spjd	count = acr + acw + ace;
2682219089Spjd	if (count == 0)
2683219089Spjd		return (0);
2684219089Spjd
2685219089Spjd	flags = 0;
2686219089Spjd	if (acr != 0 || ace != 0)
2687219089Spjd		flags |= FREAD;
2688219089Spjd	if (acw != 0)
2689219089Spjd		flags |= FWRITE;
2690219089Spjd
2691219089Spjd	g_topology_unlock();
2692219089Spjd	if (count > 0)
2693219089Spjd		error = zvol_open(pp, flags, count);
2694219089Spjd	else
2695219089Spjd		error = zvol_close(pp, flags, -count);
2696219089Spjd	g_topology_lock();
2697219089Spjd	return (error);
2698219089Spjd}
2699219089Spjd
2700219089Spjdstatic void
2701219089Spjdzvol_geom_start(struct bio *bp)
2702219089Spjd{
2703219089Spjd	zvol_state_t *zv;
2704219089Spjd	boolean_t first;
2705219089Spjd
2706260385Sscottl	zv = bp->bio_to->private;
2707260385Sscottl	ASSERT(zv != NULL);
2708219089Spjd	switch (bp->bio_cmd) {
2709260385Sscottl	case BIO_FLUSH:
2710260385Sscottl		if (!THREAD_CAN_SLEEP())
2711260385Sscottl			goto enqueue;
2712260385Sscottl		zil_commit(zv->zv_zilog, ZVOL_OBJ);
2713260385Sscottl		g_io_deliver(bp, 0);
2714260385Sscottl		break;
2715219089Spjd	case BIO_READ:
2716219089Spjd	case BIO_WRITE:
2717264732Smav	case BIO_DELETE:
2718260385Sscottl		if (!THREAD_CAN_SLEEP())
2719260385Sscottl			goto enqueue;
2720260385Sscottl		zvol_strategy(bp);
2721219089Spjd		break;
2722274732Smav	case BIO_GETATTR: {
2723274732Smav		spa_t *spa = dmu_objset_spa(zv->zv_objset);
2724274732Smav		uint64_t refd, avail, usedobjs, availobjs, val;
2725274732Smav
2726264733Smav		if (g_handleattr_int(bp, "GEOM::candelete", 1))
2727264733Smav			return;
2728274732Smav		if (strcmp(bp->bio_attribute, "blocksavail") == 0) {
2729274732Smav			dmu_objset_space(zv->zv_objset, &refd, &avail,
2730274732Smav			    &usedobjs, &availobjs);
2731274732Smav			if (g_handleattr_off_t(bp, "blocksavail",
2732274732Smav			    avail / DEV_BSIZE))
2733274732Smav				return;
2734274732Smav		} else if (strcmp(bp->bio_attribute, "blocksused") == 0) {
2735274732Smav			dmu_objset_space(zv->zv_objset, &refd, &avail,
2736274732Smav			    &usedobjs, &availobjs);
2737274732Smav			if (g_handleattr_off_t(bp, "blocksused",
2738274732Smav			    refd / DEV_BSIZE))
2739274732Smav				return;
2740274732Smav		} else if (strcmp(bp->bio_attribute, "poolblocksavail") == 0) {
2741274732Smav			avail = metaslab_class_get_space(spa_normal_class(spa));
2742274732Smav			avail -= metaslab_class_get_alloc(spa_normal_class(spa));
2743274732Smav			if (g_handleattr_off_t(bp, "poolblocksavail",
2744274732Smav			    avail / DEV_BSIZE))
2745274732Smav				return;
2746274732Smav		} else if (strcmp(bp->bio_attribute, "poolblocksused") == 0) {
2747274732Smav			refd = metaslab_class_get_alloc(spa_normal_class(spa));
2748274732Smav			if (g_handleattr_off_t(bp, "poolblocksused",
2749274732Smav			    refd / DEV_BSIZE))
2750274732Smav				return;
2751274732Smav		}
2752264733Smav		/* FALLTHROUGH */
2753274732Smav	}
2754219089Spjd	default:
2755219089Spjd		g_io_deliver(bp, EOPNOTSUPP);
2756219089Spjd		break;
2757219089Spjd	}
2758260385Sscottl	return;
2759260385Sscottl
2760260385Sscottlenqueue:
2761260385Sscottl	mtx_lock(&zv->zv_queue_mtx);
2762260385Sscottl	first = (bioq_first(&zv->zv_queue) == NULL);
2763260385Sscottl	bioq_insert_tail(&zv->zv_queue, bp);
2764260385Sscottl	mtx_unlock(&zv->zv_queue_mtx);
2765260385Sscottl	if (first)
2766260385Sscottl		wakeup_one(&zv->zv_queue);
2767219089Spjd}
2768219089Spjd
2769219089Spjdstatic void
2770219089Spjdzvol_geom_worker(void *arg)
2771219089Spjd{
2772219089Spjd	zvol_state_t *zv;
2773219089Spjd	struct bio *bp;
2774219089Spjd
2775219089Spjd	thread_lock(curthread);
2776219089Spjd	sched_prio(curthread, PRIBIO);
2777219089Spjd	thread_unlock(curthread);
2778219089Spjd
2779219089Spjd	zv = arg;
2780219089Spjd	for (;;) {
2781219089Spjd		mtx_lock(&zv->zv_queue_mtx);
2782219089Spjd		bp = bioq_takefirst(&zv->zv_queue);
2783219089Spjd		if (bp == NULL) {
2784219089Spjd			if (zv->zv_state == 1) {
2785219089Spjd				zv->zv_state = 2;
2786219089Spjd				wakeup(&zv->zv_state);
2787219089Spjd				mtx_unlock(&zv->zv_queue_mtx);
2788219089Spjd				kthread_exit();
2789219089Spjd			}
2790219089Spjd			msleep(&zv->zv_queue, &zv->zv_queue_mtx, PRIBIO | PDROP,
2791219089Spjd			    "zvol:io", 0);
2792219089Spjd			continue;
2793219089Spjd		}
2794219089Spjd		mtx_unlock(&zv->zv_queue_mtx);
2795219089Spjd		switch (bp->bio_cmd) {
2796219089Spjd		case BIO_FLUSH:
2797219089Spjd			zil_commit(zv->zv_zilog, ZVOL_OBJ);
2798219089Spjd			g_io_deliver(bp, 0);
2799219089Spjd			break;
2800219089Spjd		case BIO_READ:
2801219089Spjd		case BIO_WRITE:
2802288520Smav		case BIO_DELETE:
2803219089Spjd			zvol_strategy(bp);
2804219089Spjd			break;
2805288520Smav		default:
2806288520Smav			g_io_deliver(bp, EOPNOTSUPP);
2807288520Smav			break;
2808219089Spjd		}
2809219089Spjd	}
2810219089Spjd}
2811219089Spjd
2812219089Spjdextern boolean_t dataset_name_hidden(const char *name);
2813219089Spjd
2814219089Spjdstatic int
2815219089Spjdzvol_create_snapshots(objset_t *os, const char *name)
2816219089Spjd{
2817219089Spjd	uint64_t cookie, obj;
2818219089Spjd	char *sname;
2819219089Spjd	int error, len;
2820219089Spjd
2821219089Spjd	cookie = obj = 0;
2822219089Spjd	sname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
2823219089Spjd
2824248571Smm#if 0
2825219089Spjd	(void) dmu_objset_find(name, dmu_objset_prefetch, NULL,
2826219089Spjd	    DS_FIND_SNAPSHOTS);
2827248571Smm#endif
2828219089Spjd
2829219089Spjd	for (;;) {
2830219089Spjd		len = snprintf(sname, MAXPATHLEN, "%s@", name);
2831219089Spjd		if (len >= MAXPATHLEN) {
2832219089Spjd			dmu_objset_rele(os, FTAG);
2833219089Spjd			error = ENAMETOOLONG;
2834219089Spjd			break;
2835219089Spjd		}
2836219089Spjd
2837248976Smm		dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
2838219089Spjd		error = dmu_snapshot_list_next(os, MAXPATHLEN - len,
2839219089Spjd		    sname + len, &obj, &cookie, NULL);
2840248976Smm		dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
2841219089Spjd		if (error != 0) {
2842219089Spjd			if (error == ENOENT)
2843219089Spjd				error = 0;
2844219089Spjd			break;
2845219089Spjd		}
2846219089Spjd
2847297546Smav		error = zvol_create_minor(sname);
2848297546Smav		if (error != 0 && error != EEXIST) {
2849219089Spjd			printf("ZFS WARNING: Unable to create ZVOL %s (error=%d).\n",
2850219089Spjd			    sname, error);
2851219089Spjd			break;
2852219089Spjd		}
2853219089Spjd	}
2854219089Spjd
2855219089Spjd	kmem_free(sname, MAXPATHLEN);
2856219089Spjd	return (error);
2857219089Spjd}
2858219089Spjd
2859219089Spjdint
2860219089Spjdzvol_create_minors(const char *name)
2861219089Spjd{
2862219089Spjd	uint64_t cookie;
2863219089Spjd	objset_t *os;
2864219089Spjd	char *osname, *p;
2865219089Spjd	int error, len;
2866219089Spjd
2867219089Spjd	if (dataset_name_hidden(name))
2868219089Spjd		return (0);
2869219089Spjd
2870219089Spjd	if ((error = dmu_objset_hold(name, FTAG, &os)) != 0) {
2871219089Spjd		printf("ZFS WARNING: Unable to put hold on %s (error=%d).\n",
2872219089Spjd		    name, error);
2873219089Spjd		return (error);
2874219089Spjd	}
2875219089Spjd	if (dmu_objset_type(os) == DMU_OST_ZVOL) {
2876248571Smm		dsl_dataset_long_hold(os->os_dsl_dataset, FTAG);
2877248571Smm		dsl_pool_rele(dmu_objset_pool(os), FTAG);
2878272883Ssmh		error = zvol_create_minor(name);
2879272883Ssmh		if (error == 0 || error == EEXIST) {
2880219089Spjd			error = zvol_create_snapshots(os, name);
2881272883Ssmh		} else {
2882219089Spjd			printf("ZFS WARNING: Unable to create ZVOL %s (error=%d).\n",
2883219089Spjd			    name, error);
2884219089Spjd		}
2885248571Smm		dsl_dataset_long_rele(os->os_dsl_dataset, FTAG);
2886248571Smm		dsl_dataset_rele(os->os_dsl_dataset, FTAG);
2887219089Spjd		return (error);
2888219089Spjd	}
2889219089Spjd	if (dmu_objset_type(os) != DMU_OST_ZFS) {
2890219089Spjd		dmu_objset_rele(os, FTAG);
2891219089Spjd		return (0);
2892219089Spjd	}
2893219089Spjd
2894219089Spjd	osname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
2895219089Spjd	if (snprintf(osname, MAXPATHLEN, "%s/", name) >= MAXPATHLEN) {
2896219089Spjd		dmu_objset_rele(os, FTAG);
2897219089Spjd		kmem_free(osname, MAXPATHLEN);
2898219089Spjd		return (ENOENT);
2899219089Spjd	}
2900219089Spjd	p = osname + strlen(osname);
2901219089Spjd	len = MAXPATHLEN - (p - osname);
2902219089Spjd
2903248571Smm#if 0
2904224855Smm	/* Prefetch the datasets. */
2905224855Smm	cookie = 0;
2906224855Smm	while (dmu_dir_list_next(os, len, p, NULL, &cookie) == 0) {
2907224855Smm		if (!dataset_name_hidden(osname))
2908224855Smm			(void) dmu_objset_prefetch(osname, NULL);
2909219089Spjd	}
2910248571Smm#endif
2911219089Spjd
2912219089Spjd	cookie = 0;
2913219089Spjd	while (dmu_dir_list_next(os, MAXPATHLEN - (p - osname), p, NULL,
2914219089Spjd	    &cookie) == 0) {
2915219089Spjd		dmu_objset_rele(os, FTAG);
2916219089Spjd		(void)zvol_create_minors(osname);
2917219089Spjd		if ((error = dmu_objset_hold(name, FTAG, &os)) != 0) {
2918219089Spjd			printf("ZFS WARNING: Unable to put hold on %s (error=%d).\n",
2919219089Spjd			    name, error);
2920219089Spjd			return (error);
2921219089Spjd		}
2922219089Spjd	}
2923219089Spjd
2924219089Spjd	dmu_objset_rele(os, FTAG);
2925219089Spjd	kmem_free(osname, MAXPATHLEN);
2926219089Spjd	return (0);
2927219089Spjd}
2928219317Spjd
2929219317Spjdstatic void
2930265678Smavzvol_rename_minor(zvol_state_t *zv, const char *newname)
2931219317Spjd{
2932265678Smav	struct g_geom *gp;
2933219317Spjd	struct g_provider *pp;
2934265678Smav	struct cdev *dev;
2935219317Spjd
2936277483Ssmh	ASSERT(MUTEX_HELD(&zfsdev_state_lock));
2937219317Spjd
2938265678Smav	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
2939265678Smav		g_topology_lock();
2940265678Smav		pp = zv->zv_provider;
2941265678Smav		ASSERT(pp != NULL);
2942265678Smav		gp = pp->geom;
2943265678Smav		ASSERT(gp != NULL);
2944219317Spjd
2945265678Smav		zv->zv_provider = NULL;
2946265678Smav		g_wither_provider(pp, ENXIO);
2947219317Spjd
2948265678Smav		pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname);
2949265678Smav		pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
2950265678Smav		pp->sectorsize = DEV_BSIZE;
2951265678Smav		pp->mediasize = zv->zv_volsize;
2952265678Smav		pp->private = zv;
2953265678Smav		zv->zv_provider = pp;
2954265678Smav		g_error_provider(pp, 0);
2955265678Smav		g_topology_unlock();
2956265678Smav	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
2957297548Smav		struct make_dev_args args;
2958297548Smav
2959308448Smav		if ((dev = zv->zv_dev) != NULL) {
2960308448Smav			zv->zv_dev = NULL;
2961308448Smav			destroy_dev(dev);
2962308448Smav			if (zv->zv_total_opens > 0) {
2963308448Smav				zv->zv_flags &= ~ZVOL_EXCL;
2964308448Smav				zv->zv_total_opens = 0;
2965308448Smav				zvol_last_close(zv);
2966308448Smav			}
2967297549Smav		}
2968265678Smav
2969297548Smav		make_dev_args_init(&args);
2970297548Smav		args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
2971297548Smav		args.mda_devsw = &zvol_cdevsw;
2972297548Smav		args.mda_cr = NULL;
2973297548Smav		args.mda_uid = UID_ROOT;
2974297548Smav		args.mda_gid = GID_OPERATOR;
2975297548Smav		args.mda_mode = 0640;
2976297548Smav		args.mda_si_drv2 = zv;
2977297548Smav		if (make_dev_s(&args, &zv->zv_dev,
2978297548Smav		    "%s/%s", ZVOL_DRIVER, newname) == 0)
2979297548Smav			zv->zv_dev->si_iosize_max = MAXPHYS;
2980265678Smav	}
2981219317Spjd	strlcpy(zv->zv_name, newname, sizeof(zv->zv_name));
2982219317Spjd}
2983219317Spjd
2984219317Spjdvoid
2985219317Spjdzvol_rename_minors(const char *oldname, const char *newname)
2986219317Spjd{
2987219317Spjd	char name[MAXPATHLEN];
2988219317Spjd	struct g_provider *pp;
2989219317Spjd	struct g_geom *gp;
2990219317Spjd	size_t oldnamelen, newnamelen;
2991219317Spjd	zvol_state_t *zv;
2992219317Spjd	char *namebuf;
2993272883Ssmh	boolean_t locked = B_FALSE;
2994219317Spjd
2995219317Spjd	oldnamelen = strlen(oldname);
2996219317Spjd	newnamelen = strlen(newname);
2997219317Spjd
2998219317Spjd	DROP_GIANT();
2999272883Ssmh	/* See comment in zvol_open(). */
3000277483Ssmh	if (!MUTEX_HELD(&zfsdev_state_lock)) {
3001277483Ssmh		mutex_enter(&zfsdev_state_lock);
3002272883Ssmh		locked = B_TRUE;
3003272883Ssmh	}
3004219317Spjd
3005265678Smav	LIST_FOREACH(zv, &all_zvols, zv_links) {
3006219317Spjd		if (strcmp(zv->zv_name, oldname) == 0) {
3007265678Smav			zvol_rename_minor(zv, newname);
3008219317Spjd		} else if (strncmp(zv->zv_name, oldname, oldnamelen) == 0 &&
3009219317Spjd		    (zv->zv_name[oldnamelen] == '/' ||
3010219317Spjd		     zv->zv_name[oldnamelen] == '@')) {
3011219317Spjd			snprintf(name, sizeof(name), "%s%c%s", newname,
3012219317Spjd			    zv->zv_name[oldnamelen],
3013219317Spjd			    zv->zv_name + oldnamelen + 1);
3014265678Smav			zvol_rename_minor(zv, name);
3015219317Spjd		}
3016219317Spjd	}
3017219317Spjd
3018272883Ssmh	if (locked)
3019277483Ssmh		mutex_exit(&zfsdev_state_lock);
3020219317Spjd	PICKUP_GIANT();
3021219317Spjd}
3022265678Smav
3023265678Smavstatic int
3024265678Smavzvol_d_open(struct cdev *dev, int flags, int fmt, struct thread *td)
3025265678Smav{
3026297548Smav	zvol_state_t *zv = dev->si_drv2;
3027265678Smav	int err = 0;
3028265678Smav
3029277483Ssmh	mutex_enter(&zfsdev_state_lock);
3030265678Smav	if (zv->zv_total_opens == 0)
3031265678Smav		err = zvol_first_open(zv);
3032265678Smav	if (err) {
3033277483Ssmh		mutex_exit(&zfsdev_state_lock);
3034265678Smav		return (err);
3035265678Smav	}
3036265678Smav	if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
3037265678Smav		err = SET_ERROR(EROFS);
3038265678Smav		goto out;
3039265678Smav	}
3040265678Smav	if (zv->zv_flags & ZVOL_EXCL) {
3041265678Smav		err = SET_ERROR(EBUSY);
3042265678Smav		goto out;
3043265678Smav	}
3044265678Smav#ifdef FEXCL
3045265678Smav	if (flags & FEXCL) {
3046265678Smav		if (zv->zv_total_opens != 0) {
3047265678Smav			err = SET_ERROR(EBUSY);
3048265678Smav			goto out;
3049265678Smav		}
3050265678Smav		zv->zv_flags |= ZVOL_EXCL;
3051265678Smav	}
3052265678Smav#endif
3053265678Smav
3054265678Smav	zv->zv_total_opens++;
3055308596Smav	if (flags & (FSYNC | FDSYNC)) {
3056308596Smav		zv->zv_sync_cnt++;
3057308596Smav		if (zv->zv_sync_cnt == 1)
3058308596Smav			zil_async_to_sync(zv->zv_zilog, ZVOL_OBJ);
3059308596Smav	}
3060277483Ssmh	mutex_exit(&zfsdev_state_lock);
3061265678Smav	return (err);
3062265678Smavout:
3063265678Smav	if (zv->zv_total_opens == 0)
3064265678Smav		zvol_last_close(zv);
3065277483Ssmh	mutex_exit(&zfsdev_state_lock);
3066265678Smav	return (err);
3067265678Smav}
3068265678Smav
3069265678Smavstatic int
3070265678Smavzvol_d_close(struct cdev *dev, int flags, int fmt, struct thread *td)
3071265678Smav{
3072297548Smav	zvol_state_t *zv = dev->si_drv2;
3073265678Smav
3074277483Ssmh	mutex_enter(&zfsdev_state_lock);
3075265678Smav	if (zv->zv_flags & ZVOL_EXCL) {
3076265678Smav		ASSERT(zv->zv_total_opens == 1);
3077265678Smav		zv->zv_flags &= ~ZVOL_EXCL;
3078265678Smav	}
3079265678Smav
3080265678Smav	/*
3081265678Smav	 * If the open count is zero, this is a spurious close.
3082265678Smav	 * That indicates a bug in the kernel / DDI framework.
3083265678Smav	 */
3084265678Smav	ASSERT(zv->zv_total_opens != 0);
3085265678Smav
3086265678Smav	/*
3087265678Smav	 * You may get multiple opens, but only one close.
3088265678Smav	 */
3089265678Smav	zv->zv_total_opens--;
3090308596Smav	if (flags & (FSYNC | FDSYNC))
3091308596Smav		zv->zv_sync_cnt--;
3092265678Smav
3093265678Smav	if (zv->zv_total_opens == 0)
3094265678Smav		zvol_last_close(zv);
3095265678Smav
3096277483Ssmh	mutex_exit(&zfsdev_state_lock);
3097265678Smav	return (0);
3098265678Smav}
3099265678Smav
3100265678Smavstatic int
3101265678Smavzvol_d_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td)
3102265678Smav{
3103265678Smav	zvol_state_t *zv;
3104265678Smav	rl_t *rl;
3105308594Smav	off_t offset, length;
3106265678Smav	int i, error;
3107308594Smav	boolean_t sync;
3108265678Smav
3109265678Smav	zv = dev->si_drv2;
3110265678Smav
3111265678Smav	error = 0;
3112265678Smav	KASSERT(zv->zv_total_opens > 0,
3113265678Smav	    ("Device with zero access count in zvol_d_ioctl"));
3114265678Smav
3115265678Smav	i = IOCPARM_LEN(cmd);
3116265678Smav	switch (cmd) {
3117265678Smav	case DIOCGSECTORSIZE:
3118265678Smav		*(u_int *)data = DEV_BSIZE;
3119265678Smav		break;
3120265678Smav	case DIOCGMEDIASIZE:
3121265678Smav		*(off_t *)data = zv->zv_volsize;
3122265678Smav		break;
3123265678Smav	case DIOCGFLUSH:
3124265678Smav		zil_commit(zv->zv_zilog, ZVOL_OBJ);
3125265678Smav		break;
3126265678Smav	case DIOCGDELETE:
3127273345Sdelphij		if (!zvol_unmap_enabled)
3128273345Sdelphij			break;
3129273345Sdelphij
3130265678Smav		offset = ((off_t *)data)[0];
3131265678Smav		length = ((off_t *)data)[1];
3132265678Smav		if ((offset % DEV_BSIZE) != 0 || (length % DEV_BSIZE) != 0 ||
3133265678Smav		    offset < 0 || offset >= zv->zv_volsize ||
3134265678Smav		    length <= 0) {
3135265678Smav			printf("%s: offset=%jd length=%jd\n", __func__, offset,
3136265678Smav			    length);
3137265678Smav			error = EINVAL;
3138265678Smav			break;
3139265678Smav		}
3140265678Smav
3141265678Smav		rl = zfs_range_lock(&zv->zv_znode, offset, length, RL_WRITER);
3142265678Smav		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
3143265678Smav		error = dmu_tx_assign(tx, TXG_WAIT);
3144265678Smav		if (error != 0) {
3145308594Smav			sync = FALSE;
3146265678Smav			dmu_tx_abort(tx);
3147265678Smav		} else {
3148308594Smav			sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
3149308594Smav			zvol_log_truncate(zv, tx, offset, length, sync);
3150265678Smav			dmu_tx_commit(tx);
3151265678Smav			error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
3152265678Smav			    offset, length);
3153265678Smav		}
3154265678Smav		zfs_range_unlock(rl);
3155308594Smav		if (sync)
3156265678Smav			zil_commit(zv->zv_zilog, ZVOL_OBJ);
3157265678Smav		break;
3158265678Smav	case DIOCGSTRIPESIZE:
3159265678Smav		*(off_t *)data = zv->zv_volblocksize;
3160265678Smav		break;
3161265678Smav	case DIOCGSTRIPEOFFSET:
3162265678Smav		*(off_t *)data = 0;
3163265678Smav		break;
3164274732Smav	case DIOCGATTR: {
3165274732Smav		spa_t *spa = dmu_objset_spa(zv->zv_objset);
3166274732Smav		struct diocgattr_arg *arg = (struct diocgattr_arg *)data;
3167274732Smav		uint64_t refd, avail, usedobjs, availobjs;
3168274732Smav
3169280753Smav		if (strcmp(arg->name, "GEOM::candelete") == 0)
3170280753Smav			arg->value.i = 1;
3171280753Smav		else if (strcmp(arg->name, "blocksavail") == 0) {
3172274732Smav			dmu_objset_space(zv->zv_objset, &refd, &avail,
3173274732Smav			    &usedobjs, &availobjs);
3174274732Smav			arg->value.off = avail / DEV_BSIZE;
3175274732Smav		} else if (strcmp(arg->name, "blocksused") == 0) {
3176274732Smav			dmu_objset_space(zv->zv_objset, &refd, &avail,
3177274732Smav			    &usedobjs, &availobjs);
3178274732Smav			arg->value.off = refd / DEV_BSIZE;
3179274732Smav		} else if (strcmp(arg->name, "poolblocksavail") == 0) {
3180274732Smav			avail = metaslab_class_get_space(spa_normal_class(spa));
3181274732Smav			avail -= metaslab_class_get_alloc(spa_normal_class(spa));
3182274732Smav			arg->value.off = avail / DEV_BSIZE;
3183274732Smav		} else if (strcmp(arg->name, "poolblocksused") == 0) {
3184274732Smav			refd = metaslab_class_get_alloc(spa_normal_class(spa));
3185274732Smav			arg->value.off = refd / DEV_BSIZE;
3186274732Smav		} else
3187274732Smav			error = ENOIOCTL;
3188274732Smav		break;
3189274732Smav	}
3190275892Smav	case FIOSEEKHOLE:
3191275892Smav	case FIOSEEKDATA: {
3192275892Smav		off_t *off = (off_t *)data;
3193275892Smav		uint64_t noff;
3194275892Smav		boolean_t hole;
3195275892Smav
3196275892Smav		hole = (cmd == FIOSEEKHOLE);
3197275892Smav		noff = *off;
3198275892Smav		error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff);
3199275892Smav		*off = noff;
3200275892Smav		break;
3201275892Smav	}
3202265678Smav	default:
3203265678Smav		error = ENOIOCTL;
3204265678Smav	}
3205265678Smav
3206265678Smav	return (error);
3207265678Smav}
3208277483Ssmh#endif	/* illumos */
3209