1168404Spjd/*
2168404Spjd * CDDL HEADER START
3168404Spjd *
4168404Spjd * The contents of this file are subject to the terms of the
5168404Spjd * Common Development and Distribution License (the "License").
6168404Spjd * You may not use this file except in compliance with the License.
7168404Spjd *
8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9168404Spjd * or http://www.opensolaris.org/os/licensing.
10168404Spjd * See the License for the specific language governing permissions
11168404Spjd * and limitations under the License.
12168404Spjd *
13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each
14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15168404Spjd * If applicable, add the following below this CDDL HEADER, with the
16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying
17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner]
18168404Spjd *
19168404Spjd * CDDL HEADER END
20168404Spjd */
21168404Spjd/*
22219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23219089Spjd *
24219089Spjd * Copyright (c) 2006-2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
25168404Spjd * All rights reserved.
26249195Smm * Copyright (c) 2013 by Delphix. All rights reserved.
27255750Sdelphij * Copyright (c) 2013, Joyent, Inc. All rights reserved.
28168404Spjd */
29168404Spjd
30219089Spjd/* Portions Copyright 2010 Robert Milkowski */
31226724Smm/* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */
32219089Spjd
33168404Spjd/*
34168404Spjd * ZFS volume emulation driver.
35168404Spjd *
36168404Spjd * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
37168404Spjd * Volumes are accessed through the symbolic links named:
38168404Spjd *
39168404Spjd * /dev/zvol/dsk/<pool_name>/<dataset_name>
40168404Spjd * /dev/zvol/rdsk/<pool_name>/<dataset_name>
41168404Spjd *
42219089Spjd * These links are created by the /dev filesystem (sdev_zvolops.c).
43168404Spjd * Volumes are persistent through reboot.  No user command needs to be
44168404Spjd * run before opening and using a device.
45219089Spjd *
46219089Spjd * FreeBSD notes.
47219089Spjd * On FreeBSD ZVOLs are simply GEOM providers like any other storage device
48219089Spjd * in the system.
49168404Spjd */
50168404Spjd
51168962Spjd#include <sys/types.h>
52168404Spjd#include <sys/param.h>
53168404Spjd#include <sys/kernel.h>
54168404Spjd#include <sys/errno.h>
55168404Spjd#include <sys/uio.h>
56168404Spjd#include <sys/bio.h>
57168962Spjd#include <sys/buf.h>
58168404Spjd#include <sys/kmem.h>
59168404Spjd#include <sys/conf.h>
60168404Spjd#include <sys/cmn_err.h>
61168404Spjd#include <sys/stat.h>
62168404Spjd#include <sys/zap.h>
63168404Spjd#include <sys/spa.h>
64255750Sdelphij#include <sys/spa_impl.h>
65168404Spjd#include <sys/zio.h>
66185029Spjd#include <sys/dmu_traverse.h>
67185029Spjd#include <sys/dnode.h>
68185029Spjd#include <sys/dsl_dataset.h>
69168404Spjd#include <sys/dsl_prop.h>
70168962Spjd#include <sys/dkio.h>
71168404Spjd#include <sys/byteorder.h>
72168962Spjd#include <sys/sunddi.h>
73168404Spjd#include <sys/dirent.h>
74168962Spjd#include <sys/policy.h>
75168404Spjd#include <sys/fs/zfs.h>
76168404Spjd#include <sys/zfs_ioctl.h>
77168404Spjd#include <sys/zil.h>
78168404Spjd#include <sys/refcount.h>
79168404Spjd#include <sys/zfs_znode.h>
80168404Spjd#include <sys/zfs_rlock.h>
81185029Spjd#include <sys/vdev_impl.h>
82255750Sdelphij#include <sys/vdev_raidz.h>
83185029Spjd#include <sys/zvol.h>
84209962Smm#include <sys/zil_impl.h>
85243524Smm#include <sys/dbuf.h>
86255750Sdelphij#include <sys/dmu_tx.h>
87255750Sdelphij#include <sys/zfeature.h>
88255750Sdelphij#include <sys/zio_checksum.h>
89255750Sdelphij
90168404Spjd#include <geom/geom.h>
91168404Spjd
92168404Spjd#include "zfs_namecheck.h"
93168404Spjd
94168404Spjdstruct g_class zfs_zvol_class = {
95168404Spjd	.name = "ZFS::ZVOL",
96168404Spjd	.version = G_VERSION,
97168404Spjd};
98168404Spjd
99168404SpjdDECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
100168404Spjd
101219089Spjdvoid *zfsdev_state;
102219089Spjdstatic char *zvol_tag = "zvol_tag";
103219089Spjd
104219089Spjd#define	ZVOL_DUMPSIZE		"dumpsize"
105219089Spjd
106185029Spjd/*
107224791Spjd * The spa_namespace_lock protects the zfsdev_state structure from being
108224791Spjd * modified while it's being used, e.g. an open that comes in before a
109224791Spjd * create finishes.  It also protects temporary opens of the dataset so that,
110185029Spjd * e.g., an open doesn't get a spurious EBUSY.
111185029Spjd */
112168404Spjdstatic uint32_t zvol_minors;
113168404Spjd
114185029Spjdtypedef struct zvol_extent {
115208047Smm	list_node_t	ze_node;
116185029Spjd	dva_t		ze_dva;		/* dva associated with this extent */
117208047Smm	uint64_t	ze_nblks;	/* number of blocks in extent */
118185029Spjd} zvol_extent_t;
119185029Spjd
120168404Spjd/*
121168404Spjd * The in-core state of each volume.
122168404Spjd */
123168404Spjdtypedef struct zvol_state {
124168404Spjd	char		zv_name[MAXPATHLEN]; /* pool/dd name */
125168404Spjd	uint64_t	zv_volsize;	/* amount of space we advertise */
126168404Spjd	uint64_t	zv_volblocksize; /* volume block size */
127168404Spjd	struct g_provider *zv_provider;	/* GEOM provider */
128168404Spjd	uint8_t		zv_min_bs;	/* minimum addressable block shift */
129219089Spjd	uint8_t		zv_flags;	/* readonly, dumpified, etc. */
130168404Spjd	objset_t	*zv_objset;	/* objset handle */
131168404Spjd	uint32_t	zv_total_opens;	/* total open count */
132168404Spjd	zilog_t		*zv_zilog;	/* ZIL handle */
133208047Smm	list_t		zv_extents;	/* List of extents for dump */
134168404Spjd	znode_t		zv_znode;	/* for range locking */
135219089Spjd	dmu_buf_t	*zv_dbuf;	/* bonus handle */
136168404Spjd	int		zv_state;
137168404Spjd	struct bio_queue_head zv_queue;
138168404Spjd	struct mtx	zv_queue_mtx;	/* zv_queue mutex */
139168404Spjd} zvol_state_t;
140168404Spjd
141168404Spjd/*
142185029Spjd * zvol specific flags
143185029Spjd */
144185029Spjd#define	ZVOL_RDONLY	0x1
145185029Spjd#define	ZVOL_DUMPIFIED	0x2
146185029Spjd#define	ZVOL_EXCL	0x4
147219089Spjd#define	ZVOL_WCE	0x8
148185029Spjd
149185029Spjd/*
150168404Spjd * zvol maximum transfer in one DMU tx.
151168404Spjd */
152168404Spjdint zvol_maxphys = DMU_MAX_ACCESS/2;
153168404Spjd
154219089Spjdextern int zfs_set_prop_nvlist(const char *, zprop_source_t,
155248571Smm    nvlist_t *, nvlist_t *);
156219089Spjdstatic int zvol_remove_zv(zvol_state_t *);
157168404Spjdstatic int zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio);
158185029Spjdstatic int zvol_dumpify(zvol_state_t *zv);
159185029Spjdstatic int zvol_dump_fini(zvol_state_t *zv);
160185029Spjdstatic int zvol_dump_init(zvol_state_t *zv, boolean_t resize);
161168404Spjd
162219089Spjdstatic zvol_state_t *zvol_geom_create(const char *name);
163219089Spjdstatic void zvol_geom_run(zvol_state_t *zv);
164219089Spjdstatic void zvol_geom_destroy(zvol_state_t *zv);
165219089Spjdstatic int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace);
166219089Spjdstatic void zvol_geom_start(struct bio *bp);
167219089Spjdstatic void zvol_geom_worker(void *arg);
168219089Spjd
169185029Spjdstatic void
170219089Spjdzvol_size_changed(zvol_state_t *zv)
171185029Spjd{
172219089Spjd#ifdef sun
173219089Spjd	dev_t dev = makedevice(maj, min);
174219089Spjd
175219089Spjd	VERIFY(ddi_prop_update_int64(dev, zfs_dip,
176219089Spjd	    "Size", volsize) == DDI_SUCCESS);
177219089Spjd	VERIFY(ddi_prop_update_int64(dev, zfs_dip,
178219089Spjd	    "Nblocks", lbtodb(volsize)) == DDI_SUCCESS);
179219089Spjd
180219089Spjd	/* Notify specfs to invalidate the cached size */
181219089Spjd	spec_size_invalidate(dev, VBLK);
182219089Spjd	spec_size_invalidate(dev, VCHR);
183219089Spjd#else	/* !sun */
184196927Spjd	struct g_provider *pp;
185185029Spjd
186196927Spjd	pp = zv->zv_provider;
187196927Spjd	if (pp == NULL)
188196927Spjd		return;
189238656Strasz	g_topology_lock();
190238656Strasz	g_resize_provider(pp, zv->zv_volsize);
191238656Strasz	g_topology_unlock();
192219089Spjd#endif	/* !sun */
193185029Spjd}
194185029Spjd
195168404Spjdint
196168404Spjdzvol_check_volsize(uint64_t volsize, uint64_t blocksize)
197168404Spjd{
198168404Spjd	if (volsize == 0)
199249195Smm		return (SET_ERROR(EINVAL));
200168404Spjd
201168404Spjd	if (volsize % blocksize != 0)
202249195Smm		return (SET_ERROR(EINVAL));
203168404Spjd
204168404Spjd#ifdef _ILP32
205168404Spjd	if (volsize - 1 > SPEC_MAXOFFSET_T)
206249195Smm		return (SET_ERROR(EOVERFLOW));
207168404Spjd#endif
208168404Spjd	return (0);
209168404Spjd}
210168404Spjd
211168404Spjdint
212168404Spjdzvol_check_volblocksize(uint64_t volblocksize)
213168404Spjd{
214168404Spjd	if (volblocksize < SPA_MINBLOCKSIZE ||
215168404Spjd	    volblocksize > SPA_MAXBLOCKSIZE ||
216168404Spjd	    !ISP2(volblocksize))
217249195Smm		return (SET_ERROR(EDOM));
218168404Spjd
219168404Spjd	return (0);
220168404Spjd}
221168404Spjd
222168404Spjdint
223168404Spjdzvol_get_stats(objset_t *os, nvlist_t *nv)
224168404Spjd{
225168404Spjd	int error;
226168404Spjd	dmu_object_info_t doi;
227168404Spjd	uint64_t val;
228168404Spjd
229168404Spjd	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val);
230168404Spjd	if (error)
231168404Spjd		return (error);
232168404Spjd
233168404Spjd	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val);
234168404Spjd
235168404Spjd	error = dmu_object_info(os, ZVOL_OBJ, &doi);
236168404Spjd
237168404Spjd	if (error == 0) {
238168404Spjd		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE,
239168404Spjd		    doi.doi_data_block_size);
240168404Spjd	}
241168404Spjd
242168404Spjd	return (error);
243168404Spjd}
244168404Spjd
245168404Spjdstatic zvol_state_t *
246168404Spjdzvol_minor_lookup(const char *name)
247168404Spjd{
248168404Spjd	struct g_provider *pp;
249168404Spjd	struct g_geom *gp;
250219089Spjd	zvol_state_t *zv = NULL;
251168404Spjd
252224791Spjd	ASSERT(MUTEX_HELD(&spa_namespace_lock));
253168404Spjd
254219089Spjd	g_topology_lock();
255168404Spjd	LIST_FOREACH(gp, &zfs_zvol_class.geom, geom) {
256219089Spjd		pp = LIST_FIRST(&gp->provider);
257219089Spjd		if (pp == NULL)
258219089Spjd			continue;
259219089Spjd		zv = pp->private;
260219089Spjd		if (zv == NULL)
261219089Spjd			continue;
262219089Spjd		if (strcmp(zv->zv_name, name) == 0)
263200126Spjd			break;
264168404Spjd	}
265219089Spjd	g_topology_unlock();
266168404Spjd
267219089Spjd	return (gp != NULL ? zv : NULL);
268168404Spjd}
269168404Spjd
270185029Spjd/* extent mapping arg */
271185029Spjdstruct maparg {
272208047Smm	zvol_state_t	*ma_zv;
273208047Smm	uint64_t	ma_blks;
274185029Spjd};
275185029Spjd
276185029Spjd/*ARGSUSED*/
277185029Spjdstatic int
278246666Smmzvol_map_block(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
279219089Spjd    const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
280185029Spjd{
281208047Smm	struct maparg *ma = arg;
282208047Smm	zvol_extent_t *ze;
283208047Smm	int bs = ma->ma_zv->zv_volblocksize;
284185029Spjd
285208047Smm	if (bp == NULL || zb->zb_object != ZVOL_OBJ || zb->zb_level != 0)
286208047Smm		return (0);
287185029Spjd
288208047Smm	VERIFY3U(ma->ma_blks, ==, zb->zb_blkid);
289208047Smm	ma->ma_blks++;
290185029Spjd
291208047Smm	/* Abort immediately if we have encountered gang blocks */
292208047Smm	if (BP_IS_GANG(bp))
293249195Smm		return (SET_ERROR(EFRAGS));
294185029Spjd
295208047Smm	/*
296208047Smm	 * See if the block is at the end of the previous extent.
297208047Smm	 */
298208047Smm	ze = list_tail(&ma->ma_zv->zv_extents);
299208047Smm	if (ze &&
300208047Smm	    DVA_GET_VDEV(BP_IDENTITY(bp)) == DVA_GET_VDEV(&ze->ze_dva) &&
301208047Smm	    DVA_GET_OFFSET(BP_IDENTITY(bp)) ==
302208047Smm	    DVA_GET_OFFSET(&ze->ze_dva) + ze->ze_nblks * bs) {
303208047Smm		ze->ze_nblks++;
304208047Smm		return (0);
305185029Spjd	}
306185029Spjd
307208047Smm	dprintf_bp(bp, "%s", "next blkptr:");
308185029Spjd
309208047Smm	/* start a new extent */
310208047Smm	ze = kmem_zalloc(sizeof (zvol_extent_t), KM_SLEEP);
311208047Smm	ze->ze_dva = bp->blk_dva[0];	/* structure assignment */
312208047Smm	ze->ze_nblks = 1;
313208047Smm	list_insert_tail(&ma->ma_zv->zv_extents, ze);
314208047Smm	return (0);
315208047Smm}
316185029Spjd
317208047Smmstatic void
318208047Smmzvol_free_extents(zvol_state_t *zv)
319208047Smm{
320208047Smm	zvol_extent_t *ze;
321185029Spjd
322208047Smm	while (ze = list_head(&zv->zv_extents)) {
323208047Smm		list_remove(&zv->zv_extents, ze);
324208047Smm		kmem_free(ze, sizeof (zvol_extent_t));
325185029Spjd	}
326208047Smm}
327185029Spjd
328208047Smmstatic int
329208047Smmzvol_get_lbas(zvol_state_t *zv)
330208047Smm{
331219089Spjd	objset_t *os = zv->zv_objset;
332208047Smm	struct maparg	ma;
333208047Smm	int		err;
334185029Spjd
335208047Smm	ma.ma_zv = zv;
336208047Smm	ma.ma_blks = 0;
337208047Smm	zvol_free_extents(zv);
338208047Smm
339219089Spjd	/* commit any in-flight changes before traversing the dataset */
340219089Spjd	txg_wait_synced(dmu_objset_pool(os), 0);
341219089Spjd	err = traverse_dataset(dmu_objset_ds(os), 0,
342208047Smm	    TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, zvol_map_block, &ma);
343208047Smm	if (err || ma.ma_blks != (zv->zv_volsize / zv->zv_volblocksize)) {
344208047Smm		zvol_free_extents(zv);
345208047Smm		return (err ? err : EIO);
346185029Spjd	}
347185029Spjd
348185029Spjd	return (0);
349185029Spjd}
350185029Spjd
351185029Spjd/* ARGSUSED */
352185029Spjdvoid
353185029Spjdzvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
354185029Spjd{
355185029Spjd	zfs_creat_t *zct = arg;
356185029Spjd	nvlist_t *nvprops = zct->zct_props;
357168404Spjd	int error;
358168404Spjd	uint64_t volblocksize, volsize;
359168404Spjd
360185029Spjd	VERIFY(nvlist_lookup_uint64(nvprops,
361168404Spjd	    zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0);
362185029Spjd	if (nvlist_lookup_uint64(nvprops,
363168404Spjd	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0)
364168404Spjd		volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE);
365168404Spjd
366168404Spjd	/*
367185029Spjd	 * These properties must be removed from the list so the generic
368168404Spjd	 * property setting step won't apply to them.
369168404Spjd	 */
370185029Spjd	VERIFY(nvlist_remove_all(nvprops,
371168404Spjd	    zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0);
372185029Spjd	(void) nvlist_remove_all(nvprops,
373168404Spjd	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE));
374168404Spjd
375168404Spjd	error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize,
376168404Spjd	    DMU_OT_NONE, 0, tx);
377168404Spjd	ASSERT(error == 0);
378168404Spjd
379168404Spjd	error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP,
380168404Spjd	    DMU_OT_NONE, 0, tx);
381168404Spjd	ASSERT(error == 0);
382168404Spjd
383168404Spjd	error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx);
384168404Spjd	ASSERT(error == 0);
385168404Spjd}
386168404Spjd
387168404Spjd/*
388168404Spjd * Replay a TX_WRITE ZIL transaction that didn't get committed
389168404Spjd * after a system failure
390168404Spjd */
391168404Spjdstatic int
392168404Spjdzvol_replay_write(zvol_state_t *zv, lr_write_t *lr, boolean_t byteswap)
393168404Spjd{
394168404Spjd	objset_t *os = zv->zv_objset;
395168404Spjd	char *data = (char *)(lr + 1);	/* data follows lr_write_t */
396219089Spjd	uint64_t offset, length;
397168404Spjd	dmu_tx_t *tx;
398168404Spjd	int error;
399168404Spjd
400168404Spjd	if (byteswap)
401168404Spjd		byteswap_uint64_array(lr, sizeof (*lr));
402168404Spjd
403219089Spjd	offset = lr->lr_offset;
404219089Spjd	length = lr->lr_length;
405209962Smm
406219089Spjd	/* If it's a dmu_sync() block, write the whole block */
407219089Spjd	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
408219089Spjd		uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
409219089Spjd		if (length < blocksize) {
410219089Spjd			offset -= offset % blocksize;
411219089Spjd			length = blocksize;
412219089Spjd		}
413219089Spjd	}
414219089Spjd
415168404Spjd	tx = dmu_tx_create(os);
416219089Spjd	dmu_tx_hold_write(tx, ZVOL_OBJ, offset, length);
417209962Smm	error = dmu_tx_assign(tx, TXG_WAIT);
418168404Spjd	if (error) {
419168404Spjd		dmu_tx_abort(tx);
420168404Spjd	} else {
421219089Spjd		dmu_write(os, ZVOL_OBJ, offset, length, data, tx);
422168404Spjd		dmu_tx_commit(tx);
423168404Spjd	}
424168404Spjd
425168404Spjd	return (error);
426168404Spjd}
427168404Spjd
428168404Spjd/* ARGSUSED */
429168404Spjdstatic int
430168404Spjdzvol_replay_err(zvol_state_t *zv, lr_t *lr, boolean_t byteswap)
431168404Spjd{
432249195Smm	return (SET_ERROR(ENOTSUP));
433168404Spjd}
434168404Spjd
435168404Spjd/*
436168404Spjd * Callback vectors for replaying records.
437168404Spjd * Only TX_WRITE is needed for zvol.
438168404Spjd */
439168404Spjdzil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = {
440168404Spjd	zvol_replay_err,	/* 0 no such transaction type */
441168404Spjd	zvol_replay_err,	/* TX_CREATE */
442168404Spjd	zvol_replay_err,	/* TX_MKDIR */
443168404Spjd	zvol_replay_err,	/* TX_MKXATTR */
444168404Spjd	zvol_replay_err,	/* TX_SYMLINK */
445168404Spjd	zvol_replay_err,	/* TX_REMOVE */
446168404Spjd	zvol_replay_err,	/* TX_RMDIR */
447168404Spjd	zvol_replay_err,	/* TX_LINK */
448168404Spjd	zvol_replay_err,	/* TX_RENAME */
449168404Spjd	zvol_replay_write,	/* TX_WRITE */
450168404Spjd	zvol_replay_err,	/* TX_TRUNCATE */
451168404Spjd	zvol_replay_err,	/* TX_SETATTR */
452168404Spjd	zvol_replay_err,	/* TX_ACL */
453209962Smm	zvol_replay_err,	/* TX_CREATE_ACL */
454209962Smm	zvol_replay_err,	/* TX_CREATE_ATTR */
455209962Smm	zvol_replay_err,	/* TX_CREATE_ACL_ATTR */
456209962Smm	zvol_replay_err,	/* TX_MKDIR_ACL */
457209962Smm	zvol_replay_err,	/* TX_MKDIR_ATTR */
458209962Smm	zvol_replay_err,	/* TX_MKDIR_ACL_ATTR */
459209962Smm	zvol_replay_err,	/* TX_WRITE2 */
460168404Spjd};
461168404Spjd
462219089Spjd#ifdef sun
463219089Spjdint
464219089Spjdzvol_name2minor(const char *name, minor_t *minor)
465219089Spjd{
466219089Spjd	zvol_state_t *zv;
467219089Spjd
468224791Spjd	mutex_enter(&spa_namespace_lock);
469219089Spjd	zv = zvol_minor_lookup(name);
470219089Spjd	if (minor && zv)
471219089Spjd		*minor = zv->zv_minor;
472224791Spjd	mutex_exit(&spa_namespace_lock);
473219089Spjd	return (zv ? 0 : -1);
474219089Spjd}
475219089Spjd#endif	/* sun */
476219089Spjd
477168404Spjd/*
478185029Spjd * Create a minor node (plus a whole lot more) for the specified volume.
479185029Spjd */
480185029Spjdint
481219089Spjdzvol_create_minor(const char *name)
482185029Spjd{
483219089Spjd	zfs_soft_state_t *zs;
484168404Spjd	zvol_state_t *zv;
485168404Spjd	objset_t *os;
486168404Spjd	dmu_object_info_t doi;
487241297Savg	uint64_t volsize;
488168404Spjd	int error;
489168404Spjd
490219089Spjd	ZFS_LOG(1, "Creating ZVOL %s...", name);
491168404Spjd
492224791Spjd	mutex_enter(&spa_namespace_lock);
493219089Spjd
494219089Spjd	if (zvol_minor_lookup(name) != NULL) {
495224791Spjd		mutex_exit(&spa_namespace_lock);
496249195Smm		return (SET_ERROR(EEXIST));
497168404Spjd	}
498168404Spjd
499219089Spjd	/* lie and say we're read-only */
500219089Spjd	error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, FTAG, &os);
501168404Spjd
502168404Spjd	if (error) {
503224791Spjd		mutex_exit(&spa_namespace_lock);
504219089Spjd		return (error);
505168404Spjd	}
506168404Spjd
507219089Spjd#ifdef sun
508219089Spjd	if ((minor = zfsdev_minor_alloc()) == 0) {
509219089Spjd		dmu_objset_disown(os, FTAG);
510224791Spjd		mutex_exit(&spa_namespace_lock);
511249195Smm		return (SET_ERROR(ENXIO));
512219089Spjd	}
513168404Spjd
514219089Spjd	if (ddi_soft_state_zalloc(zfsdev_state, minor) != DDI_SUCCESS) {
515219089Spjd		dmu_objset_disown(os, FTAG);
516224791Spjd		mutex_exit(&spa_namespace_lock);
517249195Smm		return (SET_ERROR(EAGAIN));
518219089Spjd	}
519219089Spjd	(void) ddi_prop_update_string(minor, zfs_dip, ZVOL_PROP_NAME,
520219089Spjd	    (char *)name);
521219089Spjd
522219089Spjd	(void) snprintf(chrbuf, sizeof (chrbuf), "%u,raw", minor);
523219089Spjd
524219089Spjd	if (ddi_create_minor_node(zfs_dip, chrbuf, S_IFCHR,
525219089Spjd	    minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
526219089Spjd		ddi_soft_state_free(zfsdev_state, minor);
527219089Spjd		dmu_objset_disown(os, FTAG);
528224791Spjd		mutex_exit(&spa_namespace_lock);
529249195Smm		return (SET_ERROR(EAGAIN));
530219089Spjd	}
531219089Spjd
532219089Spjd	(void) snprintf(blkbuf, sizeof (blkbuf), "%u", minor);
533219089Spjd
534219089Spjd	if (ddi_create_minor_node(zfs_dip, blkbuf, S_IFBLK,
535219089Spjd	    minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
536219089Spjd		ddi_remove_minor_node(zfs_dip, chrbuf);
537219089Spjd		ddi_soft_state_free(zfsdev_state, minor);
538219089Spjd		dmu_objset_disown(os, FTAG);
539224791Spjd		mutex_exit(&spa_namespace_lock);
540249195Smm		return (SET_ERROR(EAGAIN));
541219089Spjd	}
542219089Spjd
543219089Spjd	zs = ddi_get_soft_state(zfsdev_state, minor);
544219089Spjd	zs->zss_type = ZSST_ZVOL;
545219089Spjd	zv = zs->zss_data = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
546219089Spjd#else	/* !sun */
547219089Spjd
548241297Savg	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
549241297Savg	if (error) {
550241297Savg		ASSERT(error == 0);
551241297Savg		dmu_objset_disown(os, zvol_tag);
552241297Savg		mutex_exit(&spa_namespace_lock);
553241297Savg		return (error);
554241297Savg	}
555241297Savg
556219089Spjd	DROP_GIANT();
557219089Spjd	g_topology_lock();
558219089Spjd	zv = zvol_geom_create(name);
559241297Savg	zv->zv_volsize = volsize;
560241297Savg	zv->zv_provider->mediasize = zv->zv_volsize;
561241297Savg
562219089Spjd#endif	/* !sun */
563219089Spjd
564219089Spjd	(void) strlcpy(zv->zv_name, name, MAXPATHLEN);
565168404Spjd	zv->zv_min_bs = DEV_BSHIFT;
566168404Spjd	zv->zv_objset = os;
567219089Spjd	if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os)))
568219089Spjd		zv->zv_flags |= ZVOL_RDONLY;
569168404Spjd	mutex_init(&zv->zv_znode.z_range_lock, NULL, MUTEX_DEFAULT, NULL);
570168404Spjd	avl_create(&zv->zv_znode.z_range_avl, zfs_range_compare,
571168404Spjd	    sizeof (rl_t), offsetof(rl_t, r_node));
572208047Smm	list_create(&zv->zv_extents, sizeof (zvol_extent_t),
573208047Smm	    offsetof(zvol_extent_t, ze_node));
574168404Spjd	/* get and cache the blocksize */
575168404Spjd	error = dmu_object_info(os, ZVOL_OBJ, &doi);
576168404Spjd	ASSERT(error == 0);
577168404Spjd	zv->zv_volblocksize = doi.doi_data_block_size;
578168404Spjd
579219089Spjd	if (spa_writeable(dmu_objset_spa(os))) {
580219089Spjd		if (zil_replay_disable)
581219089Spjd			zil_destroy(dmu_objset_zil(os), B_FALSE);
582219089Spjd		else
583219089Spjd			zil_replay(os, zv, zvol_replay_vector);
584219089Spjd	}
585219089Spjd	dmu_objset_disown(os, FTAG);
586219089Spjd	zv->zv_objset = NULL;
587168404Spjd
588219089Spjd	zvol_minors++;
589168404Spjd
590224791Spjd	mutex_exit(&spa_namespace_lock);
591168404Spjd
592219089Spjd	zvol_geom_run(zv);
593168404Spjd
594168404Spjd	g_topology_unlock();
595168404Spjd	PICKUP_GIANT();
596168404Spjd
597219089Spjd	ZFS_LOG(1, "ZVOL %s created.", name);
598219089Spjd
599219089Spjd	return (0);
600168404Spjd}
601168404Spjd
602168404Spjd/*
603168404Spjd * Remove minor node for the specified volume.
604168404Spjd */
605219089Spjdstatic int
606219089Spjdzvol_remove_zv(zvol_state_t *zv)
607219089Spjd{
608219089Spjd#ifdef sun
609219089Spjd	minor_t minor = zv->zv_minor;
610219089Spjd#endif
611219089Spjd
612224791Spjd	ASSERT(MUTEX_HELD(&spa_namespace_lock));
613219089Spjd	if (zv->zv_total_opens != 0)
614249195Smm		return (SET_ERROR(EBUSY));
615219089Spjd
616219089Spjd	ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name);
617219089Spjd
618219089Spjd#ifdef sun
619219089Spjd	(void) snprintf(nmbuf, sizeof (nmbuf), "%u,raw", minor);
620219089Spjd	ddi_remove_minor_node(zfs_dip, nmbuf);
621219089Spjd#endif	/* sun */
622219089Spjd
623219089Spjd	avl_destroy(&zv->zv_znode.z_range_avl);
624219089Spjd	mutex_destroy(&zv->zv_znode.z_range_lock);
625219089Spjd
626219089Spjd	zvol_geom_destroy(zv);
627219089Spjd
628219089Spjd	zvol_minors--;
629219089Spjd	return (0);
630219089Spjd}
631219089Spjd
632168404Spjdint
633168404Spjdzvol_remove_minor(const char *name)
634168404Spjd{
635168404Spjd	zvol_state_t *zv;
636219089Spjd	int rc;
637168404Spjd
638224791Spjd	mutex_enter(&spa_namespace_lock);
639168404Spjd	if ((zv = zvol_minor_lookup(name)) == NULL) {
640224791Spjd		mutex_exit(&spa_namespace_lock);
641249195Smm		return (SET_ERROR(ENXIO));
642168404Spjd	}
643219089Spjd	g_topology_lock();
644219089Spjd	rc = zvol_remove_zv(zv);
645219089Spjd	g_topology_unlock();
646224791Spjd	mutex_exit(&spa_namespace_lock);
647219089Spjd	return (rc);
648219089Spjd}
649168404Spjd
650219089Spjdint
651219089Spjdzvol_first_open(zvol_state_t *zv)
652219089Spjd{
653219089Spjd	objset_t *os;
654219089Spjd	uint64_t volsize;
655219089Spjd	int error;
656219089Spjd	uint64_t readonly;
657168404Spjd
658219089Spjd	/* lie and say we're read-only */
659219089Spjd	error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, B_TRUE,
660219089Spjd	    zvol_tag, &os);
661219089Spjd	if (error)
662219089Spjd		return (error);
663168404Spjd
664219089Spjd	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
665219089Spjd	if (error) {
666219089Spjd		ASSERT(error == 0);
667219089Spjd		dmu_objset_disown(os, zvol_tag);
668219089Spjd		return (error);
669219089Spjd	}
670219089Spjd	zv->zv_objset = os;
671219089Spjd	error = dmu_bonus_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dbuf);
672219089Spjd	if (error) {
673219089Spjd		dmu_objset_disown(os, zvol_tag);
674219089Spjd		return (error);
675219089Spjd	}
676219089Spjd	zv->zv_volsize = volsize;
677219089Spjd	zv->zv_zilog = zil_open(os, zvol_get_data);
678219089Spjd	zvol_size_changed(zv);
679168404Spjd
680219089Spjd	VERIFY(dsl_prop_get_integer(zv->zv_name, "readonly", &readonly,
681219089Spjd	    NULL) == 0);
682219089Spjd	if (readonly || dmu_objset_is_snapshot(os) ||
683219089Spjd	    !spa_writeable(dmu_objset_spa(os)))
684219089Spjd		zv->zv_flags |= ZVOL_RDONLY;
685219089Spjd	else
686219089Spjd		zv->zv_flags &= ~ZVOL_RDONLY;
687219089Spjd	return (error);
688219089Spjd}
689168404Spjd
690219089Spjdvoid
691219089Spjdzvol_last_close(zvol_state_t *zv)
692219089Spjd{
693168404Spjd	zil_close(zv->zv_zilog);
694168404Spjd	zv->zv_zilog = NULL;
695239774Smm
696219089Spjd	dmu_buf_rele(zv->zv_dbuf, zvol_tag);
697219089Spjd	zv->zv_dbuf = NULL;
698239774Smm
699239774Smm	/*
700239774Smm	 * Evict cached data
701239774Smm	 */
702239774Smm	if (dsl_dataset_is_dirty(dmu_objset_ds(zv->zv_objset)) &&
703239774Smm	    !(zv->zv_flags & ZVOL_RDONLY))
704239774Smm		txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
705248571Smm	dmu_objset_evict_dbufs(zv->zv_objset);
706239774Smm
707219089Spjd	dmu_objset_disown(zv->zv_objset, zvol_tag);
708168404Spjd	zv->zv_objset = NULL;
709168404Spjd}
710168404Spjd
711219089Spjd#ifdef sun
712168404Spjdint
713185029Spjdzvol_prealloc(zvol_state_t *zv)
714168404Spjd{
715185029Spjd	objset_t *os = zv->zv_objset;
716168404Spjd	dmu_tx_t *tx;
717185029Spjd	uint64_t refd, avail, usedobjs, availobjs;
718185029Spjd	uint64_t resid = zv->zv_volsize;
719185029Spjd	uint64_t off = 0;
720185029Spjd
721185029Spjd	/* Check the space usage before attempting to allocate the space */
722185029Spjd	dmu_objset_space(os, &refd, &avail, &usedobjs, &availobjs);
723185029Spjd	if (avail < zv->zv_volsize)
724249195Smm		return (SET_ERROR(ENOSPC));
725185029Spjd
726185029Spjd	/* Free old extents if they exist */
727185029Spjd	zvol_free_extents(zv);
728185029Spjd
729185029Spjd	while (resid != 0) {
730185029Spjd		int error;
731185029Spjd		uint64_t bytes = MIN(resid, SPA_MAXBLOCKSIZE);
732185029Spjd
733185029Spjd		tx = dmu_tx_create(os);
734185029Spjd		dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
735185029Spjd		error = dmu_tx_assign(tx, TXG_WAIT);
736185029Spjd		if (error) {
737185029Spjd			dmu_tx_abort(tx);
738185029Spjd			(void) dmu_free_long_range(os, ZVOL_OBJ, 0, off);
739185029Spjd			return (error);
740185029Spjd		}
741219089Spjd		dmu_prealloc(os, ZVOL_OBJ, off, bytes, tx);
742185029Spjd		dmu_tx_commit(tx);
743185029Spjd		off += bytes;
744185029Spjd		resid -= bytes;
745185029Spjd	}
746185029Spjd	txg_wait_synced(dmu_objset_pool(os), 0);
747185029Spjd
748185029Spjd	return (0);
749185029Spjd}
750219089Spjd#endif	/* sun */
751185029Spjd
752248571Smmstatic int
753219089Spjdzvol_update_volsize(objset_t *os, uint64_t volsize)
754185029Spjd{
755185029Spjd	dmu_tx_t *tx;
756168404Spjd	int error;
757185029Spjd
758224791Spjd	ASSERT(MUTEX_HELD(&spa_namespace_lock));
759185029Spjd
760219089Spjd	tx = dmu_tx_create(os);
761185029Spjd	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
762185029Spjd	error = dmu_tx_assign(tx, TXG_WAIT);
763185029Spjd	if (error) {
764185029Spjd		dmu_tx_abort(tx);
765185029Spjd		return (error);
766185029Spjd	}
767185029Spjd
768219089Spjd	error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1,
769185029Spjd	    &volsize, tx);
770185029Spjd	dmu_tx_commit(tx);
771185029Spjd
772185029Spjd	if (error == 0)
773219089Spjd		error = dmu_free_long_range(os,
774185029Spjd		    ZVOL_OBJ, volsize, DMU_OBJECT_END);
775219089Spjd	return (error);
776219089Spjd}
777185029Spjd
778219089Spjdvoid
779219089Spjdzvol_remove_minors(const char *name)
780219089Spjd{
781219089Spjd	struct g_geom *gp, *gptmp;
782219316Spjd	struct g_provider *pp;
783219089Spjd	zvol_state_t *zv;
784219316Spjd	size_t namelen;
785219089Spjd
786219316Spjd	namelen = strlen(name);
787219316Spjd
788219089Spjd	DROP_GIANT();
789224791Spjd	mutex_enter(&spa_namespace_lock);
790219089Spjd	g_topology_lock();
791219089Spjd
792219089Spjd	LIST_FOREACH_SAFE(gp, &zfs_zvol_class.geom, geom, gptmp) {
793219089Spjd		pp = LIST_FIRST(&gp->provider);
794219089Spjd		if (pp == NULL)
795219089Spjd			continue;
796219089Spjd		zv = pp->private;
797219089Spjd		if (zv == NULL)
798219089Spjd			continue;
799219316Spjd		if (strcmp(zv->zv_name, name) == 0 ||
800219316Spjd		    (strncmp(zv->zv_name, name, namelen) == 0 &&
801219316Spjd		     zv->zv_name[namelen] == '/')) {
802219089Spjd			(void) zvol_remove_zv(zv);
803219316Spjd		}
804185029Spjd	}
805219089Spjd
806219089Spjd	g_topology_unlock();
807224791Spjd	mutex_exit(&spa_namespace_lock);
808219089Spjd	PICKUP_GIANT();
809185029Spjd}
810185029Spjd
811185029Spjdint
812185029Spjdzvol_set_volsize(const char *name, major_t maj, uint64_t volsize)
813185029Spjd{
814219089Spjd	zvol_state_t *zv = NULL;
815219089Spjd	objset_t *os;
816185029Spjd	int error;
817168404Spjd	dmu_object_info_t doi;
818185029Spjd	uint64_t old_volsize = 0ULL;
819219089Spjd	uint64_t readonly;
820168404Spjd
821224791Spjd	mutex_enter(&spa_namespace_lock);
822219089Spjd	zv = zvol_minor_lookup(name);
823219089Spjd	if ((error = dmu_objset_hold(name, FTAG, &os)) != 0) {
824224791Spjd		mutex_exit(&spa_namespace_lock);
825219089Spjd		return (error);
826168404Spjd	}
827168404Spjd
828219089Spjd	if ((error = dmu_object_info(os, ZVOL_OBJ, &doi)) != 0 ||
829168404Spjd	    (error = zvol_check_volsize(volsize,
830185029Spjd	    doi.doi_data_block_size)) != 0)
831185029Spjd		goto out;
832168404Spjd
833219089Spjd	VERIFY(dsl_prop_get_integer(name, "readonly", &readonly,
834219089Spjd	    NULL) == 0);
835219089Spjd	if (readonly) {
836168404Spjd		error = EROFS;
837185029Spjd		goto out;
838168404Spjd	}
839168404Spjd
840219089Spjd	error = zvol_update_volsize(os, volsize);
841185029Spjd	/*
842185029Spjd	 * Reinitialize the dump area to the new size. If we
843219089Spjd	 * failed to resize the dump area then restore it back to
844219089Spjd	 * its original size.
845185029Spjd	 */
846219089Spjd	if (zv && error == 0) {
847219089Spjd#ifdef ZVOL_DUMP
848219089Spjd		if (zv->zv_flags & ZVOL_DUMPIFIED) {
849219089Spjd			old_volsize = zv->zv_volsize;
850219089Spjd			zv->zv_volsize = volsize;
851219089Spjd			if ((error = zvol_dumpify(zv)) != 0 ||
852219089Spjd			    (error = dumpvp_resize()) != 0) {
853219089Spjd				(void) zvol_update_volsize(os, old_volsize);
854219089Spjd				zv->zv_volsize = old_volsize;
855219089Spjd				error = zvol_dumpify(zv);
856219089Spjd			}
857185029Spjd		}
858219089Spjd#endif	/* ZVOL_DUMP */
859219089Spjd		if (error == 0) {
860219089Spjd			zv->zv_volsize = volsize;
861219089Spjd			zvol_size_changed(zv);
862219089Spjd		}
863168404Spjd	}
864168404Spjd
865219089Spjd#ifdef sun
866219089Spjd	/*
867219089Spjd	 * Generate a LUN expansion event.
868219089Spjd	 */
869219089Spjd	if (zv && error == 0) {
870219089Spjd		sysevent_id_t eid;
871219089Spjd		nvlist_t *attr;
872219089Spjd		char *physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
873219089Spjd
874219089Spjd		(void) snprintf(physpath, MAXPATHLEN, "%s%u", ZVOL_PSEUDO_DEV,
875219089Spjd		    zv->zv_minor);
876219089Spjd
877219089Spjd		VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0);
878219089Spjd		VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0);
879219089Spjd
880219089Spjd		(void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS,
881219089Spjd		    ESC_DEV_DLE, attr, &eid, DDI_SLEEP);
882219089Spjd
883219089Spjd		nvlist_free(attr);
884219089Spjd		kmem_free(physpath, MAXPATHLEN);
885219089Spjd	}
886219089Spjd#endif	/* sun */
887219089Spjd
888185029Spjdout:
889219089Spjd	dmu_objset_rele(os, FTAG);
890168404Spjd
891224791Spjd	mutex_exit(&spa_namespace_lock);
892168404Spjd
893168404Spjd	return (error);
894168404Spjd}
895168404Spjd
896219089Spjd/*ARGSUSED*/
897219089Spjdstatic int
898219089Spjdzvol_open(struct g_provider *pp, int flag, int count)
899168404Spjd{
900168404Spjd	zvol_state_t *zv;
901219089Spjd	int err = 0;
902240831Savg	boolean_t locked = B_FALSE;
903168404Spjd
904240831Savg	/*
905240831Savg	 * Protect against recursively entering spa_namespace_lock
906240831Savg	 * when spa_open() is used for a pool on a (local) ZVOL(s).
907240831Savg	 * This is needed since we replaced upstream zfsdev_state_lock
908240831Savg	 * with spa_namespace_lock in the ZVOL code.
909240831Savg	 * We are using the same trick as spa_open().
910240831Savg	 * Note that calls in zvol_first_open which need to resolve
911240831Savg	 * pool name to a spa object will enter spa_open()
912240831Savg	 * recursively, but that function already has all the
913240831Savg	 * necessary protection.
914240831Savg	 */
915240831Savg	if (!MUTEX_HELD(&spa_namespace_lock)) {
916240831Savg		mutex_enter(&spa_namespace_lock);
917240831Savg		locked = B_TRUE;
918227110Spjd	}
919227110Spjd
920219089Spjd	zv = pp->private;
921219089Spjd	if (zv == NULL) {
922240831Savg		if (locked)
923240831Savg			mutex_exit(&spa_namespace_lock);
924249195Smm		return (SET_ERROR(ENXIO));
925168404Spjd	}
926219089Spjd
927219089Spjd	if (zv->zv_total_opens == 0)
928219089Spjd		err = zvol_first_open(zv);
929219089Spjd	if (err) {
930240831Savg		if (locked)
931240831Savg			mutex_exit(&spa_namespace_lock);
932219089Spjd		return (err);
933168404Spjd	}
934219089Spjd	if ((flag & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
935249195Smm		err = SET_ERROR(EROFS);
936219089Spjd		goto out;
937219089Spjd	}
938219089Spjd	if (zv->zv_flags & ZVOL_EXCL) {
939249195Smm		err = SET_ERROR(EBUSY);
940219089Spjd		goto out;
941219089Spjd	}
942219089Spjd#ifdef FEXCL
943219089Spjd	if (flag & FEXCL) {
944219089Spjd		if (zv->zv_total_opens != 0) {
945249195Smm			err = SET_ERROR(EBUSY);
946219089Spjd			goto out;
947219089Spjd		}
948219089Spjd		zv->zv_flags |= ZVOL_EXCL;
949219089Spjd	}
950219089Spjd#endif
951168404Spjd
952219089Spjd	zv->zv_total_opens += count;
953240831Savg	if (locked)
954240831Savg		mutex_exit(&spa_namespace_lock);
955219089Spjd
956219089Spjd	return (err);
957219089Spjdout:
958219089Spjd	if (zv->zv_total_opens == 0)
959219089Spjd		zvol_last_close(zv);
960240831Savg	if (locked)
961240831Savg		mutex_exit(&spa_namespace_lock);
962219089Spjd	return (err);
963219089Spjd}
964219089Spjd
965219089Spjd/*ARGSUSED*/
966219089Spjdstatic int
967219089Spjdzvol_close(struct g_provider *pp, int flag, int count)
968219089Spjd{
969219089Spjd	zvol_state_t *zv;
970219089Spjd	int error = 0;
971240831Savg	boolean_t locked = B_FALSE;
972219089Spjd
973240831Savg	/* See comment in zvol_open(). */
974240831Savg	if (!MUTEX_HELD(&spa_namespace_lock)) {
975240831Savg		mutex_enter(&spa_namespace_lock);
976240831Savg		locked = B_TRUE;
977240831Savg	}
978219089Spjd
979219089Spjd	zv = pp->private;
980219089Spjd	if (zv == NULL) {
981240831Savg		if (locked)
982240831Savg			mutex_exit(&spa_namespace_lock);
983249195Smm		return (SET_ERROR(ENXIO));
984168404Spjd	}
985168404Spjd
986219089Spjd	if (zv->zv_flags & ZVOL_EXCL) {
987219089Spjd		ASSERT(zv->zv_total_opens == 1);
988219089Spjd		zv->zv_flags &= ~ZVOL_EXCL;
989219089Spjd	}
990219089Spjd
991219089Spjd	/*
992219089Spjd	 * If the open count is zero, this is a spurious close.
993219089Spjd	 * That indicates a bug in the kernel / DDI framework.
994219089Spjd	 */
995219089Spjd	ASSERT(zv->zv_total_opens != 0);
996219089Spjd
997219089Spjd	/*
998219089Spjd	 * You may get multiple opens, but only one close.
999219089Spjd	 */
1000219089Spjd	zv->zv_total_opens -= count;
1001219089Spjd
1002219089Spjd	if (zv->zv_total_opens == 0)
1003219089Spjd		zvol_last_close(zv);
1004219089Spjd
1005240831Savg	if (locked)
1006240831Savg		mutex_exit(&spa_namespace_lock);
1007168404Spjd	return (error);
1008168404Spjd}
1009168404Spjd
1010219089Spjdstatic void
1011219089Spjdzvol_get_done(zgd_t *zgd, int error)
1012168404Spjd{
1013219089Spjd	if (zgd->zgd_db)
1014219089Spjd		dmu_buf_rele(zgd->zgd_db, zgd);
1015168404Spjd
1016219089Spjd	zfs_range_unlock(zgd->zgd_rl);
1017219089Spjd
1018219089Spjd	if (error == 0 && zgd->zgd_bp)
1019219089Spjd		zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
1020219089Spjd
1021168404Spjd	kmem_free(zgd, sizeof (zgd_t));
1022168404Spjd}
1023168404Spjd
1024168404Spjd/*
1025168404Spjd * Get data to generate a TX_WRITE intent log record.
1026168404Spjd */
1027168404Spjdstatic int
1028168404Spjdzvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
1029168404Spjd{
1030168404Spjd	zvol_state_t *zv = arg;
1031168404Spjd	objset_t *os = zv->zv_objset;
1032219089Spjd	uint64_t object = ZVOL_OBJ;
1033219089Spjd	uint64_t offset = lr->lr_offset;
1034219089Spjd	uint64_t size = lr->lr_length;	/* length of user data */
1035219089Spjd	blkptr_t *bp = &lr->lr_blkptr;
1036168404Spjd	dmu_buf_t *db;
1037168404Spjd	zgd_t *zgd;
1038168404Spjd	int error;
1039168404Spjd
1040219089Spjd	ASSERT(zio != NULL);
1041219089Spjd	ASSERT(size != 0);
1042168404Spjd
1043219089Spjd	zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
1044219089Spjd	zgd->zgd_zilog = zv->zv_zilog;
1045219089Spjd	zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_READER);
1046219089Spjd
1047168404Spjd	/*
1048168404Spjd	 * Write records come in two flavors: immediate and indirect.
1049168404Spjd	 * For small writes it's cheaper to store the data with the
1050168404Spjd	 * log record (immediate); for large writes it's cheaper to
1051168404Spjd	 * sync the data and get a pointer to it (indirect) so that
1052168404Spjd	 * we don't have to write the data twice.
1053168404Spjd	 */
1054219089Spjd	if (buf != NULL) {	/* immediate write */
1055219089Spjd		error = dmu_read(os, object, offset, size, buf,
1056219089Spjd		    DMU_READ_NO_PREFETCH);
1057219089Spjd	} else {
1058219089Spjd		size = zv->zv_volblocksize;
1059219089Spjd		offset = P2ALIGN(offset, size);
1060219089Spjd		error = dmu_buf_hold(os, object, offset, zgd, &db,
1061219089Spjd		    DMU_READ_NO_PREFETCH);
1062219089Spjd		if (error == 0) {
1063243524Smm			blkptr_t *obp = dmu_buf_get_blkptr(db);
1064243524Smm			if (obp) {
1065243524Smm				ASSERT(BP_IS_HOLE(bp));
1066243524Smm				*bp = *obp;
1067243524Smm			}
1068243524Smm
1069219089Spjd			zgd->zgd_db = db;
1070219089Spjd			zgd->zgd_bp = bp;
1071168404Spjd
1072219089Spjd			ASSERT(db->db_offset == offset);
1073219089Spjd			ASSERT(db->db_size == size);
1074168404Spjd
1075219089Spjd			error = dmu_sync(zio, lr->lr_common.lrc_txg,
1076219089Spjd			    zvol_get_done, zgd);
1077168404Spjd
1078219089Spjd			if (error == 0)
1079219089Spjd				return (0);
1080219089Spjd		}
1081219089Spjd	}
1082209962Smm
1083219089Spjd	zvol_get_done(zgd, error);
1084219089Spjd
1085219089Spjd	return (error);
1086219089Spjd}
1087219089Spjd
1088219089Spjd/*
1089219089Spjd * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions.
1090219089Spjd *
1091219089Spjd * We store data in the log buffers if it's small enough.
1092219089Spjd * Otherwise we will later flush the data out via dmu_sync().
1093219089Spjd */
1094219089Spjdssize_t zvol_immediate_write_sz = 32768;
1095219089Spjd
1096219089Spjdstatic void
1097219089Spjdzvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t resid,
1098219089Spjd    boolean_t sync)
1099219089Spjd{
1100219089Spjd	uint32_t blocksize = zv->zv_volblocksize;
1101219089Spjd	zilog_t *zilog = zv->zv_zilog;
1102219089Spjd	boolean_t slogging;
1103219089Spjd	ssize_t immediate_write_sz;
1104219089Spjd
1105219089Spjd	if (zil_replaying(zilog, tx))
1106219089Spjd		return;
1107219089Spjd
1108219089Spjd	immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
1109219089Spjd	    ? 0 : zvol_immediate_write_sz;
1110219089Spjd
1111219089Spjd	slogging = spa_has_slogs(zilog->zl_spa) &&
1112219089Spjd	    (zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
1113219089Spjd
1114219089Spjd	while (resid) {
1115219089Spjd		itx_t *itx;
1116219089Spjd		lr_write_t *lr;
1117219089Spjd		ssize_t len;
1118219089Spjd		itx_wr_state_t write_state;
1119219089Spjd
1120209962Smm		/*
1121219089Spjd		 * Unlike zfs_log_write() we can be called with
1122219089Spjd		 * upto DMU_MAX_ACCESS/2 (5MB) writes.
1123209962Smm		 */
1124219089Spjd		if (blocksize > immediate_write_sz && !slogging &&
1125219089Spjd		    resid >= blocksize && off % blocksize == 0) {
1126219089Spjd			write_state = WR_INDIRECT; /* uses dmu_sync */
1127219089Spjd			len = blocksize;
1128219089Spjd		} else if (sync) {
1129219089Spjd			write_state = WR_COPIED;
1130219089Spjd			len = MIN(ZIL_MAX_LOG_DATA, resid);
1131219089Spjd		} else {
1132219089Spjd			write_state = WR_NEED_COPY;
1133219089Spjd			len = MIN(ZIL_MAX_LOG_DATA, resid);
1134219089Spjd		}
1135219089Spjd
1136219089Spjd		itx = zil_itx_create(TX_WRITE, sizeof (*lr) +
1137219089Spjd		    (write_state == WR_COPIED ? len : 0));
1138219089Spjd		lr = (lr_write_t *)&itx->itx_lr;
1139219089Spjd		if (write_state == WR_COPIED && dmu_read(zv->zv_objset,
1140219089Spjd		    ZVOL_OBJ, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
1141219089Spjd			zil_itx_destroy(itx);
1142219089Spjd			itx = zil_itx_create(TX_WRITE, sizeof (*lr));
1143219089Spjd			lr = (lr_write_t *)&itx->itx_lr;
1144219089Spjd			write_state = WR_NEED_COPY;
1145219089Spjd		}
1146219089Spjd
1147219089Spjd		itx->itx_wr_state = write_state;
1148219089Spjd		if (write_state == WR_NEED_COPY)
1149219089Spjd			itx->itx_sod += len;
1150219089Spjd		lr->lr_foid = ZVOL_OBJ;
1151219089Spjd		lr->lr_offset = off;
1152219089Spjd		lr->lr_length = len;
1153219089Spjd		lr->lr_blkoff = 0;
1154219089Spjd		BP_ZERO(&lr->lr_blkptr);
1155219089Spjd
1156219089Spjd		itx->itx_private = zv;
1157219089Spjd		itx->itx_sync = sync;
1158219089Spjd
1159219089Spjd		zil_itx_assign(zilog, itx, tx);
1160219089Spjd
1161219089Spjd		off += len;
1162219089Spjd		resid -= len;
1163209962Smm	}
1164219089Spjd}
1165209962Smm
1166219089Spjd#ifdef sun
1167219089Spjdstatic int
1168255750Sdelphijzvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t origoffset,
1169255750Sdelphij    uint64_t size, boolean_t doread, boolean_t isdump)
1170219089Spjd{
1171219089Spjd	vdev_disk_t *dvd;
1172219089Spjd	int c;
1173219089Spjd	int numerrors = 0;
1174219089Spjd
1175255750Sdelphij	if (vd->vdev_ops == &vdev_mirror_ops ||
1176255750Sdelphij	    vd->vdev_ops == &vdev_replacing_ops ||
1177255750Sdelphij	    vd->vdev_ops == &vdev_spare_ops) {
1178255750Sdelphij		for (c = 0; c < vd->vdev_children; c++) {
1179255750Sdelphij			int err = zvol_dumpio_vdev(vd->vdev_child[c],
1180255750Sdelphij			    addr, offset, origoffset, size, doread, isdump);
1181255750Sdelphij			if (err != 0) {
1182255750Sdelphij				numerrors++;
1183255750Sdelphij			} else if (doread) {
1184255750Sdelphij				break;
1185255750Sdelphij			}
1186219089Spjd		}
1187219089Spjd	}
1188219089Spjd
1189255750Sdelphij	if (!vd->vdev_ops->vdev_op_leaf && vd->vdev_ops != &vdev_raidz_ops)
1190219089Spjd		return (numerrors < vd->vdev_children ? 0 : EIO);
1191219089Spjd
1192219089Spjd	if (doread && !vdev_readable(vd))
1193249195Smm		return (SET_ERROR(EIO));
1194219089Spjd	else if (!doread && !vdev_writeable(vd))
1195249195Smm		return (SET_ERROR(EIO));
1196219089Spjd
1197255750Sdelphij	if (vd->vdev_ops == &vdev_raidz_ops) {
1198255750Sdelphij		return (vdev_raidz_physio(vd,
1199255750Sdelphij		    addr, size, offset, origoffset, doread, isdump));
1200255750Sdelphij	}
1201255750Sdelphij
1202219089Spjd	offset += VDEV_LABEL_START_SIZE;
1203219089Spjd
1204219089Spjd	if (ddi_in_panic() || isdump) {
1205219089Spjd		ASSERT(!doread);
1206219089Spjd		if (doread)
1207249195Smm			return (SET_ERROR(EIO));
1208255750Sdelphij		dvd = vd->vdev_tsd;
1209255750Sdelphij		ASSERT3P(dvd, !=, NULL);
1210219089Spjd		return (ldi_dump(dvd->vd_lh, addr, lbtodb(offset),
1211219089Spjd		    lbtodb(size)));
1212219089Spjd	} else {
1213255750Sdelphij		dvd = vd->vdev_tsd;
1214255750Sdelphij		ASSERT3P(dvd, !=, NULL);
1215255750Sdelphij		return (vdev_disk_ldi_physio(dvd->vd_lh, addr, size,
1216255750Sdelphij		    offset, doread ? B_READ : B_WRITE));
1217219089Spjd	}
1218219089Spjd}
1219219089Spjd
1220219089Spjdstatic int
1221219089Spjdzvol_dumpio(zvol_state_t *zv, void *addr, uint64_t offset, uint64_t size,
1222219089Spjd    boolean_t doread, boolean_t isdump)
1223219089Spjd{
1224219089Spjd	vdev_t *vd;
1225219089Spjd	int error;
1226219089Spjd	zvol_extent_t *ze;
1227219089Spjd	spa_t *spa = dmu_objset_spa(zv->zv_objset);
1228219089Spjd
1229219089Spjd	/* Must be sector aligned, and not stradle a block boundary. */
1230219089Spjd	if (P2PHASE(offset, DEV_BSIZE) || P2PHASE(size, DEV_BSIZE) ||
1231219089Spjd	    P2BOUNDARY(offset, size, zv->zv_volblocksize)) {
1232249195Smm		return (SET_ERROR(EINVAL));
1233219089Spjd	}
1234219089Spjd	ASSERT(size <= zv->zv_volblocksize);
1235219089Spjd
1236219089Spjd	/* Locate the extent this belongs to */
1237219089Spjd	ze = list_head(&zv->zv_extents);
1238219089Spjd	while (offset >= ze->ze_nblks * zv->zv_volblocksize) {
1239219089Spjd		offset -= ze->ze_nblks * zv->zv_volblocksize;
1240219089Spjd		ze = list_next(&zv->zv_extents, ze);
1241219089Spjd	}
1242219089Spjd
1243248571Smm	if (ze == NULL)
1244249195Smm		return (SET_ERROR(EINVAL));
1245248571Smm
1246219089Spjd	if (!ddi_in_panic())
1247219089Spjd		spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
1248219089Spjd
1249219089Spjd	vd = vdev_lookup_top(spa, DVA_GET_VDEV(&ze->ze_dva));
1250219089Spjd	offset += DVA_GET_OFFSET(&ze->ze_dva);
1251255750Sdelphij	error = zvol_dumpio_vdev(vd, addr, offset, DVA_GET_OFFSET(&ze->ze_dva),
1252255750Sdelphij	    size, doread, isdump);
1253219089Spjd
1254219089Spjd	if (!ddi_in_panic())
1255219089Spjd		spa_config_exit(spa, SCL_STATE, FTAG);
1256219089Spjd
1257219089Spjd	return (error);
1258219089Spjd}
1259219089Spjd#endif	/* sun */
1260219089Spjd
1261219089Spjdint
1262219089Spjdzvol_strategy(struct bio *bp)
1263219089Spjd{
1264219089Spjd	zvol_state_t *zv = bp->bio_to->private;
1265219089Spjd	uint64_t off, volsize;
1266219089Spjd	size_t resid;
1267219089Spjd	char *addr;
1268219089Spjd	objset_t *os;
1269219089Spjd	rl_t *rl;
1270219089Spjd	int error = 0;
1271219089Spjd	boolean_t doread = (bp->bio_cmd == BIO_READ);
1272255750Sdelphij	boolean_t is_dumpified;
1273219089Spjd	boolean_t sync;
1274219089Spjd
1275219089Spjd	if (zv == NULL) {
1276219089Spjd		g_io_deliver(bp, ENXIO);
1277219089Spjd		return (0);
1278219089Spjd	}
1279219089Spjd
1280219089Spjd	if (bp->bio_cmd != BIO_READ && (zv->zv_flags & ZVOL_RDONLY)) {
1281219089Spjd		g_io_deliver(bp, EROFS);
1282219089Spjd		return (0);
1283219089Spjd	}
1284219089Spjd
1285219089Spjd	off = bp->bio_offset;
1286219089Spjd	volsize = zv->zv_volsize;
1287219089Spjd
1288219089Spjd	os = zv->zv_objset;
1289219089Spjd	ASSERT(os != NULL);
1290219089Spjd
1291219089Spjd	addr = bp->bio_data;
1292219089Spjd	resid = bp->bio_length;
1293219089Spjd
1294219089Spjd	if (resid > 0 && (off < 0 || off >= volsize)) {
1295219089Spjd		g_io_deliver(bp, EIO);
1296219089Spjd		return (0);
1297219089Spjd	}
1298219089Spjd
1299255750Sdelphij#ifdef illumos
1300255750Sdelphij	is_dumpified = zv->zv_flags & ZVOL_DUMPIFIED;
1301255750Sdelphij#else
1302255750Sdelphij	is_dumpified = B_FALSE;
1303255750Sdelphij#endif
1304255750Sdelphij        sync = !doread && !is_dumpified &&
1305255750Sdelphij	    zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
1306219089Spjd
1307168404Spjd	/*
1308219089Spjd	 * There must be no buffer changes when doing a dmu_sync() because
1309219089Spjd	 * we can't change the data whilst calculating the checksum.
1310168404Spjd	 */
1311219089Spjd	rl = zfs_range_lock(&zv->zv_znode, off, resid,
1312219089Spjd	    doread ? RL_READER : RL_WRITER);
1313219089Spjd
1314219089Spjd	while (resid != 0 && off < volsize) {
1315219089Spjd		size_t size = MIN(resid, zvol_maxphys);
1316255750Sdelphij#ifdef illumos
1317255750Sdelphij		if (is_dumpified) {
1318255750Sdelphij			size = MIN(size, P2END(off, zv->zv_volblocksize) - off);
1319255750Sdelphij			error = zvol_dumpio(zv, addr, off, size,
1320255750Sdelphij			    doread, B_FALSE);
1321255750Sdelphij		} else if (doread) {
1322255750Sdelphij#else
1323219089Spjd		if (doread) {
1324255750Sdelphij#endif
1325219089Spjd			error = dmu_read(os, ZVOL_OBJ, off, size, addr,
1326219089Spjd			    DMU_READ_PREFETCH);
1327219089Spjd		} else {
1328219089Spjd			dmu_tx_t *tx = dmu_tx_create(os);
1329219089Spjd			dmu_tx_hold_write(tx, ZVOL_OBJ, off, size);
1330219089Spjd			error = dmu_tx_assign(tx, TXG_WAIT);
1331219089Spjd			if (error) {
1332219089Spjd				dmu_tx_abort(tx);
1333219089Spjd			} else {
1334219089Spjd				dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
1335219089Spjd				zvol_log_write(zv, tx, off, size, sync);
1336219089Spjd				dmu_tx_commit(tx);
1337219089Spjd			}
1338219089Spjd		}
1339219089Spjd		if (error) {
1340219089Spjd			/* convert checksum errors into IO errors */
1341219089Spjd			if (error == ECKSUM)
1342249195Smm				error = SET_ERROR(EIO);
1343219089Spjd			break;
1344219089Spjd		}
1345219089Spjd		off += size;
1346219089Spjd		addr += size;
1347219089Spjd		resid -= size;
1348219089Spjd	}
1349168404Spjd	zfs_range_unlock(rl);
1350219089Spjd
1351219089Spjd	bp->bio_completed = bp->bio_length - resid;
1352219089Spjd	if (bp->bio_completed < bp->bio_length)
1353219089Spjd		bp->bio_error = (off > volsize ? EINVAL : error);
1354219089Spjd
1355219089Spjd	if (sync)
1356219089Spjd		zil_commit(zv->zv_zilog, ZVOL_OBJ);
1357219089Spjd	g_io_deliver(bp, 0);
1358219089Spjd
1359219089Spjd	return (0);
1360219089Spjd}
1361219089Spjd
1362219089Spjd#ifdef sun
1363219089Spjd/*
1364219089Spjd * Set the buffer count to the zvol maximum transfer.
1365219089Spjd * Using our own routine instead of the default minphys()
1366219089Spjd * means that for larger writes we write bigger buffers on X86
1367219089Spjd * (128K instead of 56K) and flush the disk write cache less often
1368219089Spjd * (every zvol_maxphys - currently 1MB) instead of minphys (currently
1369219089Spjd * 56K on X86 and 128K on sparc).
1370219089Spjd */
1371219089Spjdvoid
1372219089Spjdzvol_minphys(struct buf *bp)
1373219089Spjd{
1374219089Spjd	if (bp->b_bcount > zvol_maxphys)
1375219089Spjd		bp->b_bcount = zvol_maxphys;
1376219089Spjd}
1377219089Spjd
1378219089Spjdint
1379219089Spjdzvol_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblocks)
1380219089Spjd{
1381219089Spjd	minor_t minor = getminor(dev);
1382219089Spjd	zvol_state_t *zv;
1383219089Spjd	int error = 0;
1384219089Spjd	uint64_t size;
1385219089Spjd	uint64_t boff;
1386219089Spjd	uint64_t resid;
1387219089Spjd
1388219089Spjd	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
1389219089Spjd	if (zv == NULL)
1390249195Smm		return (SET_ERROR(ENXIO));
1391219089Spjd
1392248571Smm	if ((zv->zv_flags & ZVOL_DUMPIFIED) == 0)
1393249195Smm		return (SET_ERROR(EINVAL));
1394248571Smm
1395219089Spjd	boff = ldbtob(blkno);
1396219089Spjd	resid = ldbtob(nblocks);
1397219089Spjd
1398219089Spjd	VERIFY3U(boff + resid, <=, zv->zv_volsize);
1399219089Spjd
1400219089Spjd	while (resid) {
1401219089Spjd		size = MIN(resid, P2END(boff, zv->zv_volblocksize) - boff);
1402219089Spjd		error = zvol_dumpio(zv, addr, boff, size, B_FALSE, B_TRUE);
1403219089Spjd		if (error)
1404219089Spjd			break;
1405219089Spjd		boff += size;
1406219089Spjd		addr += size;
1407219089Spjd		resid -= size;
1408219089Spjd	}
1409219089Spjd
1410168404Spjd	return (error);
1411168404Spjd}
1412168404Spjd
1413219089Spjd/*ARGSUSED*/
1414168404Spjdint
1415219089Spjdzvol_read(dev_t dev, uio_t *uio, cred_t *cr)
1416219089Spjd{
1417219089Spjd	minor_t minor = getminor(dev);
1418219089Spjd	zvol_state_t *zv;
1419219089Spjd	uint64_t volsize;
1420219089Spjd	rl_t *rl;
1421219089Spjd	int error = 0;
1422219089Spjd
1423219089Spjd	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
1424219089Spjd	if (zv == NULL)
1425249195Smm		return (SET_ERROR(ENXIO));
1426219089Spjd
1427219089Spjd	volsize = zv->zv_volsize;
1428219089Spjd	if (uio->uio_resid > 0 &&
1429219089Spjd	    (uio->uio_loffset < 0 || uio->uio_loffset >= volsize))
1430249195Smm		return (SET_ERROR(EIO));
1431219089Spjd
1432219089Spjd	if (zv->zv_flags & ZVOL_DUMPIFIED) {
1433219089Spjd		error = physio(zvol_strategy, NULL, dev, B_READ,
1434219089Spjd		    zvol_minphys, uio);
1435219089Spjd		return (error);
1436219089Spjd	}
1437219089Spjd
1438219089Spjd	rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
1439219089Spjd	    RL_READER);
1440219089Spjd	while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
1441219089Spjd		uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
1442219089Spjd
1443219089Spjd		/* don't read past the end */
1444219089Spjd		if (bytes > volsize - uio->uio_loffset)
1445219089Spjd			bytes = volsize - uio->uio_loffset;
1446219089Spjd
1447219089Spjd		error =  dmu_read_uio(zv->zv_objset, ZVOL_OBJ, uio, bytes);
1448219089Spjd		if (error) {
1449219089Spjd			/* convert checksum errors into IO errors */
1450219089Spjd			if (error == ECKSUM)
1451249195Smm				error = SET_ERROR(EIO);
1452219089Spjd			break;
1453219089Spjd		}
1454219089Spjd	}
1455219089Spjd	zfs_range_unlock(rl);
1456219089Spjd	return (error);
1457219089Spjd}
1458219089Spjd
1459219089Spjd/*ARGSUSED*/
1460219089Spjdint
1461219089Spjdzvol_write(dev_t dev, uio_t *uio, cred_t *cr)
1462219089Spjd{
1463219089Spjd	minor_t minor = getminor(dev);
1464219089Spjd	zvol_state_t *zv;
1465219089Spjd	uint64_t volsize;
1466219089Spjd	rl_t *rl;
1467219089Spjd	int error = 0;
1468219089Spjd	boolean_t sync;
1469219089Spjd
1470219089Spjd	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
1471219089Spjd	if (zv == NULL)
1472249195Smm		return (SET_ERROR(ENXIO));
1473219089Spjd
1474219089Spjd	volsize = zv->zv_volsize;
1475219089Spjd	if (uio->uio_resid > 0 &&
1476219089Spjd	    (uio->uio_loffset < 0 || uio->uio_loffset >= volsize))
1477249195Smm		return (SET_ERROR(EIO));
1478219089Spjd
1479219089Spjd	if (zv->zv_flags & ZVOL_DUMPIFIED) {
1480219089Spjd		error = physio(zvol_strategy, NULL, dev, B_WRITE,
1481219089Spjd		    zvol_minphys, uio);
1482219089Spjd		return (error);
1483219089Spjd	}
1484219089Spjd
1485219089Spjd	sync = !(zv->zv_flags & ZVOL_WCE) ||
1486219089Spjd	    (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
1487219089Spjd
1488219089Spjd	rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
1489219089Spjd	    RL_WRITER);
1490219089Spjd	while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
1491219089Spjd		uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
1492219089Spjd		uint64_t off = uio->uio_loffset;
1493219089Spjd		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
1494219089Spjd
1495219089Spjd		if (bytes > volsize - off)	/* don't write past the end */
1496219089Spjd			bytes = volsize - off;
1497219089Spjd
1498219089Spjd		dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
1499219089Spjd		error = dmu_tx_assign(tx, TXG_WAIT);
1500219089Spjd		if (error) {
1501219089Spjd			dmu_tx_abort(tx);
1502219089Spjd			break;
1503219089Spjd		}
1504219089Spjd		error = dmu_write_uio_dbuf(zv->zv_dbuf, uio, bytes, tx);
1505219089Spjd		if (error == 0)
1506219089Spjd			zvol_log_write(zv, tx, off, bytes, sync);
1507219089Spjd		dmu_tx_commit(tx);
1508219089Spjd
1509219089Spjd		if (error)
1510219089Spjd			break;
1511219089Spjd	}
1512219089Spjd	zfs_range_unlock(rl);
1513219089Spjd	if (sync)
1514219089Spjd		zil_commit(zv->zv_zilog, ZVOL_OBJ);
1515219089Spjd	return (error);
1516219089Spjd}
1517219089Spjd
1518219089Spjdint
1519219089Spjdzvol_getefi(void *arg, int flag, uint64_t vs, uint8_t bs)
1520219089Spjd{
1521219089Spjd	struct uuid uuid = EFI_RESERVED;
1522219089Spjd	efi_gpe_t gpe = { 0 };
1523219089Spjd	uint32_t crc;
1524219089Spjd	dk_efi_t efi;
1525219089Spjd	int length;
1526219089Spjd	char *ptr;
1527219089Spjd
1528219089Spjd	if (ddi_copyin(arg, &efi, sizeof (dk_efi_t), flag))
1529249195Smm		return (SET_ERROR(EFAULT));
1530219089Spjd	ptr = (char *)(uintptr_t)efi.dki_data_64;
1531219089Spjd	length = efi.dki_length;
1532219089Spjd	/*
1533219089Spjd	 * Some clients may attempt to request a PMBR for the
1534219089Spjd	 * zvol.  Currently this interface will return EINVAL to
1535219089Spjd	 * such requests.  These requests could be supported by
1536219089Spjd	 * adding a check for lba == 0 and consing up an appropriate
1537219089Spjd	 * PMBR.
1538219089Spjd	 */
1539219089Spjd	if (efi.dki_lba < 1 || efi.dki_lba > 2 || length <= 0)
1540249195Smm		return (SET_ERROR(EINVAL));
1541219089Spjd
1542219089Spjd	gpe.efi_gpe_StartingLBA = LE_64(34ULL);
1543219089Spjd	gpe.efi_gpe_EndingLBA = LE_64((vs >> bs) - 1);
1544219089Spjd	UUID_LE_CONVERT(gpe.efi_gpe_PartitionTypeGUID, uuid);
1545219089Spjd
1546219089Spjd	if (efi.dki_lba == 1) {
1547219089Spjd		efi_gpt_t gpt = { 0 };
1548219089Spjd
1549219089Spjd		gpt.efi_gpt_Signature = LE_64(EFI_SIGNATURE);
1550219089Spjd		gpt.efi_gpt_Revision = LE_32(EFI_VERSION_CURRENT);
1551219089Spjd		gpt.efi_gpt_HeaderSize = LE_32(sizeof (gpt));
1552219089Spjd		gpt.efi_gpt_MyLBA = LE_64(1ULL);
1553219089Spjd		gpt.efi_gpt_FirstUsableLBA = LE_64(34ULL);
1554219089Spjd		gpt.efi_gpt_LastUsableLBA = LE_64((vs >> bs) - 1);
1555219089Spjd		gpt.efi_gpt_PartitionEntryLBA = LE_64(2ULL);
1556219089Spjd		gpt.efi_gpt_NumberOfPartitionEntries = LE_32(1);
1557219089Spjd		gpt.efi_gpt_SizeOfPartitionEntry =
1558219089Spjd		    LE_32(sizeof (efi_gpe_t));
1559219089Spjd		CRC32(crc, &gpe, sizeof (gpe), -1U, crc32_table);
1560219089Spjd		gpt.efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc);
1561219089Spjd		CRC32(crc, &gpt, sizeof (gpt), -1U, crc32_table);
1562219089Spjd		gpt.efi_gpt_HeaderCRC32 = LE_32(~crc);
1563219089Spjd		if (ddi_copyout(&gpt, ptr, MIN(sizeof (gpt), length),
1564219089Spjd		    flag))
1565249195Smm			return (SET_ERROR(EFAULT));
1566219089Spjd		ptr += sizeof (gpt);
1567219089Spjd		length -= sizeof (gpt);
1568219089Spjd	}
1569219089Spjd	if (length > 0 && ddi_copyout(&gpe, ptr, MIN(sizeof (gpe),
1570219089Spjd	    length), flag))
1571249195Smm		return (SET_ERROR(EFAULT));
1572219089Spjd	return (0);
1573219089Spjd}
1574219089Spjd
1575219089Spjd/*
1576219089Spjd * BEGIN entry points to allow external callers access to the volume.
1577219089Spjd */
1578219089Spjd/*
1579219089Spjd * Return the volume parameters needed for access from an external caller.
1580219089Spjd * These values are invariant as long as the volume is held open.
1581219089Spjd */
1582219089Spjdint
1583219089Spjdzvol_get_volume_params(minor_t minor, uint64_t *blksize,
1584219089Spjd    uint64_t *max_xfer_len, void **minor_hdl, void **objset_hdl, void **zil_hdl,
1585219089Spjd    void **rl_hdl, void **bonus_hdl)
1586219089Spjd{
1587219089Spjd	zvol_state_t *zv;
1588219089Spjd
1589219089Spjd	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
1590219089Spjd	if (zv == NULL)
1591249195Smm		return (SET_ERROR(ENXIO));
1592219089Spjd	if (zv->zv_flags & ZVOL_DUMPIFIED)
1593249195Smm		return (SET_ERROR(ENXIO));
1594219089Spjd
1595219089Spjd	ASSERT(blksize && max_xfer_len && minor_hdl &&
1596219089Spjd	    objset_hdl && zil_hdl && rl_hdl && bonus_hdl);
1597219089Spjd
1598219089Spjd	*blksize = zv->zv_volblocksize;
1599219089Spjd	*max_xfer_len = (uint64_t)zvol_maxphys;
1600219089Spjd	*minor_hdl = zv;
1601219089Spjd	*objset_hdl = zv->zv_objset;
1602219089Spjd	*zil_hdl = zv->zv_zilog;
1603219089Spjd	*rl_hdl = &zv->zv_znode;
1604219089Spjd	*bonus_hdl = zv->zv_dbuf;
1605219089Spjd	return (0);
1606219089Spjd}
1607219089Spjd
1608219089Spjd/*
1609219089Spjd * Return the current volume size to an external caller.
1610219089Spjd * The size can change while the volume is open.
1611219089Spjd */
1612219089Spjduint64_t
1613219089Spjdzvol_get_volume_size(void *minor_hdl)
1614219089Spjd{
1615219089Spjd	zvol_state_t *zv = minor_hdl;
1616219089Spjd
1617219089Spjd	return (zv->zv_volsize);
1618219089Spjd}
1619219089Spjd
1620219089Spjd/*
1621219089Spjd * Return the current WCE setting to an external caller.
1622219089Spjd * The WCE setting can change while the volume is open.
1623219089Spjd */
1624219089Spjdint
1625219089Spjdzvol_get_volume_wce(void *minor_hdl)
1626219089Spjd{
1627219089Spjd	zvol_state_t *zv = minor_hdl;
1628219089Spjd
1629219089Spjd	return ((zv->zv_flags & ZVOL_WCE) ? 1 : 0);
1630219089Spjd}
1631219089Spjd
1632219089Spjd/*
1633219089Spjd * Entry point for external callers to zvol_log_write
1634219089Spjd */
1635219089Spjdvoid
1636219089Spjdzvol_log_write_minor(void *minor_hdl, dmu_tx_t *tx, offset_t off, ssize_t resid,
1637219089Spjd    boolean_t sync)
1638219089Spjd{
1639219089Spjd	zvol_state_t *zv = minor_hdl;
1640219089Spjd
1641219089Spjd	zvol_log_write(zv, tx, off, resid, sync);
1642219089Spjd}
1643219089Spjd/*
1644219089Spjd * END entry points to allow external callers access to the volume.
1645219089Spjd */
1646219089Spjd
1647219089Spjd/*
1648219089Spjd * Dirtbag ioctls to support mkfs(1M) for UFS filesystems.  See dkio(7I).
1649219089Spjd */
1650219089Spjd/*ARGSUSED*/
1651219089Spjdint
1652219089Spjdzvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
1653219089Spjd{
1654219089Spjd	zvol_state_t *zv;
1655219089Spjd	struct dk_cinfo dki;
1656219089Spjd	struct dk_minfo dkm;
1657219089Spjd	struct dk_callback *dkc;
1658219089Spjd	int error = 0;
1659219089Spjd	rl_t *rl;
1660219089Spjd
1661224791Spjd	mutex_enter(&spa_namespace_lock);
1662219089Spjd
1663219089Spjd	zv = zfsdev_get_soft_state(getminor(dev), ZSST_ZVOL);
1664219089Spjd
1665219089Spjd	if (zv == NULL) {
1666224791Spjd		mutex_exit(&spa_namespace_lock);
1667249195Smm		return (SET_ERROR(ENXIO));
1668219089Spjd	}
1669219089Spjd	ASSERT(zv->zv_total_opens > 0);
1670219089Spjd
1671219089Spjd	switch (cmd) {
1672219089Spjd
1673219089Spjd	case DKIOCINFO:
1674219089Spjd		bzero(&dki, sizeof (dki));
1675219089Spjd		(void) strcpy(dki.dki_cname, "zvol");
1676219089Spjd		(void) strcpy(dki.dki_dname, "zvol");
1677219089Spjd		dki.dki_ctype = DKC_UNKNOWN;
1678219089Spjd		dki.dki_unit = getminor(dev);
1679219089Spjd		dki.dki_maxtransfer = 1 << (SPA_MAXBLOCKSHIFT - zv->zv_min_bs);
1680224791Spjd		mutex_exit(&spa_namespace_lock);
1681219089Spjd		if (ddi_copyout(&dki, (void *)arg, sizeof (dki), flag))
1682249195Smm			error = SET_ERROR(EFAULT);
1683219089Spjd		return (error);
1684219089Spjd
1685219089Spjd	case DKIOCGMEDIAINFO:
1686219089Spjd		bzero(&dkm, sizeof (dkm));
1687219089Spjd		dkm.dki_lbsize = 1U << zv->zv_min_bs;
1688219089Spjd		dkm.dki_capacity = zv->zv_volsize >> zv->zv_min_bs;
1689219089Spjd		dkm.dki_media_type = DK_UNKNOWN;
1690224791Spjd		mutex_exit(&spa_namespace_lock);
1691219089Spjd		if (ddi_copyout(&dkm, (void *)arg, sizeof (dkm), flag))
1692249195Smm			error = SET_ERROR(EFAULT);
1693219089Spjd		return (error);
1694219089Spjd
1695219089Spjd	case DKIOCGETEFI:
1696219089Spjd		{
1697219089Spjd			uint64_t vs = zv->zv_volsize;
1698219089Spjd			uint8_t bs = zv->zv_min_bs;
1699219089Spjd
1700224791Spjd			mutex_exit(&spa_namespace_lock);
1701219089Spjd			error = zvol_getefi((void *)arg, flag, vs, bs);
1702219089Spjd			return (error);
1703219089Spjd		}
1704219089Spjd
1705219089Spjd	case DKIOCFLUSHWRITECACHE:
1706219089Spjd		dkc = (struct dk_callback *)arg;
1707224791Spjd		mutex_exit(&spa_namespace_lock);
1708219089Spjd		zil_commit(zv->zv_zilog, ZVOL_OBJ);
1709219089Spjd		if ((flag & FKIOCTL) && dkc != NULL && dkc->dkc_callback) {
1710219089Spjd			(*dkc->dkc_callback)(dkc->dkc_cookie, error);
1711219089Spjd			error = 0;
1712219089Spjd		}
1713219089Spjd		return (error);
1714219089Spjd
1715219089Spjd	case DKIOCGETWCE:
1716219089Spjd		{
1717219089Spjd			int wce = (zv->zv_flags & ZVOL_WCE) ? 1 : 0;
1718219089Spjd			if (ddi_copyout(&wce, (void *)arg, sizeof (int),
1719219089Spjd			    flag))
1720249195Smm				error = SET_ERROR(EFAULT);
1721219089Spjd			break;
1722219089Spjd		}
1723219089Spjd	case DKIOCSETWCE:
1724219089Spjd		{
1725219089Spjd			int wce;
1726219089Spjd			if (ddi_copyin((void *)arg, &wce, sizeof (int),
1727219089Spjd			    flag)) {
1728249195Smm				error = SET_ERROR(EFAULT);
1729219089Spjd				break;
1730219089Spjd			}
1731219089Spjd			if (wce) {
1732219089Spjd				zv->zv_flags |= ZVOL_WCE;
1733224791Spjd				mutex_exit(&spa_namespace_lock);
1734219089Spjd			} else {
1735219089Spjd				zv->zv_flags &= ~ZVOL_WCE;
1736224791Spjd				mutex_exit(&spa_namespace_lock);
1737219089Spjd				zil_commit(zv->zv_zilog, ZVOL_OBJ);
1738219089Spjd			}
1739219089Spjd			return (0);
1740219089Spjd		}
1741219089Spjd
1742219089Spjd	case DKIOCGGEOM:
1743219089Spjd	case DKIOCGVTOC:
1744219089Spjd		/*
1745219089Spjd		 * commands using these (like prtvtoc) expect ENOTSUP
1746219089Spjd		 * since we're emulating an EFI label
1747219089Spjd		 */
1748249195Smm		error = SET_ERROR(ENOTSUP);
1749219089Spjd		break;
1750219089Spjd
1751219089Spjd	case DKIOCDUMPINIT:
1752219089Spjd		rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize,
1753219089Spjd		    RL_WRITER);
1754219089Spjd		error = zvol_dumpify(zv);
1755219089Spjd		zfs_range_unlock(rl);
1756219089Spjd		break;
1757219089Spjd
1758219089Spjd	case DKIOCDUMPFINI:
1759219089Spjd		if (!(zv->zv_flags & ZVOL_DUMPIFIED))
1760219089Spjd			break;
1761219089Spjd		rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize,
1762219089Spjd		    RL_WRITER);
1763219089Spjd		error = zvol_dump_fini(zv);
1764219089Spjd		zfs_range_unlock(rl);
1765219089Spjd		break;
1766219089Spjd
1767249195Smm	case DKIOCFREE:
1768249195Smm	{
1769249195Smm		dkioc_free_t df;
1770249195Smm		dmu_tx_t *tx;
1771249195Smm
1772249195Smm		if (ddi_copyin((void *)arg, &df, sizeof (df), flag)) {
1773249195Smm			error = SET_ERROR(EFAULT);
1774249195Smm			break;
1775249195Smm		}
1776249195Smm
1777249195Smm		/*
1778249195Smm		 * Apply Postel's Law to length-checking.  If they overshoot,
1779249195Smm		 * just blank out until the end, if there's a need to blank
1780249195Smm		 * out anything.
1781249195Smm		 */
1782249195Smm		if (df.df_start >= zv->zv_volsize)
1783249195Smm			break;	/* No need to do anything... */
1784249195Smm		if (df.df_start + df.df_length > zv->zv_volsize)
1785249195Smm			df.df_length = DMU_OBJECT_END;
1786249195Smm
1787249195Smm		rl = zfs_range_lock(&zv->zv_znode, df.df_start, df.df_length,
1788249195Smm		    RL_WRITER);
1789249195Smm		tx = dmu_tx_create(zv->zv_objset);
1790249195Smm		error = dmu_tx_assign(tx, TXG_WAIT);
1791249195Smm		if (error != 0) {
1792249195Smm			dmu_tx_abort(tx);
1793249195Smm		} else {
1794249195Smm			zvol_log_truncate(zv, tx, df.df_start,
1795249195Smm			    df.df_length, B_TRUE);
1796249195Smm			dmu_tx_commit(tx);
1797249195Smm			error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
1798249195Smm			    df.df_start, df.df_length);
1799249195Smm		}
1800249195Smm
1801249195Smm		zfs_range_unlock(rl);
1802249195Smm
1803249195Smm		if (error == 0) {
1804249195Smm			/*
1805249195Smm			 * If the write-cache is disabled or 'sync' property
1806249195Smm			 * is set to 'always' then treat this as a synchronous
1807249195Smm			 * operation (i.e. commit to zil).
1808249195Smm			 */
1809249195Smm			if (!(zv->zv_flags & ZVOL_WCE) ||
1810249195Smm			    (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS))
1811249195Smm				zil_commit(zv->zv_zilog, ZVOL_OBJ);
1812249195Smm
1813249195Smm			/*
1814249195Smm			 * If the caller really wants synchronous writes, and
1815249195Smm			 * can't wait for them, don't return until the write
1816249195Smm			 * is done.
1817249195Smm			 */
1818249195Smm			if (df.df_flags & DF_WAIT_SYNC) {
1819249195Smm				txg_wait_synced(
1820249195Smm				    dmu_objset_pool(zv->zv_objset), 0);
1821249195Smm			}
1822249195Smm		}
1823249195Smm		break;
1824249195Smm	}
1825249195Smm
1826219089Spjd	default:
1827249195Smm		error = SET_ERROR(ENOTTY);
1828219089Spjd		break;
1829219089Spjd
1830219089Spjd	}
1831224791Spjd	mutex_exit(&spa_namespace_lock);
1832219089Spjd	return (error);
1833219089Spjd}
1834219089Spjd#endif	/* sun */
1835219089Spjd
1836219089Spjdint
1837168404Spjdzvol_busy(void)
1838168404Spjd{
1839168404Spjd	return (zvol_minors != 0);
1840168404Spjd}
1841168404Spjd
1842168404Spjdvoid
1843168404Spjdzvol_init(void)
1844168404Spjd{
1845219089Spjd	VERIFY(ddi_soft_state_init(&zfsdev_state, sizeof (zfs_soft_state_t),
1846219089Spjd	    1) == 0);
1847168404Spjd	ZFS_LOG(1, "ZVOL Initialized.");
1848168404Spjd}
1849168404Spjd
1850168404Spjdvoid
1851168404Spjdzvol_fini(void)
1852168404Spjd{
1853219089Spjd	ddi_soft_state_fini(&zfsdev_state);
1854168404Spjd	ZFS_LOG(1, "ZVOL Deinitialized.");
1855168404Spjd}
1856185029Spjd
1857219089Spjd#ifdef sun
1858255750Sdelphij/*ARGSUSED*/
1859185029Spjdstatic int
1860255750Sdelphijzfs_mvdev_dump_feature_check(void *arg, dmu_tx_t *tx)
1861255750Sdelphij{
1862255750Sdelphij	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
1863255750Sdelphij
1864255750Sdelphij	if (spa_feature_is_active(spa,
1865255750Sdelphij	    &spa_feature_table[SPA_FEATURE_MULTI_VDEV_CRASH_DUMP]))
1866255750Sdelphij		return (1);
1867255750Sdelphij	return (0);
1868255750Sdelphij}
1869255750Sdelphij
1870255750Sdelphij/*ARGSUSED*/
1871255750Sdelphijstatic void
1872255750Sdelphijzfs_mvdev_dump_activate_feature_sync(void *arg, dmu_tx_t *tx)
1873255750Sdelphij{
1874255750Sdelphij	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
1875255750Sdelphij
1876255750Sdelphij	spa_feature_incr(spa,
1877255750Sdelphij	    &spa_feature_table[SPA_FEATURE_MULTI_VDEV_CRASH_DUMP], tx);
1878255750Sdelphij}
1879255750Sdelphij
1880255750Sdelphijstatic int
1881185029Spjdzvol_dump_init(zvol_state_t *zv, boolean_t resize)
1882185029Spjd{
1883185029Spjd	dmu_tx_t *tx;
1884255750Sdelphij	int error;
1885185029Spjd	objset_t *os = zv->zv_objset;
1886255750Sdelphij	spa_t *spa = dmu_objset_spa(os);
1887255750Sdelphij	vdev_t *vd = spa->spa_root_vdev;
1888185029Spjd	nvlist_t *nv = NULL;
1889255750Sdelphij	uint64_t version = spa_version(spa);
1890255750Sdelphij	enum zio_checksum checksum;
1891185029Spjd
1892224791Spjd	ASSERT(MUTEX_HELD(&spa_namespace_lock));
1893255750Sdelphij	ASSERT(vd->vdev_ops == &vdev_root_ops);
1894255750Sdelphij
1895219089Spjd	error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, 0,
1896219089Spjd	    DMU_OBJECT_END);
1897219089Spjd	/* wait for dmu_free_long_range to actually free the blocks */
1898219089Spjd	txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
1899185029Spjd
1900255750Sdelphij	/*
1901255750Sdelphij	 * If the pool on which the dump device is being initialized has more
1902255750Sdelphij	 * than one child vdev, check that the MULTI_VDEV_CRASH_DUMP feature is
1903255750Sdelphij	 * enabled.  If so, bump that feature's counter to indicate that the
1904255750Sdelphij	 * feature is active. We also check the vdev type to handle the
1905255750Sdelphij	 * following case:
1906255750Sdelphij	 *   # zpool create test raidz disk1 disk2 disk3
1907255750Sdelphij	 *   Now have spa_root_vdev->vdev_children == 1 (the raidz vdev),
1908255750Sdelphij	 *   the raidz vdev itself has 3 children.
1909255750Sdelphij	 */
1910255750Sdelphij	if (vd->vdev_children > 1 || vd->vdev_ops == &vdev_raidz_ops) {
1911255750Sdelphij		if (!spa_feature_is_enabled(spa,
1912255750Sdelphij		    &spa_feature_table[SPA_FEATURE_MULTI_VDEV_CRASH_DUMP]))
1913255750Sdelphij			return (SET_ERROR(ENOTSUP));
1914255750Sdelphij		(void) dsl_sync_task(spa_name(spa),
1915255750Sdelphij		    zfs_mvdev_dump_feature_check,
1916255750Sdelphij		    zfs_mvdev_dump_activate_feature_sync, NULL, 2);
1917255750Sdelphij	}
1918255750Sdelphij
1919185029Spjd	tx = dmu_tx_create(os);
1920185029Spjd	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
1921219089Spjd	dmu_tx_hold_bonus(tx, ZVOL_OBJ);
1922185029Spjd	error = dmu_tx_assign(tx, TXG_WAIT);
1923185029Spjd	if (error) {
1924185029Spjd		dmu_tx_abort(tx);
1925185029Spjd		return (error);
1926185029Spjd	}
1927185029Spjd
1928185029Spjd	/*
1929255750Sdelphij	 * If MULTI_VDEV_CRASH_DUMP is active, use the NOPARITY checksum
1930255750Sdelphij	 * function.  Otherwise, use the old default -- OFF.
1931255750Sdelphij	 */
1932255750Sdelphij	checksum = spa_feature_is_active(spa,
1933255750Sdelphij	    &spa_feature_table[SPA_FEATURE_MULTI_VDEV_CRASH_DUMP]) ?
1934255750Sdelphij	    ZIO_CHECKSUM_NOPARITY : ZIO_CHECKSUM_OFF;
1935255750Sdelphij
1936255750Sdelphij	/*
1937185029Spjd	 * If we are resizing the dump device then we only need to
1938185029Spjd	 * update the refreservation to match the newly updated
1939185029Spjd	 * zvolsize. Otherwise, we save off the original state of the
1940185029Spjd	 * zvol so that we can restore them if the zvol is ever undumpified.
1941185029Spjd	 */
1942185029Spjd	if (resize) {
1943185029Spjd		error = zap_update(os, ZVOL_ZAP_OBJ,
1944185029Spjd		    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1,
1945185029Spjd		    &zv->zv_volsize, tx);
1946185029Spjd	} else {
1947219089Spjd		uint64_t checksum, compress, refresrv, vbs, dedup;
1948208047Smm
1949185029Spjd		error = dsl_prop_get_integer(zv->zv_name,
1950185029Spjd		    zfs_prop_to_name(ZFS_PROP_COMPRESSION), &compress, NULL);
1951185029Spjd		error = error ? error : dsl_prop_get_integer(zv->zv_name,
1952185029Spjd		    zfs_prop_to_name(ZFS_PROP_CHECKSUM), &checksum, NULL);
1953185029Spjd		error = error ? error : dsl_prop_get_integer(zv->zv_name,
1954185029Spjd		    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &refresrv, NULL);
1955208047Smm		error = error ? error : dsl_prop_get_integer(zv->zv_name,
1956208047Smm		    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &vbs, NULL);
1957219089Spjd		if (version >= SPA_VERSION_DEDUP) {
1958219089Spjd			error = error ? error :
1959219089Spjd			    dsl_prop_get_integer(zv->zv_name,
1960219089Spjd			    zfs_prop_to_name(ZFS_PROP_DEDUP), &dedup, NULL);
1961219089Spjd		}
1962185029Spjd
1963185029Spjd		error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
1964185029Spjd		    zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1,
1965185029Spjd		    &compress, tx);
1966185029Spjd		error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
1967185029Spjd		    zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1, &checksum, tx);
1968185029Spjd		error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
1969185029Spjd		    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1,
1970185029Spjd		    &refresrv, tx);
1971208047Smm		error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
1972208047Smm		    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1,
1973208047Smm		    &vbs, tx);
1974219089Spjd		error = error ? error : dmu_object_set_blocksize(
1975219089Spjd		    os, ZVOL_OBJ, SPA_MAXBLOCKSIZE, 0, tx);
1976219089Spjd		if (version >= SPA_VERSION_DEDUP) {
1977219089Spjd			error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
1978219089Spjd			    zfs_prop_to_name(ZFS_PROP_DEDUP), 8, 1,
1979219089Spjd			    &dedup, tx);
1980219089Spjd		}
1981219089Spjd		if (error == 0)
1982219089Spjd			zv->zv_volblocksize = SPA_MAXBLOCKSIZE;
1983185029Spjd	}
1984185029Spjd	dmu_tx_commit(tx);
1985185029Spjd
1986185029Spjd	/*
1987185029Spjd	 * We only need update the zvol's property if we are initializing
1988185029Spjd	 * the dump area for the first time.
1989185029Spjd	 */
1990185029Spjd	if (!resize) {
1991185029Spjd		VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
1992185029Spjd		VERIFY(nvlist_add_uint64(nv,
1993185029Spjd		    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 0) == 0);
1994185029Spjd		VERIFY(nvlist_add_uint64(nv,
1995185029Spjd		    zfs_prop_to_name(ZFS_PROP_COMPRESSION),
1996185029Spjd		    ZIO_COMPRESS_OFF) == 0);
1997185029Spjd		VERIFY(nvlist_add_uint64(nv,
1998185029Spjd		    zfs_prop_to_name(ZFS_PROP_CHECKSUM),
1999255750Sdelphij		    checksum) == 0);
2000219089Spjd		if (version >= SPA_VERSION_DEDUP) {
2001219089Spjd			VERIFY(nvlist_add_uint64(nv,
2002219089Spjd			    zfs_prop_to_name(ZFS_PROP_DEDUP),
2003219089Spjd			    ZIO_CHECKSUM_OFF) == 0);
2004219089Spjd		}
2005185029Spjd
2006219089Spjd		error = zfs_set_prop_nvlist(zv->zv_name, ZPROP_SRC_LOCAL,
2007219089Spjd		    nv, NULL);
2008185029Spjd		nvlist_free(nv);
2009185029Spjd
2010185029Spjd		if (error)
2011185029Spjd			return (error);
2012185029Spjd	}
2013185029Spjd
2014185029Spjd	/* Allocate the space for the dump */
2015185029Spjd	error = zvol_prealloc(zv);
2016185029Spjd	return (error);
2017185029Spjd}
2018185029Spjd
2019185029Spjdstatic int
2020185029Spjdzvol_dumpify(zvol_state_t *zv)
2021185029Spjd{
2022185029Spjd	int error = 0;
2023185029Spjd	uint64_t dumpsize = 0;
2024185029Spjd	dmu_tx_t *tx;
2025185029Spjd	objset_t *os = zv->zv_objset;
2026185029Spjd
2027219089Spjd	if (zv->zv_flags & ZVOL_RDONLY)
2028249195Smm		return (SET_ERROR(EROFS));
2029185029Spjd
2030185029Spjd	if (zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE,
2031185029Spjd	    8, 1, &dumpsize) != 0 || dumpsize != zv->zv_volsize) {
2032248571Smm		boolean_t resize = (dumpsize > 0);
2033185029Spjd
2034185029Spjd		if ((error = zvol_dump_init(zv, resize)) != 0) {
2035185029Spjd			(void) zvol_dump_fini(zv);
2036185029Spjd			return (error);
2037185029Spjd		}
2038185029Spjd	}
2039185029Spjd
2040185029Spjd	/*
2041185029Spjd	 * Build up our lba mapping.
2042185029Spjd	 */
2043185029Spjd	error = zvol_get_lbas(zv);
2044185029Spjd	if (error) {
2045185029Spjd		(void) zvol_dump_fini(zv);
2046185029Spjd		return (error);
2047185029Spjd	}
2048185029Spjd
2049185029Spjd	tx = dmu_tx_create(os);
2050185029Spjd	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
2051185029Spjd	error = dmu_tx_assign(tx, TXG_WAIT);
2052185029Spjd	if (error) {
2053185029Spjd		dmu_tx_abort(tx);
2054185029Spjd		(void) zvol_dump_fini(zv);
2055185029Spjd		return (error);
2056185029Spjd	}
2057185029Spjd
2058185029Spjd	zv->zv_flags |= ZVOL_DUMPIFIED;
2059185029Spjd	error = zap_update(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, 8, 1,
2060185029Spjd	    &zv->zv_volsize, tx);
2061185029Spjd	dmu_tx_commit(tx);
2062185029Spjd
2063185029Spjd	if (error) {
2064185029Spjd		(void) zvol_dump_fini(zv);
2065185029Spjd		return (error);
2066185029Spjd	}
2067185029Spjd
2068185029Spjd	txg_wait_synced(dmu_objset_pool(os), 0);
2069185029Spjd	return (0);
2070185029Spjd}
2071185029Spjd
2072185029Spjdstatic int
2073185029Spjdzvol_dump_fini(zvol_state_t *zv)
2074185029Spjd{
2075185029Spjd	dmu_tx_t *tx;
2076185029Spjd	objset_t *os = zv->zv_objset;
2077185029Spjd	nvlist_t *nv;
2078185029Spjd	int error = 0;
2079219089Spjd	uint64_t checksum, compress, refresrv, vbs, dedup;
2080219089Spjd	uint64_t version = spa_version(dmu_objset_spa(zv->zv_objset));
2081185029Spjd
2082185029Spjd	/*
2083185029Spjd	 * Attempt to restore the zvol back to its pre-dumpified state.
2084185029Spjd	 * This is a best-effort attempt as it's possible that not all
2085185029Spjd	 * of these properties were initialized during the dumpify process
2086185029Spjd	 * (i.e. error during zvol_dump_init).
2087185029Spjd	 */
2088185029Spjd
2089185029Spjd	tx = dmu_tx_create(os);
2090185029Spjd	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
2091185029Spjd	error = dmu_tx_assign(tx, TXG_WAIT);
2092185029Spjd	if (error) {
2093185029Spjd		dmu_tx_abort(tx);
2094185029Spjd		return (error);
2095185029Spjd	}
2096185029Spjd	(void) zap_remove(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, tx);
2097185029Spjd	dmu_tx_commit(tx);
2098185029Spjd
2099185029Spjd	(void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2100185029Spjd	    zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1, &checksum);
2101185029Spjd	(void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2102185029Spjd	    zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1, &compress);
2103185029Spjd	(void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2104185029Spjd	    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1, &refresrv);
2105208047Smm	(void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2106208047Smm	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1, &vbs);
2107185029Spjd
2108185029Spjd	VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
2109185029Spjd	(void) nvlist_add_uint64(nv,
2110185029Spjd	    zfs_prop_to_name(ZFS_PROP_CHECKSUM), checksum);
2111185029Spjd	(void) nvlist_add_uint64(nv,
2112185029Spjd	    zfs_prop_to_name(ZFS_PROP_COMPRESSION), compress);
2113185029Spjd	(void) nvlist_add_uint64(nv,
2114185029Spjd	    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), refresrv);
2115219089Spjd	if (version >= SPA_VERSION_DEDUP &&
2116219089Spjd	    zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2117219089Spjd	    zfs_prop_to_name(ZFS_PROP_DEDUP), 8, 1, &dedup) == 0) {
2118219089Spjd		(void) nvlist_add_uint64(nv,
2119219089Spjd		    zfs_prop_to_name(ZFS_PROP_DEDUP), dedup);
2120219089Spjd	}
2121219089Spjd	(void) zfs_set_prop_nvlist(zv->zv_name, ZPROP_SRC_LOCAL,
2122219089Spjd	    nv, NULL);
2123185029Spjd	nvlist_free(nv);
2124185029Spjd
2125185029Spjd	zvol_free_extents(zv);
2126185029Spjd	zv->zv_flags &= ~ZVOL_DUMPIFIED;
2127185029Spjd	(void) dmu_free_long_range(os, ZVOL_OBJ, 0, DMU_OBJECT_END);
2128219089Spjd	/* wait for dmu_free_long_range to actually free the blocks */
2129219089Spjd	txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
2130219089Spjd	tx = dmu_tx_create(os);
2131219089Spjd	dmu_tx_hold_bonus(tx, ZVOL_OBJ);
2132219089Spjd	error = dmu_tx_assign(tx, TXG_WAIT);
2133219089Spjd	if (error) {
2134219089Spjd		dmu_tx_abort(tx);
2135219089Spjd		return (error);
2136219089Spjd	}
2137219089Spjd	if (dmu_object_set_blocksize(os, ZVOL_OBJ, vbs, 0, tx) == 0)
2138219089Spjd		zv->zv_volblocksize = vbs;
2139219089Spjd	dmu_tx_commit(tx);
2140185029Spjd
2141185029Spjd	return (0);
2142185029Spjd}
2143219089Spjd#endif	/* sun */
2144219089Spjd
2145219089Spjdstatic zvol_state_t *
2146219089Spjdzvol_geom_create(const char *name)
2147219089Spjd{
2148219089Spjd	struct g_provider *pp;
2149219089Spjd	struct g_geom *gp;
2150219089Spjd	zvol_state_t *zv;
2151219089Spjd
2152219089Spjd	gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
2153219089Spjd	gp->start = zvol_geom_start;
2154219089Spjd	gp->access = zvol_geom_access;
2155219089Spjd	pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name);
2156219089Spjd	pp->sectorsize = DEV_BSIZE;
2157219089Spjd
2158219089Spjd	zv = kmem_zalloc(sizeof(*zv), KM_SLEEP);
2159219089Spjd	zv->zv_provider = pp;
2160219089Spjd	zv->zv_state = 0;
2161219089Spjd	bioq_init(&zv->zv_queue);
2162219089Spjd	mtx_init(&zv->zv_queue_mtx, "zvol", NULL, MTX_DEF);
2163219089Spjd
2164219089Spjd	pp->private = zv;
2165219089Spjd
2166219089Spjd	return (zv);
2167219089Spjd}
2168219089Spjd
2169219089Spjdstatic void
2170219089Spjdzvol_geom_run(zvol_state_t *zv)
2171219089Spjd{
2172219089Spjd	struct g_provider *pp;
2173219089Spjd
2174219089Spjd	pp = zv->zv_provider;
2175219089Spjd	g_error_provider(pp, 0);
2176219089Spjd
2177219089Spjd	kproc_kthread_add(zvol_geom_worker, zv, &zfsproc, NULL, 0, 0,
2178219089Spjd	    "zfskern", "zvol %s", pp->name + sizeof(ZVOL_DRIVER));
2179219089Spjd}
2180219089Spjd
2181219089Spjdstatic void
2182219089Spjdzvol_geom_destroy(zvol_state_t *zv)
2183219089Spjd{
2184219089Spjd	struct g_provider *pp;
2185219089Spjd
2186219089Spjd	g_topology_assert();
2187219089Spjd
2188219089Spjd	mtx_lock(&zv->zv_queue_mtx);
2189219089Spjd	zv->zv_state = 1;
2190219089Spjd	wakeup_one(&zv->zv_queue);
2191219089Spjd	while (zv->zv_state != 2)
2192219089Spjd		msleep(&zv->zv_state, &zv->zv_queue_mtx, 0, "zvol:w", 0);
2193219089Spjd	mtx_destroy(&zv->zv_queue_mtx);
2194219089Spjd
2195219089Spjd	pp = zv->zv_provider;
2196219089Spjd	zv->zv_provider = NULL;
2197219089Spjd	pp->private = NULL;
2198219089Spjd	g_wither_geom(pp->geom, ENXIO);
2199219089Spjd
2200219089Spjd	kmem_free(zv, sizeof(*zv));
2201219089Spjd}
2202219089Spjd
2203219089Spjdstatic int
2204219089Spjdzvol_geom_access(struct g_provider *pp, int acr, int acw, int ace)
2205219089Spjd{
2206219089Spjd	int count, error, flags;
2207219089Spjd
2208219089Spjd	g_topology_assert();
2209219089Spjd
2210219089Spjd	/*
2211219089Spjd	 * To make it easier we expect either open or close, but not both
2212219089Spjd	 * at the same time.
2213219089Spjd	 */
2214219089Spjd	KASSERT((acr >= 0 && acw >= 0 && ace >= 0) ||
2215219089Spjd	    (acr <= 0 && acw <= 0 && ace <= 0),
2216219089Spjd	    ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).",
2217219089Spjd	    pp->name, acr, acw, ace));
2218219089Spjd
2219219089Spjd	if (pp->private == NULL) {
2220219089Spjd		if (acr <= 0 && acw <= 0 && ace <= 0)
2221219089Spjd			return (0);
2222219089Spjd		return (pp->error);
2223219089Spjd	}
2224219089Spjd
2225219089Spjd	/*
2226219089Spjd	 * We don't pass FEXCL flag to zvol_open()/zvol_close() if ace != 0,
2227219089Spjd	 * because GEOM already handles that and handles it a bit differently.
2228219089Spjd	 * GEOM allows for multiple read/exclusive consumers and ZFS allows
2229219089Spjd	 * only one exclusive consumer, no matter if it is reader or writer.
2230219089Spjd	 * I like better the way GEOM works so I'll leave it for GEOM to
2231219089Spjd	 * decide what to do.
2232219089Spjd	 */
2233219089Spjd
2234219089Spjd	count = acr + acw + ace;
2235219089Spjd	if (count == 0)
2236219089Spjd		return (0);
2237219089Spjd
2238219089Spjd	flags = 0;
2239219089Spjd	if (acr != 0 || ace != 0)
2240219089Spjd		flags |= FREAD;
2241219089Spjd	if (acw != 0)
2242219089Spjd		flags |= FWRITE;
2243219089Spjd
2244219089Spjd	g_topology_unlock();
2245219089Spjd	if (count > 0)
2246219089Spjd		error = zvol_open(pp, flags, count);
2247219089Spjd	else
2248219089Spjd		error = zvol_close(pp, flags, -count);
2249219089Spjd	g_topology_lock();
2250219089Spjd	return (error);
2251219089Spjd}
2252219089Spjd
2253219089Spjdstatic void
2254219089Spjdzvol_geom_start(struct bio *bp)
2255219089Spjd{
2256219089Spjd	zvol_state_t *zv;
2257219089Spjd	boolean_t first;
2258219089Spjd
2259219089Spjd	switch (bp->bio_cmd) {
2260219089Spjd	case BIO_READ:
2261219089Spjd	case BIO_WRITE:
2262219089Spjd	case BIO_FLUSH:
2263219089Spjd		zv = bp->bio_to->private;
2264219089Spjd		ASSERT(zv != NULL);
2265219089Spjd		mtx_lock(&zv->zv_queue_mtx);
2266219089Spjd		first = (bioq_first(&zv->zv_queue) == NULL);
2267219089Spjd		bioq_insert_tail(&zv->zv_queue, bp);
2268219089Spjd		mtx_unlock(&zv->zv_queue_mtx);
2269219089Spjd		if (first)
2270219089Spjd			wakeup_one(&zv->zv_queue);
2271219089Spjd		break;
2272219089Spjd	case BIO_GETATTR:
2273219089Spjd	case BIO_DELETE:
2274219089Spjd	default:
2275219089Spjd		g_io_deliver(bp, EOPNOTSUPP);
2276219089Spjd		break;
2277219089Spjd	}
2278219089Spjd}
2279219089Spjd
2280219089Spjdstatic void
2281219089Spjdzvol_geom_worker(void *arg)
2282219089Spjd{
2283219089Spjd	zvol_state_t *zv;
2284219089Spjd	struct bio *bp;
2285219089Spjd
2286219089Spjd	thread_lock(curthread);
2287219089Spjd	sched_prio(curthread, PRIBIO);
2288219089Spjd	thread_unlock(curthread);
2289219089Spjd
2290219089Spjd	zv = arg;
2291219089Spjd	for (;;) {
2292219089Spjd		mtx_lock(&zv->zv_queue_mtx);
2293219089Spjd		bp = bioq_takefirst(&zv->zv_queue);
2294219089Spjd		if (bp == NULL) {
2295219089Spjd			if (zv->zv_state == 1) {
2296219089Spjd				zv->zv_state = 2;
2297219089Spjd				wakeup(&zv->zv_state);
2298219089Spjd				mtx_unlock(&zv->zv_queue_mtx);
2299219089Spjd				kthread_exit();
2300219089Spjd			}
2301219089Spjd			msleep(&zv->zv_queue, &zv->zv_queue_mtx, PRIBIO | PDROP,
2302219089Spjd			    "zvol:io", 0);
2303219089Spjd			continue;
2304219089Spjd		}
2305219089Spjd		mtx_unlock(&zv->zv_queue_mtx);
2306219089Spjd		switch (bp->bio_cmd) {
2307219089Spjd		case BIO_FLUSH:
2308219089Spjd			zil_commit(zv->zv_zilog, ZVOL_OBJ);
2309219089Spjd			g_io_deliver(bp, 0);
2310219089Spjd			break;
2311219089Spjd		case BIO_READ:
2312219089Spjd		case BIO_WRITE:
2313219089Spjd			zvol_strategy(bp);
2314219089Spjd			break;
2315219089Spjd		}
2316219089Spjd	}
2317219089Spjd}
2318219089Spjd
2319219089Spjdextern boolean_t dataset_name_hidden(const char *name);
2320219089Spjd
2321219089Spjdstatic int
2322219089Spjdzvol_create_snapshots(objset_t *os, const char *name)
2323219089Spjd{
2324219089Spjd	uint64_t cookie, obj;
2325219089Spjd	char *sname;
2326219089Spjd	int error, len;
2327219089Spjd
2328219089Spjd	cookie = obj = 0;
2329219089Spjd	sname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
2330219089Spjd
2331248571Smm#if 0
2332219089Spjd	(void) dmu_objset_find(name, dmu_objset_prefetch, NULL,
2333219089Spjd	    DS_FIND_SNAPSHOTS);
2334248571Smm#endif
2335219089Spjd
2336219089Spjd	for (;;) {
2337219089Spjd		len = snprintf(sname, MAXPATHLEN, "%s@", name);
2338219089Spjd		if (len >= MAXPATHLEN) {
2339219089Spjd			dmu_objset_rele(os, FTAG);
2340219089Spjd			error = ENAMETOOLONG;
2341219089Spjd			break;
2342219089Spjd		}
2343219089Spjd
2344248976Smm		dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
2345219089Spjd		error = dmu_snapshot_list_next(os, MAXPATHLEN - len,
2346219089Spjd		    sname + len, &obj, &cookie, NULL);
2347248976Smm		dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
2348219089Spjd		if (error != 0) {
2349219089Spjd			if (error == ENOENT)
2350219089Spjd				error = 0;
2351219089Spjd			break;
2352219089Spjd		}
2353219089Spjd
2354219089Spjd		if ((error = zvol_create_minor(sname)) != 0) {
2355219089Spjd			printf("ZFS WARNING: Unable to create ZVOL %s (error=%d).\n",
2356219089Spjd			    sname, error);
2357219089Spjd			break;
2358219089Spjd		}
2359219089Spjd	}
2360219089Spjd
2361219089Spjd	kmem_free(sname, MAXPATHLEN);
2362219089Spjd	return (error);
2363219089Spjd}
2364219089Spjd
2365219089Spjdint
2366219089Spjdzvol_create_minors(const char *name)
2367219089Spjd{
2368219089Spjd	uint64_t cookie;
2369219089Spjd	objset_t *os;
2370219089Spjd	char *osname, *p;
2371219089Spjd	int error, len;
2372219089Spjd
2373219089Spjd	if (dataset_name_hidden(name))
2374219089Spjd		return (0);
2375219089Spjd
2376219089Spjd	if ((error = dmu_objset_hold(name, FTAG, &os)) != 0) {
2377219089Spjd		printf("ZFS WARNING: Unable to put hold on %s (error=%d).\n",
2378219089Spjd		    name, error);
2379219089Spjd		return (error);
2380219089Spjd	}
2381219089Spjd	if (dmu_objset_type(os) == DMU_OST_ZVOL) {
2382248571Smm		dsl_dataset_long_hold(os->os_dsl_dataset, FTAG);
2383248571Smm		dsl_pool_rele(dmu_objset_pool(os), FTAG);
2384219089Spjd		if ((error = zvol_create_minor(name)) == 0)
2385219089Spjd			error = zvol_create_snapshots(os, name);
2386219089Spjd		else {
2387219089Spjd			printf("ZFS WARNING: Unable to create ZVOL %s (error=%d).\n",
2388219089Spjd			    name, error);
2389219089Spjd		}
2390248571Smm		dsl_dataset_long_rele(os->os_dsl_dataset, FTAG);
2391248571Smm		dsl_dataset_rele(os->os_dsl_dataset, FTAG);
2392219089Spjd		return (error);
2393219089Spjd	}
2394219089Spjd	if (dmu_objset_type(os) != DMU_OST_ZFS) {
2395219089Spjd		dmu_objset_rele(os, FTAG);
2396219089Spjd		return (0);
2397219089Spjd	}
2398219089Spjd
2399219089Spjd	osname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
2400219089Spjd	if (snprintf(osname, MAXPATHLEN, "%s/", name) >= MAXPATHLEN) {
2401219089Spjd		dmu_objset_rele(os, FTAG);
2402219089Spjd		kmem_free(osname, MAXPATHLEN);
2403219089Spjd		return (ENOENT);
2404219089Spjd	}
2405219089Spjd	p = osname + strlen(osname);
2406219089Spjd	len = MAXPATHLEN - (p - osname);
2407219089Spjd
2408248571Smm#if 0
2409224855Smm	/* Prefetch the datasets. */
2410224855Smm	cookie = 0;
2411224855Smm	while (dmu_dir_list_next(os, len, p, NULL, &cookie) == 0) {
2412224855Smm		if (!dataset_name_hidden(osname))
2413224855Smm			(void) dmu_objset_prefetch(osname, NULL);
2414219089Spjd	}
2415248571Smm#endif
2416219089Spjd
2417219089Spjd	cookie = 0;
2418219089Spjd	while (dmu_dir_list_next(os, MAXPATHLEN - (p - osname), p, NULL,
2419219089Spjd	    &cookie) == 0) {
2420219089Spjd		dmu_objset_rele(os, FTAG);
2421219089Spjd		(void)zvol_create_minors(osname);
2422219089Spjd		if ((error = dmu_objset_hold(name, FTAG, &os)) != 0) {
2423219089Spjd			printf("ZFS WARNING: Unable to put hold on %s (error=%d).\n",
2424219089Spjd			    name, error);
2425219089Spjd			return (error);
2426219089Spjd		}
2427219089Spjd	}
2428219089Spjd
2429219089Spjd	dmu_objset_rele(os, FTAG);
2430219089Spjd	kmem_free(osname, MAXPATHLEN);
2431219089Spjd	return (0);
2432219089Spjd}
2433219317Spjd
2434219317Spjdstatic void
2435219317Spjdzvol_rename_minor(struct g_geom *gp, const char *newname)
2436219317Spjd{
2437219317Spjd	struct g_provider *pp;
2438219317Spjd	zvol_state_t *zv;
2439219317Spjd
2440224791Spjd	ASSERT(MUTEX_HELD(&spa_namespace_lock));
2441219317Spjd	g_topology_assert();
2442219317Spjd
2443219317Spjd	pp = LIST_FIRST(&gp->provider);
2444219317Spjd	ASSERT(pp != NULL);
2445219317Spjd	zv = pp->private;
2446219317Spjd	ASSERT(zv != NULL);
2447219317Spjd
2448219317Spjd	zv->zv_provider = NULL;
2449219317Spjd	g_wither_provider(pp, ENXIO);
2450219317Spjd
2451219317Spjd	pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname);
2452219317Spjd	pp->sectorsize = DEV_BSIZE;
2453219317Spjd	pp->mediasize = zv->zv_volsize;
2454219317Spjd	pp->private = zv;
2455219317Spjd	zv->zv_provider = pp;
2456219317Spjd	strlcpy(zv->zv_name, newname, sizeof(zv->zv_name));
2457219317Spjd	g_error_provider(pp, 0);
2458219317Spjd}
2459219317Spjd
2460219317Spjdvoid
2461219317Spjdzvol_rename_minors(const char *oldname, const char *newname)
2462219317Spjd{
2463219317Spjd	char name[MAXPATHLEN];
2464219317Spjd	struct g_provider *pp;
2465219317Spjd	struct g_geom *gp;
2466219317Spjd	size_t oldnamelen, newnamelen;
2467219317Spjd	zvol_state_t *zv;
2468219317Spjd	char *namebuf;
2469219317Spjd
2470219317Spjd	oldnamelen = strlen(oldname);
2471219317Spjd	newnamelen = strlen(newname);
2472219317Spjd
2473219317Spjd	DROP_GIANT();
2474224791Spjd	mutex_enter(&spa_namespace_lock);
2475219317Spjd	g_topology_lock();
2476219317Spjd
2477219317Spjd	LIST_FOREACH(gp, &zfs_zvol_class.geom, geom) {
2478219317Spjd		pp = LIST_FIRST(&gp->provider);
2479219317Spjd		if (pp == NULL)
2480219317Spjd			continue;
2481219317Spjd		zv = pp->private;
2482219317Spjd		if (zv == NULL)
2483219317Spjd			continue;
2484219317Spjd		if (strcmp(zv->zv_name, oldname) == 0) {
2485219317Spjd			zvol_rename_minor(gp, newname);
2486219317Spjd		} else if (strncmp(zv->zv_name, oldname, oldnamelen) == 0 &&
2487219317Spjd		    (zv->zv_name[oldnamelen] == '/' ||
2488219317Spjd		     zv->zv_name[oldnamelen] == '@')) {
2489219317Spjd			snprintf(name, sizeof(name), "%s%c%s", newname,
2490219317Spjd			    zv->zv_name[oldnamelen],
2491219317Spjd			    zv->zv_name + oldnamelen + 1);
2492219317Spjd			zvol_rename_minor(gp, name);
2493219317Spjd		}
2494219317Spjd	}
2495219317Spjd
2496219317Spjd	g_topology_unlock();
2497224791Spjd	mutex_exit(&spa_namespace_lock);
2498219317Spjd	PICKUP_GIANT();
2499219317Spjd}
2500