1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26
27#include <sys/debug.h>
28#include <sys/types.h>
29#include <sys/file.h>
30#include <sys/errno.h>
31#include <sys/uio.h>
32#include <sys/open.h>
33#include <sys/cred.h>
34#include <sys/kmem.h>
35#include <sys/conf.h>
36#include <sys/cmn_err.h>
37#include <sys/modctl.h>
38#include <sys/disp.h>
39#include <sys/atomic.h>
40#include <sys/filio.h>
41#include <sys/stat.h> /* needed for S_IFBLK and S_IFCHR */
42#include <sys/kstat.h>
43
44#include <sys/ddi.h>
45#include <sys/devops.h>
46#include <sys/sunddi.h>
47#include <sys/esunddi.h>
48#include <sys/priv_names.h>
49
50#include <sys/fssnap.h>
51#include <sys/fssnap_if.h>
52
53/*
54 * This module implements the file system snapshot code, which provides a
55 * point-in-time image of a file system for the purposes of online backup.
56 * There are essentially two parts to this project: the driver half and the
57 * file system half.  The driver half is a pseudo device driver called
58 * "fssnap" that represents the snapshot.  Each snapshot is assigned a
59 * number that corresponds to the minor number of the device, and a control
60 * device with a high minor number is used to initiate snapshot creation and
61 * deletion.  For all practical purposes the driver half acts like a
62 * read-only disk device whose contents are exactly the same as the master
63 * file system at the time the snapshot was created.
64 *
65 * The file system half provides interfaces necessary for performing the
66 * file system dependent operations required to create and delete snapshots
67 * and a special driver strategy routine that must always be used by the file
68 * system for snapshots to work correctly.
69 *
70 * When a snapshot is to be created, the user utility will send an ioctl to
71 * the control device of the driver half specifying the file system to be
72 * snapshotted, the file descriptor of a backing-store file which is used to
73 * hold old data before it is overwritten, and other snapshot parameters.
74 * This ioctl is passed on to the file system specified in the original
75 * ioctl request.  The file system is expected to be able to flush
76 * everything out to make the file system consistent and lock it to ensure
77 * no changes occur while the snapshot is being created.  It then calls
78 * fssnap_create() to create state for a new snapshot, from which an opaque
79 * handle is returned with the snapshot locked.  Next, the file system must
80 * populate the "candidate bitmap", which tells the snapshot code which
81 * "chunks" should be considered for copy-on-write (a chunk is the unit of
82 * granularity used for copy-on-write, which is independent of the device
83 * and file system block sizes).  This is typically done by scanning the
84 * file system allocation bitmaps to determine which chunks contain
85 * allocated blocks in the file system at the time the snapshot was created.
86 * If a chunk has no allocated blocks, it does not need to be copied before
87 * being written to.  Once the candidate bitmap is populated with
88 * fssnap_set_candidate(), the file system calls fssnap_create_done() to
89 * complete the snapshot creation and unlock the snapshot.  The file system
90 * may now be unlocked and modifications to it resumed.
91 *
92 * Once a snapshot is created, the file system must perform all writes
93 * through a special strategy routine, fssnap_strategy().  This strategy
94 * routine determines whether the chunks contained by the write must be
95 * copied before being overwritten by consulting the candidate bitmap
96 * described above, and the "hastrans bitmap" which tells it whether the chunk
97 * has been copied already or not.  If the chunk is a candidate but has not
98 * been copied, it reads the old data in and adds it to a queue.  The
99 * old data can then be overwritten with the new data.  An asynchronous
100 * task queue is dispatched for each old chunk read in which writes the old
101 * data to the backing file specified at snapshot creation time.  The
102 * backing file is a sparse file the same size as the file system that
103 * contains the old data at the offset that data originally had in the
104 * file system.  If the queue containing in-memory chunks gets too large,
105 * writes to the file system may be throttled by a semaphore until the
106 * task queues have a chance to push some of the chunks to the backing file.
107 *
108 * With the candidate bitmap, the hastrans bitmap, the data on the master
109 * file system, and the old data in memory and in the backing file, the
110 * snapshot pseudo-driver can piece together the original file system
111 * information to satisfy read requests.  If the requested chunk is not a
112 * candidate, it returns a zeroed buffer.  If the chunk is a candidate but
113 * has not been copied it reads it from the master file system.  If it is a
114 * candidate and has been copied, it either copies the data from the
115 * in-memory queue or it reads it in from the backing file.  The result is
116 * a replication of the original file system that can be backed up, mounted,
117 * or manipulated by other file system utilities that work on a read-only
118 * device.
119 *
120 * This module is divided into three roughly logical sections:
121 *
122 *     - The snapshot driver, which is a character/block driver
123 *       representing the snapshot itself.  These routines are
124 *       prefixed with "snap_".
125 *
126 *     - The library routines that are defined in fssnap_if.h that
127 *       are used by file systems that use this snapshot implementation.
128 *       These functions are prefixed with "fssnap_" and are called through
129 *       a function vector from the file system.
130 *
131 *     - The helper routines used by the snapshot driver and the fssnap
132 *       library routines for managing the translation table and other
133 *       useful functions.  These routines are all static and are
134 *       prefixed with either "fssnap_" or "transtbl_" if they
135 *       are specifically used for translation table activities.
136 */
137
138static dev_info_t		*fssnap_dip = NULL;
139static struct snapshot_id	*snapshot = NULL;
140static struct snapshot_id	snap_ctl;
141static int			num_snapshots = 0;
142static kmutex_t			snapshot_mutex;
143static char			snapname[] = SNAP_NAME;
144
145/* "tunable" parameters */
146static int		fssnap_taskq_nthreads = FSSNAP_TASKQ_THREADS;
147static uint_t		fssnap_max_mem_chunks = FSSNAP_MAX_MEM_CHUNKS;
148static int		fssnap_taskq_maxtasks = FSSNAP_TASKQ_MAXTASKS;
149
150/* static function prototypes */
151
152/* snapshot driver */
153static int snap_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
154static int snap_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
155static int snap_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
156static int snap_open(dev_t *devp, int flag, int otyp, cred_t *cred);
157static int snap_close(dev_t dev, int flag, int otyp, cred_t *cred);
158static int snap_strategy(struct buf *bp);
159static int snap_read(dev_t dev, struct uio *uiop, cred_t *credp);
160static int snap_print(dev_t dev, char *str);
161static int snap_ioctl(dev_t dev, int cmd, intptr_t arg, int mode,
162    cred_t *credp, int *rvalp);
163static int snap_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op,
164    int flags, char *name, caddr_t valuep, int *lengthp);
165static int snap_getchunk(struct snapshot_id *sidp, chunknumber_t chunk,
166    int offset, int len, char *buffer);
167
168
169/* fssnap interface implementations (see fssnap_if.h) */
170static void fssnap_strategy_impl(void *, struct buf *);
171static void *fssnap_create_impl(chunknumber_t, uint_t, u_offset_t,
172    struct vnode *, int, struct vnode **, char *, u_offset_t);
173static void fssnap_set_candidate_impl(void *, chunknumber_t);
174static int fssnap_is_candidate_impl(void *, u_offset_t);
175static int fssnap_create_done_impl(void *);
176static int fssnap_delete_impl(void *);
177
178/* fssnap interface support routines */
179static int  fssnap_translate(struct snapshot_id **, struct buf *);
180static void fssnap_write_taskq(void *);
181static void fssnap_create_kstats(snapshot_id_t *, int, const char *,
182    const char *);
183static int  fssnap_update_kstat_num(kstat_t *, int);
184static void fssnap_delete_kstats(struct cow_info *);
185
186/* translation table prototypes */
187static cow_map_node_t *transtbl_add(cow_map_t *, chunknumber_t, caddr_t);
188static cow_map_node_t *transtbl_get(cow_map_t *, chunknumber_t);
189static void transtbl_delete(cow_map_t *, cow_map_node_t *);
190static void transtbl_free(cow_map_t *);
191
192static kstat_t *fssnap_highwater_kstat;
193
194/* ************************************************************************ */
195
196/* Device and Module Structures */
197
198static struct cb_ops snap_cb_ops = {
199	snap_open,
200	snap_close,
201	snap_strategy,
202	snap_print,
203	nodev,		/* no snap_dump */
204	snap_read,
205	nodev,		/* no snap_write */
206	snap_ioctl,
207	nodev,		/* no snap_devmap */
208	nodev,		/* no snap_mmap   */
209	nodev,		/* no snap_segmap */
210	nochpoll,
211	snap_prop_op,
212	NULL,		/* streamtab */
213	D_64BIT | D_NEW | D_MP, /* driver compatibility */
214	CB_REV,
215	nodev,		/* async I/O read entry point */
216	nodev		/* async I/O write entry point */
217};
218
219static struct dev_ops snap_ops = {
220	DEVO_REV,
221	0,			/* ref count */
222	snap_getinfo,
223	nulldev,		/* snap_identify obsolete */
224	nulldev,		/* no snap_probe */
225	snap_attach,
226	snap_detach,
227	nodev,			/* no snap_reset */
228	&snap_cb_ops,
229	(struct bus_ops *)NULL,
230	nulldev,		/* no snap_power() */
231	ddi_quiesce_not_needed,		/* quiesce */
232};
233
234extern struct mod_ops mod_driverops;
235
236static struct modldrv md = {
237	&mod_driverops, /* Type of module. This is a driver */
238	"snapshot driver", 	/* Name of the module */
239	&snap_ops,
240};
241
242static struct modlinkage ml = {
243	MODREV_1,
244	&md,
245	NULL
246};
247
248static void *statep;
249
250int
251_init(void)
252{
253	int	error;
254	kstat_t	*ksp;
255	kstat_named_t	*ksdata;
256
257	error = ddi_soft_state_init(&statep, sizeof (struct snapshot_id *), 1);
258	if (error) {
259		cmn_err(CE_WARN, "_init: failed to init ddi_soft_state.");
260		return (error);
261	}
262
263	error = mod_install(&ml);
264
265	if (error) {
266		cmn_err(CE_WARN, "_init: failed to mod_install.");
267		ddi_soft_state_fini(&statep);
268		return (error);
269	}
270
271	/*
272	 * Fill in the snapshot operations vector for file systems
273	 * (defined in fssnap_if.c)
274	 */
275
276	snapops.fssnap_create = fssnap_create_impl;
277	snapops.fssnap_set_candidate = fssnap_set_candidate_impl;
278	snapops.fssnap_is_candidate = fssnap_is_candidate_impl;
279	snapops.fssnap_create_done = fssnap_create_done_impl;
280	snapops.fssnap_delete = fssnap_delete_impl;
281	snapops.fssnap_strategy = fssnap_strategy_impl;
282
283	mutex_init(&snapshot_mutex, NULL, MUTEX_DEFAULT, NULL);
284
285	/*
286	 * Initialize the fssnap highwater kstat
287	 */
288	ksp = kstat_create(snapname, 0, FSSNAP_KSTAT_HIGHWATER, "misc",
289	    KSTAT_TYPE_NAMED, 1, 0);
290	if (ksp != NULL) {
291		ksdata = (kstat_named_t *)ksp->ks_data;
292		kstat_named_init(ksdata, FSSNAP_KSTAT_HIGHWATER,
293		    KSTAT_DATA_UINT32);
294		ksdata->value.ui32 = 0;
295		kstat_install(ksp);
296	} else {
297		cmn_err(CE_WARN, "_init: failed to create highwater kstat.");
298	}
299	fssnap_highwater_kstat = ksp;
300
301	return (0);
302}
303
304int
305_info(struct modinfo *modinfop)
306{
307	return (mod_info(&ml, modinfop));
308}
309
310int
311_fini(void)
312{
313	int	error;
314
315	error = mod_remove(&ml);
316	if (error)
317		return (error);
318	ddi_soft_state_fini(&statep);
319
320	/*
321	 * delete the fssnap highwater kstat
322	 */
323	kstat_delete(fssnap_highwater_kstat);
324
325	mutex_destroy(&snapshot_mutex);
326
327	/* Clear out the file system operations vector */
328	snapops.fssnap_create = NULL;
329	snapops.fssnap_set_candidate = NULL;
330	snapops.fssnap_create_done = NULL;
331	snapops.fssnap_delete = NULL;
332	snapops.fssnap_strategy = NULL;
333
334	return (0);
335}
336
337/* ************************************************************************ */
338
339/*
340 * Snapshot Driver Routines
341 *
342 * This section implements the snapshot character and block drivers.  The
343 * device will appear to be a consistent read-only file system to
344 * applications that wish to back it up or mount it.  The snapshot driver
345 * communicates with the file system through the translation table, which
346 * tells the snapshot driver where to find the data necessary to piece
347 * together the frozen file system.  The data may either be on the master
348 * device (no translation exists), in memory (a translation exists but has
349 * not been flushed to the backing store), or in the backing store file.
350 * The read request may require the snapshot driver to retrieve data from
351 * several different places and piece it together to look like a single
352 * contiguous read.
353 *
354 * The device minor number corresponds to the snapshot number in the list of
355 * snapshot identifiers.  The soft state for each minor number is simply a
356 * pointer to the snapshot id, which holds all of the snapshot state.  One
357 * minor number is designated as the control device.  All snapshot create
358 * and delete requests go through the control device to ensure this module
359 * is properly loaded and attached before the file system starts calling
360 * routines defined here.
361 */
362
363
364/*
365 * snap_getinfo() - snapshot driver getinfo(9E) routine
366 *
367 */
368/*ARGSUSED*/
369static int
370snap_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
371{
372	switch (infocmd) {
373	case DDI_INFO_DEVT2DEVINFO:
374		*result = fssnap_dip;
375		return (DDI_SUCCESS);
376	case DDI_INFO_DEVT2INSTANCE:
377		*result = 0;	/* we only have one instance */
378		return (DDI_SUCCESS);
379	}
380	return (DDI_FAILURE);
381}
382
383/*
384 * snap_attach() - snapshot driver attach(9E) routine
385 *
386 *    sets up snapshot control device and control state.  The control state
387 *    is a pointer to an "anonymous" snapshot_id for tracking opens and closes
388 */
389static int
390snap_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
391{
392	int			error;
393
394	switch (cmd) {
395	case DDI_ATTACH:
396		/* create the control device */
397		error = ddi_create_priv_minor_node(dip, SNAP_CTL_NODE, S_IFCHR,
398		    SNAP_CTL_MINOR, DDI_PSEUDO, PRIVONLY_DEV,
399		    PRIV_SYS_CONFIG, PRIV_SYS_CONFIG, 0666);
400		if (error == DDI_FAILURE) {
401			return (DDI_FAILURE);
402		}
403
404		rw_init(&snap_ctl.sid_rwlock, NULL, RW_DEFAULT, NULL);
405		rw_enter(&snap_ctl.sid_rwlock, RW_WRITER);
406		fssnap_dip = dip;
407		snap_ctl.sid_snapnumber = SNAP_CTL_MINOR;
408		/* the control sid is not linked into the snapshot list */
409		snap_ctl.sid_next = NULL;
410		snap_ctl.sid_cowinfo = NULL;
411		snap_ctl.sid_flags = 0;
412		rw_exit(&snap_ctl.sid_rwlock);
413		ddi_report_dev(dip);
414
415		return (DDI_SUCCESS);
416	case DDI_PM_RESUME:
417		return (DDI_SUCCESS);
418
419	case DDI_RESUME:
420		return (DDI_SUCCESS);
421
422	default:
423		return (DDI_FAILURE);
424	}
425}
426
427/*
428 * snap_detach() - snapshot driver detach(9E) routine
429 *
430 *    destroys snapshot control device and control state.  If any snapshots
431 *    are active (ie. num_snapshots != 0), the device will refuse to detach.
432 */
433static int
434snap_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
435{
436	struct snapshot_id *sidp, *sidnextp;
437
438	switch (cmd) {
439	case DDI_DETACH:
440		/* do not detach if the device is active */
441		mutex_enter(&snapshot_mutex);
442		if ((num_snapshots != 0) ||
443		    ((snap_ctl.sid_flags & SID_CHAR_BUSY) != 0)) {
444			mutex_exit(&snapshot_mutex);
445			return (DDI_FAILURE);
446		}
447
448		/* free up the snapshot list */
449		for (sidp = snapshot; sidp != NULL; sidp = sidnextp) {
450			ASSERT(SID_AVAILABLE(sidp) &&
451			    !RW_LOCK_HELD(&sidp->sid_rwlock));
452			sidnextp = sidp->sid_next;
453			rw_destroy(&sidp->sid_rwlock);
454			kmem_free(sidp, sizeof (struct snapshot_id));
455		}
456		snapshot = NULL;
457
458		/* delete the control device */
459		ddi_remove_minor_node(dip, SNAP_CTL_NODE);
460		fssnap_dip = NULL;
461
462		ASSERT((snap_ctl.sid_flags & SID_CHAR_BUSY) == 0);
463		rw_destroy(&snap_ctl.sid_rwlock);
464		mutex_exit(&snapshot_mutex);
465
466		return (DDI_SUCCESS);
467
468	default:
469		return (DDI_FAILURE);
470	}
471}
472
473/*
474 * snap_open() - snapshot driver open(9E) routine
475 *
476 *     marks the snapshot id as busy so it will not be recycled when deleted
477 *     until the snapshot is closed.
478 */
479/* ARGSUSED */
480static int
481snap_open(dev_t *devp, int flag, int otyp, cred_t *cred)
482{
483	minor_t	minor;
484	struct snapshot_id **sidpp, *sidp;
485
486	/* snapshots are read-only */
487	if (flag & FWRITE)
488		return (EROFS);
489
490	minor = getminor(*devp);
491
492	if (minor == SNAP_CTL_MINOR) {
493		/* control device must be opened exclusively */
494		if (((flag & FEXCL) != FEXCL) || (otyp != OTYP_CHR))
495			return (EINVAL);
496
497		rw_enter(&snap_ctl.sid_rwlock, RW_WRITER);
498		if ((snap_ctl.sid_flags & SID_CHAR_BUSY) != 0) {
499			rw_exit(&snap_ctl.sid_rwlock);
500			return (EBUSY);
501		}
502
503		snap_ctl.sid_flags |= SID_CHAR_BUSY;
504		rw_exit(&snap_ctl.sid_rwlock);
505
506		return (0);
507	}
508
509	sidpp = ddi_get_soft_state(statep, minor);
510	if (sidpp == NULL || *sidpp == NULL)
511		return (ENXIO);
512	sidp = *sidpp;
513	rw_enter(&sidp->sid_rwlock, RW_WRITER);
514
515	if ((flag & FEXCL) && SID_BUSY(sidp)) {
516		rw_exit(&sidp->sid_rwlock);
517		return (EAGAIN);
518	}
519
520	ASSERT(sidpp != NULL && sidp != NULL);
521	/* check to see if this snapshot has been killed on us */
522	if (SID_INACTIVE(sidp)) {
523		cmn_err(CE_WARN, "snap_open: snapshot %d does not exist.",
524		    minor);
525		rw_exit(&sidp->sid_rwlock);
526		return (ENXIO);
527	}
528
529	switch (otyp) {
530	case OTYP_CHR:
531		sidp->sid_flags |= SID_CHAR_BUSY;
532		break;
533	case OTYP_BLK:
534		sidp->sid_flags |= SID_BLOCK_BUSY;
535		break;
536	default:
537		rw_exit(&sidp->sid_rwlock);
538		return (EINVAL);
539	}
540
541	rw_exit(&sidp->sid_rwlock);
542
543	/*
544	 * at this point if a valid snapshot was found then it has
545	 * been marked busy and we can use it.
546	 */
547	return (0);
548}
549
550/*
551 * snap_close() - snapshot driver close(9E) routine
552 *
553 *    unsets the busy bits in the snapshot id.  If the snapshot has been
554 *    deleted while the snapshot device was open, the close call will clean
555 *    up the remaining state information.
556 */
557/* ARGSUSED */
558static int
559snap_close(dev_t dev, int flag, int otyp, cred_t *cred)
560{
561	struct snapshot_id	**sidpp, *sidp;
562	minor_t			minor;
563	char			name[20];
564
565	minor = getminor(dev);
566
567	/* if this is the control device, close it and return */
568	if (minor == SNAP_CTL_MINOR) {
569		rw_enter(&snap_ctl.sid_rwlock, RW_WRITER);
570		snap_ctl.sid_flags &= ~(SID_CHAR_BUSY);
571		rw_exit(&snap_ctl.sid_rwlock);
572		return (0);
573	}
574
575	sidpp = ddi_get_soft_state(statep, minor);
576	if (sidpp == NULL || *sidpp == NULL) {
577		cmn_err(CE_WARN, "snap_close: could not find state for "
578		    "snapshot %d.", minor);
579		return (ENXIO);
580	}
581	sidp = *sidpp;
582	mutex_enter(&snapshot_mutex);
583	rw_enter(&sidp->sid_rwlock, RW_WRITER);
584
585	/* Mark the snapshot as not being busy anymore */
586	switch (otyp) {
587	case OTYP_CHR:
588		sidp->sid_flags &= ~(SID_CHAR_BUSY);
589		break;
590	case OTYP_BLK:
591		sidp->sid_flags &= ~(SID_BLOCK_BUSY);
592		break;
593	default:
594		mutex_exit(&snapshot_mutex);
595		rw_exit(&sidp->sid_rwlock);
596		return (EINVAL);
597	}
598
599	if (SID_AVAILABLE(sidp)) {
600		/*
601		 * if this is the last close on a snapshot that has been
602		 * deleted, then free up the soft state.  The snapdelete
603		 * ioctl does not free this when the device is in use so
604		 * we do it here after the last reference goes away.
605		 */
606
607		/* remove the device nodes */
608		ASSERT(fssnap_dip != NULL);
609		(void) snprintf(name, sizeof (name), "%d",
610		    sidp->sid_snapnumber);
611		ddi_remove_minor_node(fssnap_dip, name);
612		(void) snprintf(name, sizeof (name), "%d,raw",
613		    sidp->sid_snapnumber);
614		ddi_remove_minor_node(fssnap_dip, name);
615
616		/* delete the state structure */
617		ddi_soft_state_free(statep, sidp->sid_snapnumber);
618		num_snapshots--;
619	}
620
621	mutex_exit(&snapshot_mutex);
622	rw_exit(&sidp->sid_rwlock);
623
624	return (0);
625}
626
627/*
628 * snap_read() - snapshot driver read(9E) routine
629 *
630 *    reads data from the snapshot by calling snap_strategy() through physio()
631 */
632/* ARGSUSED */
633static int
634snap_read(dev_t dev, struct uio *uiop, cred_t *credp)
635{
636	minor_t		minor;
637	struct snapshot_id **sidpp;
638
639	minor = getminor(dev);
640	sidpp = ddi_get_soft_state(statep, minor);
641	if (sidpp == NULL || *sidpp == NULL) {
642		cmn_err(CE_WARN,
643		    "snap_read: could not find state for snapshot %d.", minor);
644		return (ENXIO);
645	}
646	return (physio(snap_strategy, NULL, dev, B_READ, minphys, uiop));
647}
648
649/*
650 * snap_strategy() - snapshot driver strategy(9E) routine
651 *
652 *    cycles through each chunk in the requested buffer and calls
653 *    snap_getchunk() on each chunk to retrieve it from the appropriate
654 *    place.  Once all of the parts are put together the requested buffer
655 *    is returned.  The snapshot driver is read-only, so a write is invalid.
656 */
657static int
658snap_strategy(struct buf *bp)
659{
660	struct snapshot_id **sidpp, *sidp;
661	minor_t		minor;
662	chunknumber_t	chunk;
663	int		off, len;
664	u_longlong_t	reqptr;
665	int		error = 0;
666	size_t		chunksz;
667	caddr_t		buf;
668
669	/* snapshot device is read-only */
670	if (bp->b_flags & B_WRITE) {
671		bioerror(bp, EROFS);
672		bp->b_resid = bp->b_bcount;
673		biodone(bp);
674		return (0);
675	}
676
677	minor = getminor(bp->b_edev);
678	sidpp = ddi_get_soft_state(statep, minor);
679	if (sidpp == NULL || *sidpp == NULL) {
680		cmn_err(CE_WARN,
681		    "snap_strategy: could not find state for snapshot %d.",
682		    minor);
683		bioerror(bp, ENXIO);
684		bp->b_resid = bp->b_bcount;
685		biodone(bp);
686		return (0);
687	}
688	sidp = *sidpp;
689	ASSERT(sidp);
690	rw_enter(&sidp->sid_rwlock, RW_READER);
691
692	if (SID_INACTIVE(sidp)) {
693		bioerror(bp, ENXIO);
694		bp->b_resid = bp->b_bcount;
695		biodone(bp);
696		rw_exit(&sidp->sid_rwlock);
697		return (0);
698	}
699
700	if (bp->b_flags & (B_PAGEIO|B_PHYS))
701		bp_mapin(bp);
702
703	bp->b_resid = bp->b_bcount;
704	ASSERT(bp->b_un.b_addr);
705	buf = bp->b_un.b_addr;
706
707	chunksz = sidp->sid_cowinfo->cow_map.cmap_chunksz;
708
709	/* reqptr is the current DEV_BSIZE offset into the device */
710	/* chunk is the chunk containing reqptr */
711	/* len is the length of the request (in the current chunk) in bytes */
712	/* off is the byte offset into the current chunk */
713	reqptr = bp->b_lblkno;
714	while (bp->b_resid > 0) {
715		chunk = dbtocowchunk(&sidp->sid_cowinfo->cow_map, reqptr);
716		off = (reqptr % (chunksz >> DEV_BSHIFT)) << DEV_BSHIFT;
717		len = min(chunksz - off, bp->b_resid);
718		ASSERT((off + len) <= chunksz);
719
720		if ((error = snap_getchunk(sidp, chunk, off, len, buf)) != 0) {
721			/*
722			 * EINVAL means the user tried to go out of range.
723			 * Anything else means it's likely that we're
724			 * confused.
725			 */
726			if (error != EINVAL) {
727				cmn_err(CE_WARN, "snap_strategy: error "
728				    "calling snap_getchunk, chunk = %llu, "
729				    "offset = %d, len = %d, resid = %lu, "
730				    "error = %d.",
731				    chunk, off, len, bp->b_resid, error);
732			}
733			bioerror(bp, error);
734			biodone(bp);
735			rw_exit(&sidp->sid_rwlock);
736			return (0);
737		}
738		bp->b_resid -= len;
739		reqptr += (len >> DEV_BSHIFT);
740		buf += len;
741	}
742
743	ASSERT(bp->b_resid == 0);
744	biodone(bp);
745
746	rw_exit(&sidp->sid_rwlock);
747	return (0);
748}
749
750/*
751 * snap_getchunk() - helper function for snap_strategy()
752 *
753 *    gets the requested data from the appropriate place and fills in the
754 *    buffer.  chunk is the chunk number of the request, offset is the
755 *    offset into that chunk and must be less than the chunk size.  len is
756 *    the length of the request starting at offset, and must not exceed a
757 *    chunk boundary.  buffer is the address to copy the data to.  len
758 *    bytes are copied into the buffer starting at the location specified.
759 *
760 *    A chunk is located according to the following algorithm:
761 *        - If the chunk does not have a translation or is not a candidate
762 *          for translation, it is read straight from the master device.
763 *        - If the chunk does have a translation, then it is either on
764 *          disk or in memory:
765 *            o If it is in memory the requested data is simply copied out
766 *              of the in-memory buffer.
767 *            o If it is in the backing store, it is read from there.
768 *
769 *    This function does the real work of the snapshot driver.
770 */
771static int
772snap_getchunk(struct snapshot_id *sidp, chunknumber_t chunk, int offset,
773    int len, char *buffer)
774{
775	cow_map_t	*cmap = &sidp->sid_cowinfo->cow_map;
776	cow_map_node_t	*cmn;
777	struct buf	*snapbuf;
778	int		error = 0;
779	char		*newbuffer;
780	int		newlen = 0;
781	int		partial = 0;
782
783	ASSERT(RW_READ_HELD(&sidp->sid_rwlock));
784	ASSERT(offset + len <= cmap->cmap_chunksz);
785
786	/*
787	 * Check if the chunk number is out of range and if so bail out
788	 */
789	if (chunk >= (cmap->cmap_bmsize * NBBY)) {
790		return (EINVAL);
791	}
792
793	/*
794	 * If the chunk is not a candidate for translation, then the chunk
795	 * was not allocated when the snapshot was taken.  Since it does
796	 * not contain data associated with this snapshot, just return a
797	 * zero buffer instead.
798	 */
799	if (isclr(cmap->cmap_candidate, chunk)) {
800		bzero(buffer, len);
801		return (0);
802	}
803
804	/*
805	 * if the chunk is a candidate for translation but a
806	 * translation does not exist, then read through to the
807	 * original file system.  The rwlock is held until the read
808	 * completes if it hasn't been translated to make sure the
809	 * file system does not translate the block before we
810	 * access it. If it has already been translated we don't
811	 * need the lock, because the translation will never go away.
812	 */
813	rw_enter(&cmap->cmap_rwlock, RW_READER);
814	if (isclr(cmap->cmap_hastrans, chunk)) {
815		snapbuf = getrbuf(KM_SLEEP);
816		/*
817		 * Reading into the buffer saves having to do a copy,
818		 * but gets tricky if the request size is not a
819		 * multiple of DEV_BSIZE.  However, we are filling the
820		 * buffer left to right, so future reads will write
821		 * over any extra data we might have read.
822		 */
823
824		partial = len % DEV_BSIZE;
825
826		snapbuf->b_bcount = len;
827		snapbuf->b_lblkno = lbtodb(chunk * cmap->cmap_chunksz + offset);
828		snapbuf->b_un.b_addr = buffer;
829
830		snapbuf->b_iodone = NULL;
831		snapbuf->b_proc = NULL;		/* i.e. the kernel */
832		snapbuf->b_flags = B_READ | B_BUSY;
833		snapbuf->b_edev = sidp->sid_fvp->v_vfsp->vfs_dev;
834
835		if (partial) {
836			/*
837			 * Partial block read in progress.
838			 * This is bad as modules further down the line
839			 * assume buf's are exact multiples of DEV_BSIZE
840			 * and we end up with fewer, or zero, bytes read.
841			 * To get round this we need to round up to the
842			 * nearest full block read and then return only
843			 * len bytes.
844			 */
845			newlen = (len - partial) + DEV_BSIZE;
846			newbuffer = kmem_alloc(newlen, KM_SLEEP);
847
848			snapbuf->b_bcount = newlen;
849			snapbuf->b_un.b_addr = newbuffer;
850		}
851
852		(void) bdev_strategy(snapbuf);
853		(void) biowait(snapbuf);
854
855		error = geterror(snapbuf);
856
857		if (partial) {
858			/*
859			 * Partial block read. Now we need to bcopy the
860			 * correct number of bytes back into the
861			 * supplied buffer, and tidy up our temp
862			 * buffer.
863			 */
864			bcopy(newbuffer, buffer, len);
865			kmem_free(newbuffer, newlen);
866		}
867
868		freerbuf(snapbuf);
869		rw_exit(&cmap->cmap_rwlock);
870
871		return (error);
872	}
873
874	/*
875	 * finally, if the chunk is a candidate for translation and it
876	 * has been translated, then we clone the chunk of the buffer
877	 * that was copied aside by the file system.
878	 * The cmap_rwlock does not need to be held after we know the
879	 * data has already been copied. Once a chunk has been copied
880	 * to the backing file, it is stable read only data.
881	 */
882	cmn = transtbl_get(cmap, chunk);
883
884	/* check whether the data is in memory or in the backing file */
885	if (cmn != NULL) {
886		ASSERT(cmn->cmn_buf);
887		/* already in memory */
888		bcopy(cmn->cmn_buf + offset, buffer, len);
889		rw_exit(&cmap->cmap_rwlock);
890	} else {
891		ssize_t resid = len;
892		int	bf_index;
893		/*
894		 * can cause deadlock with writer if we don't drop the
895		 * cmap_rwlock before trying to get the backing store file
896		 * vnode rwlock.
897		 */
898		rw_exit(&cmap->cmap_rwlock);
899
900		bf_index = chunk / cmap->cmap_chunksperbf;
901
902		/* read buffer from backing file */
903		error = vn_rdwr(UIO_READ,
904		    (sidp->sid_cowinfo->cow_backfile_array)[bf_index],
905		    buffer, len, ((chunk % cmap->cmap_chunksperbf) *
906		    cmap->cmap_chunksz) + offset, UIO_SYSSPACE, 0,
907		    RLIM64_INFINITY, kcred, &resid);
908	}
909
910	return (error);
911}
912
913/*
914 * snap_print() - snapshot driver print(9E) routine
915 *
916 *    prints the device identification string.
917 */
918static int
919snap_print(dev_t dev, char *str)
920{
921	struct snapshot_id **sidpp;
922	minor_t		minor;
923
924	minor = getminor(dev);
925	sidpp = ddi_get_soft_state(statep, minor);
926	if (sidpp == NULL || *sidpp == NULL) {
927		cmn_err(CE_WARN,
928		    "snap_print: could not find state for snapshot %d.", minor);
929		return (ENXIO);
930	}
931
932	cmn_err(CE_NOTE, "snap_print: snapshot %d: %s",  minor, str);
933
934	return (0);
935}
936
937/*
938 * snap_prop_op() - snapshot driver prop_op(9E) routine
939 *
940 *    get 32-bit and 64-bit values for size (character driver) and nblocks
941 *    (block driver).
942 */
943static int
944snap_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op,
945    int flags, char *name, caddr_t valuep, int *lengthp)
946{
947	int		minor;
948	struct snapshot_id **sidpp;
949	dev_t		mdev;
950	dev_info_t	*mdip;
951	int		error;
952
953	minor = getminor(dev);
954
955	/*
956	 * If this is the control device just check for .conf properties,
957	 * if the wildcard DDI_DEV_T_ANY was passed in via the dev_t
958	 * just fall back to the defaults.
959	 */
960	if ((minor == SNAP_CTL_MINOR) || (dev == DDI_DEV_T_ANY))
961		return (ddi_prop_op(dev, dip, prop_op, flags, name,
962		    valuep, lengthp));
963
964	/* check to see if there is a master device plumbed */
965	sidpp = ddi_get_soft_state(statep, minor);
966	if (sidpp == NULL || *sidpp == NULL) {
967		cmn_err(CE_WARN,
968		    "snap_prop_op: could not find state for "
969		    "snapshot %d.", minor);
970		return (DDI_PROP_NOT_FOUND);
971	}
972
973	if (((*sidpp)->sid_fvp == NULL) || ((*sidpp)->sid_fvp->v_vfsp == NULL))
974		return (ddi_prop_op(dev, dip, prop_op, flags, name,
975		    valuep, lengthp));
976
977	/* hold master device and pass operation down */
978	mdev = (*sidpp)->sid_fvp->v_vfsp->vfs_dev;
979	if (mdip = e_ddi_hold_devi_by_dev(mdev, 0)) {
980
981		/* get size information from the master device. */
982		error = cdev_prop_op(mdev, mdip,
983		    prop_op, flags, name, valuep, lengthp);
984		ddi_release_devi(mdip);
985		if (error == DDI_PROP_SUCCESS)
986			return (error);
987	}
988
989	/* master device did not service the request, try framework */
990	return (ddi_prop_op(dev, dip, prop_op, flags, name, valuep, lengthp));
991
992}
993
994/*
995 * snap_ioctl() - snapshot driver ioctl(9E) routine
996 *
997 *    only applies to the control device.  The control device accepts two
998 *    ioctl requests: create a snapshot or delete a snapshot.  In either
999 *    case, the vnode for the requested file system is extracted, and the
1000 *    request is passed on to the file system via the same ioctl.  The file
1001 *    system is responsible for doing the things necessary for creating or
1002 *    destroying a snapshot, including any file system specific operations
1003 *    that must be performed as well as setting up and deleting the snapshot
1004 *    state through the fssnap interfaces.
1005 */
1006static int
1007snap_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
1008int *rvalp)
1009{
1010	minor_t	minor;
1011	int error = 0;
1012
1013	minor = getminor(dev);
1014
1015	if (minor != SNAP_CTL_MINOR) {
1016		return (EINVAL);
1017	}
1018
1019	switch (cmd) {
1020	case _FIOSNAPSHOTCREATE:
1021	{
1022		struct fiosnapcreate	fc;
1023		struct file		*fp;
1024		struct vnode		*vp;
1025
1026		if (ddi_copyin((void *)arg, &fc, sizeof (fc), mode))
1027			return (EFAULT);
1028
1029		/* get vnode for file system mount point */
1030		if ((fp = getf(fc.rootfiledesc)) == NULL)
1031			return (EBADF);
1032
1033		ASSERT(fp->f_vnode);
1034		vp = fp->f_vnode;
1035		VN_HOLD(vp);
1036		releasef(fc.rootfiledesc);
1037
1038		/* pass ioctl request to file system */
1039		error = VOP_IOCTL(vp, cmd, arg, 0, credp, rvalp, NULL);
1040		VN_RELE(vp);
1041		break;
1042	}
1043	case _FIOSNAPSHOTCREATE_MULTI:
1044	{
1045		struct fiosnapcreate_multi	fc;
1046		struct file		*fp;
1047		struct vnode		*vp;
1048
1049		if (ddi_copyin((void *)arg, &fc, sizeof (fc), mode))
1050			return (EFAULT);
1051
1052		/* get vnode for file system mount point */
1053		if ((fp = getf(fc.rootfiledesc)) == NULL)
1054			return (EBADF);
1055
1056		ASSERT(fp->f_vnode);
1057		vp = fp->f_vnode;
1058		VN_HOLD(vp);
1059		releasef(fc.rootfiledesc);
1060
1061		/* pass ioctl request to file system */
1062		error = VOP_IOCTL(vp, cmd, arg, 0, credp, rvalp, NULL);
1063		VN_RELE(vp);
1064		break;
1065	}
1066	case _FIOSNAPSHOTDELETE:
1067	{
1068		major_t			major;
1069		struct fiosnapdelete	fc;
1070		snapshot_id_t		*sidp = NULL;
1071		snapshot_id_t		*sidnextp = NULL;
1072		struct file		*fp = NULL;
1073		struct vnode		*vp = NULL;
1074		struct vfs 		*vfsp = NULL;
1075		vfsops_t		*vfsops = EIO_vfsops;
1076
1077		if (ddi_copyin((void *)arg, &fc, sizeof (fc), mode))
1078			return (EFAULT);
1079
1080		/* get vnode for file system mount point */
1081		if ((fp = getf(fc.rootfiledesc)) == NULL)
1082			return (EBADF);
1083
1084		ASSERT(fp->f_vnode);
1085		vp = fp->f_vnode;
1086		VN_HOLD(vp);
1087		releasef(fc.rootfiledesc);
1088		/*
1089		 * Test for two formats of delete and set correct minor/vp:
1090		 * pseudo device:
1091		 * fssnap -d [/dev/fssnap/x]
1092		 * or
1093		 * mount point:
1094		 * fssnap -d [/mntpt]
1095		 * Note that minor is verified to be equal to SNAP_CTL_MINOR
1096		 * at this point which is an invalid minor number.
1097		 */
1098		ASSERT(fssnap_dip != NULL);
1099		major = ddi_driver_major(fssnap_dip);
1100		mutex_enter(&snapshot_mutex);
1101		for (sidp = snapshot; sidp != NULL; sidp = sidnextp) {
1102			rw_enter(&sidp->sid_rwlock, RW_READER);
1103			sidnextp = sidp->sid_next;
1104			/* pseudo device: */
1105			if (major == getmajor(vp->v_rdev)) {
1106				minor = getminor(vp->v_rdev);
1107				if (sidp->sid_snapnumber == (uint_t)minor &&
1108				    sidp->sid_fvp) {
1109					VN_RELE(vp);
1110					vp = sidp->sid_fvp;
1111					VN_HOLD(vp);
1112					rw_exit(&sidp->sid_rwlock);
1113					break;
1114				}
1115			/* Mount point: */
1116			} else {
1117				if (sidp->sid_fvp == vp) {
1118					minor = sidp->sid_snapnumber;
1119					rw_exit(&sidp->sid_rwlock);
1120					break;
1121				}
1122			}
1123			rw_exit(&sidp->sid_rwlock);
1124		}
1125		mutex_exit(&snapshot_mutex);
1126		/* Verify minor got set correctly above */
1127		if (minor == SNAP_CTL_MINOR) {
1128			VN_RELE(vp);
1129			return (EINVAL);
1130		}
1131		dev = makedevice(major, minor);
1132		/*
1133		 * Create dummy vfs entry
1134		 * to use as a locking semaphore across the IOCTL
1135		 * for mount in progress cases...
1136		 */
1137		vfsp = vfs_alloc(KM_SLEEP);
1138		VFS_INIT(vfsp, vfsops, NULL);
1139		VFS_HOLD(vfsp);
1140		vfs_addmip(dev, vfsp);
1141		if ((vfs_devmounting(dev, vfsp)) ||
1142		    (vfs_devismounted(dev))) {
1143			vfs_delmip(vfsp);
1144			VFS_RELE(vfsp);
1145			VN_RELE(vp);
1146			return (EBUSY);
1147		}
1148		/*
1149		 * Nobody mounted but do not release mount in progress lock
1150		 * until IOCTL complete to prohibit a mount sneaking
1151		 * in
1152		 */
1153		error = VOP_IOCTL(vp, cmd, arg, 0, credp, rvalp, NULL);
1154		vfs_delmip(vfsp);
1155		VFS_RELE(vfsp);
1156		VN_RELE(vp);
1157		break;
1158	}
1159	default:
1160		cmn_err(CE_WARN, "snap_ioctl: Invalid ioctl cmd %d, minor %d.",
1161		    cmd, minor);
1162		return (EINVAL);
1163	}
1164
1165	return (error);
1166}
1167
1168
1169/* ************************************************************************ */
1170
1171/*
1172 * Translation Table Routines
1173 *
1174 *    These support routines implement a simple doubly linked list
1175 *    to keep track of chunks that are currently in memory.  The maximum
1176 *    size of the list is determined by the fssnap_max_mem_chunks variable.
1177 *    The cmap_rwlock is used to protect the linkage of the list.
1178 */
1179
1180/*
1181 * transtbl_add() - add a node to the translation table
1182 *
1183 *    allocates a new node and points it at the buffer passed in.  The node
1184 *    is added to the beginning of the doubly linked list and the head of
1185 *    the list is moved.  The cmap_rwlock must be held as a writer through
1186 *    this operation.
1187 */
1188static cow_map_node_t *
1189transtbl_add(cow_map_t *cmap, chunknumber_t chunk, caddr_t buf)
1190{
1191	cow_map_node_t	*cmnode;
1192
1193	ASSERT(RW_WRITE_HELD(&cmap->cmap_rwlock));
1194
1195	cmnode = kmem_alloc(sizeof (cow_map_node_t), KM_SLEEP);
1196
1197	/*
1198	 * insert new translations at the beginning so cmn_table is always
1199	 * the first node.
1200	 */
1201	cmnode->cmn_chunk = chunk;
1202	cmnode->cmn_buf = buf;
1203	cmnode->cmn_prev = NULL;
1204	cmnode->cmn_next = cmap->cmap_table;
1205	if (cmnode->cmn_next)
1206		cmnode->cmn_next->cmn_prev = cmnode;
1207	cmap->cmap_table = cmnode;
1208
1209	return (cmnode);
1210}
1211
1212/*
1213 * transtbl_get() - look up a node in the translation table
1214 *
1215 *    called by the snapshot driver to find data that has been translated.
1216 *    The lookup is done by the chunk number, and the node is returned.
1217 *    If the node was not found, NULL is returned.
1218 */
1219static cow_map_node_t *
1220transtbl_get(cow_map_t *cmap, chunknumber_t chunk)
1221{
1222	cow_map_node_t *cmn;
1223
1224	ASSERT(RW_READ_HELD(&cmap->cmap_rwlock));
1225	ASSERT(cmap);
1226
1227	/* search the translation table */
1228	for (cmn = cmap->cmap_table; cmn != NULL; cmn = cmn->cmn_next) {
1229		if (cmn->cmn_chunk == chunk)
1230			return (cmn);
1231	}
1232
1233	/* not found */
1234	return (NULL);
1235}
1236
1237/*
1238 * transtbl_delete() - delete a node from the translation table
1239 *
1240 *    called when a node's data has been written out to disk.  The
1241 *    cmap_rwlock must be held as a writer for this operation.  If the node
1242 *    being deleted is the head of the list, then the head is moved to the
1243 *    next node.  Both the node's data and the node itself are freed.
1244 */
1245static void
1246transtbl_delete(cow_map_t *cmap, cow_map_node_t *cmn)
1247{
1248	ASSERT(RW_WRITE_HELD(&cmap->cmap_rwlock));
1249	ASSERT(cmn);
1250	ASSERT(cmap->cmap_table);
1251
1252	/* if the head of the list is being deleted, then move the head up */
1253	if (cmap->cmap_table == cmn) {
1254		ASSERT(cmn->cmn_prev == NULL);
1255		cmap->cmap_table = cmn->cmn_next;
1256	}
1257
1258
1259	/* make previous node's next pointer skip over current node */
1260	if (cmn->cmn_prev != NULL) {
1261		ASSERT(cmn->cmn_prev->cmn_next == cmn);
1262		cmn->cmn_prev->cmn_next = cmn->cmn_next;
1263	}
1264
1265	/* make next node's previous pointer skip over current node */
1266	if (cmn->cmn_next != NULL) {
1267		ASSERT(cmn->cmn_next->cmn_prev == cmn);
1268		cmn->cmn_next->cmn_prev = cmn->cmn_prev;
1269	}
1270
1271	/* free the data and the node */
1272	ASSERT(cmn->cmn_buf);
1273	kmem_free(cmn->cmn_buf, cmap->cmap_chunksz);
1274	kmem_free(cmn, sizeof (cow_map_node_t));
1275}
1276
1277/*
1278 * transtbl_free() - free the entire translation table
1279 *
1280 *    called when the snapshot is deleted.  This frees all of the nodes in
1281 *    the translation table (but not the bitmaps).
1282 */
1283static void
1284transtbl_free(cow_map_t *cmap)
1285{
1286	cow_map_node_t	*curnode;
1287	cow_map_node_t	*tempnode;
1288
1289	for (curnode = cmap->cmap_table; curnode != NULL; curnode = tempnode) {
1290		tempnode = curnode->cmn_next;
1291
1292		kmem_free(curnode->cmn_buf, cmap->cmap_chunksz);
1293		kmem_free(curnode, sizeof (cow_map_node_t));
1294	}
1295}
1296
1297
1298/* ************************************************************************ */
1299
1300/*
1301 * Interface Implementation Routines
1302 *
1303 * The following functions implement snapshot interface routines that are
1304 * called by the file system to create, delete, and use a snapshot.  The
1305 * interfaces are defined in fssnap_if.c and are filled in by this driver
1306 * when it is loaded.  This technique allows the file system to depend on
1307 * the interface module without having to load the full implementation and
1308 * snapshot device drivers.
1309 */
1310
1311/*
1312 * fssnap_strategy_impl() - strategy routine called by the file system
1313 *
1314 *    called by the file system to handle copy-on-write when necessary.  All
1315 *    reads and writes that the file system performs should go through this
1316 *    function.  If the file system calls the underlying device's strategy
1317 *    routine without going through fssnap_strategy() (eg. by calling
1318 *    bdev_strategy()), the snapshot may not be consistent.
1319 *
1320 *    This function starts by doing significant sanity checking to insure
1321 *    the snapshot was not deleted out from under it or deleted and then
1322 *    recreated.  To do this, it checks the actual pointer passed into it
1323 *    (ie. the handle held by the file system).  NOTE that the parameter is
1324 *    a POINTER TO A POINTER to the snapshot id.  Once the snapshot id is
1325 *    locked, it knows things are ok and that this snapshot is really for
1326 *    this file system.
1327 *
1328 *    If the request is a write, fssnap_translate() is called to determine
1329 *    whether a copy-on-write is required.  If it is a read, the read is
1330 *    simply passed on to the underlying device.
1331 */
1332static void
1333fssnap_strategy_impl(void *snapshot_id, buf_t *bp)
1334{
1335	struct snapshot_id **sidpp;
1336	struct snapshot_id *sidp;
1337	int error;
1338
1339	/* read requests are always passed through */
1340	if (bp->b_flags & B_READ) {
1341		(void) bdev_strategy(bp);
1342		return;
1343	}
1344
1345	/*
1346	 * Because we were not able to take the snapshot read lock BEFORE
1347	 * checking for a snapshot back in the file system, things may have
1348	 * drastically changed out from under us.  For instance, the snapshot
1349	 * may have been deleted, deleted and recreated, or worse yet, deleted
1350	 * for this file system but now the snapshot number is in use by another
1351	 * file system.
1352	 *
1353	 * Having a pointer to the file system's snapshot id pointer allows us
1354	 * to sanity check most of this, though it assumes the file system is
1355	 * keeping track of a pointer to the snapshot_id somewhere.
1356	 */
1357	sidpp = (struct snapshot_id **)snapshot_id;
1358	sidp = *sidpp;
1359
1360	/*
1361	 * if this file system's snapshot was disabled, just pass the
1362	 * request through.
1363	 */
1364	if (sidp == NULL) {
1365		(void) bdev_strategy(bp);
1366		return;
1367	}
1368
1369	/*
1370	 * Once we have the reader lock the snapshot will not magically go
1371	 * away.  But things may have changed on us before this so double check.
1372	 */
1373	rw_enter(&sidp->sid_rwlock, RW_READER);
1374
1375	/*
1376	 * if an error was founds somewhere the DELETE flag will be
1377	 * set to indicate the snapshot should be deleted and no new
1378	 * translations should occur.
1379	 */
1380	if (sidp->sid_flags & SID_DELETE) {
1381		rw_exit(&sidp->sid_rwlock);
1382		(void) fssnap_delete_impl(sidpp);
1383		(void) bdev_strategy(bp);
1384		return;
1385	}
1386
1387	/*
1388	 * If the file system is no longer pointing to the snapshot we were
1389	 * called with, then it should not attempt to translate this buffer as
1390	 * it may be going to a snapshot for a different file system.
1391	 * Even if the file system snapshot pointer is still the same, the
1392	 * snapshot may have been disabled before we got the reader lock.
1393	 */
1394	if (sidp != *sidpp || SID_INACTIVE(sidp)) {
1395		rw_exit(&sidp->sid_rwlock);
1396		(void) bdev_strategy(bp);
1397		return;
1398	}
1399
1400	/*
1401	 * At this point we're sure the snapshot will not go away while the
1402	 * reader lock is held, and we are reasonably certain that we are
1403	 * writing to the correct snapshot.
1404	 */
1405	if ((error = fssnap_translate(sidpp, bp)) != 0) {
1406		/*
1407		 * fssnap_translate can release the reader lock if it
1408		 * has to wait for a semaphore.  In this case it is possible
1409		 * for the snapshot to be deleted in this time frame.  If this
1410		 * happens just sent the buf thru to the filesystems device.
1411		 */
1412		if (sidp != *sidpp || SID_INACTIVE(sidp)) {
1413			rw_exit(&sidp->sid_rwlock);
1414			(void) bdev_strategy(bp);
1415			return;
1416		}
1417		bioerror(bp, error);
1418		biodone(bp);
1419	}
1420	rw_exit(&sidp->sid_rwlock);
1421}
1422
1423/*
1424 * fssnap_translate() - helper function for fssnap_strategy()
1425 *
1426 *    performs the actual copy-on-write for write requests, if required.
1427 *    This function does the real work of the file system side of things.
1428 *
1429 *    It first checks the candidate bitmap to quickly determine whether any
1430 *    action is necessary.  If the candidate bitmap indicates the chunk was
1431 *    allocated when the snapshot was created, then it checks to see whether
1432 *    a translation already exists.  If a translation already exists then no
1433 *    action is required.  If the chunk is a candidate for copy-on-write,
1434 *    and a translation does not already exist, then the chunk is read in
1435 *    and a node is added to the translation table.
1436 *
1437 *    Once all of the chunks in the request range have been copied (if they
1438 *    needed to be), then the original request can be satisfied and the old
1439 *    data can be overwritten.
1440 */
1441static int
1442fssnap_translate(struct snapshot_id **sidpp, struct buf *wbp)
1443{
1444	snapshot_id_t	*sidp = *sidpp;
1445	struct buf	*oldbp;	/* buffer to store old data in */
1446	struct cow_info	*cowp = sidp->sid_cowinfo;
1447	cow_map_t	*cmap = &cowp->cow_map;
1448	cow_map_node_t	*cmn;
1449	chunknumber_t	cowchunk, startchunk, endchunk;
1450	int		error;
1451	int	throttle_write = 0;
1452
1453	/* make sure the snapshot is active */
1454	ASSERT(RW_READ_HELD(&sidp->sid_rwlock));
1455
1456	startchunk = dbtocowchunk(cmap, wbp->b_lblkno);
1457	endchunk   = dbtocowchunk(cmap, wbp->b_lblkno +
1458	    ((wbp->b_bcount-1) >> DEV_BSHIFT));
1459
1460	/*
1461	 * Do not throttle the writes of the fssnap taskq thread and
1462	 * the log roll (trans_roll) thread. Furthermore the writes to
1463	 * the on-disk log are also not subject to throttling.
1464	 * The fssnap_write_taskq thread's write can block on the throttling
1465	 * semaphore which leads to self-deadlock as this same thread
1466	 * releases the throttling semaphore after completing the IO.
1467	 * If the trans_roll thread's write is throttled then we can deadlock
1468	 * because the fssnap_taskq_thread which releases the throttling
1469	 * semaphore can block waiting for log space which can only be
1470	 * released by the trans_roll thread.
1471	 */
1472
1473	throttle_write = !(taskq_member(cowp->cow_taskq, curthread) ||
1474	    tsd_get(bypass_snapshot_throttle_key));
1475
1476	/*
1477	 * Iterate through all chunks covered by this write and perform the
1478	 * copy-aside if necessary.  Once all chunks have been safely
1479	 * stowed away, the new data may be written in a single sweep.
1480	 *
1481	 * For each chunk in the range, the following sequence is performed:
1482	 *	- Is the chunk a candidate for translation?
1483	 *		o If not, then no translation is necessary, continue
1484	 *	- If it is a candidate, then does it already have a translation?
1485	 *		o If so, then no translation is necessary, continue
1486	 *	- If it is a candidate, but does not yet have a translation,
1487	 *	  then read the old data and schedule an asynchronous taskq
1488	 *	  to write the old data to the backing file.
1489	 *
1490	 * Once this has been performed over the entire range of chunks, then
1491	 * it is safe to overwrite the data that is there.
1492	 *
1493	 * Note that no lock is required to check the candidate bitmap because
1494	 * it never changes once the snapshot is created.  The reader lock is
1495	 * taken to check the hastrans bitmap since it may change.  If it
1496	 * turns out a copy is required, then the lock is upgraded to a
1497	 * writer, and the bitmap is re-checked as it may have changed while
1498	 * the lock was released.  Finally, the write lock is held while
1499	 * reading the old data to make sure it is not translated out from
1500	 * under us.
1501	 *
1502	 * This locking mechanism should be sufficient to handle multiple
1503	 * threads writing to overlapping chunks simultaneously.
1504	 */
1505	for (cowchunk = startchunk; cowchunk <= endchunk; cowchunk++) {
1506		/*
1507		 * If the cowchunk is outside of the range of our
1508		 * candidate maps, then simply break out of the
1509		 * loop and pass the I/O through to bdev_strategy.
1510		 * This would occur if the file system has grown
1511		 * larger since the snapshot was taken.
1512		 */
1513		if (cowchunk >= (cmap->cmap_bmsize * NBBY))
1514			break;
1515
1516		/*
1517		 * If no disk blocks were allocated in this chunk when the
1518		 * snapshot was created then no copy-on-write will be
1519		 * required.  Since this bitmap is read-only no locks are
1520		 * necessary.
1521		 */
1522		if (isclr(cmap->cmap_candidate, cowchunk)) {
1523			continue;
1524		}
1525
1526		/*
1527		 * If a translation already exists, the data can be written
1528		 * through since the old data has already been saved off.
1529		 */
1530		if (isset(cmap->cmap_hastrans, cowchunk)) {
1531			continue;
1532		}
1533
1534
1535		/*
1536		 * Throttle translations if there are too many outstanding
1537		 * chunks in memory.  The semaphore is sema_v'd by the taskq.
1538		 *
1539		 * You can't keep the sid_rwlock if you would go to sleep.
1540		 * This will result in deadlock when someone tries to delete
1541		 * the snapshot (wants the sid_rwlock as a writer, but can't
1542		 * get it).
1543		 */
1544		if (throttle_write) {
1545			if (sema_tryp(&cmap->cmap_throttle_sem) == 0) {
1546				rw_exit(&sidp->sid_rwlock);
1547				atomic_add_32(&cmap->cmap_waiters, 1);
1548				sema_p(&cmap->cmap_throttle_sem);
1549				atomic_add_32(&cmap->cmap_waiters, -1);
1550				rw_enter(&sidp->sid_rwlock, RW_READER);
1551
1552			/*
1553			 * Now since we released the sid_rwlock the state may
1554			 * have transitioned underneath us. so check that again.
1555			 */
1556				if (sidp != *sidpp || SID_INACTIVE(sidp)) {
1557					sema_v(&cmap->cmap_throttle_sem);
1558					return (ENXIO);
1559				}
1560			}
1561		}
1562
1563		/*
1564		 * Acquire the lock as a writer and check to see if a
1565		 * translation has been added in the meantime.
1566		 */
1567		rw_enter(&cmap->cmap_rwlock, RW_WRITER);
1568		if (isset(cmap->cmap_hastrans, cowchunk)) {
1569			if (throttle_write)
1570				sema_v(&cmap->cmap_throttle_sem);
1571			rw_exit(&cmap->cmap_rwlock);
1572			continue; /* go to the next chunk */
1573		}
1574
1575		/*
1576		 * read a full chunk of data from the requested offset rounded
1577		 * down to the nearest chunk size.
1578		 */
1579		oldbp = getrbuf(KM_SLEEP);
1580		oldbp->b_lblkno = cowchunktodb(cmap, cowchunk);
1581		oldbp->b_edev = wbp->b_edev;
1582		oldbp->b_bcount = cmap->cmap_chunksz;
1583		oldbp->b_bufsize = cmap->cmap_chunksz;
1584		oldbp->b_iodone = NULL;
1585		oldbp->b_proc = NULL;
1586		oldbp->b_flags = B_READ;
1587		oldbp->b_un.b_addr = kmem_alloc(cmap->cmap_chunksz, KM_SLEEP);
1588
1589		(void) bdev_strategy(oldbp);
1590		(void) biowait(oldbp);
1591
1592		/*
1593		 * It's ok to bail in the middle of translating the range
1594		 * because the extra copy-asides will not hurt anything
1595		 * (except by using extra space in the backing store).
1596		 */
1597		if ((error = geterror(oldbp)) != 0) {
1598			cmn_err(CE_WARN, "fssnap_translate: error reading "
1599			    "old data for snapshot %d, chunk %llu, disk block "
1600			    "%lld, size %lu, error %d.", sidp->sid_snapnumber,
1601			    cowchunk, oldbp->b_lblkno, oldbp->b_bcount, error);
1602			kmem_free(oldbp->b_un.b_addr, cmap->cmap_chunksz);
1603			freerbuf(oldbp);
1604			rw_exit(&cmap->cmap_rwlock);
1605			if (throttle_write)
1606				sema_v(&cmap->cmap_throttle_sem);
1607			return (error);
1608		}
1609
1610		/*
1611		 * add the node to the translation table and save a reference
1612		 * to pass to the taskq for writing out to the backing file
1613		 */
1614		cmn = transtbl_add(cmap, cowchunk, oldbp->b_un.b_addr);
1615		freerbuf(oldbp);
1616
1617		/*
1618		 * Add a reference to the snapshot id so the lower level
1619		 * processing (ie. the taskq) can get back to the state
1620		 * information.
1621		 */
1622		cmn->cmn_sid = sidp;
1623		cmn->release_sem = throttle_write;
1624		setbit(cmap->cmap_hastrans, cowchunk);
1625
1626		rw_exit(&cmap->cmap_rwlock);
1627
1628		/*
1629		 * schedule the asynchronous write to the backing file
1630		 */
1631		if (cowp->cow_backfile_array != NULL)
1632			(void) taskq_dispatch(cowp->cow_taskq,
1633			    fssnap_write_taskq, cmn, TQ_SLEEP);
1634	}
1635
1636	/*
1637	 * Write new data in place of the old data.  At this point all of the
1638	 * chunks touched by this write have been copied aside and so the new
1639	 * data can be written out all at once.
1640	 */
1641	(void) bdev_strategy(wbp);
1642
1643	return (0);
1644}
1645
1646/*
1647 * fssnap_write_taskq() - write in-memory translations to the backing file
1648 *
1649 *    writes in-memory translations to the backing file asynchronously.  A
1650 *    task is dispatched each time a new translation is created.  The task
1651 *    writes the data to the backing file and removes it from the memory
1652 *    list. The throttling semaphore is released only if the particular
1653 *    translation was throttled in fssnap_translate.
1654 */
1655static void
1656fssnap_write_taskq(void *arg)
1657{
1658	cow_map_node_t	*cmn = (cow_map_node_t *)arg;
1659	snapshot_id_t	*sidp = cmn->cmn_sid;
1660	cow_info_t	*cowp = sidp->sid_cowinfo;
1661	cow_map_t	*cmap = &cowp->cow_map;
1662	int		error;
1663	int		bf_index;
1664	int		release_sem = cmn->release_sem;
1665
1666	/*
1667	 * The sid_rwlock does not need to be held here because the taskqs
1668	 * are destroyed explicitly by fssnap_delete (with the sid_rwlock
1669	 * held as a writer).  taskq_destroy() will flush all of the tasks
1670	 * out before fssnap_delete frees up all of the structures.
1671	 */
1672
1673	/* if the snapshot was disabled from under us, drop the request. */
1674	rw_enter(&sidp->sid_rwlock, RW_READER);
1675	if (SID_INACTIVE(sidp)) {
1676		rw_exit(&sidp->sid_rwlock);
1677		if (release_sem)
1678			sema_v(&cmap->cmap_throttle_sem);
1679		return;
1680	}
1681	rw_exit(&sidp->sid_rwlock);
1682
1683	atomic_add_64((uint64_t *)&cmap->cmap_nchunks, 1);
1684
1685	if ((cmap->cmap_maxsize != 0) &&
1686	    ((cmap->cmap_nchunks * cmap->cmap_chunksz) > cmap->cmap_maxsize)) {
1687		cmn_err(CE_WARN, "fssnap_write_taskq: snapshot %d (%s) has "
1688		    "reached the maximum backing file size specified (%llu "
1689		    "bytes) and will be deleted.", sidp->sid_snapnumber,
1690		    (char *)cowp->cow_kstat_mntpt->ks_data,
1691		    cmap->cmap_maxsize);
1692		if (release_sem)
1693			sema_v(&cmap->cmap_throttle_sem);
1694		atomic_or_uint(&sidp->sid_flags, SID_DELETE);
1695		return;
1696	}
1697
1698	/* perform the write */
1699	bf_index = cmn->cmn_chunk / cmap->cmap_chunksperbf;
1700
1701	if (error = vn_rdwr(UIO_WRITE, (cowp->cow_backfile_array)[bf_index],
1702	    cmn->cmn_buf, cmap->cmap_chunksz,
1703	    (cmn->cmn_chunk % cmap->cmap_chunksperbf) * cmap->cmap_chunksz,
1704	    UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, (ssize_t *)NULL)) {
1705		cmn_err(CE_WARN, "fssnap_write_taskq: error writing to "
1706		    "backing file.  DELETING SNAPSHOT %d, backing file path "
1707		    "%s, offset %llu bytes, error %d.", sidp->sid_snapnumber,
1708		    (char *)cowp->cow_kstat_bfname->ks_data,
1709		    cmn->cmn_chunk * cmap->cmap_chunksz, error);
1710		if (release_sem)
1711			sema_v(&cmap->cmap_throttle_sem);
1712		atomic_or_uint(&sidp->sid_flags, SID_DELETE);
1713		return;
1714	}
1715
1716	/*
1717	 * now remove the node and buffer from memory
1718	 */
1719	rw_enter(&cmap->cmap_rwlock, RW_WRITER);
1720	transtbl_delete(cmap, cmn);
1721	rw_exit(&cmap->cmap_rwlock);
1722
1723	/* Allow more translations */
1724	if (release_sem)
1725		sema_v(&cmap->cmap_throttle_sem);
1726
1727}
1728
1729/*
1730 * fssnap_create_impl() - called from the file system to create a new snapshot
1731 *
1732 *    allocates and initializes the structures needed for a new snapshot.
1733 *    This is called by the file system when it receives an ioctl request to
1734 *    create a new snapshot.  An unused snapshot identifier is either found
1735 *    or created, and eventually returned as the opaque handle the file
1736 *    system will use to identify this snapshot.  The snapshot number
1737 *    associated with the snapshot identifier is the same as the minor
1738 *    number for the snapshot device that is used to access that snapshot.
1739 *
1740 *    The snapshot can not be used until the candidate bitmap is populated
1741 *    by the file system (see fssnap_set_candidate_impl()), and the file
1742 *    system finishes the setup process by calling fssnap_create_done().
1743 *    Nearly all of the snapshot locks are held for the duration of the
1744 *    create, and are not released until fssnap_create_done is called().
1745 */
1746static void *
1747fssnap_create_impl(chunknumber_t nchunks, uint_t chunksz, u_offset_t maxsize,
1748    struct vnode *fsvp, int backfilecount, struct vnode **bfvpp, char *backpath,
1749    u_offset_t max_backfile_size)
1750{
1751	refstr_t *mountpoint;
1752	char taskqname[50];
1753	struct cow_info *cowp;
1754	struct cow_map	*cmap;
1755	struct snapshot_id *sidp;
1756	int lastsnap;
1757
1758	/*
1759	 * Sanity check the parameters we care about
1760	 * (we don't care about the informational parameters)
1761	 */
1762	if ((nchunks == 0) ||
1763	    ((chunksz % DEV_BSIZE) != 0) ||
1764	    (bfvpp == NULL)) {
1765		return (NULL);
1766	}
1767
1768	/*
1769	 * Look for unused snapshot identifiers.  Snapshot ids are never
1770	 * freed, but deleted snapshot ids will be recycled as needed.
1771	 */
1772	mutex_enter(&snapshot_mutex);
1773
1774findagain:
1775	lastsnap = 0;
1776	for (sidp = snapshot; sidp != NULL; sidp = sidp->sid_next) {
1777		if (sidp->sid_snapnumber > lastsnap)
1778			lastsnap = sidp->sid_snapnumber;
1779
1780		/*
1781		 * The sid_rwlock is taken as a reader initially so that
1782		 * activity on each snapshot is not stalled while searching
1783		 * for a free snapshot id.
1784		 */
1785		rw_enter(&sidp->sid_rwlock, RW_READER);
1786
1787		/*
1788		 * If the snapshot has been deleted and nobody is using the
1789		 * snapshot device than we can reuse this snapshot_id.  If
1790		 * the snapshot is marked to be deleted (SID_DELETE), then
1791		 * it hasn't been deleted yet so don't reuse it.
1792		 */
1793		if (SID_AVAILABLE(sidp))
1794			break; /* This spot is unused, so take it */
1795		rw_exit(&sidp->sid_rwlock);
1796	}
1797
1798	/*
1799	 * add a new snapshot identifier if there are no deleted
1800	 * entries.  Since it doesn't matter what order the entries
1801	 * are in we can just add it to the beginning of the list.
1802	 */
1803	if (sidp) {
1804		if (rw_tryupgrade(&sidp->sid_rwlock) == 0) {
1805			/* someone else grabbed it as a writer, try again */
1806			rw_exit(&sidp->sid_rwlock);
1807			goto findagain;
1808		}
1809	} else {
1810		/* Create a new node if we didn't find an unused one */
1811		sidp = kmem_alloc(sizeof (struct snapshot_id), KM_SLEEP);
1812		rw_init(&sidp->sid_rwlock, NULL, RW_DEFAULT, NULL);
1813		rw_enter(&sidp->sid_rwlock, RW_WRITER);
1814		sidp->sid_snapnumber = (snapshot == NULL) ? 0 : lastsnap + 1;
1815		sidp->sid_cowinfo = NULL;
1816		sidp->sid_flags = 0;
1817		sidp->sid_next = snapshot;
1818		snapshot = sidp;
1819	}
1820
1821	ASSERT(RW_WRITE_HELD(&sidp->sid_rwlock));
1822	ASSERT(sidp->sid_cowinfo == NULL);
1823	ASSERT(sidp->sid_snapnumber <= (lastsnap + 1));
1824
1825	sidp->sid_flags |= SID_CREATING;
1826	/* The root vnode is held until snap_delete_impl() is called */
1827	VN_HOLD(fsvp);
1828	sidp->sid_fvp = fsvp;
1829	num_snapshots++;
1830
1831	/* allocate and initialize structures */
1832
1833	cowp = kmem_zalloc(sizeof (struct cow_info), KM_SLEEP);
1834
1835	cowp->cow_backfile_array = bfvpp;
1836	cowp->cow_backcount = backfilecount;
1837	cowp->cow_backfile_sz = max_backfile_size;
1838
1839	/*
1840	 * Initialize task queues for this snapshot.  Only a small number
1841	 * of threads are required because they will be serialized on the
1842	 * backing file's reader/writer lock anyway.
1843	 */
1844	(void) snprintf(taskqname, sizeof (taskqname), "%s_taskq_%d", snapname,
1845	    sidp->sid_snapnumber);
1846	cowp->cow_taskq = taskq_create(taskqname, fssnap_taskq_nthreads,
1847	    minclsyspri, 1,  fssnap_taskq_maxtasks, 0);
1848
1849	/* don't allow tasks to start until after everything is ready */
1850	taskq_suspend(cowp->cow_taskq);
1851
1852	/* initialize translation table */
1853	cmap = &cowp->cow_map;
1854	rw_init(&cmap->cmap_rwlock, NULL, RW_DEFAULT, NULL);
1855	rw_enter(&cmap->cmap_rwlock, RW_WRITER);
1856
1857	sema_init(&cmap->cmap_throttle_sem, fssnap_max_mem_chunks, NULL,
1858	    SEMA_DEFAULT, NULL);
1859
1860	cmap->cmap_chunksz = chunksz;
1861	cmap->cmap_maxsize = maxsize;
1862	cmap->cmap_chunksperbf = max_backfile_size / chunksz;
1863
1864	/*
1865	 * allocate one bit per chunk for the bitmaps, round up
1866	 */
1867	cmap->cmap_bmsize = (nchunks + (NBBY - 1)) / NBBY;
1868	cmap->cmap_hastrans  = kmem_zalloc(cmap->cmap_bmsize, KM_SLEEP);
1869	cmap->cmap_candidate = kmem_zalloc(cmap->cmap_bmsize, KM_SLEEP);
1870
1871	sidp->sid_cowinfo = cowp;
1872
1873	/* initialize kstats for this snapshot */
1874	mountpoint = vfs_getmntpoint(fsvp->v_vfsp);
1875	fssnap_create_kstats(sidp, sidp->sid_snapnumber,
1876	    refstr_value(mountpoint), backpath);
1877	refstr_rele(mountpoint);
1878
1879	mutex_exit(&snapshot_mutex);
1880
1881	/*
1882	 * return with snapshot id rwlock held as a writer until
1883	 * fssnap_create_done is called
1884	 */
1885	return (sidp);
1886}
1887
1888/*
1889 * fssnap_set_candidate_impl() - mark a chunk as a candidate for copy-on-write
1890 *
1891 *    sets a bit in the candidate bitmap that indicates that a chunk is a
1892 *    candidate for copy-on-write.  Typically, chunks that are allocated on
1893 *    the file system at the time the snapshot is taken are candidates,
1894 *    while chunks that have no allocated data do not need to be copied.
1895 *    Chunks containing metadata must be marked as candidates as well.
1896 */
1897static void
1898fssnap_set_candidate_impl(void *snapshot_id, chunknumber_t chunknumber)
1899{
1900	struct snapshot_id	*sid = snapshot_id;
1901	struct cow_info *cowp = sid->sid_cowinfo;
1902	struct cow_map	*cmap = &cowp->cow_map;
1903
1904	/* simple bitmap operation for now */
1905	ASSERT(chunknumber < (cmap->cmap_bmsize * NBBY));
1906	setbit(cmap->cmap_candidate, chunknumber);
1907}
1908
1909/*
1910 * fssnap_is_candidate_impl() - check whether a chunk is a candidate
1911 *
1912 *    returns 0 if the chunk is not a candidate and 1 if the chunk is a
1913 *    candidate.  This can be used by the file system to change behavior for
1914 *    chunks that might induce a copy-on-write.  The offset is specified in
1915 *    bytes since the chunk size may not be known by the file system.
1916 */
1917static int
1918fssnap_is_candidate_impl(void *snapshot_id, u_offset_t off)
1919{
1920	struct snapshot_id	*sid = snapshot_id;
1921	struct cow_info *cowp = sid->sid_cowinfo;
1922	struct cow_map	*cmap = &cowp->cow_map;
1923	ulong_t chunknumber = off / cmap->cmap_chunksz;
1924
1925	/* simple bitmap operation for now */
1926	ASSERT(chunknumber < (cmap->cmap_bmsize * NBBY));
1927	return (isset(cmap->cmap_candidate, chunknumber));
1928}
1929
1930/*
1931 * fssnap_create_done_impl() - complete the snapshot setup process
1932 *
1933 *    called when the file system is done populating the candidate bitmap
1934 *    and it is ready to start using the snapshot.  This routine releases
1935 *    the snapshot locks, allows taskq tasks to start processing, and
1936 *    creates the device minor nodes associated with the snapshot.
1937 */
1938static int
1939fssnap_create_done_impl(void *snapshot_id)
1940{
1941	struct snapshot_id	**sidpp, *sidp = snapshot_id;
1942	struct cow_info		*cowp;
1943	struct cow_map		*cmap;
1944	int			snapnumber = -1;
1945	char			name[20];
1946
1947	/* sid rwlock and cmap rwlock should be taken from fssnap_create */
1948	ASSERT(sidp);
1949	ASSERT(RW_WRITE_HELD(&sidp->sid_rwlock));
1950	ASSERT(sidp->sid_cowinfo);
1951
1952	cowp = sidp->sid_cowinfo;
1953	cmap = &cowp->cow_map;
1954
1955	ASSERT(RW_WRITE_HELD(&cmap->cmap_rwlock));
1956
1957	sidp->sid_flags &= ~(SID_CREATING | SID_DISABLED);
1958	snapnumber = sidp->sid_snapnumber;
1959
1960	/* allocate state structure and find new snapshot id */
1961	if (ddi_soft_state_zalloc(statep, snapnumber) != DDI_SUCCESS) {
1962		cmn_err(CE_WARN,
1963		    "snap_ioctl: create: could not allocate "
1964		    "state for snapshot %d.", snapnumber);
1965		snapnumber = -1;
1966		goto out;
1967	}
1968
1969	sidpp = ddi_get_soft_state(statep, snapnumber);
1970	*sidpp = sidp;
1971
1972	/* create minor node based on snapshot number */
1973	ASSERT(fssnap_dip != NULL);
1974	(void) snprintf(name, sizeof (name), "%d", snapnumber);
1975	if (ddi_create_minor_node(fssnap_dip, name, S_IFBLK,
1976	    snapnumber, DDI_PSEUDO, 0) != DDI_SUCCESS) {
1977		cmn_err(CE_WARN, "snap_ioctl: could not create "
1978		    "block minor node for snapshot %d.", snapnumber);
1979		snapnumber = -1;
1980		goto out;
1981	}
1982
1983	(void) snprintf(name, sizeof (name), "%d,raw", snapnumber);
1984	if (ddi_create_minor_node(fssnap_dip, name, S_IFCHR,
1985	    snapnumber, DDI_PSEUDO, 0) != DDI_SUCCESS) {
1986		cmn_err(CE_WARN, "snap_ioctl: could not create "
1987		    "character minor node for snapshot %d.", snapnumber);
1988		snapnumber = -1;
1989	}
1990
1991out:
1992	rw_exit(&sidp->sid_rwlock);
1993	rw_exit(&cmap->cmap_rwlock);
1994
1995	/* let the taskq threads start processing */
1996	taskq_resume(cowp->cow_taskq);
1997
1998	return (snapnumber);
1999}
2000
2001/*
2002 * fssnap_delete_impl() - delete a snapshot
2003 *
2004 *    used when a snapshot is no longer needed.  This is called by the file
2005 *    system when it receives an ioctl request to delete a snapshot.  It is
2006 *    also called internally when error conditions such as disk full, errors
2007 *    writing to the backing file, or backing file maxsize exceeded occur.
2008 *    If the snapshot device is busy when the delete request is received,
2009 *    all state will be deleted except for the soft state and device files
2010 *    associated with the snapshot; they will be deleted when the snapshot
2011 *    device is closed.
2012 *
2013 *    NOTE this function takes a POINTER TO A POINTER to the snapshot id,
2014 *    and expects to be able to set the handle held by the file system to
2015 *    NULL.  This depends on the file system checking that variable for NULL
2016 *    before calling fssnap_strategy().
2017 */
2018static int
2019fssnap_delete_impl(void *snapshot_id)
2020{
2021	struct snapshot_id	**sidpp = (struct snapshot_id **)snapshot_id;
2022	struct snapshot_id	*sidp;
2023	struct snapshot_id	**statesidpp;
2024	struct cow_info		*cowp;
2025	struct cow_map		*cmap;
2026	char			name[20];
2027	int			snapnumber = -1;
2028	vnode_t			**vpp;
2029
2030	/*
2031	 * sidp is guaranteed to be valid if sidpp is valid because
2032	 * the snapshot list is append-only.
2033	 */
2034	if (sidpp == NULL) {
2035		return (-1);
2036	}
2037
2038	sidp = *sidpp;
2039	rw_enter(&sidp->sid_rwlock, RW_WRITER);
2040
2041	ASSERT(RW_WRITE_HELD(&sidp->sid_rwlock));
2042
2043	/*
2044	 * double check that the snapshot is still valid for THIS file system
2045	 */
2046	if (*sidpp == NULL) {
2047		rw_exit(&sidp->sid_rwlock);
2048		return (-1);
2049	}
2050
2051	/*
2052	 * Now we know the snapshot is still valid and will not go away
2053	 * because we have the write lock.  Once the state is transitioned
2054	 * to "disabling", the sid_rwlock can be released.  Any pending I/O
2055	 * waiting for the lock as a reader will check for this state and
2056	 * abort without touching data that may be getting freed.
2057	 */
2058	sidp->sid_flags |= SID_DISABLING;
2059	if (sidp->sid_flags & SID_DELETE) {
2060		cmn_err(CE_WARN, "Snapshot %d automatically deleted.",
2061		    sidp->sid_snapnumber);
2062		sidp->sid_flags &= ~(SID_DELETE);
2063	}
2064
2065
2066	/*
2067	 * This is pointing into file system specific data!  The assumption is
2068	 * that fssnap_strategy() gets called from the file system based on
2069	 * whether this reference to the snapshot_id is NULL or not.  So
2070	 * setting this to NULL should disable snapshots for the file system.
2071	 */
2072	*sidpp = NULL;
2073
2074	/* remove cowinfo */
2075	cowp = sidp->sid_cowinfo;
2076	if (cowp == NULL) {
2077		rw_exit(&sidp->sid_rwlock);
2078		return (-1);
2079	}
2080	rw_exit(&sidp->sid_rwlock);
2081
2082	/* destroy task queues first so they don't reference freed data. */
2083	if (cowp->cow_taskq) {
2084		taskq_destroy(cowp->cow_taskq);
2085		cowp->cow_taskq = NULL;
2086	}
2087
2088	if (cowp->cow_backfile_array != NULL) {
2089		for (vpp = cowp->cow_backfile_array; *vpp; vpp++)
2090			VN_RELE(*vpp);
2091		kmem_free(cowp->cow_backfile_array,
2092		    (cowp->cow_backcount + 1) * sizeof (vnode_t *));
2093		cowp->cow_backfile_array = NULL;
2094	}
2095
2096	sidp->sid_cowinfo = NULL;
2097
2098	/* remove cmap */
2099	cmap = &cowp->cow_map;
2100	ASSERT(cmap);
2101
2102	if (cmap->cmap_candidate)
2103		kmem_free(cmap->cmap_candidate, cmap->cmap_bmsize);
2104
2105	if (cmap->cmap_hastrans)
2106		kmem_free(cmap->cmap_hastrans, cmap->cmap_bmsize);
2107
2108	if (cmap->cmap_table)
2109		transtbl_free(&cowp->cow_map);
2110
2111	rw_destroy(&cmap->cmap_rwlock);
2112
2113	while (cmap->cmap_waiters) {
2114		sema_p(&cmap->cmap_throttle_sem);
2115		sema_v(&cmap->cmap_throttle_sem);
2116	}
2117	sema_destroy(&cmap->cmap_throttle_sem);
2118
2119	/* remove kstats */
2120	fssnap_delete_kstats(cowp);
2121
2122	kmem_free(cowp, sizeof (struct cow_info));
2123
2124	statesidpp = ddi_get_soft_state(statep, sidp->sid_snapnumber);
2125	if (statesidpp == NULL || *statesidpp == NULL) {
2126		cmn_err(CE_WARN,
2127		    "fssnap_delete_impl: could not find state for snapshot %d.",
2128		    sidp->sid_snapnumber);
2129	}
2130	ASSERT(*statesidpp == sidp);
2131
2132	/*
2133	 * Leave the node in the list marked DISABLED so it can be reused
2134	 * and avoid many race conditions.  Return the snapshot number
2135	 * that was deleted.
2136	 */
2137	mutex_enter(&snapshot_mutex);
2138	rw_enter(&sidp->sid_rwlock, RW_WRITER);
2139	sidp->sid_flags &= ~(SID_DISABLING);
2140	sidp->sid_flags |= SID_DISABLED;
2141	VN_RELE(sidp->sid_fvp);
2142	sidp->sid_fvp = NULL;
2143	snapnumber = sidp->sid_snapnumber;
2144
2145	/*
2146	 * If the snapshot is not busy, free the device info now.  Otherwise
2147	 * the device nodes are freed in snap_close() when the device is
2148	 * closed.  The sid will not be reused until the device is not busy.
2149	 */
2150	if (SID_AVAILABLE(sidp)) {
2151		/* remove the device nodes */
2152		ASSERT(fssnap_dip != NULL);
2153		(void) snprintf(name, sizeof (name), "%d",
2154		    sidp->sid_snapnumber);
2155		ddi_remove_minor_node(fssnap_dip, name);
2156		(void) snprintf(name, sizeof (name), "%d,raw",
2157		    sidp->sid_snapnumber);
2158		ddi_remove_minor_node(fssnap_dip, name);
2159
2160		/* delete the state structure */
2161		ddi_soft_state_free(statep, sidp->sid_snapnumber);
2162		num_snapshots--;
2163	}
2164
2165	mutex_exit(&snapshot_mutex);
2166	rw_exit(&sidp->sid_rwlock);
2167
2168	return (snapnumber);
2169}
2170
2171/*
2172 * fssnap_create_kstats() - allocate and initialize snapshot kstats
2173 *
2174 */
2175static void
2176fssnap_create_kstats(snapshot_id_t *sidp, int snapnum,
2177    const char *mountpoint, const char *backfilename)
2178{
2179	kstat_t *num, *mntpoint, *bfname;
2180	kstat_named_t *hw;
2181	struct cow_info *cowp = sidp->sid_cowinfo;
2182	struct cow_kstat_num *stats;
2183
2184	/* update the high water mark */
2185	if (fssnap_highwater_kstat == NULL) {
2186		cmn_err(CE_WARN, "fssnap_create_kstats: failed to lookup "
2187		    "high water mark kstat.");
2188		return;
2189	}
2190
2191	hw = (kstat_named_t *)fssnap_highwater_kstat->ks_data;
2192	if (hw->value.ui32 < snapnum)
2193		hw->value.ui32 = snapnum;
2194
2195	/* initialize the mount point kstat */
2196	kstat_delete_byname(snapname, snapnum, FSSNAP_KSTAT_MNTPT);
2197
2198	if (mountpoint != NULL) {
2199		mntpoint = kstat_create(snapname, snapnum, FSSNAP_KSTAT_MNTPT,
2200		    "misc", KSTAT_TYPE_RAW, strlen(mountpoint) + 1, 0);
2201		if (mntpoint == NULL) {
2202			cowp->cow_kstat_mntpt = NULL;
2203			cmn_err(CE_WARN, "fssnap_create_kstats: failed to "
2204			    "create mount point kstat");
2205		} else {
2206			(void) strncpy(mntpoint->ks_data, mountpoint,
2207			    strlen(mountpoint));
2208			cowp->cow_kstat_mntpt = mntpoint;
2209			kstat_install(mntpoint);
2210		}
2211	} else {
2212		cowp->cow_kstat_mntpt = NULL;
2213		cmn_err(CE_WARN, "fssnap_create_kstats: mount point not "
2214		    "specified.");
2215	}
2216
2217	/* initialize the backing file kstat */
2218	kstat_delete_byname(snapname, snapnum, FSSNAP_KSTAT_BFNAME);
2219
2220	if (backfilename == NULL) {
2221		cowp->cow_kstat_bfname = NULL;
2222	} else {
2223		bfname = kstat_create(snapname, snapnum, FSSNAP_KSTAT_BFNAME,
2224		    "misc", KSTAT_TYPE_RAW, strlen(backfilename) + 1, 0);
2225		if (bfname != NULL) {
2226			(void) strncpy(bfname->ks_data, backfilename,
2227			    strlen(backfilename));
2228			cowp->cow_kstat_bfname = bfname;
2229			kstat_install(bfname);
2230		} else {
2231			cowp->cow_kstat_bfname = NULL;
2232			cmn_err(CE_WARN, "fssnap_create_kstats: failed to "
2233			    "create backing file name kstat");
2234		}
2235	}
2236
2237	/* initialize numeric kstats */
2238	kstat_delete_byname(snapname, snapnum, FSSNAP_KSTAT_NUM);
2239
2240	num = kstat_create(snapname, snapnum, FSSNAP_KSTAT_NUM,
2241	    "misc", KSTAT_TYPE_NAMED,
2242	    sizeof (struct cow_kstat_num) / sizeof (kstat_named_t),
2243	    0);
2244	if (num == NULL) {
2245		cmn_err(CE_WARN, "fssnap_create_kstats: failed to create "
2246		    "numeric kstats");
2247		cowp->cow_kstat_num = NULL;
2248		return;
2249	}
2250
2251	cowp->cow_kstat_num = num;
2252	stats = num->ks_data;
2253	num->ks_update = fssnap_update_kstat_num;
2254	num->ks_private = sidp;
2255
2256	kstat_named_init(&stats->ckn_state, FSSNAP_KSTAT_NUM_STATE,
2257	    KSTAT_DATA_INT32);
2258	kstat_named_init(&stats->ckn_bfsize, FSSNAP_KSTAT_NUM_BFSIZE,
2259	    KSTAT_DATA_UINT64);
2260	kstat_named_init(&stats->ckn_maxsize, FSSNAP_KSTAT_NUM_MAXSIZE,
2261	    KSTAT_DATA_UINT64);
2262	kstat_named_init(&stats->ckn_createtime, FSSNAP_KSTAT_NUM_CREATETIME,
2263	    KSTAT_DATA_LONG);
2264	kstat_named_init(&stats->ckn_chunksize, FSSNAP_KSTAT_NUM_CHUNKSIZE,
2265	    KSTAT_DATA_UINT32);
2266
2267	/* initialize the static kstats */
2268	stats->ckn_chunksize.value.ui32 = cowp->cow_map.cmap_chunksz;
2269	stats->ckn_maxsize.value.ui64 = cowp->cow_map.cmap_maxsize;
2270	stats->ckn_createtime.value.l = gethrestime_sec();
2271
2272	kstat_install(num);
2273}
2274
2275/*
2276 * fssnap_update_kstat_num() - update a numerical snapshot kstat value
2277 *
2278 */
2279int
2280fssnap_update_kstat_num(kstat_t *ksp, int rw)
2281{
2282	snapshot_id_t *sidp = (snapshot_id_t *)ksp->ks_private;
2283	struct cow_info *cowp = sidp->sid_cowinfo;
2284	struct cow_kstat_num *stats = ksp->ks_data;
2285
2286	if (rw == KSTAT_WRITE)
2287		return (EACCES);
2288
2289	/* state */
2290	if (sidp->sid_flags & SID_CREATING)
2291		stats->ckn_state.value.i32 = COWSTATE_CREATING;
2292	else if (SID_INACTIVE(sidp))
2293		stats->ckn_state.value.i32 = COWSTATE_DISABLED;
2294	else if (SID_BUSY(sidp))
2295		stats->ckn_state.value.i32 = COWSTATE_ACTIVE;
2296	else
2297		stats->ckn_state.value.i32 = COWSTATE_IDLE;
2298
2299	/* bfsize */
2300	stats->ckn_bfsize.value.ui64 = cowp->cow_map.cmap_nchunks *
2301	    cowp->cow_map.cmap_chunksz;
2302
2303	return (0);
2304}
2305
2306/*
2307 * fssnap_delete_kstats() - deallocate snapshot kstats
2308 *
2309 */
2310void
2311fssnap_delete_kstats(struct cow_info *cowp)
2312{
2313	if (cowp->cow_kstat_num != NULL) {
2314		kstat_delete(cowp->cow_kstat_num);
2315		cowp->cow_kstat_num = NULL;
2316	}
2317	if (cowp->cow_kstat_mntpt != NULL) {
2318		kstat_delete(cowp->cow_kstat_mntpt);
2319		cowp->cow_kstat_mntpt = NULL;
2320	}
2321	if (cowp->cow_kstat_bfname != NULL) {
2322		kstat_delete(cowp->cow_kstat_bfname);
2323		cowp->cow_kstat_bfname = NULL;
2324	}
2325}
2326