1/*	$NetBSD: fss.c,v 1.81.4.2 2013/02/10 23:57:25 riz Exp $	*/
2
3/*-
4 * Copyright (c) 2003 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Juergen Hannken-Illjes.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32/*
33 * File system snapshot disk driver.
34 *
35 * Block/character interface to the snapshot of a mounted file system.
36 */
37
38#include <sys/cdefs.h>
39__KERNEL_RCSID(0, "$NetBSD: fss.c,v 1.81.4.2 2013/02/10 23:57:25 riz Exp $");
40
41#include <sys/param.h>
42#include <sys/systm.h>
43#include <sys/namei.h>
44#include <sys/proc.h>
45#include <sys/errno.h>
46#include <sys/malloc.h>
47#include <sys/buf.h>
48#include <sys/ioctl.h>
49#include <sys/disklabel.h>
50#include <sys/device.h>
51#include <sys/disk.h>
52#include <sys/stat.h>
53#include <sys/mount.h>
54#include <sys/vnode.h>
55#include <sys/file.h>
56#include <sys/uio.h>
57#include <sys/conf.h>
58#include <sys/kthread.h>
59#include <sys/fstrans.h>
60#include <sys/simplelock.h>
61#include <sys/vfs_syscalls.h>		/* For do_sys_unlink(). */
62
63#include <miscfs/specfs/specdev.h>
64
65#include <dev/fssvar.h>
66
67#include <uvm/uvm.h>
68
69void fssattach(int);
70
71dev_type_open(fss_open);
72dev_type_close(fss_close);
73dev_type_read(fss_read);
74dev_type_write(fss_write);
75dev_type_ioctl(fss_ioctl);
76dev_type_strategy(fss_strategy);
77dev_type_dump(fss_dump);
78dev_type_size(fss_size);
79
80static void fss_unmount_hook(struct mount *);
81static int fss_copy_on_write(void *, struct buf *, bool);
82static inline void fss_error(struct fss_softc *, const char *);
83static int fss_create_files(struct fss_softc *, struct fss_set *,
84    off_t *, struct lwp *);
85static int fss_create_snapshot(struct fss_softc *, struct fss_set *,
86    struct lwp *);
87static int fss_delete_snapshot(struct fss_softc *, struct lwp *);
88static int fss_softc_alloc(struct fss_softc *);
89static void fss_softc_free(struct fss_softc *);
90static int fss_read_cluster(struct fss_softc *, u_int32_t);
91static void fss_bs_thread(void *);
92static int fss_bs_io(struct fss_softc *, fss_io_type,
93    u_int32_t, off_t, int, void *);
94static u_int32_t *fss_bs_indir(struct fss_softc *, u_int32_t);
95
96static kmutex_t fss_device_lock;	/* Protect all units. */
97static int fss_num_attached = 0;	/* Number of attached devices. */
98static struct vfs_hooks fss_vfs_hooks = {
99	.vh_unmount = fss_unmount_hook
100};
101
102const struct bdevsw fss_bdevsw = {
103	fss_open, fss_close, fss_strategy, fss_ioctl,
104	fss_dump, fss_size, D_DISK | D_MPSAFE
105};
106
107const struct cdevsw fss_cdevsw = {
108	fss_open, fss_close, fss_read, fss_write, fss_ioctl,
109	nostop, notty, nopoll, nommap, nokqfilter, D_DISK | D_MPSAFE
110};
111
112static int fss_match(device_t, cfdata_t, void *);
113static void fss_attach(device_t, device_t, void *);
114static int fss_detach(device_t, int);
115
116CFATTACH_DECL_NEW(fss, sizeof(struct fss_softc),
117    fss_match, fss_attach, fss_detach, NULL);
118extern struct cfdriver fss_cd;
119
120void
121fssattach(int num)
122{
123
124	mutex_init(&fss_device_lock, MUTEX_DEFAULT, IPL_NONE);
125	if (config_cfattach_attach(fss_cd.cd_name, &fss_ca))
126		aprint_error("%s: unable to register\n", fss_cd.cd_name);
127}
128
129static int
130fss_match(device_t self, cfdata_t cfdata, void *aux)
131{
132	return 1;
133}
134
135static void
136fss_attach(device_t parent, device_t self, void *aux)
137{
138	struct fss_softc *sc = device_private(self);
139
140	sc->sc_dev = self;
141	sc->sc_bdev = NODEV;
142	mutex_init(&sc->sc_slock, MUTEX_DEFAULT, IPL_NONE);
143	mutex_init(&sc->sc_lock, MUTEX_DEFAULT, IPL_NONE);
144	cv_init(&sc->sc_work_cv, "fssbs");
145	cv_init(&sc->sc_cache_cv, "cowwait");
146	bufq_alloc(&sc->sc_bufq, "fcfs", 0);
147	sc->sc_dkdev = malloc(sizeof(*sc->sc_dkdev), M_DEVBUF, M_WAITOK);
148	sc->sc_dkdev->dk_info = NULL;
149	disk_init(sc->sc_dkdev, device_xname(self), NULL);
150	if (!pmf_device_register(self, NULL, NULL))
151		aprint_error_dev(self, "couldn't establish power handler\n");
152
153	if (fss_num_attached++ == 0)
154		vfs_hooks_attach(&fss_vfs_hooks);
155}
156
157static int
158fss_detach(device_t self, int flags)
159{
160	struct fss_softc *sc = device_private(self);
161
162	if (sc->sc_flags & FSS_ACTIVE)
163		return EBUSY;
164
165	if (--fss_num_attached == 0)
166		vfs_hooks_detach(&fss_vfs_hooks);
167
168	pmf_device_deregister(self);
169	mutex_destroy(&sc->sc_slock);
170	mutex_destroy(&sc->sc_lock);
171	cv_destroy(&sc->sc_work_cv);
172	cv_destroy(&sc->sc_cache_cv);
173	bufq_drain(sc->sc_bufq);
174	bufq_free(sc->sc_bufq);
175	disk_destroy(sc->sc_dkdev);
176	free(sc->sc_dkdev, M_DEVBUF);
177
178	return 0;
179}
180
181int
182fss_open(dev_t dev, int flags, int mode, struct lwp *l)
183{
184	int mflag;
185	cfdata_t cf;
186	struct fss_softc *sc;
187
188	mflag = (mode == S_IFCHR ? FSS_CDEV_OPEN : FSS_BDEV_OPEN);
189
190	mutex_enter(&fss_device_lock);
191
192	sc = device_lookup_private(&fss_cd, minor(dev));
193	if (sc == NULL) {
194		cf = malloc(sizeof(*cf), M_DEVBUF, M_WAITOK);
195		cf->cf_name = fss_cd.cd_name;
196		cf->cf_atname = fss_cd.cd_name;
197		cf->cf_unit = minor(dev);
198		cf->cf_fstate = FSTATE_STAR;
199		sc = device_private(config_attach_pseudo(cf));
200		if (sc == NULL) {
201			mutex_exit(&fss_device_lock);
202			return ENOMEM;
203		}
204	}
205
206	mutex_enter(&sc->sc_slock);
207
208	sc->sc_flags |= mflag;
209
210	mutex_exit(&sc->sc_slock);
211	mutex_exit(&fss_device_lock);
212
213	return 0;
214}
215
216int
217fss_close(dev_t dev, int flags, int mode, struct lwp *l)
218{
219	int mflag, error;
220	cfdata_t cf;
221	struct fss_softc *sc = device_lookup_private(&fss_cd, minor(dev));
222
223	mflag = (mode == S_IFCHR ? FSS_CDEV_OPEN : FSS_BDEV_OPEN);
224	error = 0;
225
226	mutex_enter(&fss_device_lock);
227restart:
228	mutex_enter(&sc->sc_slock);
229	if ((sc->sc_flags & (FSS_CDEV_OPEN|FSS_BDEV_OPEN)) != mflag) {
230		sc->sc_flags &= ~mflag;
231		mutex_exit(&sc->sc_slock);
232		mutex_exit(&fss_device_lock);
233		return 0;
234	}
235	if ((sc->sc_flags & FSS_ACTIVE) != 0 &&
236	    (sc->sc_uflags & FSS_UNCONFIG_ON_CLOSE) != 0) {
237		sc->sc_uflags &= ~FSS_UNCONFIG_ON_CLOSE;
238		mutex_exit(&sc->sc_slock);
239		error = fss_ioctl(dev, FSSIOCCLR, NULL, FWRITE, l);
240		goto restart;
241	}
242	if ((sc->sc_flags & FSS_ACTIVE) != 0) {
243		mutex_exit(&sc->sc_slock);
244		mutex_exit(&fss_device_lock);
245		return error;
246	}
247
248	KASSERT((sc->sc_flags & FSS_ACTIVE) == 0);
249	KASSERT((sc->sc_flags & (FSS_CDEV_OPEN|FSS_BDEV_OPEN)) == mflag);
250	mutex_exit(&sc->sc_slock);
251	cf = device_cfdata(sc->sc_dev);
252	error = config_detach(sc->sc_dev, DETACH_QUIET);
253	if (! error)
254		free(cf, M_DEVBUF);
255	mutex_exit(&fss_device_lock);
256
257	return error;
258}
259
260void
261fss_strategy(struct buf *bp)
262{
263	const bool write = ((bp->b_flags & B_READ) != B_READ);
264	struct fss_softc *sc = device_lookup_private(&fss_cd, minor(bp->b_dev));
265
266	mutex_enter(&sc->sc_slock);
267
268	if (write || !FSS_ISVALID(sc)) {
269
270		mutex_exit(&sc->sc_slock);
271
272		bp->b_error = (write ? EROFS : ENXIO);
273		bp->b_resid = bp->b_bcount;
274		biodone(bp);
275		return;
276	}
277
278	bp->b_rawblkno = bp->b_blkno;
279	bufq_put(sc->sc_bufq, bp);
280	cv_signal(&sc->sc_work_cv);
281
282	mutex_exit(&sc->sc_slock);
283}
284
285int
286fss_read(dev_t dev, struct uio *uio, int flags)
287{
288	return physio(fss_strategy, NULL, dev, B_READ, minphys, uio);
289}
290
291int
292fss_write(dev_t dev, struct uio *uio, int flags)
293{
294	return physio(fss_strategy, NULL, dev, B_WRITE, minphys, uio);
295}
296
297int
298fss_ioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
299{
300	int error;
301	struct fss_softc *sc = device_lookup_private(&fss_cd, minor(dev));
302	struct fss_set _fss;
303	struct fss_set *fss = (struct fss_set *)data;
304	struct fss_set50 *fss50 = (struct fss_set50 *)data;
305	struct fss_get *fsg = (struct fss_get *)data;
306#ifndef _LP64
307	struct fss_get50 *fsg50 = (struct fss_get50 *)data;
308#endif
309
310	switch (cmd) {
311	case FSSIOCSET50:
312		fss = &_fss;
313		fss->fss_mount = fss50->fss_mount;
314		fss->fss_bstore = fss50->fss_bstore;
315		fss->fss_csize = fss50->fss_csize;
316		fss->fss_flags = 0;
317		/* Fall through */
318	case FSSIOCSET:
319		mutex_enter(&sc->sc_lock);
320		if ((flag & FWRITE) == 0)
321			error = EPERM;
322		else if ((sc->sc_flags & FSS_ACTIVE) != 0)
323			error = EBUSY;
324		else
325			error = fss_create_snapshot(sc, fss, l);
326		if (error == 0)
327			sc->sc_uflags = fss->fss_flags;
328		mutex_exit(&sc->sc_lock);
329		break;
330
331	case FSSIOCCLR:
332		mutex_enter(&sc->sc_lock);
333		if ((flag & FWRITE) == 0)
334			error = EPERM;
335		else if ((sc->sc_flags & FSS_ACTIVE) == 0)
336			error = ENXIO;
337		else
338			error = fss_delete_snapshot(sc, l);
339		mutex_exit(&sc->sc_lock);
340		break;
341
342#ifndef _LP64
343	case FSSIOCGET50:
344		mutex_enter(&sc->sc_lock);
345		switch (sc->sc_flags & (FSS_PERSISTENT | FSS_ACTIVE)) {
346		case FSS_ACTIVE:
347			memcpy(fsg50->fsg_mount, sc->sc_mntname, MNAMELEN);
348			fsg50->fsg_csize = FSS_CLSIZE(sc);
349			timeval_to_timeval50(&sc->sc_time, &fsg50->fsg_time);
350			fsg50->fsg_mount_size = sc->sc_clcount;
351			fsg50->fsg_bs_size = sc->sc_clnext;
352			error = 0;
353			break;
354		case FSS_PERSISTENT | FSS_ACTIVE:
355			memcpy(fsg50->fsg_mount, sc->sc_mntname, MNAMELEN);
356			fsg50->fsg_csize = 0;
357			timeval_to_timeval50(&sc->sc_time, &fsg50->fsg_time);
358			fsg50->fsg_mount_size = 0;
359			fsg50->fsg_bs_size = 0;
360			error = 0;
361			break;
362		default:
363			error = ENXIO;
364			break;
365		}
366		mutex_exit(&sc->sc_lock);
367		break;
368#endif /* _LP64 */
369
370	case FSSIOCGET:
371		mutex_enter(&sc->sc_lock);
372		switch (sc->sc_flags & (FSS_PERSISTENT | FSS_ACTIVE)) {
373		case FSS_ACTIVE:
374			memcpy(fsg->fsg_mount, sc->sc_mntname, MNAMELEN);
375			fsg->fsg_csize = FSS_CLSIZE(sc);
376			fsg->fsg_time = sc->sc_time;
377			fsg->fsg_mount_size = sc->sc_clcount;
378			fsg->fsg_bs_size = sc->sc_clnext;
379			error = 0;
380			break;
381		case FSS_PERSISTENT | FSS_ACTIVE:
382			memcpy(fsg->fsg_mount, sc->sc_mntname, MNAMELEN);
383			fsg->fsg_csize = 0;
384			fsg->fsg_time = sc->sc_time;
385			fsg->fsg_mount_size = 0;
386			fsg->fsg_bs_size = 0;
387			error = 0;
388			break;
389		default:
390			error = ENXIO;
391			break;
392		}
393		mutex_exit(&sc->sc_lock);
394		break;
395
396	case FSSIOFSET:
397		mutex_enter(&sc->sc_slock);
398		sc->sc_uflags = *(int *)data;
399		mutex_exit(&sc->sc_slock);
400		error = 0;
401		break;
402
403	case FSSIOFGET:
404		mutex_enter(&sc->sc_slock);
405		*(int *)data = sc->sc_uflags;
406		mutex_exit(&sc->sc_slock);
407		error = 0;
408		break;
409
410	default:
411		error = EINVAL;
412		break;
413	}
414
415	return error;
416}
417
418int
419fss_size(dev_t dev)
420{
421	return -1;
422}
423
424int
425fss_dump(dev_t dev, daddr_t blkno, void *va,
426    size_t size)
427{
428	return EROFS;
429}
430
431/*
432 * An error occurred reading or writing the snapshot or backing store.
433 * If it is the first error log to console.
434 * The caller holds the mutex.
435 */
436static inline void
437fss_error(struct fss_softc *sc, const char *msg)
438{
439
440	if ((sc->sc_flags & (FSS_ACTIVE|FSS_ERROR)) == FSS_ACTIVE)
441		aprint_error_dev(sc->sc_dev, "snapshot invalid: %s\n", msg);
442	if ((sc->sc_flags & FSS_ACTIVE) == FSS_ACTIVE)
443		sc->sc_flags |= FSS_ERROR;
444}
445
446/*
447 * Allocate the variable sized parts of the softc and
448 * fork the kernel thread.
449 *
450 * The fields sc_clcount, sc_clshift, sc_cache_size and sc_indir_size
451 * must be initialized.
452 */
453static int
454fss_softc_alloc(struct fss_softc *sc)
455{
456	int i, error;
457
458	if ((sc->sc_flags & FSS_PERSISTENT) == 0) {
459		sc->sc_copied =
460		    kmem_zalloc(howmany(sc->sc_clcount, NBBY), KM_SLEEP);
461		if (sc->sc_copied == NULL)
462			return(ENOMEM);
463
464		sc->sc_cache = kmem_alloc(sc->sc_cache_size *
465		    sizeof(struct fss_cache), KM_SLEEP);
466		if (sc->sc_cache == NULL)
467			return(ENOMEM);
468
469		for (i = 0; i < sc->sc_cache_size; i++) {
470			sc->sc_cache[i].fc_type = FSS_CACHE_FREE;
471			sc->sc_cache[i].fc_data =
472			    kmem_alloc(FSS_CLSIZE(sc), KM_SLEEP);
473			if (sc->sc_cache[i].fc_data == NULL)
474				return(ENOMEM);
475			cv_init(&sc->sc_cache[i].fc_state_cv, "cowwait1");
476		}
477
478		sc->sc_indir_valid =
479		    kmem_zalloc(howmany(sc->sc_indir_size, NBBY), KM_SLEEP);
480		if (sc->sc_indir_valid == NULL)
481			return(ENOMEM);
482
483		sc->sc_indir_data = kmem_zalloc(FSS_CLSIZE(sc), KM_SLEEP);
484		if (sc->sc_indir_data == NULL)
485			return(ENOMEM);
486	} else {
487		sc->sc_copied = NULL;
488		sc->sc_cache = NULL;
489		sc->sc_indir_valid = NULL;
490		sc->sc_indir_data = NULL;
491	}
492
493	sc->sc_flags |= FSS_BS_THREAD;
494	if ((error = kthread_create(PRI_BIO, KTHREAD_MUSTJOIN, NULL,
495	    fss_bs_thread, sc, &sc->sc_bs_lwp,
496	    "%s", device_xname(sc->sc_dev))) != 0) {
497		sc->sc_flags &= ~FSS_BS_THREAD;
498		return error;
499	}
500
501	disk_attach(sc->sc_dkdev);
502
503	return 0;
504}
505
506/*
507 * Free the variable sized parts of the softc.
508 */
509static void
510fss_softc_free(struct fss_softc *sc)
511{
512	int i;
513
514	if ((sc->sc_flags & FSS_BS_THREAD) != 0) {
515		mutex_enter(&sc->sc_slock);
516		sc->sc_flags &= ~FSS_BS_THREAD;
517		cv_signal(&sc->sc_work_cv);
518		mutex_exit(&sc->sc_slock);
519		kthread_join(sc->sc_bs_lwp);
520
521		disk_detach(sc->sc_dkdev);
522	}
523
524	if (sc->sc_copied != NULL)
525		kmem_free(sc->sc_copied, howmany(sc->sc_clcount, NBBY));
526	sc->sc_copied = NULL;
527
528	if (sc->sc_cache != NULL) {
529		for (i = 0; i < sc->sc_cache_size; i++)
530			if (sc->sc_cache[i].fc_data != NULL) {
531				cv_destroy(&sc->sc_cache[i].fc_state_cv);
532				kmem_free(sc->sc_cache[i].fc_data,
533				    FSS_CLSIZE(sc));
534			}
535		kmem_free(sc->sc_cache,
536		    sc->sc_cache_size*sizeof(struct fss_cache));
537	}
538	sc->sc_cache = NULL;
539
540	if (sc->sc_indir_valid != NULL)
541		kmem_free(sc->sc_indir_valid, howmany(sc->sc_indir_size, NBBY));
542	sc->sc_indir_valid = NULL;
543
544	if (sc->sc_indir_data != NULL)
545		kmem_free(sc->sc_indir_data, FSS_CLSIZE(sc));
546	sc->sc_indir_data = NULL;
547}
548
549/*
550 * Set all active snapshots on this file system into ERROR state.
551 */
552static void
553fss_unmount_hook(struct mount *mp)
554{
555	int i;
556	struct fss_softc *sc;
557
558	mutex_enter(&fss_device_lock);
559	for (i = 0; i < fss_cd.cd_ndevs; i++) {
560		if ((sc = device_lookup_private(&fss_cd, i)) == NULL)
561			continue;
562		mutex_enter(&sc->sc_slock);
563		if ((sc->sc_flags & FSS_ACTIVE) != 0 &&
564		    sc->sc_mount == mp)
565			fss_error(sc, "forced unmount");
566		mutex_exit(&sc->sc_slock);
567	}
568	mutex_exit(&fss_device_lock);
569}
570
571/*
572 * A buffer is written to the snapshotted block device. Copy to
573 * backing store if needed.
574 */
575static int
576fss_copy_on_write(void *v, struct buf *bp, bool data_valid)
577{
578	int error;
579	u_int32_t cl, ch, c;
580	struct fss_softc *sc = v;
581
582	mutex_enter(&sc->sc_slock);
583	if (!FSS_ISVALID(sc)) {
584		mutex_exit(&sc->sc_slock);
585		return 0;
586	}
587
588	cl = FSS_BTOCL(sc, dbtob(bp->b_blkno));
589	ch = FSS_BTOCL(sc, dbtob(bp->b_blkno)+bp->b_bcount-1);
590	error = 0;
591	if (curlwp == uvm.pagedaemon_lwp) {
592		for (c = cl; c <= ch; c++)
593			if (isclr(sc->sc_copied, c)) {
594				error = ENOMEM;
595				break;
596			}
597	}
598	mutex_exit(&sc->sc_slock);
599
600	if (error == 0)
601		for (c = cl; c <= ch; c++) {
602			error = fss_read_cluster(sc, c);
603			if (error)
604				break;
605		}
606
607	return error;
608}
609
610/*
611 * Lookup and open needed files.
612 *
613 * For file system internal snapshot initializes sc_mntname, sc_mount,
614 * sc_bs_vp and sc_time.
615 *
616 * Otherwise returns dev and size of the underlying block device.
617 * Initializes sc_mntname, sc_mount, sc_bdev, sc_bs_vp and sc_mount
618 */
619static int
620fss_create_files(struct fss_softc *sc, struct fss_set *fss,
621    off_t *bsize, struct lwp *l)
622{
623	int i, error, bits, fsbsize;
624	uint64_t numsec;
625	unsigned int secsize;
626	struct timespec ts;
627	/* nd -> nd2 to reduce mistakes while updating only some namei calls */
628	struct pathbuf *pb2;
629	struct nameidata nd2;
630	struct vnode *vp;
631
632	/*
633	 * Get the mounted file system.
634	 */
635
636	error = namei_simple_user(fss->fss_mount,
637				NSM_FOLLOW_NOEMULROOT, &vp);
638	if (error != 0)
639		return error;
640
641	if ((vp->v_vflag & VV_ROOT) != VV_ROOT) {
642		vrele(vp);
643		return EINVAL;
644	}
645
646	sc->sc_mount = vp->v_mount;
647	memcpy(sc->sc_mntname, sc->sc_mount->mnt_stat.f_mntonname, MNAMELEN);
648
649	vrele(vp);
650
651	/*
652	 * Check for file system internal snapshot.
653	 */
654
655	error = namei_simple_user(fss->fss_bstore,
656				NSM_FOLLOW_NOEMULROOT, &vp);
657	if (error != 0)
658		return error;
659
660	if (vp->v_type == VREG && vp->v_mount == sc->sc_mount) {
661		sc->sc_flags |= FSS_PERSISTENT;
662		sc->sc_bs_vp = vp;
663
664		fsbsize = sc->sc_bs_vp->v_mount->mnt_stat.f_iosize;
665		bits = sizeof(sc->sc_bs_bshift)*NBBY;
666		for (sc->sc_bs_bshift = 1; sc->sc_bs_bshift < bits;
667		    sc->sc_bs_bshift++)
668			if (FSS_FSBSIZE(sc) == fsbsize)
669				break;
670		if (sc->sc_bs_bshift >= bits)
671			return EINVAL;
672
673		sc->sc_bs_bmask = FSS_FSBSIZE(sc)-1;
674		sc->sc_clshift = 0;
675
676		if ((fss->fss_flags & FSS_UNLINK_ON_CREATE) != 0) {
677			error = do_sys_unlink(fss->fss_bstore, UIO_USERSPACE);
678			if (error)
679				return error;
680		}
681		error = vn_lock(vp, LK_EXCLUSIVE);
682		if (error != 0)
683			return error;
684		error = VFS_SNAPSHOT(sc->sc_mount, sc->sc_bs_vp, &ts);
685		TIMESPEC_TO_TIMEVAL(&sc->sc_time, &ts);
686
687		VOP_UNLOCK(sc->sc_bs_vp);
688
689		return error;
690	}
691	vrele(vp);
692
693	/*
694	 * Get the block device it is mounted on and its size.
695	 */
696
697	mutex_enter(&device_lock);
698	for (i = 0; i < SPECHSZ; i++) {
699		for (vp = specfs_hash[i]; vp; vp = vp->v_specnext) {
700			if (vp->v_type == VBLK &&
701			    vp == vp->v_specnode->sn_dev->sd_bdevvp &&
702			    vp->v_specmountpoint == sc->sc_mount)
703				break;
704		}
705		if (vp != NULL)
706			break;
707	}
708	if (vp == NULL) {
709		mutex_exit(&device_lock);
710		return EINVAL;
711	}
712	mutex_enter(vp->v_interlock);
713	mutex_exit(&device_lock);
714	error = vget(vp, 0);
715	if (error)
716		return error;
717	sc->sc_bdev = vp->v_rdev;
718
719	error = getdisksize(vp, &numsec, &secsize);
720	vrele(vp);
721	if (error)
722		return error;
723
724	*bsize = (off_t)numsec*secsize;
725
726	/*
727	 * Get the backing store
728	 */
729
730	error = pathbuf_copyin(fss->fss_bstore, &pb2);
731	if (error) {
732 		return error;
733	}
734	NDINIT(&nd2, LOOKUP, FOLLOW, pb2);
735	if ((error = vn_open(&nd2, FREAD|FWRITE, 0)) != 0) {
736		pathbuf_destroy(pb2);
737		return error;
738	}
739	VOP_UNLOCK(nd2.ni_vp);
740
741	sc->sc_bs_vp = nd2.ni_vp;
742
743	if (nd2.ni_vp->v_type != VREG && nd2.ni_vp->v_type != VCHR) {
744		pathbuf_destroy(pb2);
745		return EINVAL;
746	}
747	pathbuf_destroy(pb2);
748
749	if ((fss->fss_flags & FSS_UNLINK_ON_CREATE) != 0) {
750		error = do_sys_unlink(fss->fss_bstore, UIO_USERSPACE);
751		if (error)
752			return error;
753	}
754	if (sc->sc_bs_vp->v_type == VREG) {
755		fsbsize = sc->sc_bs_vp->v_mount->mnt_stat.f_iosize;
756		if (fsbsize & (fsbsize-1))	/* No power of two */
757			return EINVAL;
758		for (sc->sc_bs_bshift = 1; sc->sc_bs_bshift < 32;
759		    sc->sc_bs_bshift++)
760			if (FSS_FSBSIZE(sc) == fsbsize)
761				break;
762		if (sc->sc_bs_bshift >= 32)
763			return EINVAL;
764		sc->sc_bs_bmask = FSS_FSBSIZE(sc)-1;
765	} else {
766		sc->sc_bs_bshift = DEV_BSHIFT;
767		sc->sc_bs_bmask = FSS_FSBSIZE(sc)-1;
768	}
769
770	return 0;
771}
772
773/*
774 * Create a snapshot.
775 */
776static int
777fss_create_snapshot(struct fss_softc *sc, struct fss_set *fss, struct lwp *l)
778{
779	int len, error;
780	u_int32_t csize;
781	off_t bsize;
782
783	bsize = 0;	/* XXX gcc */
784
785	/*
786	 * Open needed files.
787	 */
788	if ((error = fss_create_files(sc, fss, &bsize, l)) != 0)
789		goto bad;
790
791	if (sc->sc_flags & FSS_PERSISTENT) {
792		fss_softc_alloc(sc);
793		sc->sc_flags |= FSS_ACTIVE;
794		return 0;
795	}
796
797	/*
798	 * Set cluster size. Must be a power of two and
799	 * a multiple of backing store block size.
800	 */
801	if (fss->fss_csize <= 0)
802		csize = MAXPHYS;
803	else
804		csize = fss->fss_csize;
805	if (bsize/csize > FSS_CLUSTER_MAX)
806		csize = bsize/FSS_CLUSTER_MAX+1;
807
808	for (sc->sc_clshift = sc->sc_bs_bshift; sc->sc_clshift < 32;
809	    sc->sc_clshift++)
810		if (FSS_CLSIZE(sc) >= csize)
811			break;
812	if (sc->sc_clshift >= 32) {
813		error = EINVAL;
814		goto bad;
815	}
816	sc->sc_clmask = FSS_CLSIZE(sc)-1;
817
818	/*
819	 * Set number of cache slots.
820	 */
821	if (FSS_CLSIZE(sc) <= 8192)
822		sc->sc_cache_size = 32;
823	else if (FSS_CLSIZE(sc) <= 65536)
824		sc->sc_cache_size = 8;
825	else
826		sc->sc_cache_size = 4;
827
828	/*
829	 * Set number of clusters and size of last cluster.
830	 */
831	sc->sc_clcount = FSS_BTOCL(sc, bsize-1)+1;
832	sc->sc_clresid = FSS_CLOFF(sc, bsize-1)+1;
833
834	/*
835	 * Set size of indirect table.
836	 */
837	len = sc->sc_clcount*sizeof(u_int32_t);
838	sc->sc_indir_size = FSS_BTOCL(sc, len)+1;
839	sc->sc_clnext = sc->sc_indir_size;
840	sc->sc_indir_cur = 0;
841
842	if ((error = fss_softc_alloc(sc)) != 0)
843		goto bad;
844
845	/*
846	 * Activate the snapshot.
847	 */
848
849	if ((error = vfs_suspend(sc->sc_mount, 0)) != 0)
850		goto bad;
851
852	microtime(&sc->sc_time);
853
854	error = fscow_establish(sc->sc_mount, fss_copy_on_write, sc);
855	if (error == 0)
856		sc->sc_flags |= FSS_ACTIVE;
857
858	vfs_resume(sc->sc_mount);
859
860	if (error != 0)
861		goto bad;
862
863	aprint_debug_dev(sc->sc_dev, "%s snapshot active\n", sc->sc_mntname);
864	aprint_debug_dev(sc->sc_dev,
865	    "%u clusters of %u, %u cache slots, %u indir clusters\n",
866	    sc->sc_clcount, FSS_CLSIZE(sc),
867	    sc->sc_cache_size, sc->sc_indir_size);
868
869	return 0;
870
871bad:
872	fss_softc_free(sc);
873	if (sc->sc_bs_vp != NULL) {
874		if (sc->sc_flags & FSS_PERSISTENT)
875			vrele(sc->sc_bs_vp);
876		else
877			vn_close(sc->sc_bs_vp, FREAD|FWRITE, l->l_cred);
878	}
879	sc->sc_bs_vp = NULL;
880
881	return error;
882}
883
884/*
885 * Delete a snapshot.
886 */
887static int
888fss_delete_snapshot(struct fss_softc *sc, struct lwp *l)
889{
890
891	if ((sc->sc_flags & FSS_PERSISTENT) == 0)
892		fscow_disestablish(sc->sc_mount, fss_copy_on_write, sc);
893
894	mutex_enter(&sc->sc_slock);
895	sc->sc_flags &= ~(FSS_ACTIVE|FSS_ERROR);
896	sc->sc_mount = NULL;
897	sc->sc_bdev = NODEV;
898	mutex_exit(&sc->sc_slock);
899
900	fss_softc_free(sc);
901	if (sc->sc_flags & FSS_PERSISTENT)
902		vrele(sc->sc_bs_vp);
903	else
904		vn_close(sc->sc_bs_vp, FREAD|FWRITE, l->l_cred);
905	sc->sc_bs_vp = NULL;
906	sc->sc_flags &= ~FSS_PERSISTENT;
907
908	return 0;
909}
910
911/*
912 * Read a cluster from the snapshotted block device to the cache.
913 */
914static int
915fss_read_cluster(struct fss_softc *sc, u_int32_t cl)
916{
917	int error, todo, offset, len;
918	daddr_t dblk;
919	struct buf *bp, *mbp;
920	struct fss_cache *scp, *scl;
921
922	/*
923	 * Get a free cache slot.
924	 */
925	scl = sc->sc_cache+sc->sc_cache_size;
926
927	mutex_enter(&sc->sc_slock);
928
929restart:
930	if (isset(sc->sc_copied, cl) || !FSS_ISVALID(sc)) {
931		mutex_exit(&sc->sc_slock);
932		return 0;
933	}
934
935	for (scp = sc->sc_cache; scp < scl; scp++)
936		if (scp->fc_cluster == cl) {
937			if (scp->fc_type == FSS_CACHE_VALID) {
938				mutex_exit(&sc->sc_slock);
939				return 0;
940			} else if (scp->fc_type == FSS_CACHE_BUSY) {
941				cv_wait(&scp->fc_state_cv, &sc->sc_slock);
942				goto restart;
943			}
944		}
945
946	for (scp = sc->sc_cache; scp < scl; scp++)
947		if (scp->fc_type == FSS_CACHE_FREE) {
948			scp->fc_type = FSS_CACHE_BUSY;
949			scp->fc_cluster = cl;
950			break;
951		}
952	if (scp >= scl) {
953		cv_wait(&sc->sc_cache_cv, &sc->sc_slock);
954		goto restart;
955	}
956
957	mutex_exit(&sc->sc_slock);
958
959	/*
960	 * Start the read.
961	 */
962	dblk = btodb(FSS_CLTOB(sc, cl));
963	if (cl == sc->sc_clcount-1) {
964		todo = sc->sc_clresid;
965		memset((char *)scp->fc_data + todo, 0, FSS_CLSIZE(sc) - todo);
966	} else
967		todo = FSS_CLSIZE(sc);
968	offset = 0;
969	mbp = getiobuf(NULL, true);
970	mbp->b_bufsize = todo;
971	mbp->b_data = scp->fc_data;
972	mbp->b_resid = mbp->b_bcount = todo;
973	mbp->b_flags = B_READ;
974	mbp->b_cflags = BC_BUSY;
975	mbp->b_dev = sc->sc_bdev;
976	while (todo > 0) {
977		len = todo;
978		if (len > MAXPHYS)
979			len = MAXPHYS;
980		if (btodb(FSS_CLTOB(sc, cl)) == dblk && len == todo)
981			bp = mbp;
982		else {
983			bp = getiobuf(NULL, true);
984			nestiobuf_setup(mbp, bp, offset, len);
985		}
986		bp->b_lblkno = 0;
987		bp->b_blkno = dblk;
988		bdev_strategy(bp);
989		dblk += btodb(len);
990		offset += len;
991		todo -= len;
992	}
993	error = biowait(mbp);
994	putiobuf(mbp);
995
996	mutex_enter(&sc->sc_slock);
997	scp->fc_type = (error ? FSS_CACHE_FREE : FSS_CACHE_VALID);
998	cv_broadcast(&scp->fc_state_cv);
999	if (error == 0) {
1000		setbit(sc->sc_copied, scp->fc_cluster);
1001		cv_signal(&sc->sc_work_cv);
1002	}
1003	mutex_exit(&sc->sc_slock);
1004
1005	return error;
1006}
1007
1008/*
1009 * Read/write clusters from/to backing store.
1010 * For persistent snapshots must be called with cl == 0. off is the
1011 * offset into the snapshot.
1012 */
1013static int
1014fss_bs_io(struct fss_softc *sc, fss_io_type rw,
1015    u_int32_t cl, off_t off, int len, void *data)
1016{
1017	int error;
1018
1019	off += FSS_CLTOB(sc, cl);
1020
1021	vn_lock(sc->sc_bs_vp, LK_EXCLUSIVE|LK_RETRY);
1022
1023	error = vn_rdwr((rw == FSS_READ ? UIO_READ : UIO_WRITE), sc->sc_bs_vp,
1024	    data, len, off, UIO_SYSSPACE,
1025	    IO_ADV_ENCODE(POSIX_FADV_NOREUSE) | IO_NODELOCKED,
1026	    sc->sc_bs_lwp->l_cred, NULL, NULL);
1027	if (error == 0) {
1028		mutex_enter(sc->sc_bs_vp->v_interlock);
1029		error = VOP_PUTPAGES(sc->sc_bs_vp, trunc_page(off),
1030		    round_page(off+len), PGO_CLEANIT | PGO_FREE | PGO_SYNCIO);
1031	}
1032
1033	VOP_UNLOCK(sc->sc_bs_vp);
1034
1035	return error;
1036}
1037
1038/*
1039 * Get a pointer to the indirect slot for this cluster.
1040 */
1041static u_int32_t *
1042fss_bs_indir(struct fss_softc *sc, u_int32_t cl)
1043{
1044	u_int32_t icl;
1045	int ioff;
1046
1047	icl = cl/(FSS_CLSIZE(sc)/sizeof(u_int32_t));
1048	ioff = cl%(FSS_CLSIZE(sc)/sizeof(u_int32_t));
1049
1050	if (sc->sc_indir_cur == icl)
1051		return &sc->sc_indir_data[ioff];
1052
1053	if (sc->sc_indir_dirty) {
1054		if (fss_bs_io(sc, FSS_WRITE, sc->sc_indir_cur, 0,
1055		    FSS_CLSIZE(sc), (void *)sc->sc_indir_data) != 0)
1056			return NULL;
1057		setbit(sc->sc_indir_valid, sc->sc_indir_cur);
1058	}
1059
1060	sc->sc_indir_dirty = 0;
1061	sc->sc_indir_cur = icl;
1062
1063	if (isset(sc->sc_indir_valid, sc->sc_indir_cur)) {
1064		if (fss_bs_io(sc, FSS_READ, sc->sc_indir_cur, 0,
1065		    FSS_CLSIZE(sc), (void *)sc->sc_indir_data) != 0)
1066			return NULL;
1067	} else
1068		memset(sc->sc_indir_data, 0, FSS_CLSIZE(sc));
1069
1070	return &sc->sc_indir_data[ioff];
1071}
1072
1073/*
1074 * The kernel thread (one for every active snapshot).
1075 *
1076 * After wakeup it cleans the cache and runs the I/O requests.
1077 */
1078static void
1079fss_bs_thread(void *arg)
1080{
1081	bool thread_idle, is_valid;
1082	int error, i, todo, len, crotor, is_read;
1083	long off;
1084	char *addr;
1085	u_int32_t c, cl, ch, *indirp;
1086	struct buf *bp, *nbp;
1087	struct fss_softc *sc;
1088	struct fss_cache *scp, *scl;
1089
1090	sc = arg;
1091	scl = sc->sc_cache+sc->sc_cache_size;
1092	crotor = 0;
1093	thread_idle = false;
1094
1095	mutex_enter(&sc->sc_slock);
1096
1097	for (;;) {
1098		if (thread_idle)
1099			cv_wait(&sc->sc_work_cv, &sc->sc_slock);
1100		thread_idle = true;
1101		if ((sc->sc_flags & FSS_BS_THREAD) == 0) {
1102			mutex_exit(&sc->sc_slock);
1103			kthread_exit(0);
1104		}
1105
1106		/*
1107		 * Process I/O requests (persistent)
1108		 */
1109
1110		if (sc->sc_flags & FSS_PERSISTENT) {
1111			if ((bp = bufq_get(sc->sc_bufq)) == NULL)
1112				continue;
1113			is_valid = FSS_ISVALID(sc);
1114			is_read = (bp->b_flags & B_READ);
1115			thread_idle = false;
1116			mutex_exit(&sc->sc_slock);
1117
1118			if (is_valid) {
1119				disk_busy(sc->sc_dkdev);
1120				error = fss_bs_io(sc, FSS_READ, 0,
1121				    dbtob(bp->b_blkno), bp->b_bcount,
1122				    bp->b_data);
1123				disk_unbusy(sc->sc_dkdev,
1124				    (error ? 0 : bp->b_bcount), is_read);
1125			} else
1126				error = ENXIO;
1127
1128			bp->b_error = error;
1129			bp->b_resid = (error ? bp->b_bcount : 0);
1130			biodone(bp);
1131
1132			mutex_enter(&sc->sc_slock);
1133			continue;
1134		}
1135
1136		/*
1137		 * Clean the cache
1138		 */
1139		for (i = 0; i < sc->sc_cache_size; i++) {
1140			crotor = (crotor + 1) % sc->sc_cache_size;
1141			scp = sc->sc_cache + crotor;
1142			if (scp->fc_type != FSS_CACHE_VALID)
1143				continue;
1144			mutex_exit(&sc->sc_slock);
1145
1146			thread_idle = false;
1147			indirp = fss_bs_indir(sc, scp->fc_cluster);
1148			if (indirp != NULL) {
1149				error = fss_bs_io(sc, FSS_WRITE, sc->sc_clnext,
1150				    0, FSS_CLSIZE(sc), scp->fc_data);
1151			} else
1152				error = EIO;
1153
1154			mutex_enter(&sc->sc_slock);
1155			if (error == 0) {
1156				*indirp = sc->sc_clnext++;
1157				sc->sc_indir_dirty = 1;
1158			} else
1159				fss_error(sc, "write error on backing store");
1160
1161			scp->fc_type = FSS_CACHE_FREE;
1162			cv_signal(&sc->sc_cache_cv);
1163			break;
1164		}
1165
1166		/*
1167		 * Process I/O requests
1168		 */
1169		if ((bp = bufq_get(sc->sc_bufq)) == NULL)
1170			continue;
1171		is_valid = FSS_ISVALID(sc);
1172		is_read = (bp->b_flags & B_READ);
1173		thread_idle = false;
1174
1175		if (!is_valid) {
1176			mutex_exit(&sc->sc_slock);
1177
1178			bp->b_error = ENXIO;
1179			bp->b_resid = bp->b_bcount;
1180			biodone(bp);
1181
1182			mutex_enter(&sc->sc_slock);
1183			continue;
1184		}
1185
1186		disk_busy(sc->sc_dkdev);
1187
1188		/*
1189		 * First read from the snapshotted block device unless
1190		 * this request is completely covered by backing store.
1191		 */
1192
1193		cl = FSS_BTOCL(sc, dbtob(bp->b_blkno));
1194		off = FSS_CLOFF(sc, dbtob(bp->b_blkno));
1195		ch = FSS_BTOCL(sc, dbtob(bp->b_blkno)+bp->b_bcount-1);
1196		error = 0;
1197		bp->b_resid = 0;
1198		bp->b_error = 0;
1199		for (c = cl; c <= ch; c++) {
1200			if (isset(sc->sc_copied, c))
1201				continue;
1202			mutex_exit(&sc->sc_slock);
1203
1204			/* Not on backing store, read from device. */
1205			nbp = getiobuf(NULL, true);
1206			nbp->b_flags = B_READ;
1207			nbp->b_resid = nbp->b_bcount = bp->b_bcount;
1208			nbp->b_bufsize = bp->b_bcount;
1209			nbp->b_data = bp->b_data;
1210			nbp->b_blkno = bp->b_blkno;
1211			nbp->b_lblkno = 0;
1212			nbp->b_dev = sc->sc_bdev;
1213			SET(nbp->b_cflags, BC_BUSY);	/* mark buffer busy */
1214
1215			bdev_strategy(nbp);
1216
1217			error = biowait(nbp);
1218			if (error != 0) {
1219				bp->b_resid = bp->b_bcount;
1220				bp->b_error = nbp->b_error;
1221				disk_unbusy(sc->sc_dkdev, 0, is_read);
1222				biodone(bp);
1223			}
1224			putiobuf(nbp);
1225
1226			mutex_enter(&sc->sc_slock);
1227			break;
1228		}
1229		if (error)
1230			continue;
1231
1232		/*
1233		 * Replace those parts that have been saved to backing store.
1234		 */
1235
1236		addr = bp->b_data;
1237		todo = bp->b_bcount;
1238		for (c = cl; c <= ch; c++, off = 0, todo -= len, addr += len) {
1239			len = FSS_CLSIZE(sc)-off;
1240			if (len > todo)
1241				len = todo;
1242			if (isclr(sc->sc_copied, c))
1243				continue;
1244			mutex_exit(&sc->sc_slock);
1245
1246			indirp = fss_bs_indir(sc, c);
1247			if (indirp == NULL || *indirp == 0) {
1248				/*
1249				 * Not on backing store. Either in cache
1250				 * or hole in the snapshotted block device.
1251				 */
1252
1253				mutex_enter(&sc->sc_slock);
1254				for (scp = sc->sc_cache; scp < scl; scp++)
1255					if (scp->fc_type == FSS_CACHE_VALID &&
1256					    scp->fc_cluster == c)
1257						break;
1258				if (scp < scl)
1259					memcpy(addr, (char *)scp->fc_data+off,
1260					    len);
1261				else
1262					memset(addr, 0, len);
1263				continue;
1264			}
1265
1266			/*
1267			 * Read from backing store.
1268			 */
1269			error =
1270			    fss_bs_io(sc, FSS_READ, *indirp, off, len, addr);
1271
1272			mutex_enter(&sc->sc_slock);
1273			if (error) {
1274				bp->b_resid = bp->b_bcount;
1275				bp->b_error = error;
1276				break;
1277			}
1278		}
1279		mutex_exit(&sc->sc_slock);
1280
1281		disk_unbusy(sc->sc_dkdev, (error ? 0 : bp->b_bcount), is_read);
1282		biodone(bp);
1283
1284		mutex_enter(&sc->sc_slock);
1285	}
1286}
1287
1288#ifdef _MODULE
1289
1290#include <sys/module.h>
1291
1292MODULE(MODULE_CLASS_DRIVER, fss, NULL);
1293CFDRIVER_DECL(fss, DV_DISK, NULL);
1294
1295static int
1296fss_modcmd(modcmd_t cmd, void *arg)
1297{
1298	int bmajor = -1, cmajor = -1,  error = 0;
1299
1300	switch (cmd) {
1301	case MODULE_CMD_INIT:
1302		mutex_init(&fss_device_lock, MUTEX_DEFAULT, IPL_NONE);
1303		error = config_cfdriver_attach(&fss_cd);
1304		if (error) {
1305			mutex_destroy(&fss_device_lock);
1306			break;
1307		}
1308		error = config_cfattach_attach(fss_cd.cd_name, &fss_ca);
1309		if (error) {
1310			config_cfdriver_detach(&fss_cd);
1311			mutex_destroy(&fss_device_lock);
1312			break;
1313		}
1314		error = devsw_attach(fss_cd.cd_name,
1315		    &fss_bdevsw, &bmajor, &fss_cdevsw, &cmajor);
1316		if (error == EEXIST)
1317			error = 0;
1318		if (error) {
1319			config_cfattach_detach(fss_cd.cd_name, &fss_ca);
1320			config_cfdriver_detach(&fss_cd);
1321			mutex_destroy(&fss_device_lock);
1322			break;
1323		}
1324		break;
1325
1326	case MODULE_CMD_FINI:
1327		error = config_cfattach_detach(fss_cd.cd_name, &fss_ca);
1328		if (error)
1329			break;
1330		config_cfdriver_detach(&fss_cd);
1331		devsw_detach(&fss_bdevsw, &fss_cdevsw);
1332		mutex_destroy(&fss_device_lock);
1333		break;
1334
1335	default:
1336		error = ENOTTY;
1337		break;
1338	}
1339
1340	return error;
1341}
1342
1343#endif /* _MODULE */
1344