vfs_trans.c revision 1.60
1/*	$NetBSD: vfs_trans.c,v 1.60 2019/05/13 08:16:56 hannken Exp $	*/
2
3/*-
4 * Copyright (c) 2007 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Juergen Hannken-Illjes.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32#include <sys/cdefs.h>
33__KERNEL_RCSID(0, "$NetBSD: vfs_trans.c,v 1.60 2019/05/13 08:16:56 hannken Exp $");
34
35/*
36 * File system transaction operations.
37 */
38
39#ifdef _KERNEL_OPT
40#include "opt_ddb.h"
41#endif
42
43#include <sys/param.h>
44#include <sys/systm.h>
45#include <sys/atomic.h>
46#include <sys/buf.h>
47#include <sys/kmem.h>
48#include <sys/mount.h>
49#include <sys/pserialize.h>
50#include <sys/vnode.h>
51#include <sys/fstrans.h>
52#include <sys/proc.h>
53
54#include <miscfs/specfs/specdev.h>
55
56enum fstrans_lock_type {
57	FSTRANS_LAZY,			/* Granted while not suspended */
58	FSTRANS_SHARED,			/* Granted while not suspending */
59	FSTRANS_EXCL			/* Internal: exclusive lock */
60};
61
62struct fscow_handler {
63	LIST_ENTRY(fscow_handler) ch_list;
64	int (*ch_func)(void *, struct buf *, bool);
65	void *ch_arg;
66};
67struct fstrans_lwp_info {
68	struct fstrans_lwp_info *fli_succ;
69	struct lwp *fli_self;
70	struct mount *fli_mount;
71	struct fstrans_lwp_info *fli_alias;
72	struct fstrans_mount_info *fli_mountinfo;
73	int fli_trans_cnt;
74	int fli_alias_cnt;
75	int fli_cow_cnt;
76	enum fstrans_lock_type fli_lock_type;
77	LIST_ENTRY(fstrans_lwp_info) fli_list;
78};
79struct fstrans_mount_info {
80	enum fstrans_state fmi_state;
81	unsigned int fmi_ref_cnt;
82	bool fmi_gone;
83	bool fmi_cow_change;
84	LIST_HEAD(, fscow_handler) fmi_cow_handler;
85	struct mount *fmi_mount;
86};
87
88static kmutex_t vfs_suspend_lock;	/* Serialize suspensions. */
89static kmutex_t fstrans_lock;		/* Fstrans big lock. */
90static kmutex_t fstrans_mount_lock;	/* Fstrans mount big lock. */
91static kcondvar_t fstrans_state_cv;	/* Fstrans or cow state changed. */
92static kcondvar_t fstrans_count_cv;	/* Fstrans or cow count changed. */
93static pserialize_t fstrans_psz;	/* Pserialize state. */
94static LIST_HEAD(fstrans_lwp_head, fstrans_lwp_info) fstrans_fli_head;
95					/* List of all fstrans_lwp_info. */
96static int fstrans_gone_count;		/* Number of fstrans_mount_info gone. */
97
98static void fstrans_mount_dtor(struct fstrans_mount_info *);
99static void fstrans_clear_lwp_info(void);
100static inline struct fstrans_lwp_info *
101    fstrans_get_lwp_info(struct mount *, bool);
102static struct fstrans_lwp_info *fstrans_alloc_lwp_info(struct mount *);
103static inline int _fstrans_start(struct mount *, enum fstrans_lock_type, int);
104static bool grant_lock(const enum fstrans_state, const enum fstrans_lock_type);
105static bool state_change_done(const struct fstrans_mount_info *);
106static bool cow_state_change_done(const struct fstrans_mount_info *);
107static void cow_change_enter(struct fstrans_mount_info *);
108static void cow_change_done(struct fstrans_mount_info *);
109
110extern struct mount *dead_rootmount;
111
112#if defined(DIAGNOSTIC)
113
114struct fstrans_debug_mount {
115	struct mount *fdm_mount;
116	SLIST_ENTRY(fstrans_debug_mount) fdm_list;
117};
118
119static SLIST_HEAD(, fstrans_debug_mount) fstrans_debug_mount_head =
120    SLIST_HEAD_INITIALIZER(fstrans_debug_mount_head);
121
122static void
123fstrans_debug_mount(struct mount *mp)
124{
125	struct fstrans_debug_mount *fdm, *new;
126
127	KASSERT(mutex_owned(&fstrans_mount_lock));
128
129	mutex_exit(&fstrans_mount_lock);
130	new = kmem_alloc(sizeof(*new), KM_SLEEP);
131	new->fdm_mount = mp;
132	mutex_enter(&fstrans_mount_lock);
133
134	SLIST_FOREACH(fdm, &fstrans_debug_mount_head, fdm_list)
135		KASSERT(fdm->fdm_mount != mp);
136	SLIST_INSERT_HEAD(&fstrans_debug_mount_head, new, fdm_list);
137}
138
139static void
140fstrans_debug_unmount(struct mount *mp)
141{
142	struct fstrans_debug_mount *fdm;
143
144	KASSERT(mutex_owned(&fstrans_mount_lock));
145
146	SLIST_FOREACH(fdm, &fstrans_debug_mount_head, fdm_list)
147		if (fdm->fdm_mount == mp)
148			break;
149	KASSERT(fdm != NULL);
150	SLIST_REMOVE(&fstrans_debug_mount_head, fdm,
151	    fstrans_debug_mount, fdm_list);
152	kmem_free(fdm, sizeof(*fdm));
153}
154
155static void
156fstrans_debug_validate_mount(struct mount *mp)
157{
158	struct fstrans_debug_mount *fdm;
159
160	KASSERT(mutex_owned(&fstrans_mount_lock));
161
162	SLIST_FOREACH(fdm, &fstrans_debug_mount_head, fdm_list)
163		if (fdm->fdm_mount == mp)
164			break;
165	KASSERTMSG(fdm != NULL, "mount %p invalid", mp);
166}
167
168#else /* defined(DIAGNOSTIC) */
169
170#define fstrans_debug_mount(mp)
171#define fstrans_debug_unmount(mp)
172#define fstrans_debug_validate_mount(mp)
173
174#endif  /* defined(DIAGNOSTIC) */
175
176/*
177 * Initialize.
178 */
179void
180fstrans_init(void)
181{
182
183	mutex_init(&vfs_suspend_lock, MUTEX_DEFAULT, IPL_NONE);
184	mutex_init(&fstrans_lock, MUTEX_DEFAULT, IPL_NONE);
185	mutex_init(&fstrans_mount_lock, MUTEX_DEFAULT, IPL_NONE);
186	cv_init(&fstrans_state_cv, "fstchg");
187	cv_init(&fstrans_count_cv, "fstcnt");
188	fstrans_psz = pserialize_create();
189	LIST_INIT(&fstrans_fli_head);
190}
191
192/*
193 * Deallocate lwp state.
194 */
195void
196fstrans_lwp_dtor(lwp_t *l)
197{
198	struct fstrans_lwp_info *fli, *fli_next;
199
200	for (fli = l->l_fstrans; fli; fli = fli_next) {
201		KASSERT(fli->fli_trans_cnt == 0);
202		KASSERT(fli->fli_cow_cnt == 0);
203		KASSERT(fli->fli_self == l);
204		if (fli->fli_mount != NULL)
205			fstrans_mount_dtor(fli->fli_mountinfo);
206		fli_next = fli->fli_succ;
207		fli->fli_alias_cnt = 0;
208		fli->fli_mount = NULL;
209		fli->fli_alias = NULL;
210		fli->fli_mountinfo = NULL;
211		membar_sync();
212		fli->fli_self = NULL;
213	}
214
215	l->l_fstrans = NULL;
216}
217
218/*
219 * Dereference mount state.
220 */
221static void
222fstrans_mount_dtor(struct fstrans_mount_info *fmi)
223{
224
225	mutex_enter(&fstrans_mount_lock);
226
227	KASSERT(fmi != NULL);
228	fmi->fmi_ref_cnt -= 1;
229	if (fmi->fmi_ref_cnt > 0) {
230		mutex_exit(&fstrans_mount_lock);
231		return;
232	}
233
234	KASSERT(fmi->fmi_state == FSTRANS_NORMAL);
235	KASSERT(LIST_FIRST(&fmi->fmi_cow_handler) == NULL);
236
237	KASSERT(fstrans_gone_count > 0);
238	fstrans_gone_count -= 1;
239
240	mutex_exit(&fstrans_mount_lock);
241
242	kmem_free(fmi->fmi_mount, sizeof(*fmi->fmi_mount));
243	kmem_free(fmi, sizeof(*fmi));
244}
245
246/*
247 * Allocate mount state.
248 */
249int
250fstrans_mount(struct mount *mp)
251{
252	struct fstrans_mount_info *newfmi;
253
254	newfmi = kmem_alloc(sizeof(*newfmi), KM_SLEEP);
255	newfmi->fmi_state = FSTRANS_NORMAL;
256	newfmi->fmi_ref_cnt = 1;
257	newfmi->fmi_gone = false;
258	LIST_INIT(&newfmi->fmi_cow_handler);
259	newfmi->fmi_cow_change = false;
260	newfmi->fmi_mount = mp;
261
262	mutex_enter(&fstrans_mount_lock);
263	mp->mnt_transinfo = newfmi;
264	fstrans_debug_mount(mp);
265	mutex_exit(&fstrans_mount_lock);
266
267	return 0;
268}
269
270/*
271 * Deallocate mount state.
272 */
273void
274fstrans_unmount(struct mount *mp)
275{
276	struct fstrans_mount_info *fmi = mp->mnt_transinfo;
277
278	KASSERT(fmi != NULL);
279
280	mutex_enter(&fstrans_mount_lock);
281	fstrans_debug_unmount(mp);
282	fmi->fmi_gone = true;
283	mp->mnt_transinfo = NULL;
284	fstrans_gone_count += 1;
285	mutex_exit(&fstrans_mount_lock);
286
287	fstrans_mount_dtor(fmi);
288}
289
290/*
291 * Clear mount entries whose mount is gone.
292 */
293static void
294fstrans_clear_lwp_info(void)
295{
296	struct fstrans_lwp_info **p, *fli;
297
298	/*
299	 * Scan our list clearing entries whose mount is gone.
300	 */
301	for (p = &curlwp->l_fstrans; *p; ) {
302		fli = *p;
303		if (fli->fli_mount != NULL &&
304		    fli->fli_mountinfo->fmi_gone &&
305		    fli->fli_trans_cnt == 0 &&
306		    fli->fli_cow_cnt == 0 &&
307		    fli->fli_alias_cnt == 0) {
308			*p = (*p)->fli_succ;
309			fstrans_mount_dtor(fli->fli_mountinfo);
310			if (fli->fli_alias) {
311				KASSERT(fli->fli_alias->fli_alias_cnt > 0);
312				fli->fli_alias->fli_alias_cnt--;
313			}
314			fli->fli_mount = NULL;
315			fli->fli_alias = NULL;
316			fli->fli_mountinfo = NULL;
317			membar_sync();
318			fli->fli_self = NULL;
319			p = &curlwp->l_fstrans;
320		} else {
321			p = &(*p)->fli_succ;
322		}
323	}
324#ifdef DIAGNOSTIC
325	for (fli = curlwp->l_fstrans; fli; fli = fli->fli_succ)
326		if (fli->fli_alias != NULL)
327			KASSERT(fli->fli_alias->fli_self == curlwp);
328#endif /* DIAGNOSTIC */
329}
330
331/*
332 * Allocate and return per lwp info for this mount.
333 */
334static struct fstrans_lwp_info *
335fstrans_alloc_lwp_info(struct mount *mp)
336{
337	struct fstrans_lwp_info *fli;
338	struct fstrans_mount_info *fmi;
339
340	for (fli = curlwp->l_fstrans; fli; fli = fli->fli_succ) {
341		if (fli->fli_mount == mp)
342			return fli;
343	}
344
345	/*
346	 * Try to reuse a cleared entry or allocate a new one.
347	 */
348	mutex_enter(&fstrans_lock);
349	LIST_FOREACH(fli, &fstrans_fli_head, fli_list) {
350		membar_sync();
351		if (fli->fli_self == NULL) {
352			KASSERT(fli->fli_mount == NULL);
353			KASSERT(fli->fli_trans_cnt == 0);
354			KASSERT(fli->fli_cow_cnt == 0);
355			KASSERT(fli->fli_alias_cnt == 0);
356			fli->fli_self = curlwp;
357			fli->fli_succ = curlwp->l_fstrans;
358			curlwp->l_fstrans = fli;
359			break;
360		}
361	}
362	mutex_exit(&fstrans_lock);
363
364	if (fli == NULL) {
365		fli = kmem_alloc(sizeof(*fli), KM_SLEEP);
366		mutex_enter(&fstrans_lock);
367		memset(fli, 0, sizeof(*fli));
368		fli->fli_self = curlwp;
369		LIST_INSERT_HEAD(&fstrans_fli_head, fli, fli_list);
370		mutex_exit(&fstrans_lock);
371		fli->fli_succ = curlwp->l_fstrans;
372		curlwp->l_fstrans = fli;
373	}
374
375	/*
376	 * Attach the entry to the mount if its mnt_transinfo is valid.
377	 */
378
379	mutex_enter(&fstrans_mount_lock);
380	fstrans_debug_validate_mount(mp);
381	fmi = mp->mnt_transinfo;
382	KASSERT(fmi != NULL);
383	fli->fli_mount = mp;
384	fli->fli_mountinfo = fmi;
385	fmi->fmi_ref_cnt += 1;
386	do {
387		mp = mp->mnt_lower;
388	} while (mp && mp->mnt_lower);
389	mutex_exit(&fstrans_mount_lock);
390
391	if (mp) {
392		fli->fli_alias = fstrans_alloc_lwp_info(mp);
393		fli->fli_alias->fli_alias_cnt++;
394		fli = fli->fli_alias;
395	}
396
397	return fli;
398}
399
400/*
401 * Retrieve the per lwp info for this mount allocating if necessary.
402 */
403static inline struct fstrans_lwp_info *
404fstrans_get_lwp_info(struct mount *mp, bool do_alloc)
405{
406	struct fstrans_lwp_info *fli;
407
408	/*
409	 * Scan our list for a match.
410	 */
411	for (fli = curlwp->l_fstrans; fli; fli = fli->fli_succ) {
412		if (fli->fli_mount == mp) {
413			KASSERT((mp->mnt_lower == NULL) ==
414			    (fli->fli_alias == NULL));
415			if (fli->fli_alias != NULL)
416				fli = fli->fli_alias;
417			break;
418		}
419	}
420
421	if (do_alloc) {
422		if (__predict_false(fli == NULL))
423			fli = fstrans_alloc_lwp_info(mp);
424		KASSERT(fli != NULL && !fli->fli_mountinfo->fmi_gone);
425	} else {
426		KASSERT(fli != NULL);
427	}
428
429	return fli;
430}
431
432/*
433 * Check if this lock type is granted at this state.
434 */
435static bool
436grant_lock(const enum fstrans_state state, const enum fstrans_lock_type type)
437{
438
439	if (__predict_true(state == FSTRANS_NORMAL))
440		return true;
441	if (type == FSTRANS_EXCL)
442		return true;
443	if  (state == FSTRANS_SUSPENDING && type == FSTRANS_LAZY)
444		return true;
445
446	return false;
447}
448
449/*
450 * Start a transaction.  If this thread already has a transaction on this
451 * file system increment the reference counter.
452 */
453static inline int
454_fstrans_start(struct mount *mp, enum fstrans_lock_type lock_type, int wait)
455{
456	int s;
457	struct fstrans_lwp_info *fli;
458	struct fstrans_mount_info *fmi;
459
460#ifndef FSTRANS_DEAD_ENABLED
461	if (mp == dead_rootmount)
462		return 0;
463#endif
464
465	ASSERT_SLEEPABLE();
466
467	fli = fstrans_get_lwp_info(mp, true);
468	fmi = fli->fli_mountinfo;
469
470	if (fli->fli_trans_cnt > 0) {
471		KASSERT(lock_type != FSTRANS_EXCL);
472		fli->fli_trans_cnt += 1;
473
474		return 0;
475	}
476
477	s = pserialize_read_enter();
478	if (__predict_true(grant_lock(fmi->fmi_state, lock_type))) {
479		fli->fli_trans_cnt = 1;
480		fli->fli_lock_type = lock_type;
481		pserialize_read_exit(s);
482
483		return 0;
484	}
485	pserialize_read_exit(s);
486
487	if (! wait)
488		return EBUSY;
489
490	mutex_enter(&fstrans_lock);
491	while (! grant_lock(fmi->fmi_state, lock_type))
492		cv_wait(&fstrans_state_cv, &fstrans_lock);
493	fli->fli_trans_cnt = 1;
494	fli->fli_lock_type = lock_type;
495	mutex_exit(&fstrans_lock);
496
497	return 0;
498}
499
500void
501fstrans_start(struct mount *mp)
502{
503	int error __diagused;
504
505	error = _fstrans_start(mp, FSTRANS_SHARED, 1);
506	KASSERT(error == 0);
507}
508
509int
510fstrans_start_nowait(struct mount *mp)
511{
512
513	return _fstrans_start(mp, FSTRANS_SHARED, 0);
514}
515
516void
517fstrans_start_lazy(struct mount *mp)
518{
519	int error __diagused;
520
521	error = _fstrans_start(mp, FSTRANS_LAZY, 1);
522	KASSERT(error == 0);
523}
524
525/*
526 * Finish a transaction.
527 */
528void
529fstrans_done(struct mount *mp)
530{
531	int s;
532	struct fstrans_lwp_info *fli;
533	struct fstrans_mount_info *fmi;
534
535#ifndef FSTRANS_DEAD_ENABLED
536	if (mp == dead_rootmount)
537		return;
538#endif
539
540	fli = fstrans_get_lwp_info(mp, false);
541	fmi = fli->fli_mountinfo;
542	KASSERT(fli->fli_trans_cnt > 0);
543
544	if (fli->fli_trans_cnt > 1) {
545		fli->fli_trans_cnt -= 1;
546
547		return;
548	}
549
550	if (__predict_false(fstrans_gone_count > 0))
551		fstrans_clear_lwp_info();
552
553	s = pserialize_read_enter();
554	if (__predict_true(fmi->fmi_state == FSTRANS_NORMAL)) {
555		fli->fli_trans_cnt = 0;
556		pserialize_read_exit(s);
557
558		return;
559	}
560	pserialize_read_exit(s);
561
562	mutex_enter(&fstrans_lock);
563	fli->fli_trans_cnt = 0;
564	cv_signal(&fstrans_count_cv);
565	mutex_exit(&fstrans_lock);
566}
567
568/*
569 * Check if this thread has an exclusive lock.
570 */
571int
572fstrans_is_owner(struct mount *mp)
573{
574	struct fstrans_lwp_info *fli;
575
576	KASSERT(mp != dead_rootmount);
577
578	fli = fstrans_get_lwp_info(mp, true);
579
580	if (fli->fli_trans_cnt == 0)
581		return 0;
582
583	return (fli->fli_lock_type == FSTRANS_EXCL);
584}
585
586/*
587 * True, if no thread is in a transaction not granted at the current state.
588 */
589static bool
590state_change_done(const struct fstrans_mount_info *fmi)
591{
592	struct fstrans_lwp_info *fli;
593
594	KASSERT(mutex_owned(&fstrans_lock));
595
596	LIST_FOREACH(fli, &fstrans_fli_head, fli_list) {
597		if (fli->fli_mountinfo != fmi)
598			continue;
599		if (fli->fli_trans_cnt == 0)
600			continue;
601		if (grant_lock(fmi->fmi_state, fli->fli_lock_type))
602			continue;
603
604		return false;
605	}
606
607	return true;
608}
609
610/*
611 * Set new file system state.
612 */
613int
614fstrans_setstate(struct mount *mp, enum fstrans_state new_state)
615{
616	int error;
617	enum fstrans_state old_state;
618	struct fstrans_lwp_info *fli;
619	struct fstrans_mount_info *fmi;
620
621	KASSERT(mp != dead_rootmount);
622
623	fli = fstrans_get_lwp_info(mp, true);
624	fmi = fli->fli_mountinfo;
625	old_state = fmi->fmi_state;
626	if (old_state == new_state)
627		return 0;
628
629	mutex_enter(&fstrans_lock);
630	fmi->fmi_state = new_state;
631	pserialize_perform(fstrans_psz);
632
633	/*
634	 * All threads see the new state now.
635	 * Wait for transactions invalid at this state to leave.
636	 */
637	error = 0;
638	while (! state_change_done(fmi)) {
639		error = cv_wait_sig(&fstrans_count_cv, &fstrans_lock);
640		if (error) {
641			new_state = fmi->fmi_state = FSTRANS_NORMAL;
642			break;
643		}
644	}
645	cv_broadcast(&fstrans_state_cv);
646	mutex_exit(&fstrans_lock);
647
648	if (old_state != new_state) {
649		if (old_state == FSTRANS_NORMAL)
650			_fstrans_start(mp, FSTRANS_EXCL, 1);
651		if (new_state == FSTRANS_NORMAL)
652			fstrans_done(mp);
653	}
654
655	return error;
656}
657
658/*
659 * Get current file system state.
660 */
661enum fstrans_state
662fstrans_getstate(struct mount *mp)
663{
664	struct fstrans_lwp_info *fli;
665	struct fstrans_mount_info *fmi;
666
667	KASSERT(mp != dead_rootmount);
668
669	fli = fstrans_get_lwp_info(mp, true);
670	fmi = fli->fli_mountinfo;
671
672	return fmi->fmi_state;
673}
674
675/*
676 * Request a filesystem to suspend all operations.
677 */
678int
679vfs_suspend(struct mount *mp, int nowait)
680{
681	struct fstrans_lwp_info *fli;
682	int error;
683
684	if (mp == dead_rootmount)
685		return EOPNOTSUPP;
686
687	fli = fstrans_get_lwp_info(mp, true);
688	mp = fli->fli_mount;
689
690	if (nowait) {
691		if (!mutex_tryenter(&vfs_suspend_lock))
692			return EWOULDBLOCK;
693	} else
694		mutex_enter(&vfs_suspend_lock);
695
696	if ((error = VFS_SUSPENDCTL(mp, SUSPEND_SUSPEND)) != 0)
697		mutex_exit(&vfs_suspend_lock);
698
699	return error;
700}
701
702/*
703 * Request a filesystem to resume all operations.
704 */
705void
706vfs_resume(struct mount *mp)
707{
708	struct fstrans_lwp_info *fli;
709
710	KASSERT(mp != dead_rootmount);
711
712	fli = fstrans_get_lwp_info(mp, false);
713	mp = fli->fli_mount;
714
715	VFS_SUSPENDCTL(mp, SUSPEND_RESUME);
716	mutex_exit(&vfs_suspend_lock);
717}
718
719
720/*
721 * True, if no thread is running a cow handler.
722 */
723static bool
724cow_state_change_done(const struct fstrans_mount_info *fmi)
725{
726	struct fstrans_lwp_info *fli;
727
728	KASSERT(mutex_owned(&fstrans_lock));
729	KASSERT(fmi->fmi_cow_change);
730
731	LIST_FOREACH(fli, &fstrans_fli_head, fli_list) {
732		if (fli->fli_mount != fmi->fmi_mount)
733			continue;
734		if (fli->fli_cow_cnt == 0)
735			continue;
736
737		return false;
738	}
739
740	return true;
741}
742
743/*
744 * Prepare for changing this mounts cow list.
745 * Returns with fstrans_lock locked.
746 */
747static void
748cow_change_enter(struct fstrans_mount_info *fmi)
749{
750
751	mutex_enter(&fstrans_lock);
752
753	/*
754	 * Wait for other threads changing the list.
755	 */
756	while (fmi->fmi_cow_change)
757		cv_wait(&fstrans_state_cv, &fstrans_lock);
758
759	/*
760	 * Wait until all threads are aware of a state change.
761	 */
762	fmi->fmi_cow_change = true;
763	pserialize_perform(fstrans_psz);
764
765	while (! cow_state_change_done(fmi))
766		cv_wait(&fstrans_count_cv, &fstrans_lock);
767}
768
769/*
770 * Done changing this mounts cow list.
771 */
772static void
773cow_change_done(struct fstrans_mount_info *fmi)
774{
775
776	KASSERT(mutex_owned(&fstrans_lock));
777
778	fmi->fmi_cow_change = false;
779	pserialize_perform(fstrans_psz);
780
781	cv_broadcast(&fstrans_state_cv);
782
783	mutex_exit(&fstrans_lock);
784}
785
786/*
787 * Add a handler to this mount.
788 */
789int
790fscow_establish(struct mount *mp, int (*func)(void *, struct buf *, bool),
791    void *arg)
792{
793	struct fstrans_mount_info *fmi;
794	struct fscow_handler *newch;
795
796	KASSERT(mp != dead_rootmount);
797
798	mutex_enter(&fstrans_mount_lock);
799	fmi = mp->mnt_transinfo;
800	KASSERT(fmi != NULL);
801	fmi->fmi_ref_cnt += 1;
802	mutex_exit(&fstrans_mount_lock);
803
804	newch = kmem_alloc(sizeof(*newch), KM_SLEEP);
805	newch->ch_func = func;
806	newch->ch_arg = arg;
807
808	cow_change_enter(fmi);
809	LIST_INSERT_HEAD(&fmi->fmi_cow_handler, newch, ch_list);
810	cow_change_done(fmi);
811
812	return 0;
813}
814
815/*
816 * Remove a handler from this mount.
817 */
818int
819fscow_disestablish(struct mount *mp, int (*func)(void *, struct buf *, bool),
820    void *arg)
821{
822	struct fstrans_mount_info *fmi;
823	struct fscow_handler *hp = NULL;
824
825	KASSERT(mp != dead_rootmount);
826
827	fmi = mp->mnt_transinfo;
828	KASSERT(fmi != NULL);
829
830	cow_change_enter(fmi);
831	LIST_FOREACH(hp, &fmi->fmi_cow_handler, ch_list)
832		if (hp->ch_func == func && hp->ch_arg == arg)
833			break;
834	if (hp != NULL) {
835		LIST_REMOVE(hp, ch_list);
836		kmem_free(hp, sizeof(*hp));
837	}
838	cow_change_done(fmi);
839
840	fstrans_mount_dtor(fmi);
841
842	return hp ? 0 : EINVAL;
843}
844
845/*
846 * Check for need to copy block that is about to be written.
847 */
848int
849fscow_run(struct buf *bp, bool data_valid)
850{
851	int error, s;
852	struct mount *mp;
853	struct fstrans_lwp_info *fli;
854	struct fstrans_mount_info *fmi;
855	struct fscow_handler *hp;
856
857	/*
858	 * First check if we need run the copy-on-write handler.
859	 */
860	if ((bp->b_flags & B_COWDONE))
861		return 0;
862	if (bp->b_vp == NULL) {
863		bp->b_flags |= B_COWDONE;
864		return 0;
865	}
866	if (bp->b_vp->v_type == VBLK)
867		mp = spec_node_getmountedfs(bp->b_vp);
868	else
869		mp = bp->b_vp->v_mount;
870	if (mp == NULL || mp == dead_rootmount) {
871		bp->b_flags |= B_COWDONE;
872		return 0;
873	}
874
875	fli = fstrans_get_lwp_info(mp, true);
876	fmi = fli->fli_mountinfo;
877
878	/*
879	 * On non-recursed run check if other threads
880	 * want to change the list.
881	 */
882	if (fli->fli_cow_cnt == 0) {
883		s = pserialize_read_enter();
884		if (__predict_false(fmi->fmi_cow_change)) {
885			pserialize_read_exit(s);
886			mutex_enter(&fstrans_lock);
887			while (fmi->fmi_cow_change)
888				cv_wait(&fstrans_state_cv, &fstrans_lock);
889			fli->fli_cow_cnt = 1;
890			mutex_exit(&fstrans_lock);
891		} else {
892			fli->fli_cow_cnt = 1;
893			pserialize_read_exit(s);
894		}
895	} else
896		fli->fli_cow_cnt += 1;
897
898	/*
899	 * Run all copy-on-write handlers, stop on error.
900	 */
901	error = 0;
902	LIST_FOREACH(hp, &fmi->fmi_cow_handler, ch_list)
903		if ((error = (*hp->ch_func)(hp->ch_arg, bp, data_valid)) != 0)
904			break;
905 	if (error == 0)
906 		bp->b_flags |= B_COWDONE;
907
908	/*
909	 * Check if other threads want to change the list.
910	 */
911	if (fli->fli_cow_cnt > 1) {
912		fli->fli_cow_cnt -= 1;
913	} else {
914		s = pserialize_read_enter();
915		if (__predict_false(fmi->fmi_cow_change)) {
916			pserialize_read_exit(s);
917			mutex_enter(&fstrans_lock);
918			fli->fli_cow_cnt = 0;
919			cv_signal(&fstrans_count_cv);
920			mutex_exit(&fstrans_lock);
921		} else {
922			fli->fli_cow_cnt = 0;
923			pserialize_read_exit(s);
924		}
925	}
926
927	return error;
928}
929
930#if defined(DDB)
931void fstrans_dump(int);
932
933static void
934fstrans_print_lwp(struct proc *p, struct lwp *l, int verbose)
935{
936	char prefix[9];
937	struct fstrans_lwp_info *fli;
938
939	snprintf(prefix, sizeof(prefix), "%d.%d", p->p_pid, l->l_lid);
940	LIST_FOREACH(fli, &fstrans_fli_head, fli_list) {
941		if (fli->fli_self != l)
942			continue;
943		if (fli->fli_trans_cnt == 0 && fli->fli_cow_cnt == 0) {
944			if (! verbose)
945				continue;
946		}
947		printf("%-8s", prefix);
948		if (verbose)
949			printf(" @%p", fli);
950		if (fli->fli_mount == dead_rootmount)
951			printf(" <dead>");
952		else if (fli->fli_mount != NULL)
953			printf(" (%s)", fli->fli_mount->mnt_stat.f_mntonname);
954		else
955			printf(" NULL");
956		if (fli->fli_alias != NULL) {
957			struct mount *amp = fli->fli_alias->fli_mount;
958
959			printf(" alias");
960			if (verbose)
961				printf(" @%p", fli->fli_alias);
962			if (amp == NULL)
963				printf(" NULL");
964			else
965				printf(" (%s)", amp->mnt_stat.f_mntonname);
966		}
967		if (fli->fli_mountinfo && fli->fli_mountinfo->fmi_gone)
968			printf(" gone");
969		if (fli->fli_trans_cnt == 0) {
970			printf(" -");
971		} else {
972			switch (fli->fli_lock_type) {
973			case FSTRANS_LAZY:
974				printf(" lazy");
975				break;
976			case FSTRANS_SHARED:
977				printf(" shared");
978				break;
979			case FSTRANS_EXCL:
980				printf(" excl");
981				break;
982			default:
983				printf(" %#x", fli->fli_lock_type);
984				break;
985			}
986		}
987		printf(" %d cow %d alias %d\n",
988		    fli->fli_trans_cnt, fli->fli_cow_cnt, fli->fli_alias_cnt);
989		prefix[0] = '\0';
990	}
991}
992
993static void
994fstrans_print_mount(struct mount *mp, int verbose)
995{
996	struct fstrans_mount_info *fmi;
997
998	fmi = mp->mnt_transinfo;
999	if (!verbose && (fmi == NULL || fmi->fmi_state == FSTRANS_NORMAL))
1000		return;
1001
1002	printf("%-16s ", mp->mnt_stat.f_mntonname);
1003	if (fmi == NULL) {
1004		printf("(null)\n");
1005		return;
1006	}
1007	switch (fmi->fmi_state) {
1008	case FSTRANS_NORMAL:
1009		printf("state normal\n");
1010		break;
1011	case FSTRANS_SUSPENDING:
1012		printf("state suspending\n");
1013		break;
1014	case FSTRANS_SUSPENDED:
1015		printf("state suspended\n");
1016		break;
1017	default:
1018		printf("state %#x\n", fmi->fmi_state);
1019		break;
1020	}
1021}
1022
1023void
1024fstrans_dump(int full)
1025{
1026	const struct proclist_desc *pd;
1027	struct proc *p;
1028	struct lwp *l;
1029	struct mount *mp;
1030
1031	printf("Fstrans locks by lwp:\n");
1032	for (pd = proclists; pd->pd_list != NULL; pd++)
1033		PROCLIST_FOREACH(p, pd->pd_list)
1034			LIST_FOREACH(l, &p->p_lwps, l_sibling)
1035				fstrans_print_lwp(p, l, full == 1);
1036
1037	printf("Fstrans state by mount:\n");
1038	for (mp = _mountlist_next(NULL); mp; mp = _mountlist_next(mp))
1039		fstrans_print_mount(mp, full == 1);
1040}
1041#endif /* defined(DDB) */
1042