vfs_trans.c revision 1.62
1/*	$NetBSD: vfs_trans.c,v 1.62 2020/05/13 09:21:30 hannken Exp $	*/
2
3/*-
4 * Copyright (c) 2007 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Juergen Hannken-Illjes.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32#include <sys/cdefs.h>
33__KERNEL_RCSID(0, "$NetBSD: vfs_trans.c,v 1.62 2020/05/13 09:21:30 hannken Exp $");
34
35/*
36 * File system transaction operations.
37 */
38
39#ifdef _KERNEL_OPT
40#include "opt_ddb.h"
41#endif
42
43#include <sys/param.h>
44#include <sys/systm.h>
45#include <sys/atomic.h>
46#include <sys/buf.h>
47#include <sys/kmem.h>
48#include <sys/mount.h>
49#include <sys/pserialize.h>
50#include <sys/vnode.h>
51#include <sys/fstrans.h>
52#include <sys/proc.h>
53
54#include <miscfs/specfs/specdev.h>
55
56enum fstrans_lock_type {
57	FSTRANS_LAZY,			/* Granted while not suspended */
58	FSTRANS_SHARED			/* Granted while not suspending */
59};
60
61struct fscow_handler {
62	LIST_ENTRY(fscow_handler) ch_list;
63	int (*ch_func)(void *, struct buf *, bool);
64	void *ch_arg;
65};
66struct fstrans_lwp_info {
67	struct fstrans_lwp_info *fli_succ;
68	struct lwp *fli_self;
69	struct mount *fli_mount;
70	struct fstrans_lwp_info *fli_alias;
71	struct fstrans_mount_info *fli_mountinfo;
72	int fli_trans_cnt;
73	int fli_alias_cnt;
74	int fli_cow_cnt;
75	enum fstrans_lock_type fli_lock_type;
76	LIST_ENTRY(fstrans_lwp_info) fli_list;
77};
78struct fstrans_mount_info {
79	enum fstrans_state fmi_state;
80	unsigned int fmi_ref_cnt;
81	bool fmi_gone;
82	bool fmi_cow_change;
83	LIST_HEAD(, fscow_handler) fmi_cow_handler;
84	struct mount *fmi_mount;
85	struct lwp *fmi_owner;
86};
87
88static kmutex_t vfs_suspend_lock;	/* Serialize suspensions. */
89static kmutex_t fstrans_lock;		/* Fstrans big lock. */
90static kmutex_t fstrans_mount_lock;	/* Fstrans mount big lock. */
91static kcondvar_t fstrans_state_cv;	/* Fstrans or cow state changed. */
92static kcondvar_t fstrans_count_cv;	/* Fstrans or cow count changed. */
93static pserialize_t fstrans_psz;	/* Pserialize state. */
94static LIST_HEAD(fstrans_lwp_head, fstrans_lwp_info) fstrans_fli_head;
95					/* List of all fstrans_lwp_info. */
96static int fstrans_gone_count;		/* Number of fstrans_mount_info gone. */
97
98static void fstrans_mount_dtor(struct fstrans_mount_info *);
99static void fstrans_clear_lwp_info(void);
100static inline struct fstrans_lwp_info *
101    fstrans_get_lwp_info(struct mount *, bool);
102static struct fstrans_lwp_info *fstrans_alloc_lwp_info(struct mount *);
103static inline int _fstrans_start(struct mount *, enum fstrans_lock_type, int);
104static bool grant_lock(const struct fstrans_mount_info *,
105    const enum fstrans_lock_type);
106static bool state_change_done(const struct fstrans_mount_info *);
107static bool cow_state_change_done(const struct fstrans_mount_info *);
108static void cow_change_enter(struct fstrans_mount_info *);
109static void cow_change_done(struct fstrans_mount_info *);
110
111extern struct mount *dead_rootmount;
112
113#if defined(DIAGNOSTIC)
114
115struct fstrans_debug_mount {
116	struct mount *fdm_mount;
117	SLIST_ENTRY(fstrans_debug_mount) fdm_list;
118};
119
120static SLIST_HEAD(, fstrans_debug_mount) fstrans_debug_mount_head =
121    SLIST_HEAD_INITIALIZER(fstrans_debug_mount_head);
122
123static void
124fstrans_debug_mount(struct mount *mp)
125{
126	struct fstrans_debug_mount *fdm, *new;
127
128	KASSERT(mutex_owned(&fstrans_mount_lock));
129
130	mutex_exit(&fstrans_mount_lock);
131	new = kmem_alloc(sizeof(*new), KM_SLEEP);
132	new->fdm_mount = mp;
133	mutex_enter(&fstrans_mount_lock);
134
135	SLIST_FOREACH(fdm, &fstrans_debug_mount_head, fdm_list)
136		KASSERT(fdm->fdm_mount != mp);
137	SLIST_INSERT_HEAD(&fstrans_debug_mount_head, new, fdm_list);
138}
139
140static void
141fstrans_debug_unmount(struct mount *mp)
142{
143	struct fstrans_debug_mount *fdm;
144
145	KASSERT(mutex_owned(&fstrans_mount_lock));
146
147	SLIST_FOREACH(fdm, &fstrans_debug_mount_head, fdm_list)
148		if (fdm->fdm_mount == mp)
149			break;
150	KASSERT(fdm != NULL);
151	SLIST_REMOVE(&fstrans_debug_mount_head, fdm,
152	    fstrans_debug_mount, fdm_list);
153	kmem_free(fdm, sizeof(*fdm));
154}
155
156static void
157fstrans_debug_validate_mount(struct mount *mp)
158{
159	struct fstrans_debug_mount *fdm;
160
161	KASSERT(mutex_owned(&fstrans_mount_lock));
162
163	SLIST_FOREACH(fdm, &fstrans_debug_mount_head, fdm_list)
164		if (fdm->fdm_mount == mp)
165			break;
166	KASSERTMSG(fdm != NULL, "mount %p invalid", mp);
167}
168
169#else /* defined(DIAGNOSTIC) */
170
171#define fstrans_debug_mount(mp)
172#define fstrans_debug_unmount(mp)
173#define fstrans_debug_validate_mount(mp)
174
175#endif  /* defined(DIAGNOSTIC) */
176
177/*
178 * Initialize.
179 */
180void
181fstrans_init(void)
182{
183
184	mutex_init(&vfs_suspend_lock, MUTEX_DEFAULT, IPL_NONE);
185	mutex_init(&fstrans_lock, MUTEX_DEFAULT, IPL_NONE);
186	mutex_init(&fstrans_mount_lock, MUTEX_DEFAULT, IPL_NONE);
187	cv_init(&fstrans_state_cv, "fstchg");
188	cv_init(&fstrans_count_cv, "fstcnt");
189	fstrans_psz = pserialize_create();
190	LIST_INIT(&fstrans_fli_head);
191}
192
193/*
194 * Deallocate lwp state.
195 */
196void
197fstrans_lwp_dtor(lwp_t *l)
198{
199	struct fstrans_lwp_info *fli, *fli_next;
200
201	for (fli = l->l_fstrans; fli; fli = fli_next) {
202		KASSERT(fli->fli_trans_cnt == 0);
203		KASSERT(fli->fli_cow_cnt == 0);
204		KASSERT(fli->fli_self == l);
205		if (fli->fli_mount != NULL)
206			fstrans_mount_dtor(fli->fli_mountinfo);
207		fli_next = fli->fli_succ;
208		fli->fli_alias_cnt = 0;
209		fli->fli_mount = NULL;
210		fli->fli_alias = NULL;
211		fli->fli_mountinfo = NULL;
212		membar_sync();
213		fli->fli_self = NULL;
214	}
215
216	l->l_fstrans = NULL;
217}
218
219/*
220 * Dereference mount state.
221 */
222static void
223fstrans_mount_dtor(struct fstrans_mount_info *fmi)
224{
225
226	mutex_enter(&fstrans_mount_lock);
227
228	KASSERT(fmi != NULL);
229	fmi->fmi_ref_cnt -= 1;
230	if (fmi->fmi_ref_cnt > 0) {
231		mutex_exit(&fstrans_mount_lock);
232		return;
233	}
234
235	KASSERT(fmi->fmi_state == FSTRANS_NORMAL);
236	KASSERT(LIST_FIRST(&fmi->fmi_cow_handler) == NULL);
237	KASSERT(fmi->fmi_owner == NULL);
238
239	KASSERT(fstrans_gone_count > 0);
240	fstrans_gone_count -= 1;
241
242	mutex_exit(&fstrans_mount_lock);
243
244	kmem_free(fmi->fmi_mount, sizeof(*fmi->fmi_mount));
245	kmem_free(fmi, sizeof(*fmi));
246}
247
248/*
249 * Allocate mount state.
250 */
251int
252fstrans_mount(struct mount *mp)
253{
254	struct fstrans_mount_info *newfmi;
255
256	newfmi = kmem_alloc(sizeof(*newfmi), KM_SLEEP);
257	newfmi->fmi_state = FSTRANS_NORMAL;
258	newfmi->fmi_ref_cnt = 1;
259	newfmi->fmi_gone = false;
260	LIST_INIT(&newfmi->fmi_cow_handler);
261	newfmi->fmi_cow_change = false;
262	newfmi->fmi_mount = mp;
263	newfmi->fmi_owner = NULL;
264
265	mutex_enter(&fstrans_mount_lock);
266	mp->mnt_transinfo = newfmi;
267	fstrans_debug_mount(mp);
268	mutex_exit(&fstrans_mount_lock);
269
270	return 0;
271}
272
273/*
274 * Deallocate mount state.
275 */
276void
277fstrans_unmount(struct mount *mp)
278{
279	struct fstrans_mount_info *fmi = mp->mnt_transinfo;
280
281	KASSERT(fmi != NULL);
282
283	mutex_enter(&fstrans_mount_lock);
284	fstrans_debug_unmount(mp);
285	fmi->fmi_gone = true;
286	mp->mnt_transinfo = NULL;
287	fstrans_gone_count += 1;
288	mutex_exit(&fstrans_mount_lock);
289
290	fstrans_mount_dtor(fmi);
291}
292
293/*
294 * Clear mount entries whose mount is gone.
295 */
296static void
297fstrans_clear_lwp_info(void)
298{
299	struct fstrans_lwp_info **p, *fli;
300
301	/*
302	 * Scan our list clearing entries whose mount is gone.
303	 */
304	for (p = &curlwp->l_fstrans; *p; ) {
305		fli = *p;
306		if (fli->fli_mount != NULL &&
307		    fli->fli_mountinfo->fmi_gone &&
308		    fli->fli_trans_cnt == 0 &&
309		    fli->fli_cow_cnt == 0 &&
310		    fli->fli_alias_cnt == 0) {
311			*p = (*p)->fli_succ;
312			fstrans_mount_dtor(fli->fli_mountinfo);
313			if (fli->fli_alias) {
314				KASSERT(fli->fli_alias->fli_alias_cnt > 0);
315				fli->fli_alias->fli_alias_cnt--;
316			}
317			fli->fli_mount = NULL;
318			fli->fli_alias = NULL;
319			fli->fli_mountinfo = NULL;
320			membar_sync();
321			fli->fli_self = NULL;
322			p = &curlwp->l_fstrans;
323		} else {
324			p = &(*p)->fli_succ;
325		}
326	}
327#ifdef DIAGNOSTIC
328	for (fli = curlwp->l_fstrans; fli; fli = fli->fli_succ)
329		if (fli->fli_alias != NULL)
330			KASSERT(fli->fli_alias->fli_self == curlwp);
331#endif /* DIAGNOSTIC */
332}
333
334/*
335 * Allocate and return per lwp info for this mount.
336 */
337static struct fstrans_lwp_info *
338fstrans_alloc_lwp_info(struct mount *mp)
339{
340	struct fstrans_lwp_info *fli;
341	struct fstrans_mount_info *fmi;
342
343	for (fli = curlwp->l_fstrans; fli; fli = fli->fli_succ) {
344		if (fli->fli_mount == mp)
345			return fli;
346	}
347
348	/*
349	 * Try to reuse a cleared entry or allocate a new one.
350	 */
351	mutex_enter(&fstrans_lock);
352	LIST_FOREACH(fli, &fstrans_fli_head, fli_list) {
353		membar_sync();
354		if (fli->fli_self == NULL) {
355			KASSERT(fli->fli_mount == NULL);
356			KASSERT(fli->fli_trans_cnt == 0);
357			KASSERT(fli->fli_cow_cnt == 0);
358			KASSERT(fli->fli_alias_cnt == 0);
359			fli->fli_self = curlwp;
360			fli->fli_succ = curlwp->l_fstrans;
361			curlwp->l_fstrans = fli;
362			break;
363		}
364	}
365	mutex_exit(&fstrans_lock);
366
367	if (fli == NULL) {
368		fli = kmem_alloc(sizeof(*fli), KM_SLEEP);
369		mutex_enter(&fstrans_lock);
370		memset(fli, 0, sizeof(*fli));
371		fli->fli_self = curlwp;
372		LIST_INSERT_HEAD(&fstrans_fli_head, fli, fli_list);
373		mutex_exit(&fstrans_lock);
374		fli->fli_succ = curlwp->l_fstrans;
375		curlwp->l_fstrans = fli;
376	}
377
378	/*
379	 * Attach the entry to the mount if its mnt_transinfo is valid.
380	 */
381
382	mutex_enter(&fstrans_mount_lock);
383	fstrans_debug_validate_mount(mp);
384	fmi = mp->mnt_transinfo;
385	KASSERT(fmi != NULL);
386	fli->fli_mount = mp;
387	fli->fli_mountinfo = fmi;
388	fmi->fmi_ref_cnt += 1;
389	do {
390		mp = mp->mnt_lower;
391	} while (mp && mp->mnt_lower);
392	mutex_exit(&fstrans_mount_lock);
393
394	if (mp) {
395		fli->fli_alias = fstrans_alloc_lwp_info(mp);
396		fli->fli_alias->fli_alias_cnt++;
397		fli = fli->fli_alias;
398	}
399
400	return fli;
401}
402
403/*
404 * Retrieve the per lwp info for this mount allocating if necessary.
405 */
406static inline struct fstrans_lwp_info *
407fstrans_get_lwp_info(struct mount *mp, bool do_alloc)
408{
409	struct fstrans_lwp_info *fli;
410
411	/*
412	 * Scan our list for a match.
413	 */
414	for (fli = curlwp->l_fstrans; fli; fli = fli->fli_succ) {
415		if (fli->fli_mount == mp) {
416			KASSERT((mp->mnt_lower == NULL) ==
417			    (fli->fli_alias == NULL));
418			if (fli->fli_alias != NULL)
419				fli = fli->fli_alias;
420			break;
421		}
422	}
423
424	if (do_alloc) {
425		if (__predict_false(fli == NULL))
426			fli = fstrans_alloc_lwp_info(mp);
427		KASSERT(fli != NULL && !fli->fli_mountinfo->fmi_gone);
428	} else {
429		KASSERT(fli != NULL);
430	}
431
432	return fli;
433}
434
435/*
436 * Check if this lock type is granted at this state.
437 */
438static bool
439grant_lock(const struct fstrans_mount_info *fmi,
440    const enum fstrans_lock_type type)
441{
442
443	if (__predict_true(fmi->fmi_state == FSTRANS_NORMAL))
444		return true;
445	if (fmi->fmi_owner == curlwp)
446		return true;
447	if  (fmi->fmi_state == FSTRANS_SUSPENDING && type == FSTRANS_LAZY)
448		return true;
449
450	return false;
451}
452
453/*
454 * Start a transaction.  If this thread already has a transaction on this
455 * file system increment the reference counter.
456 */
457static inline int
458_fstrans_start(struct mount *mp, enum fstrans_lock_type lock_type, int wait)
459{
460	int s;
461	struct fstrans_lwp_info *fli;
462	struct fstrans_mount_info *fmi;
463
464#ifndef FSTRANS_DEAD_ENABLED
465	if (mp == dead_rootmount)
466		return 0;
467#endif
468
469	ASSERT_SLEEPABLE();
470
471	fli = fstrans_get_lwp_info(mp, true);
472	fmi = fli->fli_mountinfo;
473
474	if (fli->fli_trans_cnt > 0) {
475		fli->fli_trans_cnt += 1;
476
477		return 0;
478	}
479
480	s = pserialize_read_enter();
481	if (__predict_true(grant_lock(fmi, lock_type))) {
482		fli->fli_trans_cnt = 1;
483		fli->fli_lock_type = lock_type;
484		pserialize_read_exit(s);
485
486		return 0;
487	}
488	pserialize_read_exit(s);
489
490	if (! wait)
491		return EBUSY;
492
493	mutex_enter(&fstrans_lock);
494	while (! grant_lock(fmi, lock_type))
495		cv_wait(&fstrans_state_cv, &fstrans_lock);
496	fli->fli_trans_cnt = 1;
497	fli->fli_lock_type = lock_type;
498	mutex_exit(&fstrans_lock);
499
500	return 0;
501}
502
503void
504fstrans_start(struct mount *mp)
505{
506	int error __diagused;
507
508	error = _fstrans_start(mp, FSTRANS_SHARED, 1);
509	KASSERT(error == 0);
510}
511
512int
513fstrans_start_nowait(struct mount *mp)
514{
515
516	return _fstrans_start(mp, FSTRANS_SHARED, 0);
517}
518
519void
520fstrans_start_lazy(struct mount *mp)
521{
522	int error __diagused;
523
524	error = _fstrans_start(mp, FSTRANS_LAZY, 1);
525	KASSERT(error == 0);
526}
527
528/*
529 * Finish a transaction.
530 */
531void
532fstrans_done(struct mount *mp)
533{
534	int s;
535	struct fstrans_lwp_info *fli;
536	struct fstrans_mount_info *fmi;
537
538#ifndef FSTRANS_DEAD_ENABLED
539	if (mp == dead_rootmount)
540		return;
541#endif
542
543	fli = fstrans_get_lwp_info(mp, false);
544	fmi = fli->fli_mountinfo;
545	KASSERT(fli->fli_trans_cnt > 0);
546
547	if (fli->fli_trans_cnt > 1) {
548		fli->fli_trans_cnt -= 1;
549
550		return;
551	}
552
553	if (__predict_false(fstrans_gone_count > 0))
554		fstrans_clear_lwp_info();
555
556	s = pserialize_read_enter();
557	if (__predict_true(fmi->fmi_state == FSTRANS_NORMAL)) {
558		fli->fli_trans_cnt = 0;
559		pserialize_read_exit(s);
560
561		return;
562	}
563	pserialize_read_exit(s);
564
565	mutex_enter(&fstrans_lock);
566	fli->fli_trans_cnt = 0;
567	cv_signal(&fstrans_count_cv);
568	mutex_exit(&fstrans_lock);
569}
570
571/*
572 * Check if we hold an lock.
573 */
574int
575fstrans_held(struct mount *mp)
576{
577	struct fstrans_lwp_info *fli;
578	struct fstrans_mount_info *fmi;
579
580	KASSERT(mp != dead_rootmount);
581
582	fli = fstrans_get_lwp_info(mp, true);
583	fmi = fli->fli_mountinfo;
584
585	return (fli->fli_trans_cnt > 0 || fmi->fmi_owner == curlwp);
586}
587
588/*
589 * Check if this thread has an exclusive lock.
590 */
591int
592fstrans_is_owner(struct mount *mp)
593{
594	struct fstrans_lwp_info *fli;
595	struct fstrans_mount_info *fmi;
596
597	KASSERT(mp != dead_rootmount);
598
599	fli = fstrans_get_lwp_info(mp, true);
600	fmi = fli->fli_mountinfo;
601
602	return (fmi->fmi_owner == curlwp);
603}
604
605/*
606 * True, if no thread is in a transaction not granted at the current state.
607 */
608static bool
609state_change_done(const struct fstrans_mount_info *fmi)
610{
611	struct fstrans_lwp_info *fli;
612
613	KASSERT(mutex_owned(&fstrans_lock));
614
615	LIST_FOREACH(fli, &fstrans_fli_head, fli_list) {
616		if (fli->fli_mountinfo != fmi)
617			continue;
618		if (fli->fli_trans_cnt == 0)
619			continue;
620		if (fli->fli_self == curlwp)
621			continue;
622		if (grant_lock(fmi, fli->fli_lock_type))
623			continue;
624
625		return false;
626	}
627
628	return true;
629}
630
631/*
632 * Set new file system state.
633 */
634int
635fstrans_setstate(struct mount *mp, enum fstrans_state new_state)
636{
637	int error;
638	enum fstrans_state old_state;
639	struct fstrans_lwp_info *fli;
640	struct fstrans_mount_info *fmi;
641
642	KASSERT(mp != dead_rootmount);
643
644	fli = fstrans_get_lwp_info(mp, true);
645	fmi = fli->fli_mountinfo;
646	old_state = fmi->fmi_state;
647	if (old_state == new_state)
648		return 0;
649
650	mutex_enter(&fstrans_lock);
651	fmi->fmi_state = new_state;
652	pserialize_perform(fstrans_psz);
653
654	/*
655	 * All threads see the new state now.
656	 * Wait for transactions invalid at this state to leave.
657	 */
658	error = 0;
659	while (! state_change_done(fmi)) {
660		error = cv_wait_sig(&fstrans_count_cv, &fstrans_lock);
661		if (error) {
662			new_state = fmi->fmi_state = FSTRANS_NORMAL;
663			break;
664		}
665	}
666	if (old_state != new_state) {
667		if (old_state == FSTRANS_NORMAL) {
668			KASSERT(fmi->fmi_owner == NULL);
669			fmi->fmi_owner = curlwp;
670		}
671		if (new_state == FSTRANS_NORMAL) {
672			KASSERT(fmi->fmi_owner == curlwp);
673			fmi->fmi_owner = NULL;
674		}
675	}
676	cv_broadcast(&fstrans_state_cv);
677	mutex_exit(&fstrans_lock);
678
679	return error;
680}
681
682/*
683 * Get current file system state.
684 */
685enum fstrans_state
686fstrans_getstate(struct mount *mp)
687{
688	struct fstrans_lwp_info *fli;
689	struct fstrans_mount_info *fmi;
690
691	KASSERT(mp != dead_rootmount);
692
693	fli = fstrans_get_lwp_info(mp, true);
694	fmi = fli->fli_mountinfo;
695
696	return fmi->fmi_state;
697}
698
699/*
700 * Request a filesystem to suspend all operations.
701 */
702int
703vfs_suspend(struct mount *mp, int nowait)
704{
705	struct fstrans_lwp_info *fli;
706	int error;
707
708	if (mp == dead_rootmount)
709		return EOPNOTSUPP;
710
711	fli = fstrans_get_lwp_info(mp, true);
712	mp = fli->fli_mount;
713
714	if (nowait) {
715		if (!mutex_tryenter(&vfs_suspend_lock))
716			return EWOULDBLOCK;
717	} else
718		mutex_enter(&vfs_suspend_lock);
719
720	if ((error = VFS_SUSPENDCTL(mp, SUSPEND_SUSPEND)) != 0)
721		mutex_exit(&vfs_suspend_lock);
722
723	return error;
724}
725
726/*
727 * Request a filesystem to resume all operations.
728 */
729void
730vfs_resume(struct mount *mp)
731{
732	struct fstrans_lwp_info *fli;
733
734	KASSERT(mp != dead_rootmount);
735
736	fli = fstrans_get_lwp_info(mp, false);
737	mp = fli->fli_mount;
738
739	VFS_SUSPENDCTL(mp, SUSPEND_RESUME);
740	mutex_exit(&vfs_suspend_lock);
741}
742
743
744/*
745 * True, if no thread is running a cow handler.
746 */
747static bool
748cow_state_change_done(const struct fstrans_mount_info *fmi)
749{
750	struct fstrans_lwp_info *fli;
751
752	KASSERT(mutex_owned(&fstrans_lock));
753	KASSERT(fmi->fmi_cow_change);
754
755	LIST_FOREACH(fli, &fstrans_fli_head, fli_list) {
756		if (fli->fli_mount != fmi->fmi_mount)
757			continue;
758		if (fli->fli_cow_cnt == 0)
759			continue;
760
761		return false;
762	}
763
764	return true;
765}
766
767/*
768 * Prepare for changing this mounts cow list.
769 * Returns with fstrans_lock locked.
770 */
771static void
772cow_change_enter(struct fstrans_mount_info *fmi)
773{
774
775	mutex_enter(&fstrans_lock);
776
777	/*
778	 * Wait for other threads changing the list.
779	 */
780	while (fmi->fmi_cow_change)
781		cv_wait(&fstrans_state_cv, &fstrans_lock);
782
783	/*
784	 * Wait until all threads are aware of a state change.
785	 */
786	fmi->fmi_cow_change = true;
787	pserialize_perform(fstrans_psz);
788
789	while (! cow_state_change_done(fmi))
790		cv_wait(&fstrans_count_cv, &fstrans_lock);
791}
792
793/*
794 * Done changing this mounts cow list.
795 */
796static void
797cow_change_done(struct fstrans_mount_info *fmi)
798{
799
800	KASSERT(mutex_owned(&fstrans_lock));
801
802	fmi->fmi_cow_change = false;
803	pserialize_perform(fstrans_psz);
804
805	cv_broadcast(&fstrans_state_cv);
806
807	mutex_exit(&fstrans_lock);
808}
809
810/*
811 * Add a handler to this mount.
812 */
813int
814fscow_establish(struct mount *mp, int (*func)(void *, struct buf *, bool),
815    void *arg)
816{
817	struct fstrans_mount_info *fmi;
818	struct fscow_handler *newch;
819
820	KASSERT(mp != dead_rootmount);
821
822	mutex_enter(&fstrans_mount_lock);
823	fmi = mp->mnt_transinfo;
824	KASSERT(fmi != NULL);
825	fmi->fmi_ref_cnt += 1;
826	mutex_exit(&fstrans_mount_lock);
827
828	newch = kmem_alloc(sizeof(*newch), KM_SLEEP);
829	newch->ch_func = func;
830	newch->ch_arg = arg;
831
832	cow_change_enter(fmi);
833	LIST_INSERT_HEAD(&fmi->fmi_cow_handler, newch, ch_list);
834	cow_change_done(fmi);
835
836	return 0;
837}
838
839/*
840 * Remove a handler from this mount.
841 */
842int
843fscow_disestablish(struct mount *mp, int (*func)(void *, struct buf *, bool),
844    void *arg)
845{
846	struct fstrans_mount_info *fmi;
847	struct fscow_handler *hp = NULL;
848
849	KASSERT(mp != dead_rootmount);
850
851	fmi = mp->mnt_transinfo;
852	KASSERT(fmi != NULL);
853
854	cow_change_enter(fmi);
855	LIST_FOREACH(hp, &fmi->fmi_cow_handler, ch_list)
856		if (hp->ch_func == func && hp->ch_arg == arg)
857			break;
858	if (hp != NULL) {
859		LIST_REMOVE(hp, ch_list);
860		kmem_free(hp, sizeof(*hp));
861	}
862	cow_change_done(fmi);
863
864	fstrans_mount_dtor(fmi);
865
866	return hp ? 0 : EINVAL;
867}
868
869/*
870 * Check for need to copy block that is about to be written.
871 */
872int
873fscow_run(struct buf *bp, bool data_valid)
874{
875	int error, s;
876	struct mount *mp;
877	struct fstrans_lwp_info *fli;
878	struct fstrans_mount_info *fmi;
879	struct fscow_handler *hp;
880
881	/*
882	 * First check if we need run the copy-on-write handler.
883	 */
884	if ((bp->b_flags & B_COWDONE))
885		return 0;
886	if (bp->b_vp == NULL) {
887		bp->b_flags |= B_COWDONE;
888		return 0;
889	}
890	if (bp->b_vp->v_type == VBLK)
891		mp = spec_node_getmountedfs(bp->b_vp);
892	else
893		mp = bp->b_vp->v_mount;
894	if (mp == NULL || mp == dead_rootmount) {
895		bp->b_flags |= B_COWDONE;
896		return 0;
897	}
898
899	fli = fstrans_get_lwp_info(mp, true);
900	fmi = fli->fli_mountinfo;
901
902	/*
903	 * On non-recursed run check if other threads
904	 * want to change the list.
905	 */
906	if (fli->fli_cow_cnt == 0) {
907		s = pserialize_read_enter();
908		if (__predict_false(fmi->fmi_cow_change)) {
909			pserialize_read_exit(s);
910			mutex_enter(&fstrans_lock);
911			while (fmi->fmi_cow_change)
912				cv_wait(&fstrans_state_cv, &fstrans_lock);
913			fli->fli_cow_cnt = 1;
914			mutex_exit(&fstrans_lock);
915		} else {
916			fli->fli_cow_cnt = 1;
917			pserialize_read_exit(s);
918		}
919	} else
920		fli->fli_cow_cnt += 1;
921
922	/*
923	 * Run all copy-on-write handlers, stop on error.
924	 */
925	error = 0;
926	LIST_FOREACH(hp, &fmi->fmi_cow_handler, ch_list)
927		if ((error = (*hp->ch_func)(hp->ch_arg, bp, data_valid)) != 0)
928			break;
929 	if (error == 0)
930 		bp->b_flags |= B_COWDONE;
931
932	/*
933	 * Check if other threads want to change the list.
934	 */
935	if (fli->fli_cow_cnt > 1) {
936		fli->fli_cow_cnt -= 1;
937	} else {
938		s = pserialize_read_enter();
939		if (__predict_false(fmi->fmi_cow_change)) {
940			pserialize_read_exit(s);
941			mutex_enter(&fstrans_lock);
942			fli->fli_cow_cnt = 0;
943			cv_signal(&fstrans_count_cv);
944			mutex_exit(&fstrans_lock);
945		} else {
946			fli->fli_cow_cnt = 0;
947			pserialize_read_exit(s);
948		}
949	}
950
951	return error;
952}
953
954#if defined(DDB)
955void fstrans_dump(int);
956
957static void
958fstrans_print_lwp(struct proc *p, struct lwp *l, int verbose)
959{
960	char prefix[9];
961	struct fstrans_lwp_info *fli;
962
963	snprintf(prefix, sizeof(prefix), "%d.%d", p->p_pid, l->l_lid);
964	LIST_FOREACH(fli, &fstrans_fli_head, fli_list) {
965		if (fli->fli_self != l)
966			continue;
967		if (fli->fli_trans_cnt == 0 && fli->fli_cow_cnt == 0) {
968			if (! verbose)
969				continue;
970		}
971		printf("%-8s", prefix);
972		if (verbose)
973			printf(" @%p", fli);
974		if (fli->fli_mount == dead_rootmount)
975			printf(" <dead>");
976		else if (fli->fli_mount != NULL)
977			printf(" (%s)", fli->fli_mount->mnt_stat.f_mntonname);
978		else
979			printf(" NULL");
980		if (fli->fli_alias != NULL) {
981			struct mount *amp = fli->fli_alias->fli_mount;
982
983			printf(" alias");
984			if (verbose)
985				printf(" @%p", fli->fli_alias);
986			if (amp == NULL)
987				printf(" NULL");
988			else
989				printf(" (%s)", amp->mnt_stat.f_mntonname);
990		}
991		if (fli->fli_mountinfo && fli->fli_mountinfo->fmi_gone)
992			printf(" gone");
993		if (fli->fli_trans_cnt == 0) {
994			printf(" -");
995		} else {
996			switch (fli->fli_lock_type) {
997			case FSTRANS_LAZY:
998				printf(" lazy");
999				break;
1000			case FSTRANS_SHARED:
1001				printf(" shared");
1002				break;
1003			default:
1004				printf(" %#x", fli->fli_lock_type);
1005				break;
1006			}
1007		}
1008		printf(" %d cow %d alias %d\n",
1009		    fli->fli_trans_cnt, fli->fli_cow_cnt, fli->fli_alias_cnt);
1010		prefix[0] = '\0';
1011	}
1012}
1013
1014static void
1015fstrans_print_mount(struct mount *mp, int verbose)
1016{
1017	struct fstrans_mount_info *fmi;
1018
1019	fmi = mp->mnt_transinfo;
1020	if (!verbose && (fmi == NULL || fmi->fmi_state == FSTRANS_NORMAL))
1021		return;
1022
1023	printf("%-16s ", mp->mnt_stat.f_mntonname);
1024	if (fmi == NULL) {
1025		printf("(null)\n");
1026		return;
1027	}
1028	printf("owner %p ", fmi->fmi_owner);
1029	switch (fmi->fmi_state) {
1030	case FSTRANS_NORMAL:
1031		printf("state normal\n");
1032		break;
1033	case FSTRANS_SUSPENDING:
1034		printf("state suspending\n");
1035		break;
1036	case FSTRANS_SUSPENDED:
1037		printf("state suspended\n");
1038		break;
1039	default:
1040		printf("state %#x\n", fmi->fmi_state);
1041		break;
1042	}
1043}
1044
1045void
1046fstrans_dump(int full)
1047{
1048	const struct proclist_desc *pd;
1049	struct proc *p;
1050	struct lwp *l;
1051	struct mount *mp;
1052
1053	printf("Fstrans locks by lwp:\n");
1054	for (pd = proclists; pd->pd_list != NULL; pd++)
1055		PROCLIST_FOREACH(p, pd->pd_list)
1056			LIST_FOREACH(l, &p->p_lwps, l_sibling)
1057				fstrans_print_lwp(p, l, full == 1);
1058
1059	printf("Fstrans state by mount:\n");
1060	for (mp = _mountlist_next(NULL); mp; mp = _mountlist_next(mp))
1061		fstrans_print_mount(mp, full == 1);
1062}
1063#endif /* defined(DDB) */
1064