vfs_trans.c revision 1.70
1/*	$NetBSD: vfs_trans.c,v 1.70 2022/11/04 11:20:39 hannken Exp $	*/
2
3/*-
4 * Copyright (c) 2007, 2020 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Juergen Hannken-Illjes.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32#include <sys/cdefs.h>
33__KERNEL_RCSID(0, "$NetBSD: vfs_trans.c,v 1.70 2022/11/04 11:20:39 hannken Exp $");
34
35/*
36 * File system transaction operations.
37 */
38
39#ifdef _KERNEL_OPT
40#include "opt_ddb.h"
41#endif
42
43#include <sys/param.h>
44#include <sys/systm.h>
45#include <sys/atomic.h>
46#include <sys/buf.h>
47#include <sys/hash.h>
48#include <sys/kmem.h>
49#include <sys/mount.h>
50#include <sys/pserialize.h>
51#include <sys/vnode.h>
52#include <sys/fstrans.h>
53#include <sys/proc.h>
54#include <sys/pool.h>
55
56#include <miscfs/deadfs/deadfs.h>
57#include <miscfs/specfs/specdev.h>
58
59#define FSTRANS_MOUNT_HASHSIZE	32
60
61enum fstrans_lock_type {
62	FSTRANS_LAZY,			/* Granted while not suspended */
63	FSTRANS_SHARED			/* Granted while not suspending */
64};
65
66struct fscow_handler {
67	LIST_ENTRY(fscow_handler) ch_list;
68	int (*ch_func)(void *, struct buf *, bool);
69	void *ch_arg;
70};
71struct fstrans_lwp_info {
72	struct fstrans_lwp_info *fli_succ;
73	struct lwp *fli_self;
74	struct mount *fli_mount;
75	struct fstrans_lwp_info *fli_alias;
76	struct fstrans_mount_info *fli_mountinfo;
77	int fli_trans_cnt;
78	int fli_alias_cnt;
79	int fli_cow_cnt;
80	enum fstrans_lock_type fli_lock_type;
81	LIST_ENTRY(fstrans_lwp_info) fli_list;
82};
83struct fstrans_mount_info {
84	enum fstrans_state fmi_state;
85	unsigned int fmi_ref_cnt;
86	bool fmi_gone;
87	bool fmi_cow_change;
88	SLIST_ENTRY(fstrans_mount_info) fmi_hash;
89	LIST_HEAD(, fscow_handler) fmi_cow_handler;
90	struct mount *fmi_mount;
91	struct fstrans_mount_info *fmi_lower_info;
92	struct lwp *fmi_owner;
93};
94SLIST_HEAD(fstrans_mount_hashhead, fstrans_mount_info);
95
96static kmutex_t vfs_suspend_lock	/* Serialize suspensions. */
97    __cacheline_aligned;
98static kmutex_t fstrans_lock		/* Fstrans big lock. */
99    __cacheline_aligned;
100static kcondvar_t fstrans_state_cv;	/* Fstrans or cow state changed. */
101static kcondvar_t fstrans_count_cv;	/* Fstrans or cow count changed. */
102static pserialize_t fstrans_psz;	/* Pserialize state. */
103static LIST_HEAD(fstrans_lwp_head, fstrans_lwp_info) fstrans_fli_head;
104					/* List of all fstrans_lwp_info. */
105static pool_cache_t fstrans_lwp_cache;	/* Cache of fstrans_lwp_info. */
106
107static u_long fstrans_mount_hashmask;
108static struct fstrans_mount_hashhead *fstrans_mount_hashtab;
109static int fstrans_gone_count;		/* Number of fstrans_mount_info gone. */
110
111static inline uint32_t fstrans_mount_hash(struct mount *);
112static inline struct fstrans_mount_info *fstrans_mount_get(struct mount *);
113static void fstrans_mount_dtor(struct fstrans_mount_info *);
114static void fstrans_clear_lwp_info(void);
115static inline struct fstrans_lwp_info *
116    fstrans_get_lwp_info(struct mount *, bool);
117static struct fstrans_lwp_info *fstrans_alloc_lwp_info(struct mount *);
118static int fstrans_lwp_pcc(void *, void *, int);
119static void fstrans_lwp_pcd(void *, void *);
120static inline int _fstrans_start(struct mount *, enum fstrans_lock_type, int);
121static bool grant_lock(const struct fstrans_mount_info *,
122    const enum fstrans_lock_type);
123static bool state_change_done(const struct fstrans_mount_info *);
124static bool cow_state_change_done(const struct fstrans_mount_info *);
125static void cow_change_enter(struct fstrans_mount_info *);
126static void cow_change_done(struct fstrans_mount_info *);
127
128/*
129 * Initialize.
130 */
131void
132fstrans_init(void)
133{
134
135	mutex_init(&vfs_suspend_lock, MUTEX_DEFAULT, IPL_NONE);
136	mutex_init(&fstrans_lock, MUTEX_DEFAULT, IPL_NONE);
137	cv_init(&fstrans_state_cv, "fstchg");
138	cv_init(&fstrans_count_cv, "fstcnt");
139	fstrans_psz = pserialize_create();
140	LIST_INIT(&fstrans_fli_head);
141	fstrans_lwp_cache = pool_cache_init(sizeof(struct fstrans_lwp_info),
142	    coherency_unit, 0, 0, "fstlwp", NULL, IPL_NONE,
143	    fstrans_lwp_pcc, fstrans_lwp_pcd, NULL);
144	KASSERT(fstrans_lwp_cache != NULL);
145	fstrans_mount_hashtab = hashinit(FSTRANS_MOUNT_HASHSIZE, HASH_SLIST,
146	    true, &fstrans_mount_hashmask);
147}
148
149/*
150 * pool_cache constructor for fstrans_lwp_info.  Updating the global list
151 * produces cache misses on MP.  Minimise by keeping free entries on list.
152 */
153int
154fstrans_lwp_pcc(void *arg, void *obj, int flags)
155{
156	struct fstrans_lwp_info *fli = obj;
157
158	memset(fli, 0, sizeof(*fli));
159
160	mutex_enter(&fstrans_lock);
161	LIST_INSERT_HEAD(&fstrans_fli_head, fli, fli_list);
162	mutex_exit(&fstrans_lock);
163
164	return 0;
165}
166
167/*
168 * pool_cache destructor
169 */
170void
171fstrans_lwp_pcd(void *arg, void *obj)
172{
173	struct fstrans_lwp_info *fli = obj;
174
175	mutex_enter(&fstrans_lock);
176	LIST_REMOVE(fli, fli_list);
177	mutex_exit(&fstrans_lock);
178}
179
180/*
181 * Deallocate lwp state.
182 */
183void
184fstrans_lwp_dtor(lwp_t *l)
185{
186	struct fstrans_lwp_info *fli, *fli_next;
187
188	if (l->l_fstrans == NULL)
189		return;
190
191	mutex_enter(&fstrans_lock);
192	for (fli = l->l_fstrans; fli; fli = fli_next) {
193		KASSERT(fli->fli_trans_cnt == 0);
194		KASSERT(fli->fli_cow_cnt == 0);
195		KASSERT(fli->fli_self == l);
196		if (fli->fli_mount != NULL)
197			fstrans_mount_dtor(fli->fli_mountinfo);
198		fli_next = fli->fli_succ;
199		fli->fli_alias_cnt = 0;
200		fli->fli_mount = NULL;
201		fli->fli_alias = NULL;
202		fli->fli_mountinfo = NULL;
203		fli->fli_self = NULL;
204	}
205	mutex_exit(&fstrans_lock);
206
207	for (fli = l->l_fstrans; fli; fli = fli_next) {
208		fli_next = fli->fli_succ;
209		pool_cache_put(fstrans_lwp_cache, fli);
210	}
211	l->l_fstrans = NULL;
212}
213
214/*
215 * mount pointer to hash
216 */
217static inline uint32_t
218fstrans_mount_hash(struct mount *mp)
219{
220
221	return hash32_buf(&mp, sizeof(mp), HASH32_BUF_INIT) &
222	    fstrans_mount_hashmask;
223}
224
225/*
226 * retrieve fstrans_mount_info by mount or NULL
227 */
228static inline struct fstrans_mount_info *
229fstrans_mount_get(struct mount *mp)
230{
231	uint32_t indx;
232	struct fstrans_mount_info *fmi, *fmi_lower;
233
234	KASSERT(mutex_owned(&fstrans_lock));
235
236	indx = fstrans_mount_hash(mp);
237	SLIST_FOREACH(fmi, &fstrans_mount_hashtab[indx], fmi_hash) {
238		if (fmi->fmi_mount == mp) {
239			if (__predict_false(mp->mnt_lower != NULL &&
240			    fmi->fmi_lower_info == NULL)) {
241				/*
242				 * Intern the lower/lowest mount into
243				 * this mount info on first lookup.
244				 */
245				KASSERT(fmi->fmi_ref_cnt == 1);
246
247				fmi_lower = fstrans_mount_get(mp->mnt_lower);
248				if (fmi_lower && fmi_lower->fmi_lower_info)
249					fmi_lower = fmi_lower->fmi_lower_info;
250				if (fmi_lower == NULL)
251					return NULL;
252				fmi->fmi_lower_info = fmi_lower;
253				fmi->fmi_lower_info->fmi_ref_cnt += 1;
254			}
255			return fmi;
256		}
257	}
258
259	return NULL;
260}
261
262/*
263 * Dereference mount state.
264 */
265static void
266fstrans_mount_dtor(struct fstrans_mount_info *fmi)
267{
268
269	KASSERT(mutex_owned(&fstrans_lock));
270
271	KASSERT(fmi != NULL);
272	fmi->fmi_ref_cnt -= 1;
273	if (__predict_true(fmi->fmi_ref_cnt > 0)) {
274		return;
275	}
276
277	KASSERT(fmi->fmi_state == FSTRANS_NORMAL);
278	KASSERT(LIST_FIRST(&fmi->fmi_cow_handler) == NULL);
279	KASSERT(fmi->fmi_owner == NULL);
280
281	if (fmi->fmi_lower_info)
282		fstrans_mount_dtor(fmi->fmi_lower_info);
283
284	KASSERT(fstrans_gone_count > 0);
285	fstrans_gone_count -= 1;
286
287	KASSERT(fmi->fmi_mount->mnt_lower == NULL);
288
289	kmem_free(fmi->fmi_mount, sizeof(*fmi->fmi_mount));
290	kmem_free(fmi, sizeof(*fmi));
291}
292
293/*
294 * Allocate mount state.
295 */
296int
297fstrans_mount(struct mount *mp)
298{
299	uint32_t indx;
300	struct fstrans_mount_info *newfmi;
301
302	indx = fstrans_mount_hash(mp);
303
304	newfmi = kmem_alloc(sizeof(*newfmi), KM_SLEEP);
305	newfmi->fmi_state = FSTRANS_NORMAL;
306	newfmi->fmi_ref_cnt = 1;
307	newfmi->fmi_gone = false;
308	LIST_INIT(&newfmi->fmi_cow_handler);
309	newfmi->fmi_cow_change = false;
310	newfmi->fmi_mount = mp;
311	newfmi->fmi_lower_info = NULL;
312	newfmi->fmi_owner = NULL;
313
314	mutex_enter(&fstrans_lock);
315	SLIST_INSERT_HEAD(&fstrans_mount_hashtab[indx], newfmi, fmi_hash);
316	mutex_exit(&fstrans_lock);
317
318	return 0;
319}
320
321/*
322 * Deallocate mount state.
323 */
324void
325fstrans_unmount(struct mount *mp)
326{
327	uint32_t indx;
328	struct fstrans_mount_info *fmi;
329
330	indx = fstrans_mount_hash(mp);
331
332	mutex_enter(&fstrans_lock);
333	fmi = fstrans_mount_get(mp);
334	KASSERT(fmi != NULL);
335	fmi->fmi_gone = true;
336	SLIST_REMOVE(&fstrans_mount_hashtab[indx],
337	    fmi, fstrans_mount_info, fmi_hash);
338	fstrans_gone_count += 1;
339	fstrans_mount_dtor(fmi);
340	mutex_exit(&fstrans_lock);
341}
342
343/*
344 * Clear mount entries whose mount is gone.
345 */
346static void
347fstrans_clear_lwp_info(void)
348{
349	struct fstrans_lwp_info **p, *fli, *tofree = NULL;
350
351	/*
352	 * Scan our list clearing entries whose mount is gone.
353	 */
354	mutex_enter(&fstrans_lock);
355	for (p = &curlwp->l_fstrans; *p; ) {
356		fli = *p;
357		if (fli->fli_mount != NULL &&
358		    fli->fli_mountinfo->fmi_gone &&
359		    fli->fli_trans_cnt == 0 &&
360		    fli->fli_cow_cnt == 0 &&
361		    fli->fli_alias_cnt == 0) {
362			*p = (*p)->fli_succ;
363			fstrans_mount_dtor(fli->fli_mountinfo);
364			if (fli->fli_alias) {
365				KASSERT(fli->fli_alias->fli_alias_cnt > 0);
366				fli->fli_alias->fli_alias_cnt--;
367			}
368			fli->fli_mount = NULL;
369			fli->fli_alias = NULL;
370			fli->fli_mountinfo = NULL;
371			fli->fli_self = NULL;
372			p = &curlwp->l_fstrans;
373			fli->fli_succ = tofree;
374			tofree = fli;
375		} else {
376			p = &(*p)->fli_succ;
377		}
378	}
379#ifdef DIAGNOSTIC
380	for (fli = curlwp->l_fstrans; fli; fli = fli->fli_succ)
381		if (fli->fli_alias != NULL)
382			KASSERT(fli->fli_alias->fli_self == curlwp);
383#endif /* DIAGNOSTIC */
384	mutex_exit(&fstrans_lock);
385
386	while (tofree != NULL) {
387		fli = tofree;
388		tofree = fli->fli_succ;
389		pool_cache_put(fstrans_lwp_cache, fli);
390	}
391}
392
393/*
394 * Allocate and return per lwp info for this mount.
395 */
396static struct fstrans_lwp_info *
397fstrans_alloc_lwp_info(struct mount *mp)
398{
399	struct fstrans_lwp_info *fli, *fli_lower;
400	struct fstrans_mount_info *fmi;
401
402	for (fli = curlwp->l_fstrans; fli; fli = fli->fli_succ) {
403		if (fli->fli_mount == mp)
404			return fli;
405	}
406
407	/*
408	 * Lookup mount info and get lower mount per lwp info.
409	 */
410	mutex_enter(&fstrans_lock);
411	fmi = fstrans_mount_get(mp);
412	if (fmi == NULL) {
413		mutex_exit(&fstrans_lock);
414		return NULL;
415	}
416	fmi->fmi_ref_cnt += 1;
417	mutex_exit(&fstrans_lock);
418
419	if (fmi->fmi_lower_info) {
420		fli_lower =
421		    fstrans_alloc_lwp_info(fmi->fmi_lower_info->fmi_mount);
422		if (fli_lower == NULL) {
423			mutex_enter(&fstrans_lock);
424			fstrans_mount_dtor(fmi);
425			mutex_exit(&fstrans_lock);
426
427			return NULL;
428		}
429	} else {
430		fli_lower = NULL;
431	}
432
433	/*
434	 * Allocate a new entry.
435	 */
436	fli = pool_cache_get(fstrans_lwp_cache, PR_WAITOK);
437	KASSERT(fli->fli_trans_cnt == 0);
438	KASSERT(fli->fli_cow_cnt == 0);
439	KASSERT(fli->fli_alias_cnt == 0);
440	KASSERT(fli->fli_mount == NULL);
441	KASSERT(fli->fli_alias == NULL);
442	KASSERT(fli->fli_mountinfo == NULL);
443	KASSERT(fli->fli_self == NULL);
444
445	/*
446	 * Attach the mount info and alias.
447	 */
448
449	fli->fli_self = curlwp;
450	fli->fli_mount = mp;
451	fli->fli_mountinfo = fmi;
452
453	fli->fli_succ = curlwp->l_fstrans;
454	curlwp->l_fstrans = fli;
455
456	if (fli_lower) {
457		fli->fli_alias = fli_lower;
458		fli->fli_alias->fli_alias_cnt++;
459		fli = fli->fli_alias;
460	}
461
462	return fli;
463}
464
465/*
466 * Retrieve the per lwp info for this mount allocating if necessary.
467 */
468static inline struct fstrans_lwp_info *
469fstrans_get_lwp_info(struct mount *mp, bool do_alloc)
470{
471	struct fstrans_lwp_info *fli;
472
473	/*
474	 * Scan our list for a match.
475	 */
476	for (fli = curlwp->l_fstrans; fli; fli = fli->fli_succ) {
477		if (fli->fli_mount == mp) {
478			KASSERT(mp->mnt_lower == NULL ||
479			    fli->fli_alias != NULL);
480			if (fli->fli_alias != NULL)
481				fli = fli->fli_alias;
482			break;
483		}
484	}
485
486	if (do_alloc) {
487		if (__predict_false(fli == NULL))
488			fli = fstrans_alloc_lwp_info(mp);
489	}
490
491	return fli;
492}
493
494/*
495 * Check if this lock type is granted at this state.
496 */
497static bool
498grant_lock(const struct fstrans_mount_info *fmi,
499    const enum fstrans_lock_type type)
500{
501
502	if (__predict_true(fmi->fmi_state == FSTRANS_NORMAL))
503		return true;
504	if (fmi->fmi_owner == curlwp)
505		return true;
506	if  (fmi->fmi_state == FSTRANS_SUSPENDING && type == FSTRANS_LAZY)
507		return true;
508
509	return false;
510}
511
512/*
513 * Start a transaction.  If this thread already has a transaction on this
514 * file system increment the reference counter.
515 */
516static inline int
517_fstrans_start(struct mount *mp, enum fstrans_lock_type lock_type, int wait)
518{
519	int s;
520	struct fstrans_lwp_info *fli;
521	struct fstrans_mount_info *fmi;
522
523	ASSERT_SLEEPABLE();
524
525	fli = fstrans_get_lwp_info(mp, true);
526	if (fli == NULL)
527		return 0;
528	fmi = fli->fli_mountinfo;
529
530	if (fli->fli_trans_cnt > 0) {
531		fli->fli_trans_cnt += 1;
532
533		return 0;
534	}
535
536	s = pserialize_read_enter();
537	if (__predict_true(grant_lock(fmi, lock_type))) {
538		fli->fli_trans_cnt = 1;
539		fli->fli_lock_type = lock_type;
540		pserialize_read_exit(s);
541
542		return 0;
543	}
544	pserialize_read_exit(s);
545
546	if (! wait)
547		return EBUSY;
548
549	mutex_enter(&fstrans_lock);
550	while (! grant_lock(fmi, lock_type))
551		cv_wait(&fstrans_state_cv, &fstrans_lock);
552	fli->fli_trans_cnt = 1;
553	fli->fli_lock_type = lock_type;
554	mutex_exit(&fstrans_lock);
555
556	return 0;
557}
558
559void
560fstrans_start(struct mount *mp)
561{
562	int error __diagused;
563
564	error = _fstrans_start(mp, FSTRANS_SHARED, 1);
565	KASSERT(error == 0);
566}
567
568int
569fstrans_start_nowait(struct mount *mp)
570{
571
572	return _fstrans_start(mp, FSTRANS_SHARED, 0);
573}
574
575void
576fstrans_start_lazy(struct mount *mp)
577{
578	int error __diagused;
579
580	error = _fstrans_start(mp, FSTRANS_LAZY, 1);
581	KASSERT(error == 0);
582}
583
584/*
585 * Finish a transaction.
586 */
587void
588fstrans_done(struct mount *mp)
589{
590	int s;
591	struct fstrans_lwp_info *fli;
592	struct fstrans_mount_info *fmi;
593
594	fli = fstrans_get_lwp_info(mp, false);
595	if (fli == NULL)
596		return;
597	fmi = fli->fli_mountinfo;
598	KASSERT(fli->fli_trans_cnt > 0);
599
600	if (fli->fli_trans_cnt > 1) {
601		fli->fli_trans_cnt -= 1;
602
603		return;
604	}
605
606	if (__predict_false(fstrans_gone_count > 0))
607		fstrans_clear_lwp_info();
608
609	s = pserialize_read_enter();
610	if (__predict_true(fmi->fmi_state == FSTRANS_NORMAL)) {
611		fli->fli_trans_cnt = 0;
612		pserialize_read_exit(s);
613
614		return;
615	}
616	pserialize_read_exit(s);
617
618	mutex_enter(&fstrans_lock);
619	fli->fli_trans_cnt = 0;
620	cv_signal(&fstrans_count_cv);
621	mutex_exit(&fstrans_lock);
622}
623
624/*
625 * Check if we hold an lock.
626 */
627int
628fstrans_held(struct mount *mp)
629{
630	struct fstrans_lwp_info *fli;
631	struct fstrans_mount_info *fmi;
632
633	KASSERT(mp != dead_rootmount);
634
635	fli = fstrans_get_lwp_info(mp, false);
636	if (fli == NULL)
637		return 0;
638	fmi = fli->fli_mountinfo;
639
640	return (fli->fli_trans_cnt > 0 || fmi->fmi_owner == curlwp);
641}
642
643/*
644 * Check if this thread has an exclusive lock.
645 */
646int
647fstrans_is_owner(struct mount *mp)
648{
649	struct fstrans_lwp_info *fli;
650	struct fstrans_mount_info *fmi;
651
652	KASSERT(mp != dead_rootmount);
653
654	fli = fstrans_get_lwp_info(mp, false);
655	if (fli == NULL)
656		return 0;
657	fmi = fli->fli_mountinfo;
658
659	return (fmi->fmi_owner == curlwp);
660}
661
662/*
663 * True, if no thread is in a transaction not granted at the current state.
664 */
665static bool
666state_change_done(const struct fstrans_mount_info *fmi)
667{
668	struct fstrans_lwp_info *fli;
669
670	KASSERT(mutex_owned(&fstrans_lock));
671
672	LIST_FOREACH(fli, &fstrans_fli_head, fli_list) {
673		if (fli->fli_mountinfo != fmi)
674			continue;
675		if (fli->fli_trans_cnt == 0)
676			continue;
677		if (fli->fli_self == curlwp)
678			continue;
679		if (grant_lock(fmi, fli->fli_lock_type))
680			continue;
681
682		return false;
683	}
684
685	return true;
686}
687
688/*
689 * Set new file system state.
690 */
691int
692fstrans_setstate(struct mount *mp, enum fstrans_state new_state)
693{
694	int error;
695	enum fstrans_state old_state;
696	struct fstrans_lwp_info *fli;
697	struct fstrans_mount_info *fmi;
698
699	KASSERT(mp != dead_rootmount);
700
701	fli = fstrans_get_lwp_info(mp, true);
702	if (fli == NULL)
703		return ENOENT;
704	fmi = fli->fli_mountinfo;
705	old_state = fmi->fmi_state;
706	if (old_state == new_state)
707		return 0;
708
709	mutex_enter(&fstrans_lock);
710	fmi->fmi_state = new_state;
711	pserialize_perform(fstrans_psz);
712
713	/*
714	 * All threads see the new state now.
715	 * Wait for transactions invalid at this state to leave.
716	 */
717	error = 0;
718	while (! state_change_done(fmi)) {
719		error = cv_wait_sig(&fstrans_count_cv, &fstrans_lock);
720		if (error) {
721			new_state = fmi->fmi_state = FSTRANS_NORMAL;
722			break;
723		}
724	}
725	if (old_state != new_state) {
726		if (old_state == FSTRANS_NORMAL) {
727			KASSERT(fmi->fmi_owner == NULL);
728			fmi->fmi_owner = curlwp;
729		}
730		if (new_state == FSTRANS_NORMAL) {
731			KASSERT(fmi->fmi_owner == curlwp);
732			fmi->fmi_owner = NULL;
733		}
734	}
735	cv_broadcast(&fstrans_state_cv);
736	mutex_exit(&fstrans_lock);
737
738	return error;
739}
740
741/*
742 * Get current file system state.
743 */
744enum fstrans_state
745fstrans_getstate(struct mount *mp)
746{
747	struct fstrans_lwp_info *fli;
748	struct fstrans_mount_info *fmi;
749
750	KASSERT(mp != dead_rootmount);
751
752	fli = fstrans_get_lwp_info(mp, true);
753	KASSERT(fli != NULL);
754	fmi = fli->fli_mountinfo;
755
756	return fmi->fmi_state;
757}
758
759/*
760 * Request a filesystem to suspend all operations.
761 */
762int
763vfs_suspend(struct mount *mp, int nowait)
764{
765	struct fstrans_lwp_info *fli;
766	int error;
767
768	if (mp == dead_rootmount)
769		return EOPNOTSUPP;
770
771	fli = fstrans_get_lwp_info(mp, true);
772	if (fli == NULL)
773		return ENOENT;
774
775	if (nowait) {
776		if (!mutex_tryenter(&vfs_suspend_lock))
777			return EWOULDBLOCK;
778	} else
779		mutex_enter(&vfs_suspend_lock);
780
781	if ((error = VFS_SUSPENDCTL(fli->fli_mount, SUSPEND_SUSPEND)) != 0) {
782		mutex_exit(&vfs_suspend_lock);
783		return error;
784	}
785
786	if ((mp->mnt_iflag & IMNT_GONE) != 0) {
787		vfs_resume(mp);
788		return ENOENT;
789	}
790
791	return 0;
792}
793
794/*
795 * Request a filesystem to resume all operations.
796 */
797void
798vfs_resume(struct mount *mp)
799{
800	struct fstrans_lwp_info *fli;
801
802	KASSERT(mp != dead_rootmount);
803
804	fli = fstrans_get_lwp_info(mp, false);
805	mp = fli->fli_mount;
806
807	VFS_SUSPENDCTL(mp, SUSPEND_RESUME);
808	mutex_exit(&vfs_suspend_lock);
809}
810
811
812/*
813 * True, if no thread is running a cow handler.
814 */
815static bool
816cow_state_change_done(const struct fstrans_mount_info *fmi)
817{
818	struct fstrans_lwp_info *fli;
819
820	KASSERT(mutex_owned(&fstrans_lock));
821	KASSERT(fmi->fmi_cow_change);
822
823	LIST_FOREACH(fli, &fstrans_fli_head, fli_list) {
824		if (fli->fli_mount != fmi->fmi_mount)
825			continue;
826		if (fli->fli_cow_cnt == 0)
827			continue;
828
829		return false;
830	}
831
832	return true;
833}
834
835/*
836 * Prepare for changing this mounts cow list.
837 * Returns with fstrans_lock locked.
838 */
839static void
840cow_change_enter(struct fstrans_mount_info *fmi)
841{
842
843	mutex_enter(&fstrans_lock);
844
845	/*
846	 * Wait for other threads changing the list.
847	 */
848	while (fmi->fmi_cow_change)
849		cv_wait(&fstrans_state_cv, &fstrans_lock);
850
851	/*
852	 * Wait until all threads are aware of a state change.
853	 */
854	fmi->fmi_cow_change = true;
855	pserialize_perform(fstrans_psz);
856
857	while (! cow_state_change_done(fmi))
858		cv_wait(&fstrans_count_cv, &fstrans_lock);
859}
860
861/*
862 * Done changing this mounts cow list.
863 */
864static void
865cow_change_done(struct fstrans_mount_info *fmi)
866{
867
868	KASSERT(mutex_owned(&fstrans_lock));
869
870	fmi->fmi_cow_change = false;
871	pserialize_perform(fstrans_psz);
872
873	cv_broadcast(&fstrans_state_cv);
874
875	mutex_exit(&fstrans_lock);
876}
877
878/*
879 * Add a handler to this mount.
880 */
881int
882fscow_establish(struct mount *mp, int (*func)(void *, struct buf *, bool),
883    void *arg)
884{
885	struct fstrans_mount_info *fmi;
886	struct fscow_handler *newch;
887
888	KASSERT(mp != dead_rootmount);
889
890	mutex_enter(&fstrans_lock);
891	fmi = fstrans_mount_get(mp);
892	KASSERT(fmi != NULL);
893	fmi->fmi_ref_cnt += 1;
894	mutex_exit(&fstrans_lock);
895
896	newch = kmem_alloc(sizeof(*newch), KM_SLEEP);
897	newch->ch_func = func;
898	newch->ch_arg = arg;
899
900	cow_change_enter(fmi);
901	LIST_INSERT_HEAD(&fmi->fmi_cow_handler, newch, ch_list);
902	cow_change_done(fmi);
903
904	return 0;
905}
906
907/*
908 * Remove a handler from this mount.
909 */
910int
911fscow_disestablish(struct mount *mp, int (*func)(void *, struct buf *, bool),
912    void *arg)
913{
914	struct fstrans_mount_info *fmi;
915	struct fscow_handler *hp = NULL;
916
917	KASSERT(mp != dead_rootmount);
918
919	mutex_enter(&fstrans_lock);
920	fmi = fstrans_mount_get(mp);
921	KASSERT(fmi != NULL);
922	mutex_exit(&fstrans_lock);
923
924	cow_change_enter(fmi);
925	LIST_FOREACH(hp, &fmi->fmi_cow_handler, ch_list)
926		if (hp->ch_func == func && hp->ch_arg == arg)
927			break;
928	if (hp != NULL) {
929		LIST_REMOVE(hp, ch_list);
930		kmem_free(hp, sizeof(*hp));
931	}
932	fstrans_mount_dtor(fmi);
933	cow_change_done(fmi);
934
935	return hp ? 0 : EINVAL;
936}
937
938/*
939 * Check for need to copy block that is about to be written.
940 */
941int
942fscow_run(struct buf *bp, bool data_valid)
943{
944	int error, s;
945	struct mount *mp;
946	struct fstrans_lwp_info *fli;
947	struct fstrans_mount_info *fmi;
948	struct fscow_handler *hp;
949
950	/*
951	 * First check if we need run the copy-on-write handler.
952	 */
953	if ((bp->b_flags & B_COWDONE))
954		return 0;
955	if (bp->b_vp == NULL) {
956		bp->b_flags |= B_COWDONE;
957		return 0;
958	}
959	if (bp->b_vp->v_type == VBLK)
960		mp = spec_node_getmountedfs(bp->b_vp);
961	else
962		mp = bp->b_vp->v_mount;
963	if (mp == NULL || mp == dead_rootmount) {
964		bp->b_flags |= B_COWDONE;
965		return 0;
966	}
967
968	fli = fstrans_get_lwp_info(mp, true);
969	KASSERT(fli != NULL);
970	fmi = fli->fli_mountinfo;
971
972	/*
973	 * On non-recursed run check if other threads
974	 * want to change the list.
975	 */
976	if (fli->fli_cow_cnt == 0) {
977		s = pserialize_read_enter();
978		if (__predict_false(fmi->fmi_cow_change)) {
979			pserialize_read_exit(s);
980			mutex_enter(&fstrans_lock);
981			while (fmi->fmi_cow_change)
982				cv_wait(&fstrans_state_cv, &fstrans_lock);
983			fli->fli_cow_cnt = 1;
984			mutex_exit(&fstrans_lock);
985		} else {
986			fli->fli_cow_cnt = 1;
987			pserialize_read_exit(s);
988		}
989	} else
990		fli->fli_cow_cnt += 1;
991
992	/*
993	 * Run all copy-on-write handlers, stop on error.
994	 */
995	error = 0;
996	LIST_FOREACH(hp, &fmi->fmi_cow_handler, ch_list)
997		if ((error = (*hp->ch_func)(hp->ch_arg, bp, data_valid)) != 0)
998			break;
999 	if (error == 0)
1000 		bp->b_flags |= B_COWDONE;
1001
1002	/*
1003	 * Check if other threads want to change the list.
1004	 */
1005	if (fli->fli_cow_cnt > 1) {
1006		fli->fli_cow_cnt -= 1;
1007	} else {
1008		s = pserialize_read_enter();
1009		if (__predict_false(fmi->fmi_cow_change)) {
1010			pserialize_read_exit(s);
1011			mutex_enter(&fstrans_lock);
1012			fli->fli_cow_cnt = 0;
1013			cv_signal(&fstrans_count_cv);
1014			mutex_exit(&fstrans_lock);
1015		} else {
1016			fli->fli_cow_cnt = 0;
1017			pserialize_read_exit(s);
1018		}
1019	}
1020
1021	return error;
1022}
1023
1024#if defined(DDB)
1025void fstrans_dump(int);
1026
1027static void
1028fstrans_print_lwp(struct proc *p, struct lwp *l, int verbose)
1029{
1030	char prefix[9];
1031	struct fstrans_lwp_info *fli;
1032
1033	snprintf(prefix, sizeof(prefix), "%d.%d", p->p_pid, l->l_lid);
1034	LIST_FOREACH(fli, &fstrans_fli_head, fli_list) {
1035		if (fli->fli_self != l)
1036			continue;
1037		if (fli->fli_trans_cnt == 0 && fli->fli_cow_cnt == 0) {
1038			if (! verbose)
1039				continue;
1040		}
1041		printf("%-8s", prefix);
1042		if (verbose)
1043			printf(" @%p", fli);
1044		if (fli->fli_mount == dead_rootmount)
1045			printf(" <dead>");
1046		else if (fli->fli_mount != NULL)
1047			printf(" (%s)", fli->fli_mount->mnt_stat.f_mntonname);
1048		else
1049			printf(" NULL");
1050		if (fli->fli_alias != NULL) {
1051			struct mount *amp = fli->fli_alias->fli_mount;
1052
1053			printf(" alias");
1054			if (verbose)
1055				printf(" @%p", fli->fli_alias);
1056			if (amp == NULL)
1057				printf(" NULL");
1058			else
1059				printf(" (%s)", amp->mnt_stat.f_mntonname);
1060		}
1061		if (fli->fli_mountinfo && fli->fli_mountinfo->fmi_gone)
1062			printf(" gone");
1063		if (fli->fli_trans_cnt == 0) {
1064			printf(" -");
1065		} else {
1066			switch (fli->fli_lock_type) {
1067			case FSTRANS_LAZY:
1068				printf(" lazy");
1069				break;
1070			case FSTRANS_SHARED:
1071				printf(" shared");
1072				break;
1073			default:
1074				printf(" %#x", fli->fli_lock_type);
1075				break;
1076			}
1077		}
1078		printf(" %d cow %d alias %d\n",
1079		    fli->fli_trans_cnt, fli->fli_cow_cnt, fli->fli_alias_cnt);
1080		prefix[0] = '\0';
1081	}
1082}
1083
1084static void
1085fstrans_print_mount(struct mount *mp, int verbose)
1086{
1087	uint32_t indx;
1088	struct fstrans_mount_info *fmi;
1089
1090	indx = fstrans_mount_hash(mp);
1091	SLIST_FOREACH(fmi, &fstrans_mount_hashtab[indx], fmi_hash)
1092		if (fmi->fmi_mount == mp)
1093			break;
1094
1095	if (!verbose && (fmi == NULL || fmi->fmi_state == FSTRANS_NORMAL))
1096		return;
1097
1098	printf("%-16s ", mp->mnt_stat.f_mntonname);
1099	if (fmi == NULL) {
1100		printf("(null)\n");
1101		return;
1102	}
1103	printf("owner %p ", fmi->fmi_owner);
1104	switch (fmi->fmi_state) {
1105	case FSTRANS_NORMAL:
1106		printf("state normal\n");
1107		break;
1108	case FSTRANS_SUSPENDING:
1109		printf("state suspending\n");
1110		break;
1111	case FSTRANS_SUSPENDED:
1112		printf("state suspended\n");
1113		break;
1114	default:
1115		printf("state %#x\n", fmi->fmi_state);
1116		break;
1117	}
1118}
1119
1120void
1121fstrans_dump(int full)
1122{
1123	const struct proclist_desc *pd;
1124	struct proc *p;
1125	struct lwp *l;
1126	struct mount *mp;
1127
1128	printf("Fstrans locks by lwp:\n");
1129	for (pd = proclists; pd->pd_list != NULL; pd++)
1130		PROCLIST_FOREACH(p, pd->pd_list)
1131			LIST_FOREACH(l, &p->p_lwps, l_sibling)
1132				fstrans_print_lwp(p, l, full == 1);
1133
1134	printf("Fstrans state by mount:\n");
1135	for (mp = _mountlist_next(NULL); mp; mp = _mountlist_next(mp))
1136		fstrans_print_mount(mp, full == 1);
1137}
1138#endif /* defined(DDB) */
1139