vfs_trans.c revision 1.64
1/*	$NetBSD: vfs_trans.c,v 1.64 2022/06/28 00:13:48 riastradh Exp $	*/
2
3/*-
4 * Copyright (c) 2007, 2020 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Juergen Hannken-Illjes.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32#include <sys/cdefs.h>
33__KERNEL_RCSID(0, "$NetBSD: vfs_trans.c,v 1.64 2022/06/28 00:13:48 riastradh Exp $");
34
35/*
36 * File system transaction operations.
37 */
38
39#ifdef _KERNEL_OPT
40#include "opt_ddb.h"
41#endif
42
43#include <sys/param.h>
44#include <sys/systm.h>
45#include <sys/atomic.h>
46#include <sys/buf.h>
47#include <sys/kmem.h>
48#include <sys/mount.h>
49#include <sys/pserialize.h>
50#include <sys/vnode.h>
51#include <sys/fstrans.h>
52#include <sys/proc.h>
53#include <sys/pool.h>
54
55#include <miscfs/specfs/specdev.h>
56
57enum fstrans_lock_type {
58	FSTRANS_LAZY,			/* Granted while not suspended */
59	FSTRANS_SHARED			/* Granted while not suspending */
60};
61
62struct fscow_handler {
63	LIST_ENTRY(fscow_handler) ch_list;
64	int (*ch_func)(void *, struct buf *, bool);
65	void *ch_arg;
66};
67struct fstrans_lwp_info {
68	struct fstrans_lwp_info *fli_succ;
69	struct lwp *fli_self;
70	struct mount *fli_mount;
71	struct fstrans_lwp_info *fli_alias;
72	struct fstrans_mount_info *fli_mountinfo;
73	int fli_trans_cnt;
74	int fli_alias_cnt;
75	int fli_cow_cnt;
76	enum fstrans_lock_type fli_lock_type;
77	LIST_ENTRY(fstrans_lwp_info) fli_list;
78};
79struct fstrans_mount_info {
80	enum fstrans_state fmi_state;
81	unsigned int fmi_ref_cnt;
82	bool fmi_gone;
83	bool fmi_cow_change;
84	LIST_HEAD(, fscow_handler) fmi_cow_handler;
85	struct mount *fmi_mount;
86	struct lwp *fmi_owner;
87};
88
89static kmutex_t vfs_suspend_lock	/* Serialize suspensions. */
90    __cacheline_aligned;
91static kmutex_t fstrans_lock		/* Fstrans big lock. */
92    __cacheline_aligned;
93static kcondvar_t fstrans_state_cv;	/* Fstrans or cow state changed. */
94static kcondvar_t fstrans_count_cv;	/* Fstrans or cow count changed. */
95static pserialize_t fstrans_psz;	/* Pserialize state. */
96static LIST_HEAD(fstrans_lwp_head, fstrans_lwp_info) fstrans_fli_head;
97					/* List of all fstrans_lwp_info. */
98static pool_cache_t fstrans_lwp_cache;	/* Cache of fstrans_lwp_info. */
99
100static int fstrans_gone_count;		/* Number of fstrans_mount_info gone. */
101
102static void fstrans_mount_dtor(struct fstrans_mount_info *);
103static void fstrans_clear_lwp_info(void);
104static inline struct fstrans_lwp_info *
105    fstrans_get_lwp_info(struct mount *, bool);
106static struct fstrans_lwp_info *fstrans_alloc_lwp_info(struct mount *);
107static int fstrans_lwp_pcc(void *, void *, int);
108static void fstrans_lwp_pcd(void *, void *);
109static inline int _fstrans_start(struct mount *, enum fstrans_lock_type, int);
110static bool grant_lock(const struct fstrans_mount_info *,
111    const enum fstrans_lock_type);
112static bool state_change_done(const struct fstrans_mount_info *);
113static bool cow_state_change_done(const struct fstrans_mount_info *);
114static void cow_change_enter(struct fstrans_mount_info *);
115static void cow_change_done(struct fstrans_mount_info *);
116
117extern struct mount *dead_rootmount;
118
119#if defined(DIAGNOSTIC)
120
121struct fstrans_debug_mount {
122	struct mount *fdm_mount;
123	SLIST_ENTRY(fstrans_debug_mount) fdm_list;
124};
125
126static SLIST_HEAD(, fstrans_debug_mount) fstrans_debug_mount_head =
127    SLIST_HEAD_INITIALIZER(fstrans_debug_mount_head);
128
129static void
130fstrans_debug_mount(struct mount *mp)
131{
132	struct fstrans_debug_mount *fdm, *new;
133
134	KASSERT(mutex_owned(&fstrans_lock));
135
136	mutex_exit(&fstrans_lock);
137	new = kmem_alloc(sizeof(*new), KM_SLEEP);
138	new->fdm_mount = mp;
139	mutex_enter(&fstrans_lock);
140
141	SLIST_FOREACH(fdm, &fstrans_debug_mount_head, fdm_list)
142		KASSERT(fdm->fdm_mount != mp);
143	SLIST_INSERT_HEAD(&fstrans_debug_mount_head, new, fdm_list);
144}
145
146static void
147fstrans_debug_unmount(struct mount *mp)
148{
149	struct fstrans_debug_mount *fdm;
150
151	KASSERT(mutex_owned(&fstrans_lock));
152
153	SLIST_FOREACH(fdm, &fstrans_debug_mount_head, fdm_list)
154		if (fdm->fdm_mount == mp)
155			break;
156	KASSERT(fdm != NULL);
157	SLIST_REMOVE(&fstrans_debug_mount_head, fdm,
158	    fstrans_debug_mount, fdm_list);
159	kmem_free(fdm, sizeof(*fdm));
160}
161
162static void
163fstrans_debug_validate_mount(struct mount *mp)
164{
165	struct fstrans_debug_mount *fdm;
166
167	KASSERT(mutex_owned(&fstrans_lock));
168
169	SLIST_FOREACH(fdm, &fstrans_debug_mount_head, fdm_list)
170		if (fdm->fdm_mount == mp)
171			break;
172	KASSERTMSG(fdm != NULL, "mount %p invalid", mp);
173}
174
175#else /* defined(DIAGNOSTIC) */
176
177#define fstrans_debug_mount(mp)
178#define fstrans_debug_unmount(mp)
179#define fstrans_debug_validate_mount(mp)
180
181#endif  /* defined(DIAGNOSTIC) */
182
183/*
184 * Initialize.
185 */
186void
187fstrans_init(void)
188{
189
190	mutex_init(&vfs_suspend_lock, MUTEX_DEFAULT, IPL_NONE);
191	mutex_init(&fstrans_lock, MUTEX_DEFAULT, IPL_NONE);
192	cv_init(&fstrans_state_cv, "fstchg");
193	cv_init(&fstrans_count_cv, "fstcnt");
194	fstrans_psz = pserialize_create();
195	LIST_INIT(&fstrans_fli_head);
196	fstrans_lwp_cache = pool_cache_init(sizeof(struct fstrans_lwp_info),
197	    coherency_unit, 0, 0, "fstlwp", NULL, IPL_NONE,
198	    fstrans_lwp_pcc, fstrans_lwp_pcd, NULL);
199	KASSERT(fstrans_lwp_cache != NULL);
200}
201
202/*
203 * pool_cache constructor for fstrans_lwp_info.  Updating the global list
204 * produces cache misses on MP.  Minimise by keeping free entries on list.
205 */
206int
207fstrans_lwp_pcc(void *arg, void *obj, int flags)
208{
209	struct fstrans_lwp_info *fli = obj;
210
211	memset(fli, 0, sizeof(*fli));
212
213	mutex_enter(&fstrans_lock);
214	LIST_INSERT_HEAD(&fstrans_fli_head, fli, fli_list);
215	mutex_exit(&fstrans_lock);
216
217	return 0;
218}
219
220/*
221 * pool_cache destructor
222 */
223void
224fstrans_lwp_pcd(void *arg, void *obj)
225{
226	struct fstrans_lwp_info *fli = obj;
227
228	mutex_enter(&fstrans_lock);
229	LIST_REMOVE(fli, fli_list);
230	mutex_exit(&fstrans_lock);
231}
232
233/*
234 * Deallocate lwp state.
235 */
236void
237fstrans_lwp_dtor(lwp_t *l)
238{
239	struct fstrans_lwp_info *fli, *fli_next;
240
241	if (l->l_fstrans == NULL)
242		return;
243
244	mutex_enter(&fstrans_lock);
245	for (fli = l->l_fstrans; fli; fli = fli_next) {
246		KASSERT(fli->fli_trans_cnt == 0);
247		KASSERT(fli->fli_cow_cnt == 0);
248		KASSERT(fli->fli_self == l);
249		if (fli->fli_mount != NULL)
250			fstrans_mount_dtor(fli->fli_mountinfo);
251		fli_next = fli->fli_succ;
252		fli->fli_alias_cnt = 0;
253		fli->fli_mount = NULL;
254		fli->fli_alias = NULL;
255		fli->fli_mountinfo = NULL;
256		fli->fli_self = NULL;
257	}
258	mutex_exit(&fstrans_lock);
259
260	for (fli = l->l_fstrans; fli; fli = fli_next) {
261		fli_next = fli->fli_succ;
262		pool_cache_put(fstrans_lwp_cache, fli);
263	}
264	l->l_fstrans = NULL;
265}
266
267/*
268 * Dereference mount state.
269 */
270static void
271fstrans_mount_dtor(struct fstrans_mount_info *fmi)
272{
273
274	KASSERT(mutex_owned(&fstrans_lock));
275
276	KASSERT(fmi != NULL);
277	fmi->fmi_ref_cnt -= 1;
278	if (__predict_true(fmi->fmi_ref_cnt > 0)) {
279		return;
280	}
281
282	KASSERT(fmi->fmi_state == FSTRANS_NORMAL);
283	KASSERT(LIST_FIRST(&fmi->fmi_cow_handler) == NULL);
284	KASSERT(fmi->fmi_owner == NULL);
285
286	KASSERT(fstrans_gone_count > 0);
287	fstrans_gone_count -= 1;
288
289	kmem_free(fmi->fmi_mount, sizeof(*fmi->fmi_mount));
290	kmem_free(fmi, sizeof(*fmi));
291}
292
293/*
294 * Allocate mount state.
295 */
296int
297fstrans_mount(struct mount *mp)
298{
299	struct fstrans_mount_info *newfmi;
300
301	newfmi = kmem_alloc(sizeof(*newfmi), KM_SLEEP);
302	newfmi->fmi_state = FSTRANS_NORMAL;
303	newfmi->fmi_ref_cnt = 1;
304	newfmi->fmi_gone = false;
305	LIST_INIT(&newfmi->fmi_cow_handler);
306	newfmi->fmi_cow_change = false;
307	newfmi->fmi_mount = mp;
308	newfmi->fmi_owner = NULL;
309
310	mutex_enter(&fstrans_lock);
311	mp->mnt_transinfo = newfmi;
312	fstrans_debug_mount(mp);
313	mutex_exit(&fstrans_lock);
314
315	return 0;
316}
317
318/*
319 * Deallocate mount state.
320 */
321void
322fstrans_unmount(struct mount *mp)
323{
324	struct fstrans_mount_info *fmi = mp->mnt_transinfo;
325
326	KASSERT(fmi != NULL);
327
328	mutex_enter(&fstrans_lock);
329	fstrans_debug_unmount(mp);
330	fmi->fmi_gone = true;
331	mp->mnt_transinfo = NULL;
332	fstrans_gone_count += 1;
333	fstrans_mount_dtor(fmi);
334	mutex_exit(&fstrans_lock);
335}
336
337/*
338 * Clear mount entries whose mount is gone.
339 */
340static void
341fstrans_clear_lwp_info(void)
342{
343	struct fstrans_lwp_info **p, *fli, *tofree = NULL;
344
345	/*
346	 * Scan our list clearing entries whose mount is gone.
347	 */
348	mutex_enter(&fstrans_lock);
349	for (p = &curlwp->l_fstrans; *p; ) {
350		fli = *p;
351		if (fli->fli_mount != NULL &&
352		    fli->fli_mountinfo->fmi_gone &&
353		    fli->fli_trans_cnt == 0 &&
354		    fli->fli_cow_cnt == 0 &&
355		    fli->fli_alias_cnt == 0) {
356			*p = (*p)->fli_succ;
357			fstrans_mount_dtor(fli->fli_mountinfo);
358			if (fli->fli_alias) {
359				KASSERT(fli->fli_alias->fli_alias_cnt > 0);
360				fli->fli_alias->fli_alias_cnt--;
361			}
362			fli->fli_mount = NULL;
363			fli->fli_alias = NULL;
364			fli->fli_mountinfo = NULL;
365			fli->fli_self = NULL;
366			p = &curlwp->l_fstrans;
367			fli->fli_succ = tofree;
368			tofree = fli;
369		} else {
370			p = &(*p)->fli_succ;
371		}
372	}
373#ifdef DIAGNOSTIC
374	for (fli = curlwp->l_fstrans; fli; fli = fli->fli_succ)
375		if (fli->fli_alias != NULL)
376			KASSERT(fli->fli_alias->fli_self == curlwp);
377#endif /* DIAGNOSTIC */
378	mutex_exit(&fstrans_lock);
379
380	while (tofree != NULL) {
381		fli = tofree;
382		tofree = fli->fli_succ;
383		pool_cache_put(fstrans_lwp_cache, fli);
384	}
385}
386
387/*
388 * Allocate and return per lwp info for this mount.
389 */
390static struct fstrans_lwp_info *
391fstrans_alloc_lwp_info(struct mount *mp)
392{
393	struct fstrans_lwp_info *fli;
394	struct fstrans_mount_info *fmi;
395
396	for (fli = curlwp->l_fstrans; fli; fli = fli->fli_succ) {
397		if (fli->fli_mount == mp)
398			return fli;
399	}
400
401	/*
402	 * Allocate a new entry.
403	 */
404	fli = pool_cache_get(fstrans_lwp_cache, PR_WAITOK);
405	KASSERT(fli->fli_trans_cnt == 0);
406	KASSERT(fli->fli_cow_cnt == 0);
407	KASSERT(fli->fli_alias_cnt == 0);
408	KASSERT(fli->fli_mount == NULL);
409	KASSERT(fli->fli_alias == NULL);
410	KASSERT(fli->fli_mountinfo == NULL);
411	KASSERT(fli->fli_self == NULL);
412	fli->fli_succ = curlwp->l_fstrans;
413	curlwp->l_fstrans = fli;
414
415	/*
416	 * Attach the entry to the mount if its mnt_transinfo is valid.
417	 */
418
419	mutex_enter(&fstrans_lock);
420	fli->fli_self = curlwp;
421	fstrans_debug_validate_mount(mp);
422	fmi = mp->mnt_transinfo;
423	KASSERT(fmi != NULL);
424	fli->fli_mount = mp;
425	fli->fli_mountinfo = fmi;
426	fmi->fmi_ref_cnt += 1;
427	do {
428		mp = mp->mnt_lower;
429	} while (mp && mp->mnt_lower);
430	mutex_exit(&fstrans_lock);
431
432	if (mp) {
433		fli->fli_alias = fstrans_alloc_lwp_info(mp);
434		fli->fli_alias->fli_alias_cnt++;
435		fli = fli->fli_alias;
436	}
437
438	return fli;
439}
440
441/*
442 * Retrieve the per lwp info for this mount allocating if necessary.
443 */
444static inline struct fstrans_lwp_info *
445fstrans_get_lwp_info(struct mount *mp, bool do_alloc)
446{
447	struct fstrans_lwp_info *fli;
448
449	/*
450	 * Scan our list for a match.
451	 */
452	for (fli = curlwp->l_fstrans; fli; fli = fli->fli_succ) {
453		if (fli->fli_mount == mp) {
454			KASSERT((mp->mnt_lower == NULL) ==
455			    (fli->fli_alias == NULL));
456			if (fli->fli_alias != NULL)
457				fli = fli->fli_alias;
458			break;
459		}
460	}
461
462	if (do_alloc) {
463		if (__predict_false(fli == NULL))
464			fli = fstrans_alloc_lwp_info(mp);
465		KASSERT(fli != NULL);
466		KASSERT(!fli->fli_mountinfo->fmi_gone);
467	} else {
468		KASSERT(fli != NULL);
469	}
470
471	return fli;
472}
473
474/*
475 * Check if this lock type is granted at this state.
476 */
477static bool
478grant_lock(const struct fstrans_mount_info *fmi,
479    const enum fstrans_lock_type type)
480{
481
482	if (__predict_true(fmi->fmi_state == FSTRANS_NORMAL))
483		return true;
484	if (fmi->fmi_owner == curlwp)
485		return true;
486	if  (fmi->fmi_state == FSTRANS_SUSPENDING && type == FSTRANS_LAZY)
487		return true;
488
489	return false;
490}
491
492/*
493 * Start a transaction.  If this thread already has a transaction on this
494 * file system increment the reference counter.
495 */
496static inline int
497_fstrans_start(struct mount *mp, enum fstrans_lock_type lock_type, int wait)
498{
499	int s;
500	struct fstrans_lwp_info *fli;
501	struct fstrans_mount_info *fmi;
502
503#ifndef FSTRANS_DEAD_ENABLED
504	if (mp == dead_rootmount)
505		return 0;
506#endif
507
508	ASSERT_SLEEPABLE();
509
510	fli = fstrans_get_lwp_info(mp, true);
511	fmi = fli->fli_mountinfo;
512
513	if (fli->fli_trans_cnt > 0) {
514		fli->fli_trans_cnt += 1;
515
516		return 0;
517	}
518
519	s = pserialize_read_enter();
520	if (__predict_true(grant_lock(fmi, lock_type))) {
521		fli->fli_trans_cnt = 1;
522		fli->fli_lock_type = lock_type;
523		pserialize_read_exit(s);
524
525		return 0;
526	}
527	pserialize_read_exit(s);
528
529	if (! wait)
530		return EBUSY;
531
532	mutex_enter(&fstrans_lock);
533	while (! grant_lock(fmi, lock_type))
534		cv_wait(&fstrans_state_cv, &fstrans_lock);
535	fli->fli_trans_cnt = 1;
536	fli->fli_lock_type = lock_type;
537	mutex_exit(&fstrans_lock);
538
539	return 0;
540}
541
542void
543fstrans_start(struct mount *mp)
544{
545	int error __diagused;
546
547	error = _fstrans_start(mp, FSTRANS_SHARED, 1);
548	KASSERT(error == 0);
549}
550
551int
552fstrans_start_nowait(struct mount *mp)
553{
554
555	return _fstrans_start(mp, FSTRANS_SHARED, 0);
556}
557
558void
559fstrans_start_lazy(struct mount *mp)
560{
561	int error __diagused;
562
563	error = _fstrans_start(mp, FSTRANS_LAZY, 1);
564	KASSERT(error == 0);
565}
566
567/*
568 * Finish a transaction.
569 */
570void
571fstrans_done(struct mount *mp)
572{
573	int s;
574	struct fstrans_lwp_info *fli;
575	struct fstrans_mount_info *fmi;
576
577#ifndef FSTRANS_DEAD_ENABLED
578	if (mp == dead_rootmount)
579		return;
580#endif
581
582	fli = fstrans_get_lwp_info(mp, false);
583	fmi = fli->fli_mountinfo;
584	KASSERT(fli->fli_trans_cnt > 0);
585
586	if (fli->fli_trans_cnt > 1) {
587		fli->fli_trans_cnt -= 1;
588
589		return;
590	}
591
592	if (__predict_false(fstrans_gone_count > 0))
593		fstrans_clear_lwp_info();
594
595	s = pserialize_read_enter();
596	if (__predict_true(fmi->fmi_state == FSTRANS_NORMAL)) {
597		fli->fli_trans_cnt = 0;
598		pserialize_read_exit(s);
599
600		return;
601	}
602	pserialize_read_exit(s);
603
604	mutex_enter(&fstrans_lock);
605	fli->fli_trans_cnt = 0;
606	cv_signal(&fstrans_count_cv);
607	mutex_exit(&fstrans_lock);
608}
609
610/*
611 * Check if we hold an lock.
612 */
613int
614fstrans_held(struct mount *mp)
615{
616	struct fstrans_lwp_info *fli;
617	struct fstrans_mount_info *fmi;
618
619	KASSERT(mp != dead_rootmount);
620
621	fli = fstrans_get_lwp_info(mp, true);
622	fmi = fli->fli_mountinfo;
623
624	return (fli->fli_trans_cnt > 0 || fmi->fmi_owner == curlwp);
625}
626
627/*
628 * Check if this thread has an exclusive lock.
629 */
630int
631fstrans_is_owner(struct mount *mp)
632{
633	struct fstrans_lwp_info *fli;
634	struct fstrans_mount_info *fmi;
635
636	KASSERT(mp != dead_rootmount);
637
638	fli = fstrans_get_lwp_info(mp, true);
639	fmi = fli->fli_mountinfo;
640
641	return (fmi->fmi_owner == curlwp);
642}
643
644/*
645 * True, if no thread is in a transaction not granted at the current state.
646 */
647static bool
648state_change_done(const struct fstrans_mount_info *fmi)
649{
650	struct fstrans_lwp_info *fli;
651
652	KASSERT(mutex_owned(&fstrans_lock));
653
654	LIST_FOREACH(fli, &fstrans_fli_head, fli_list) {
655		if (fli->fli_mountinfo != fmi)
656			continue;
657		if (fli->fli_trans_cnt == 0)
658			continue;
659		if (fli->fli_self == curlwp)
660			continue;
661		if (grant_lock(fmi, fli->fli_lock_type))
662			continue;
663
664		return false;
665	}
666
667	return true;
668}
669
670/*
671 * Set new file system state.
672 */
673int
674fstrans_setstate(struct mount *mp, enum fstrans_state new_state)
675{
676	int error;
677	enum fstrans_state old_state;
678	struct fstrans_lwp_info *fli;
679	struct fstrans_mount_info *fmi;
680
681	KASSERT(mp != dead_rootmount);
682
683	fli = fstrans_get_lwp_info(mp, true);
684	fmi = fli->fli_mountinfo;
685	old_state = fmi->fmi_state;
686	if (old_state == new_state)
687		return 0;
688
689	mutex_enter(&fstrans_lock);
690	fmi->fmi_state = new_state;
691	pserialize_perform(fstrans_psz);
692
693	/*
694	 * All threads see the new state now.
695	 * Wait for transactions invalid at this state to leave.
696	 */
697	error = 0;
698	while (! state_change_done(fmi)) {
699		error = cv_wait_sig(&fstrans_count_cv, &fstrans_lock);
700		if (error) {
701			new_state = fmi->fmi_state = FSTRANS_NORMAL;
702			break;
703		}
704	}
705	if (old_state != new_state) {
706		if (old_state == FSTRANS_NORMAL) {
707			KASSERT(fmi->fmi_owner == NULL);
708			fmi->fmi_owner = curlwp;
709		}
710		if (new_state == FSTRANS_NORMAL) {
711			KASSERT(fmi->fmi_owner == curlwp);
712			fmi->fmi_owner = NULL;
713		}
714	}
715	cv_broadcast(&fstrans_state_cv);
716	mutex_exit(&fstrans_lock);
717
718	return error;
719}
720
721/*
722 * Get current file system state.
723 */
724enum fstrans_state
725fstrans_getstate(struct mount *mp)
726{
727	struct fstrans_lwp_info *fli;
728	struct fstrans_mount_info *fmi;
729
730	KASSERT(mp != dead_rootmount);
731
732	fli = fstrans_get_lwp_info(mp, true);
733	fmi = fli->fli_mountinfo;
734
735	return fmi->fmi_state;
736}
737
738/*
739 * Request a filesystem to suspend all operations.
740 */
741int
742vfs_suspend(struct mount *mp, int nowait)
743{
744	struct fstrans_lwp_info *fli;
745	int error;
746
747	if (mp == dead_rootmount)
748		return EOPNOTSUPP;
749
750	fli = fstrans_get_lwp_info(mp, true);
751	mp = fli->fli_mount;
752
753	if (nowait) {
754		if (!mutex_tryenter(&vfs_suspend_lock))
755			return EWOULDBLOCK;
756	} else
757		mutex_enter(&vfs_suspend_lock);
758
759	if ((error = VFS_SUSPENDCTL(mp, SUSPEND_SUSPEND)) != 0)
760		mutex_exit(&vfs_suspend_lock);
761
762	return error;
763}
764
765/*
766 * Request a filesystem to resume all operations.
767 */
768void
769vfs_resume(struct mount *mp)
770{
771	struct fstrans_lwp_info *fli;
772
773	KASSERT(mp != dead_rootmount);
774
775	fli = fstrans_get_lwp_info(mp, false);
776	mp = fli->fli_mount;
777
778	VFS_SUSPENDCTL(mp, SUSPEND_RESUME);
779	mutex_exit(&vfs_suspend_lock);
780}
781
782
783/*
784 * True, if no thread is running a cow handler.
785 */
786static bool
787cow_state_change_done(const struct fstrans_mount_info *fmi)
788{
789	struct fstrans_lwp_info *fli;
790
791	KASSERT(mutex_owned(&fstrans_lock));
792	KASSERT(fmi->fmi_cow_change);
793
794	LIST_FOREACH(fli, &fstrans_fli_head, fli_list) {
795		if (fli->fli_mount != fmi->fmi_mount)
796			continue;
797		if (fli->fli_cow_cnt == 0)
798			continue;
799
800		return false;
801	}
802
803	return true;
804}
805
806/*
807 * Prepare for changing this mounts cow list.
808 * Returns with fstrans_lock locked.
809 */
810static void
811cow_change_enter(struct fstrans_mount_info *fmi)
812{
813
814	mutex_enter(&fstrans_lock);
815
816	/*
817	 * Wait for other threads changing the list.
818	 */
819	while (fmi->fmi_cow_change)
820		cv_wait(&fstrans_state_cv, &fstrans_lock);
821
822	/*
823	 * Wait until all threads are aware of a state change.
824	 */
825	fmi->fmi_cow_change = true;
826	pserialize_perform(fstrans_psz);
827
828	while (! cow_state_change_done(fmi))
829		cv_wait(&fstrans_count_cv, &fstrans_lock);
830}
831
832/*
833 * Done changing this mounts cow list.
834 */
835static void
836cow_change_done(struct fstrans_mount_info *fmi)
837{
838
839	KASSERT(mutex_owned(&fstrans_lock));
840
841	fmi->fmi_cow_change = false;
842	pserialize_perform(fstrans_psz);
843
844	cv_broadcast(&fstrans_state_cv);
845
846	mutex_exit(&fstrans_lock);
847}
848
849/*
850 * Add a handler to this mount.
851 */
852int
853fscow_establish(struct mount *mp, int (*func)(void *, struct buf *, bool),
854    void *arg)
855{
856	struct fstrans_mount_info *fmi;
857	struct fscow_handler *newch;
858
859	KASSERT(mp != dead_rootmount);
860
861	mutex_enter(&fstrans_lock);
862	fmi = mp->mnt_transinfo;
863	KASSERT(fmi != NULL);
864	fmi->fmi_ref_cnt += 1;
865	mutex_exit(&fstrans_lock);
866
867	newch = kmem_alloc(sizeof(*newch), KM_SLEEP);
868	newch->ch_func = func;
869	newch->ch_arg = arg;
870
871	cow_change_enter(fmi);
872	LIST_INSERT_HEAD(&fmi->fmi_cow_handler, newch, ch_list);
873	cow_change_done(fmi);
874
875	return 0;
876}
877
878/*
879 * Remove a handler from this mount.
880 */
881int
882fscow_disestablish(struct mount *mp, int (*func)(void *, struct buf *, bool),
883    void *arg)
884{
885	struct fstrans_mount_info *fmi;
886	struct fscow_handler *hp = NULL;
887
888	KASSERT(mp != dead_rootmount);
889
890	fmi = mp->mnt_transinfo;
891	KASSERT(fmi != NULL);
892
893	cow_change_enter(fmi);
894	LIST_FOREACH(hp, &fmi->fmi_cow_handler, ch_list)
895		if (hp->ch_func == func && hp->ch_arg == arg)
896			break;
897	if (hp != NULL) {
898		LIST_REMOVE(hp, ch_list);
899		kmem_free(hp, sizeof(*hp));
900	}
901	fstrans_mount_dtor(fmi);
902	cow_change_done(fmi);
903
904	return hp ? 0 : EINVAL;
905}
906
907/*
908 * Check for need to copy block that is about to be written.
909 */
910int
911fscow_run(struct buf *bp, bool data_valid)
912{
913	int error, s;
914	struct mount *mp;
915	struct fstrans_lwp_info *fli;
916	struct fstrans_mount_info *fmi;
917	struct fscow_handler *hp;
918
919	/*
920	 * First check if we need run the copy-on-write handler.
921	 */
922	if ((bp->b_flags & B_COWDONE))
923		return 0;
924	if (bp->b_vp == NULL) {
925		bp->b_flags |= B_COWDONE;
926		return 0;
927	}
928	if (bp->b_vp->v_type == VBLK)
929		mp = spec_node_getmountedfs(bp->b_vp);
930	else
931		mp = bp->b_vp->v_mount;
932	if (mp == NULL || mp == dead_rootmount) {
933		bp->b_flags |= B_COWDONE;
934		return 0;
935	}
936
937	fli = fstrans_get_lwp_info(mp, true);
938	fmi = fli->fli_mountinfo;
939
940	/*
941	 * On non-recursed run check if other threads
942	 * want to change the list.
943	 */
944	if (fli->fli_cow_cnt == 0) {
945		s = pserialize_read_enter();
946		if (__predict_false(fmi->fmi_cow_change)) {
947			pserialize_read_exit(s);
948			mutex_enter(&fstrans_lock);
949			while (fmi->fmi_cow_change)
950				cv_wait(&fstrans_state_cv, &fstrans_lock);
951			fli->fli_cow_cnt = 1;
952			mutex_exit(&fstrans_lock);
953		} else {
954			fli->fli_cow_cnt = 1;
955			pserialize_read_exit(s);
956		}
957	} else
958		fli->fli_cow_cnt += 1;
959
960	/*
961	 * Run all copy-on-write handlers, stop on error.
962	 */
963	error = 0;
964	LIST_FOREACH(hp, &fmi->fmi_cow_handler, ch_list)
965		if ((error = (*hp->ch_func)(hp->ch_arg, bp, data_valid)) != 0)
966			break;
967 	if (error == 0)
968 		bp->b_flags |= B_COWDONE;
969
970	/*
971	 * Check if other threads want to change the list.
972	 */
973	if (fli->fli_cow_cnt > 1) {
974		fli->fli_cow_cnt -= 1;
975	} else {
976		s = pserialize_read_enter();
977		if (__predict_false(fmi->fmi_cow_change)) {
978			pserialize_read_exit(s);
979			mutex_enter(&fstrans_lock);
980			fli->fli_cow_cnt = 0;
981			cv_signal(&fstrans_count_cv);
982			mutex_exit(&fstrans_lock);
983		} else {
984			fli->fli_cow_cnt = 0;
985			pserialize_read_exit(s);
986		}
987	}
988
989	return error;
990}
991
992#if defined(DDB)
993void fstrans_dump(int);
994
995static void
996fstrans_print_lwp(struct proc *p, struct lwp *l, int verbose)
997{
998	char prefix[9];
999	struct fstrans_lwp_info *fli;
1000
1001	snprintf(prefix, sizeof(prefix), "%d.%d", p->p_pid, l->l_lid);
1002	LIST_FOREACH(fli, &fstrans_fli_head, fli_list) {
1003		if (fli->fli_self != l)
1004			continue;
1005		if (fli->fli_trans_cnt == 0 && fli->fli_cow_cnt == 0) {
1006			if (! verbose)
1007				continue;
1008		}
1009		printf("%-8s", prefix);
1010		if (verbose)
1011			printf(" @%p", fli);
1012		if (fli->fli_mount == dead_rootmount)
1013			printf(" <dead>");
1014		else if (fli->fli_mount != NULL)
1015			printf(" (%s)", fli->fli_mount->mnt_stat.f_mntonname);
1016		else
1017			printf(" NULL");
1018		if (fli->fli_alias != NULL) {
1019			struct mount *amp = fli->fli_alias->fli_mount;
1020
1021			printf(" alias");
1022			if (verbose)
1023				printf(" @%p", fli->fli_alias);
1024			if (amp == NULL)
1025				printf(" NULL");
1026			else
1027				printf(" (%s)", amp->mnt_stat.f_mntonname);
1028		}
1029		if (fli->fli_mountinfo && fli->fli_mountinfo->fmi_gone)
1030			printf(" gone");
1031		if (fli->fli_trans_cnt == 0) {
1032			printf(" -");
1033		} else {
1034			switch (fli->fli_lock_type) {
1035			case FSTRANS_LAZY:
1036				printf(" lazy");
1037				break;
1038			case FSTRANS_SHARED:
1039				printf(" shared");
1040				break;
1041			default:
1042				printf(" %#x", fli->fli_lock_type);
1043				break;
1044			}
1045		}
1046		printf(" %d cow %d alias %d\n",
1047		    fli->fli_trans_cnt, fli->fli_cow_cnt, fli->fli_alias_cnt);
1048		prefix[0] = '\0';
1049	}
1050}
1051
1052static void
1053fstrans_print_mount(struct mount *mp, int verbose)
1054{
1055	struct fstrans_mount_info *fmi;
1056
1057	fmi = mp->mnt_transinfo;
1058	if (!verbose && (fmi == NULL || fmi->fmi_state == FSTRANS_NORMAL))
1059		return;
1060
1061	printf("%-16s ", mp->mnt_stat.f_mntonname);
1062	if (fmi == NULL) {
1063		printf("(null)\n");
1064		return;
1065	}
1066	printf("owner %p ", fmi->fmi_owner);
1067	switch (fmi->fmi_state) {
1068	case FSTRANS_NORMAL:
1069		printf("state normal\n");
1070		break;
1071	case FSTRANS_SUSPENDING:
1072		printf("state suspending\n");
1073		break;
1074	case FSTRANS_SUSPENDED:
1075		printf("state suspended\n");
1076		break;
1077	default:
1078		printf("state %#x\n", fmi->fmi_state);
1079		break;
1080	}
1081}
1082
1083void
1084fstrans_dump(int full)
1085{
1086	const struct proclist_desc *pd;
1087	struct proc *p;
1088	struct lwp *l;
1089	struct mount *mp;
1090
1091	printf("Fstrans locks by lwp:\n");
1092	for (pd = proclists; pd->pd_list != NULL; pd++)
1093		PROCLIST_FOREACH(p, pd->pd_list)
1094			LIST_FOREACH(l, &p->p_lwps, l_sibling)
1095				fstrans_print_lwp(p, l, full == 1);
1096
1097	printf("Fstrans state by mount:\n");
1098	for (mp = _mountlist_next(NULL); mp; mp = _mountlist_next(mp))
1099		fstrans_print_mount(mp, full == 1);
1100}
1101#endif /* defined(DDB) */
1102