vfs_trans.c revision 1.69
1/*	$NetBSD: vfs_trans.c,v 1.69 2022/10/26 23:39:43 riastradh Exp $	*/
2
3/*-
4 * Copyright (c) 2007, 2020 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Juergen Hannken-Illjes.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32#include <sys/cdefs.h>
33__KERNEL_RCSID(0, "$NetBSD: vfs_trans.c,v 1.69 2022/10/26 23:39:43 riastradh Exp $");
34
35/*
36 * File system transaction operations.
37 */
38
39#ifdef _KERNEL_OPT
40#include "opt_ddb.h"
41#endif
42
43#include <sys/param.h>
44#include <sys/systm.h>
45#include <sys/atomic.h>
46#include <sys/buf.h>
47#include <sys/hash.h>
48#include <sys/kmem.h>
49#include <sys/mount.h>
50#include <sys/pserialize.h>
51#include <sys/vnode.h>
52#include <sys/fstrans.h>
53#include <sys/proc.h>
54#include <sys/pool.h>
55
56#include <miscfs/deadfs/deadfs.h>
57#include <miscfs/specfs/specdev.h>
58
59#define FSTRANS_MOUNT_HASHSIZE	32
60
61enum fstrans_lock_type {
62	FSTRANS_LAZY,			/* Granted while not suspended */
63	FSTRANS_SHARED			/* Granted while not suspending */
64};
65
66struct fscow_handler {
67	LIST_ENTRY(fscow_handler) ch_list;
68	int (*ch_func)(void *, struct buf *, bool);
69	void *ch_arg;
70};
71struct fstrans_lwp_info {
72	struct fstrans_lwp_info *fli_succ;
73	struct lwp *fli_self;
74	struct mount *fli_mount;
75	struct fstrans_lwp_info *fli_alias;
76	struct fstrans_mount_info *fli_mountinfo;
77	int fli_trans_cnt;
78	int fli_alias_cnt;
79	int fli_cow_cnt;
80	enum fstrans_lock_type fli_lock_type;
81	LIST_ENTRY(fstrans_lwp_info) fli_list;
82};
83struct fstrans_mount_info {
84	enum fstrans_state fmi_state;
85	unsigned int fmi_ref_cnt;
86	bool fmi_gone;
87	bool fmi_cow_change;
88	SLIST_ENTRY(fstrans_mount_info) fmi_hash;
89	LIST_HEAD(, fscow_handler) fmi_cow_handler;
90	struct mount *fmi_mount;
91	struct fstrans_mount_info *fmi_lower_info;
92	struct lwp *fmi_owner;
93};
94SLIST_HEAD(fstrans_mount_hashhead, fstrans_mount_info);
95
96static kmutex_t vfs_suspend_lock	/* Serialize suspensions. */
97    __cacheline_aligned;
98static kmutex_t fstrans_lock		/* Fstrans big lock. */
99    __cacheline_aligned;
100static kcondvar_t fstrans_state_cv;	/* Fstrans or cow state changed. */
101static kcondvar_t fstrans_count_cv;	/* Fstrans or cow count changed. */
102static pserialize_t fstrans_psz;	/* Pserialize state. */
103static LIST_HEAD(fstrans_lwp_head, fstrans_lwp_info) fstrans_fli_head;
104					/* List of all fstrans_lwp_info. */
105static pool_cache_t fstrans_lwp_cache;	/* Cache of fstrans_lwp_info. */
106
107static u_long fstrans_mount_hashmask;
108static struct fstrans_mount_hashhead *fstrans_mount_hashtab;
109static int fstrans_gone_count;		/* Number of fstrans_mount_info gone. */
110
111static inline uint32_t fstrans_mount_hash(struct mount *);
112static inline struct fstrans_mount_info *fstrans_mount_get(struct mount *);
113static void fstrans_mount_dtor(struct fstrans_mount_info *);
114static void fstrans_clear_lwp_info(void);
115static inline struct fstrans_lwp_info *
116    fstrans_get_lwp_info(struct mount *, bool);
117static struct fstrans_lwp_info *fstrans_alloc_lwp_info(struct mount *);
118static int fstrans_lwp_pcc(void *, void *, int);
119static void fstrans_lwp_pcd(void *, void *);
120static inline int _fstrans_start(struct mount *, enum fstrans_lock_type, int);
121static bool grant_lock(const struct fstrans_mount_info *,
122    const enum fstrans_lock_type);
123static bool state_change_done(const struct fstrans_mount_info *);
124static bool cow_state_change_done(const struct fstrans_mount_info *);
125static void cow_change_enter(struct fstrans_mount_info *);
126static void cow_change_done(struct fstrans_mount_info *);
127
128/*
129 * Initialize.
130 */
131void
132fstrans_init(void)
133{
134
135	mutex_init(&vfs_suspend_lock, MUTEX_DEFAULT, IPL_NONE);
136	mutex_init(&fstrans_lock, MUTEX_DEFAULT, IPL_NONE);
137	cv_init(&fstrans_state_cv, "fstchg");
138	cv_init(&fstrans_count_cv, "fstcnt");
139	fstrans_psz = pserialize_create();
140	LIST_INIT(&fstrans_fli_head);
141	fstrans_lwp_cache = pool_cache_init(sizeof(struct fstrans_lwp_info),
142	    coherency_unit, 0, 0, "fstlwp", NULL, IPL_NONE,
143	    fstrans_lwp_pcc, fstrans_lwp_pcd, NULL);
144	KASSERT(fstrans_lwp_cache != NULL);
145	fstrans_mount_hashtab = hashinit(FSTRANS_MOUNT_HASHSIZE, HASH_SLIST,
146	    true, &fstrans_mount_hashmask);
147}
148
149/*
150 * pool_cache constructor for fstrans_lwp_info.  Updating the global list
151 * produces cache misses on MP.  Minimise by keeping free entries on list.
152 */
153int
154fstrans_lwp_pcc(void *arg, void *obj, int flags)
155{
156	struct fstrans_lwp_info *fli = obj;
157
158	memset(fli, 0, sizeof(*fli));
159
160	mutex_enter(&fstrans_lock);
161	LIST_INSERT_HEAD(&fstrans_fli_head, fli, fli_list);
162	mutex_exit(&fstrans_lock);
163
164	return 0;
165}
166
167/*
168 * pool_cache destructor
169 */
170void
171fstrans_lwp_pcd(void *arg, void *obj)
172{
173	struct fstrans_lwp_info *fli = obj;
174
175	mutex_enter(&fstrans_lock);
176	LIST_REMOVE(fli, fli_list);
177	mutex_exit(&fstrans_lock);
178}
179
180/*
181 * Deallocate lwp state.
182 */
183void
184fstrans_lwp_dtor(lwp_t *l)
185{
186	struct fstrans_lwp_info *fli, *fli_next;
187
188	if (l->l_fstrans == NULL)
189		return;
190
191	mutex_enter(&fstrans_lock);
192	for (fli = l->l_fstrans; fli; fli = fli_next) {
193		KASSERT(fli->fli_trans_cnt == 0);
194		KASSERT(fli->fli_cow_cnt == 0);
195		KASSERT(fli->fli_self == l);
196		if (fli->fli_mount != NULL)
197			fstrans_mount_dtor(fli->fli_mountinfo);
198		fli_next = fli->fli_succ;
199		fli->fli_alias_cnt = 0;
200		fli->fli_mount = NULL;
201		fli->fli_alias = NULL;
202		fli->fli_mountinfo = NULL;
203		fli->fli_self = NULL;
204	}
205	mutex_exit(&fstrans_lock);
206
207	for (fli = l->l_fstrans; fli; fli = fli_next) {
208		fli_next = fli->fli_succ;
209		pool_cache_put(fstrans_lwp_cache, fli);
210	}
211	l->l_fstrans = NULL;
212}
213
214/*
215 * mount pointer to hash
216 */
217static inline uint32_t
218fstrans_mount_hash(struct mount *mp)
219{
220
221	return hash32_buf(&mp, sizeof(mp), HASH32_BUF_INIT) &
222	    fstrans_mount_hashmask;
223}
224
225/*
226 * retrieve fstrans_mount_info by mount or NULL
227 */
228static inline struct fstrans_mount_info *
229fstrans_mount_get(struct mount *mp)
230{
231	uint32_t indx;
232	struct fstrans_mount_info *fmi, *fmi_lower;
233
234	KASSERT(mutex_owned(&fstrans_lock));
235
236	indx = fstrans_mount_hash(mp);
237	SLIST_FOREACH(fmi, &fstrans_mount_hashtab[indx], fmi_hash) {
238		if (fmi->fmi_mount == mp) {
239			if (__predict_false(mp->mnt_lower != NULL &&
240			    fmi->fmi_lower_info == NULL)) {
241				/*
242				 * Intern the lower/lowest mount into
243				 * this mount info on first lookup.
244				 */
245				KASSERT(fmi->fmi_ref_cnt == 1);
246
247				fmi_lower = fstrans_mount_get(mp->mnt_lower);
248				if (fmi_lower && fmi_lower->fmi_lower_info)
249					fmi_lower = fmi_lower->fmi_lower_info;
250				if (fmi_lower == NULL)
251					return NULL;
252				fmi->fmi_lower_info = fmi_lower;
253				fmi->fmi_lower_info->fmi_ref_cnt += 1;
254			}
255			return fmi;
256		}
257	}
258
259	return NULL;
260}
261
262/*
263 * Dereference mount state.
264 */
265static void
266fstrans_mount_dtor(struct fstrans_mount_info *fmi)
267{
268
269	KASSERT(mutex_owned(&fstrans_lock));
270
271	KASSERT(fmi != NULL);
272	fmi->fmi_ref_cnt -= 1;
273	if (__predict_true(fmi->fmi_ref_cnt > 0)) {
274		return;
275	}
276
277	KASSERT(fmi->fmi_state == FSTRANS_NORMAL);
278	KASSERT(LIST_FIRST(&fmi->fmi_cow_handler) == NULL);
279	KASSERT(fmi->fmi_owner == NULL);
280
281	if (fmi->fmi_lower_info)
282		fstrans_mount_dtor(fmi->fmi_lower_info);
283
284	KASSERT(fstrans_gone_count > 0);
285	fstrans_gone_count -= 1;
286
287	kmem_free(fmi->fmi_mount, sizeof(*fmi->fmi_mount));
288	kmem_free(fmi, sizeof(*fmi));
289}
290
291/*
292 * Allocate mount state.
293 */
294int
295fstrans_mount(struct mount *mp)
296{
297	uint32_t indx;
298	struct fstrans_mount_info *newfmi;
299
300	indx = fstrans_mount_hash(mp);
301
302	newfmi = kmem_alloc(sizeof(*newfmi), KM_SLEEP);
303	newfmi->fmi_state = FSTRANS_NORMAL;
304	newfmi->fmi_ref_cnt = 1;
305	newfmi->fmi_gone = false;
306	LIST_INIT(&newfmi->fmi_cow_handler);
307	newfmi->fmi_cow_change = false;
308	newfmi->fmi_mount = mp;
309	newfmi->fmi_lower_info = NULL;
310	newfmi->fmi_owner = NULL;
311
312	mutex_enter(&fstrans_lock);
313	SLIST_INSERT_HEAD(&fstrans_mount_hashtab[indx], newfmi, fmi_hash);
314	mutex_exit(&fstrans_lock);
315
316	return 0;
317}
318
319/*
320 * Deallocate mount state.
321 */
322void
323fstrans_unmount(struct mount *mp)
324{
325	uint32_t indx;
326	struct fstrans_mount_info *fmi;
327
328	indx = fstrans_mount_hash(mp);
329
330	mutex_enter(&fstrans_lock);
331	fmi = fstrans_mount_get(mp);
332	KASSERT(fmi != NULL);
333	fmi->fmi_gone = true;
334	SLIST_REMOVE(&fstrans_mount_hashtab[indx],
335	    fmi, fstrans_mount_info, fmi_hash);
336	fstrans_gone_count += 1;
337	fstrans_mount_dtor(fmi);
338	mutex_exit(&fstrans_lock);
339}
340
341/*
342 * Clear mount entries whose mount is gone.
343 */
344static void
345fstrans_clear_lwp_info(void)
346{
347	struct fstrans_lwp_info **p, *fli, *tofree = NULL;
348
349	/*
350	 * Scan our list clearing entries whose mount is gone.
351	 */
352	mutex_enter(&fstrans_lock);
353	for (p = &curlwp->l_fstrans; *p; ) {
354		fli = *p;
355		if (fli->fli_mount != NULL &&
356		    fli->fli_mountinfo->fmi_gone &&
357		    fli->fli_trans_cnt == 0 &&
358		    fli->fli_cow_cnt == 0 &&
359		    fli->fli_alias_cnt == 0) {
360			*p = (*p)->fli_succ;
361			fstrans_mount_dtor(fli->fli_mountinfo);
362			if (fli->fli_alias) {
363				KASSERT(fli->fli_alias->fli_alias_cnt > 0);
364				fli->fli_alias->fli_alias_cnt--;
365			}
366			fli->fli_mount = NULL;
367			fli->fli_alias = NULL;
368			fli->fli_mountinfo = NULL;
369			fli->fli_self = NULL;
370			p = &curlwp->l_fstrans;
371			fli->fli_succ = tofree;
372			tofree = fli;
373		} else {
374			p = &(*p)->fli_succ;
375		}
376	}
377#ifdef DIAGNOSTIC
378	for (fli = curlwp->l_fstrans; fli; fli = fli->fli_succ)
379		if (fli->fli_alias != NULL)
380			KASSERT(fli->fli_alias->fli_self == curlwp);
381#endif /* DIAGNOSTIC */
382	mutex_exit(&fstrans_lock);
383
384	while (tofree != NULL) {
385		fli = tofree;
386		tofree = fli->fli_succ;
387		pool_cache_put(fstrans_lwp_cache, fli);
388	}
389}
390
391/*
392 * Allocate and return per lwp info for this mount.
393 */
394static struct fstrans_lwp_info *
395fstrans_alloc_lwp_info(struct mount *mp)
396{
397	struct fstrans_lwp_info *fli, *fli_lower;
398	struct fstrans_mount_info *fmi;
399
400	for (fli = curlwp->l_fstrans; fli; fli = fli->fli_succ) {
401		if (fli->fli_mount == mp)
402			return fli;
403	}
404
405	/*
406	 * Lookup mount info and get lower mount per lwp info.
407	 */
408	mutex_enter(&fstrans_lock);
409	fmi = fstrans_mount_get(mp);
410	if (fmi == NULL) {
411		mutex_exit(&fstrans_lock);
412		return NULL;
413	}
414	fmi->fmi_ref_cnt += 1;
415	mutex_exit(&fstrans_lock);
416
417	if (fmi->fmi_lower_info) {
418		fli_lower =
419		    fstrans_alloc_lwp_info(fmi->fmi_lower_info->fmi_mount);
420		if (fli_lower == NULL) {
421			mutex_enter(&fstrans_lock);
422			fstrans_mount_dtor(fmi);
423			mutex_exit(&fstrans_lock);
424
425			return NULL;
426		}
427	} else {
428		fli_lower = NULL;
429	}
430
431	/*
432	 * Allocate a new entry.
433	 */
434	fli = pool_cache_get(fstrans_lwp_cache, PR_WAITOK);
435	KASSERT(fli->fli_trans_cnt == 0);
436	KASSERT(fli->fli_cow_cnt == 0);
437	KASSERT(fli->fli_alias_cnt == 0);
438	KASSERT(fli->fli_mount == NULL);
439	KASSERT(fli->fli_alias == NULL);
440	KASSERT(fli->fli_mountinfo == NULL);
441	KASSERT(fli->fli_self == NULL);
442
443	/*
444	 * Attach the mount info and alias.
445	 */
446
447	fli->fli_self = curlwp;
448	fli->fli_mount = mp;
449	fli->fli_mountinfo = fmi;
450
451	fli->fli_succ = curlwp->l_fstrans;
452	curlwp->l_fstrans = fli;
453
454	if (fli_lower) {
455		fli->fli_alias = fli_lower;
456		fli->fli_alias->fli_alias_cnt++;
457		fli = fli->fli_alias;
458	}
459
460	return fli;
461}
462
463/*
464 * Retrieve the per lwp info for this mount allocating if necessary.
465 */
466static inline struct fstrans_lwp_info *
467fstrans_get_lwp_info(struct mount *mp, bool do_alloc)
468{
469	struct fstrans_lwp_info *fli;
470
471	/*
472	 * Scan our list for a match.
473	 */
474	for (fli = curlwp->l_fstrans; fli; fli = fli->fli_succ) {
475		if (fli->fli_mount == mp) {
476			KASSERT((mp->mnt_lower == NULL) ==
477			    (fli->fli_alias == NULL));
478			if (fli->fli_alias != NULL)
479				fli = fli->fli_alias;
480			break;
481		}
482	}
483
484	if (do_alloc) {
485		if (__predict_false(fli == NULL))
486			fli = fstrans_alloc_lwp_info(mp);
487	}
488
489	return fli;
490}
491
492/*
493 * Check if this lock type is granted at this state.
494 */
495static bool
496grant_lock(const struct fstrans_mount_info *fmi,
497    const enum fstrans_lock_type type)
498{
499
500	if (__predict_true(fmi->fmi_state == FSTRANS_NORMAL))
501		return true;
502	if (fmi->fmi_owner == curlwp)
503		return true;
504	if  (fmi->fmi_state == FSTRANS_SUSPENDING && type == FSTRANS_LAZY)
505		return true;
506
507	return false;
508}
509
510/*
511 * Start a transaction.  If this thread already has a transaction on this
512 * file system increment the reference counter.
513 */
514static inline int
515_fstrans_start(struct mount *mp, enum fstrans_lock_type lock_type, int wait)
516{
517	int s;
518	struct fstrans_lwp_info *fli;
519	struct fstrans_mount_info *fmi;
520
521	ASSERT_SLEEPABLE();
522
523	fli = fstrans_get_lwp_info(mp, true);
524	if (fli == NULL)
525		return 0;
526	fmi = fli->fli_mountinfo;
527
528	if (fli->fli_trans_cnt > 0) {
529		fli->fli_trans_cnt += 1;
530
531		return 0;
532	}
533
534	s = pserialize_read_enter();
535	if (__predict_true(grant_lock(fmi, lock_type))) {
536		fli->fli_trans_cnt = 1;
537		fli->fli_lock_type = lock_type;
538		pserialize_read_exit(s);
539
540		return 0;
541	}
542	pserialize_read_exit(s);
543
544	if (! wait)
545		return EBUSY;
546
547	mutex_enter(&fstrans_lock);
548	while (! grant_lock(fmi, lock_type))
549		cv_wait(&fstrans_state_cv, &fstrans_lock);
550	fli->fli_trans_cnt = 1;
551	fli->fli_lock_type = lock_type;
552	mutex_exit(&fstrans_lock);
553
554	return 0;
555}
556
557void
558fstrans_start(struct mount *mp)
559{
560	int error __diagused;
561
562	error = _fstrans_start(mp, FSTRANS_SHARED, 1);
563	KASSERT(error == 0);
564}
565
566int
567fstrans_start_nowait(struct mount *mp)
568{
569
570	return _fstrans_start(mp, FSTRANS_SHARED, 0);
571}
572
573void
574fstrans_start_lazy(struct mount *mp)
575{
576	int error __diagused;
577
578	error = _fstrans_start(mp, FSTRANS_LAZY, 1);
579	KASSERT(error == 0);
580}
581
582/*
583 * Finish a transaction.
584 */
585void
586fstrans_done(struct mount *mp)
587{
588	int s;
589	struct fstrans_lwp_info *fli;
590	struct fstrans_mount_info *fmi;
591
592	fli = fstrans_get_lwp_info(mp, false);
593	if (fli == NULL)
594		return;
595	fmi = fli->fli_mountinfo;
596	KASSERT(fli->fli_trans_cnt > 0);
597
598	if (fli->fli_trans_cnt > 1) {
599		fli->fli_trans_cnt -= 1;
600
601		return;
602	}
603
604	if (__predict_false(fstrans_gone_count > 0))
605		fstrans_clear_lwp_info();
606
607	s = pserialize_read_enter();
608	if (__predict_true(fmi->fmi_state == FSTRANS_NORMAL)) {
609		fli->fli_trans_cnt = 0;
610		pserialize_read_exit(s);
611
612		return;
613	}
614	pserialize_read_exit(s);
615
616	mutex_enter(&fstrans_lock);
617	fli->fli_trans_cnt = 0;
618	cv_signal(&fstrans_count_cv);
619	mutex_exit(&fstrans_lock);
620}
621
622/*
623 * Check if we hold an lock.
624 */
625int
626fstrans_held(struct mount *mp)
627{
628	struct fstrans_lwp_info *fli;
629	struct fstrans_mount_info *fmi;
630
631	KASSERT(mp != dead_rootmount);
632
633	fli = fstrans_get_lwp_info(mp, false);
634	if (fli == NULL)
635		return 0;
636	fmi = fli->fli_mountinfo;
637
638	return (fli->fli_trans_cnt > 0 || fmi->fmi_owner == curlwp);
639}
640
641/*
642 * Check if this thread has an exclusive lock.
643 */
644int
645fstrans_is_owner(struct mount *mp)
646{
647	struct fstrans_lwp_info *fli;
648	struct fstrans_mount_info *fmi;
649
650	KASSERT(mp != dead_rootmount);
651
652	fli = fstrans_get_lwp_info(mp, false);
653	if (fli == NULL)
654		return 0;
655	fmi = fli->fli_mountinfo;
656
657	return (fmi->fmi_owner == curlwp);
658}
659
660/*
661 * True, if no thread is in a transaction not granted at the current state.
662 */
663static bool
664state_change_done(const struct fstrans_mount_info *fmi)
665{
666	struct fstrans_lwp_info *fli;
667
668	KASSERT(mutex_owned(&fstrans_lock));
669
670	LIST_FOREACH(fli, &fstrans_fli_head, fli_list) {
671		if (fli->fli_mountinfo != fmi)
672			continue;
673		if (fli->fli_trans_cnt == 0)
674			continue;
675		if (fli->fli_self == curlwp)
676			continue;
677		if (grant_lock(fmi, fli->fli_lock_type))
678			continue;
679
680		return false;
681	}
682
683	return true;
684}
685
686/*
687 * Set new file system state.
688 */
689int
690fstrans_setstate(struct mount *mp, enum fstrans_state new_state)
691{
692	int error;
693	enum fstrans_state old_state;
694	struct fstrans_lwp_info *fli;
695	struct fstrans_mount_info *fmi;
696
697	KASSERT(mp != dead_rootmount);
698
699	fli = fstrans_get_lwp_info(mp, true);
700	if (fli == NULL)
701		return ENOENT;
702	fmi = fli->fli_mountinfo;
703	old_state = fmi->fmi_state;
704	if (old_state == new_state)
705		return 0;
706
707	mutex_enter(&fstrans_lock);
708	fmi->fmi_state = new_state;
709	pserialize_perform(fstrans_psz);
710
711	/*
712	 * All threads see the new state now.
713	 * Wait for transactions invalid at this state to leave.
714	 */
715	error = 0;
716	while (! state_change_done(fmi)) {
717		error = cv_wait_sig(&fstrans_count_cv, &fstrans_lock);
718		if (error) {
719			new_state = fmi->fmi_state = FSTRANS_NORMAL;
720			break;
721		}
722	}
723	if (old_state != new_state) {
724		if (old_state == FSTRANS_NORMAL) {
725			KASSERT(fmi->fmi_owner == NULL);
726			fmi->fmi_owner = curlwp;
727		}
728		if (new_state == FSTRANS_NORMAL) {
729			KASSERT(fmi->fmi_owner == curlwp);
730			fmi->fmi_owner = NULL;
731		}
732	}
733	cv_broadcast(&fstrans_state_cv);
734	mutex_exit(&fstrans_lock);
735
736	return error;
737}
738
739/*
740 * Get current file system state.
741 */
742enum fstrans_state
743fstrans_getstate(struct mount *mp)
744{
745	struct fstrans_lwp_info *fli;
746	struct fstrans_mount_info *fmi;
747
748	KASSERT(mp != dead_rootmount);
749
750	fli = fstrans_get_lwp_info(mp, true);
751	KASSERT(fli != NULL);
752	fmi = fli->fli_mountinfo;
753
754	return fmi->fmi_state;
755}
756
757/*
758 * Request a filesystem to suspend all operations.
759 */
760int
761vfs_suspend(struct mount *mp, int nowait)
762{
763	struct fstrans_lwp_info *fli;
764	int error;
765
766	if (mp == dead_rootmount)
767		return EOPNOTSUPP;
768
769	fli = fstrans_get_lwp_info(mp, true);
770	if (fli == NULL)
771		return ENOENT;
772
773	if (nowait) {
774		if (!mutex_tryenter(&vfs_suspend_lock))
775			return EWOULDBLOCK;
776	} else
777		mutex_enter(&vfs_suspend_lock);
778
779	if ((error = VFS_SUSPENDCTL(fli->fli_mount, SUSPEND_SUSPEND)) != 0) {
780		mutex_exit(&vfs_suspend_lock);
781		return error;
782	}
783
784	if ((mp->mnt_iflag & IMNT_GONE) != 0) {
785		vfs_resume(mp);
786		return ENOENT;
787	}
788
789	return 0;
790}
791
792/*
793 * Request a filesystem to resume all operations.
794 */
795void
796vfs_resume(struct mount *mp)
797{
798	struct fstrans_lwp_info *fli;
799
800	KASSERT(mp != dead_rootmount);
801
802	fli = fstrans_get_lwp_info(mp, false);
803	mp = fli->fli_mount;
804
805	VFS_SUSPENDCTL(mp, SUSPEND_RESUME);
806	mutex_exit(&vfs_suspend_lock);
807}
808
809
810/*
811 * True, if no thread is running a cow handler.
812 */
813static bool
814cow_state_change_done(const struct fstrans_mount_info *fmi)
815{
816	struct fstrans_lwp_info *fli;
817
818	KASSERT(mutex_owned(&fstrans_lock));
819	KASSERT(fmi->fmi_cow_change);
820
821	LIST_FOREACH(fli, &fstrans_fli_head, fli_list) {
822		if (fli->fli_mount != fmi->fmi_mount)
823			continue;
824		if (fli->fli_cow_cnt == 0)
825			continue;
826
827		return false;
828	}
829
830	return true;
831}
832
833/*
834 * Prepare for changing this mounts cow list.
835 * Returns with fstrans_lock locked.
836 */
837static void
838cow_change_enter(struct fstrans_mount_info *fmi)
839{
840
841	mutex_enter(&fstrans_lock);
842
843	/*
844	 * Wait for other threads changing the list.
845	 */
846	while (fmi->fmi_cow_change)
847		cv_wait(&fstrans_state_cv, &fstrans_lock);
848
849	/*
850	 * Wait until all threads are aware of a state change.
851	 */
852	fmi->fmi_cow_change = true;
853	pserialize_perform(fstrans_psz);
854
855	while (! cow_state_change_done(fmi))
856		cv_wait(&fstrans_count_cv, &fstrans_lock);
857}
858
859/*
860 * Done changing this mounts cow list.
861 */
862static void
863cow_change_done(struct fstrans_mount_info *fmi)
864{
865
866	KASSERT(mutex_owned(&fstrans_lock));
867
868	fmi->fmi_cow_change = false;
869	pserialize_perform(fstrans_psz);
870
871	cv_broadcast(&fstrans_state_cv);
872
873	mutex_exit(&fstrans_lock);
874}
875
876/*
877 * Add a handler to this mount.
878 */
879int
880fscow_establish(struct mount *mp, int (*func)(void *, struct buf *, bool),
881    void *arg)
882{
883	struct fstrans_mount_info *fmi;
884	struct fscow_handler *newch;
885
886	KASSERT(mp != dead_rootmount);
887
888	mutex_enter(&fstrans_lock);
889	fmi = fstrans_mount_get(mp);
890	KASSERT(fmi != NULL);
891	fmi->fmi_ref_cnt += 1;
892	mutex_exit(&fstrans_lock);
893
894	newch = kmem_alloc(sizeof(*newch), KM_SLEEP);
895	newch->ch_func = func;
896	newch->ch_arg = arg;
897
898	cow_change_enter(fmi);
899	LIST_INSERT_HEAD(&fmi->fmi_cow_handler, newch, ch_list);
900	cow_change_done(fmi);
901
902	return 0;
903}
904
905/*
906 * Remove a handler from this mount.
907 */
908int
909fscow_disestablish(struct mount *mp, int (*func)(void *, struct buf *, bool),
910    void *arg)
911{
912	struct fstrans_mount_info *fmi;
913	struct fscow_handler *hp = NULL;
914
915	KASSERT(mp != dead_rootmount);
916
917	mutex_enter(&fstrans_lock);
918	fmi = fstrans_mount_get(mp);
919	KASSERT(fmi != NULL);
920	mutex_exit(&fstrans_lock);
921
922	cow_change_enter(fmi);
923	LIST_FOREACH(hp, &fmi->fmi_cow_handler, ch_list)
924		if (hp->ch_func == func && hp->ch_arg == arg)
925			break;
926	if (hp != NULL) {
927		LIST_REMOVE(hp, ch_list);
928		kmem_free(hp, sizeof(*hp));
929	}
930	fstrans_mount_dtor(fmi);
931	cow_change_done(fmi);
932
933	return hp ? 0 : EINVAL;
934}
935
936/*
937 * Check for need to copy block that is about to be written.
938 */
939int
940fscow_run(struct buf *bp, bool data_valid)
941{
942	int error, s;
943	struct mount *mp;
944	struct fstrans_lwp_info *fli;
945	struct fstrans_mount_info *fmi;
946	struct fscow_handler *hp;
947
948	/*
949	 * First check if we need run the copy-on-write handler.
950	 */
951	if ((bp->b_flags & B_COWDONE))
952		return 0;
953	if (bp->b_vp == NULL) {
954		bp->b_flags |= B_COWDONE;
955		return 0;
956	}
957	if (bp->b_vp->v_type == VBLK)
958		mp = spec_node_getmountedfs(bp->b_vp);
959	else
960		mp = bp->b_vp->v_mount;
961	if (mp == NULL || mp == dead_rootmount) {
962		bp->b_flags |= B_COWDONE;
963		return 0;
964	}
965
966	fli = fstrans_get_lwp_info(mp, true);
967	KASSERT(fli != NULL);
968	fmi = fli->fli_mountinfo;
969
970	/*
971	 * On non-recursed run check if other threads
972	 * want to change the list.
973	 */
974	if (fli->fli_cow_cnt == 0) {
975		s = pserialize_read_enter();
976		if (__predict_false(fmi->fmi_cow_change)) {
977			pserialize_read_exit(s);
978			mutex_enter(&fstrans_lock);
979			while (fmi->fmi_cow_change)
980				cv_wait(&fstrans_state_cv, &fstrans_lock);
981			fli->fli_cow_cnt = 1;
982			mutex_exit(&fstrans_lock);
983		} else {
984			fli->fli_cow_cnt = 1;
985			pserialize_read_exit(s);
986		}
987	} else
988		fli->fli_cow_cnt += 1;
989
990	/*
991	 * Run all copy-on-write handlers, stop on error.
992	 */
993	error = 0;
994	LIST_FOREACH(hp, &fmi->fmi_cow_handler, ch_list)
995		if ((error = (*hp->ch_func)(hp->ch_arg, bp, data_valid)) != 0)
996			break;
997 	if (error == 0)
998 		bp->b_flags |= B_COWDONE;
999
1000	/*
1001	 * Check if other threads want to change the list.
1002	 */
1003	if (fli->fli_cow_cnt > 1) {
1004		fli->fli_cow_cnt -= 1;
1005	} else {
1006		s = pserialize_read_enter();
1007		if (__predict_false(fmi->fmi_cow_change)) {
1008			pserialize_read_exit(s);
1009			mutex_enter(&fstrans_lock);
1010			fli->fli_cow_cnt = 0;
1011			cv_signal(&fstrans_count_cv);
1012			mutex_exit(&fstrans_lock);
1013		} else {
1014			fli->fli_cow_cnt = 0;
1015			pserialize_read_exit(s);
1016		}
1017	}
1018
1019	return error;
1020}
1021
1022#if defined(DDB)
1023void fstrans_dump(int);
1024
1025static void
1026fstrans_print_lwp(struct proc *p, struct lwp *l, int verbose)
1027{
1028	char prefix[9];
1029	struct fstrans_lwp_info *fli;
1030
1031	snprintf(prefix, sizeof(prefix), "%d.%d", p->p_pid, l->l_lid);
1032	LIST_FOREACH(fli, &fstrans_fli_head, fli_list) {
1033		if (fli->fli_self != l)
1034			continue;
1035		if (fli->fli_trans_cnt == 0 && fli->fli_cow_cnt == 0) {
1036			if (! verbose)
1037				continue;
1038		}
1039		printf("%-8s", prefix);
1040		if (verbose)
1041			printf(" @%p", fli);
1042		if (fli->fli_mount == dead_rootmount)
1043			printf(" <dead>");
1044		else if (fli->fli_mount != NULL)
1045			printf(" (%s)", fli->fli_mount->mnt_stat.f_mntonname);
1046		else
1047			printf(" NULL");
1048		if (fli->fli_alias != NULL) {
1049			struct mount *amp = fli->fli_alias->fli_mount;
1050
1051			printf(" alias");
1052			if (verbose)
1053				printf(" @%p", fli->fli_alias);
1054			if (amp == NULL)
1055				printf(" NULL");
1056			else
1057				printf(" (%s)", amp->mnt_stat.f_mntonname);
1058		}
1059		if (fli->fli_mountinfo && fli->fli_mountinfo->fmi_gone)
1060			printf(" gone");
1061		if (fli->fli_trans_cnt == 0) {
1062			printf(" -");
1063		} else {
1064			switch (fli->fli_lock_type) {
1065			case FSTRANS_LAZY:
1066				printf(" lazy");
1067				break;
1068			case FSTRANS_SHARED:
1069				printf(" shared");
1070				break;
1071			default:
1072				printf(" %#x", fli->fli_lock_type);
1073				break;
1074			}
1075		}
1076		printf(" %d cow %d alias %d\n",
1077		    fli->fli_trans_cnt, fli->fli_cow_cnt, fli->fli_alias_cnt);
1078		prefix[0] = '\0';
1079	}
1080}
1081
1082static void
1083fstrans_print_mount(struct mount *mp, int verbose)
1084{
1085	uint32_t indx;
1086	struct fstrans_mount_info *fmi;
1087
1088	indx = fstrans_mount_hash(mp);
1089	SLIST_FOREACH(fmi, &fstrans_mount_hashtab[indx], fmi_hash)
1090		if (fmi->fmi_mount == mp)
1091			break;
1092
1093	if (!verbose && (fmi == NULL || fmi->fmi_state == FSTRANS_NORMAL))
1094		return;
1095
1096	printf("%-16s ", mp->mnt_stat.f_mntonname);
1097	if (fmi == NULL) {
1098		printf("(null)\n");
1099		return;
1100	}
1101	printf("owner %p ", fmi->fmi_owner);
1102	switch (fmi->fmi_state) {
1103	case FSTRANS_NORMAL:
1104		printf("state normal\n");
1105		break;
1106	case FSTRANS_SUSPENDING:
1107		printf("state suspending\n");
1108		break;
1109	case FSTRANS_SUSPENDED:
1110		printf("state suspended\n");
1111		break;
1112	default:
1113		printf("state %#x\n", fmi->fmi_state);
1114		break;
1115	}
1116}
1117
1118void
1119fstrans_dump(int full)
1120{
1121	const struct proclist_desc *pd;
1122	struct proc *p;
1123	struct lwp *l;
1124	struct mount *mp;
1125
1126	printf("Fstrans locks by lwp:\n");
1127	for (pd = proclists; pd->pd_list != NULL; pd++)
1128		PROCLIST_FOREACH(p, pd->pd_list)
1129			LIST_FOREACH(l, &p->p_lwps, l_sibling)
1130				fstrans_print_lwp(p, l, full == 1);
1131
1132	printf("Fstrans state by mount:\n");
1133	for (mp = _mountlist_next(NULL); mp; mp = _mountlist_next(mp))
1134		fstrans_print_mount(mp, full == 1);
1135}
1136#endif /* defined(DDB) */
1137