1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24
25#include <sys/types.h>
26#include <sys/t_lock.h>
27#include <sys/param.h>
28#include <sys/time.h>
29#include <sys/systm.h>
30#include <sys/sysmacros.h>
31#include <sys/resource.h>
32#include <sys/signal.h>
33#include <sys/cred.h>
34#include <sys/user.h>
35#include <sys/buf.h>
36#include <sys/vfs.h>
37#include <sys/vnode.h>
38#include <sys/proc.h>
39#include <sys/disp.h>
40#include <sys/file.h>
41#include <sys/fcntl.h>
42#include <sys/flock.h>
43#include <sys/atomic.h>
44#include <sys/kmem.h>
45#include <sys/uio.h>
46#include <sys/conf.h>
47#include <sys/mman.h>
48#include <sys/pathname.h>
49#include <sys/debug.h>
50#include <sys/vmsystm.h>
51#include <sys/cmn_err.h>
52#include <sys/acct.h>
53#include <sys/dnlc.h>
54#include <sys/swap.h>
55
56#include <sys/fs/ufs_fs.h>
57#include <sys/fs/ufs_inode.h>
58#include <sys/fs/ufs_fsdir.h>
59#include <sys/fs/ufs_trans.h>
60#include <sys/fs/ufs_panic.h>
61#include <sys/fs/ufs_mount.h>
62#include <sys/fs/ufs_bio.h>
63#include <sys/fs/ufs_log.h>
64#include <sys/fs/ufs_quota.h>
65#include <sys/dirent.h>		/* must be AFTER <sys/fs/fsdir.h>! */
66#include <sys/errno.h>
67#include <sys/sysinfo.h>
68
69#include <vm/hat.h>
70#include <vm/pvn.h>
71#include <vm/as.h>
72#include <vm/seg.h>
73#include <vm/seg_map.h>
74#include <vm/seg_vn.h>
75#include <vm/rm.h>
76#include <vm/anon.h>
77#include <sys/swap.h>
78#include <sys/dnlc.h>
79
80extern struct vnode *common_specvp(struct vnode *vp);
81
82/* error lock status */
83#define	UN_ERRLCK	(-1)
84#define	SET_ERRLCK	1
85#define	RE_ERRLCK	2
86#define	NO_ERRLCK	0
87
88/*
89 * Index to be used in TSD for storing lockfs data
90 */
91uint_t ufs_lockfs_key;
92
93typedef struct _ulockfs_info {
94	struct _ulockfs_info *next;
95	struct ulockfs *ulp;
96	uint_t flags;
97} ulockfs_info_t;
98
99#define	ULOCK_INFO_FALLOCATE	0x00000001	/* fallocate thread */
100
101/*
102 * Check in TSD that whether we are already doing any VOP on this filesystem
103 */
104#define	IS_REC_VOP(found, head, ulp, free)		\
105{							\
106	ulockfs_info_t *_curr;				\
107							\
108	for (found = 0, free = NULL, _curr = head;	\
109	    _curr != NULL; _curr = _curr->next) {	\
110		if ((free == NULL) &&			\
111		    (_curr->ulp == NULL))		\
112			free = _curr;			\
113		if (_curr->ulp == ulp) {		\
114			found = 1;			\
115			break;				\
116		}					\
117	}						\
118}
119
120/*
121 * Get the lockfs data from TSD so that lockfs handles the recursive VOP
122 * properly
123 */
124#define	SEARCH_ULOCKFSP(head, ulp, info)		\
125{							\
126	ulockfs_info_t *_curr;				\
127							\
128	for (_curr = head; _curr != NULL;		\
129	    _curr = _curr->next) {			\
130		if (_curr->ulp == ulp) {		\
131			break;				\
132		}					\
133	}						\
134							\
135	info = _curr;					\
136}
137
138/*
139 * Validate lockfs request
140 */
141static int
142ufs_getlfd(
143	struct lockfs *lockfsp,		/* new lock request */
144	struct lockfs *ul_lockfsp)	/* old lock state */
145{
146	int	error = 0;
147
148	/*
149	 * no input flags defined
150	 */
151	if (lockfsp->lf_flags != 0) {
152		error = EINVAL;
153		goto errout;
154	}
155
156	/*
157	 * check key
158	 */
159	if (!LOCKFS_IS_ULOCK(ul_lockfsp))
160		if (lockfsp->lf_key != ul_lockfsp->lf_key) {
161			error = EINVAL;
162			goto errout;
163	}
164
165	lockfsp->lf_key = ul_lockfsp->lf_key + 1;
166
167errout:
168	return (error);
169}
170
171/*
172 * ufs_checkaccton
173 *	check if accounting is turned on on this fs
174 */
175
176int
177ufs_checkaccton(struct vnode *vp)
178{
179	if (acct_fs_in_use(vp))
180		return (EDEADLK);
181	return (0);
182}
183
184/*
185 * ufs_checkswapon
186 *	check if local swapping is to file on this fs
187 */
188int
189ufs_checkswapon(struct vnode *vp)
190{
191	struct swapinfo	*sip;
192
193	mutex_enter(&swapinfo_lock);
194	for (sip = swapinfo; sip; sip = sip->si_next)
195		if (sip->si_vp->v_vfsp == vp->v_vfsp) {
196			mutex_exit(&swapinfo_lock);
197			return (EDEADLK);
198		}
199	mutex_exit(&swapinfo_lock);
200	return (0);
201}
202
203/*
204 * ufs_freeze
205 *	pend future accesses for current lock and desired lock
206 */
207void
208ufs_freeze(struct ulockfs *ulp, struct lockfs *lockfsp)
209{
210	/*
211	 * set to new lock type
212	 */
213	ulp->ul_lockfs.lf_lock = lockfsp->lf_lock;
214	ulp->ul_lockfs.lf_key = lockfsp->lf_key;
215	ulp->ul_lockfs.lf_comlen = lockfsp->lf_comlen;
216	ulp->ul_lockfs.lf_comment = lockfsp->lf_comment;
217
218	ulp->ul_fs_lock = (1 << ulp->ul_lockfs.lf_lock);
219}
220
221/*
222 * All callers of ufs_quiesce() atomically increment ufs_quiesce_pend before
223 * starting ufs_quiesce() protocol and decrement it only when a file system no
224 * longer has to be in quiescent state. This allows ufs_pageio() to detect
225 * that another thread wants to quiesce a file system. See more comments in
226 * ufs_pageio().
227 */
228ulong_t ufs_quiesce_pend = 0;
229
230/*
231 * ufs_quiesce
232 *	wait for outstanding accesses to finish
233 */
234int
235ufs_quiesce(struct ulockfs *ulp)
236{
237	int error = 0;
238	ulockfs_info_t *head;
239	ulockfs_info_t *info;
240	klwp_t *lwp = ttolwp(curthread);
241
242	head = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
243	SEARCH_ULOCKFSP(head, ulp, info);
244
245	/*
246	 * We have to keep /proc away from stopping us after we applied
247	 * the softlock but before we got a chance to clear it again.
248	 * prstop() may pagefault and become stuck on the softlock still
249	 * pending.
250	 */
251	if (lwp != NULL)
252		lwp->lwp_nostop++;
253
254	/*
255	 * Set a softlock to suspend future ufs_vnops so that
256	 * this lockfs request will not be starved
257	 */
258	ULOCKFS_SET_SLOCK(ulp);
259	ASSERT(ufs_quiesce_pend);
260
261	/* check if there is any outstanding ufs vnodeops calls */
262	while (ulp->ul_vnops_cnt || ulp->ul_falloc_cnt) {
263		/*
264		 * use timed version of cv_wait_sig() to make sure we don't
265		 * miss a wake up call from ufs_pageio() when it doesn't use
266		 * ul_lock.
267		 *
268		 * when a fallocate thread comes in, the only way it returns
269		 * from this function is if there are no other vnode operations
270		 * going on (remember fallocate threads are tracked using
271		 * ul_falloc_cnt not ul_vnops_cnt), and another fallocate thread
272		 * hasn't already grabbed the fs write lock.
273		 */
274		if (info && (info->flags & ULOCK_INFO_FALLOCATE)) {
275			if (!ulp->ul_vnops_cnt && !ULOCKFS_IS_FWLOCK(ulp))
276				goto out;
277		}
278		if (!cv_reltimedwait_sig(&ulp->ul_cv, &ulp->ul_lock, hz,
279		    TR_CLOCK_TICK)) {
280			error = EINTR;
281			goto out;
282		}
283	}
284
285out:
286	/*
287	 * unlock the soft lock
288	 */
289	ULOCKFS_CLR_SLOCK(ulp);
290
291	if (lwp != NULL)
292		lwp->lwp_nostop--;
293
294	return (error);
295}
296
297/*
298 * ufs_flush_inode
299 */
300int
301ufs_flush_inode(struct inode *ip, void *arg)
302{
303	int	error;
304	int	saverror	= 0;
305
306	/*
307	 * wrong file system; keep looking
308	 */
309	if (ip->i_ufsvfs != (struct ufsvfs *)arg)
310		return (0);
311
312	/*
313	 * asynchronously push all the dirty pages
314	 */
315	if (((error = TRANS_SYNCIP(ip, B_ASYNC, 0, TOP_SYNCIP_FLUSHI)) != 0) &&
316	    (error != EAGAIN))
317		saverror = error;
318	/*
319	 * wait for io and discard all mappings
320	 */
321	if (error = TRANS_SYNCIP(ip, B_INVAL, 0, TOP_SYNCIP_FLUSHI))
322		saverror = error;
323
324	if (ITOV(ip)->v_type == VDIR) {
325		dnlc_dir_purge(&ip->i_danchor);
326	}
327
328	return (saverror);
329}
330
331/*
332 * ufs_flush
333 *	Flush everything that is currently dirty; this includes invalidating
334 *	any mappings.
335 */
336int
337ufs_flush(struct vfs *vfsp)
338{
339	int		error;
340	int		saverror = 0;
341	struct ufsvfs	*ufsvfsp	= (struct ufsvfs *)vfsp->vfs_data;
342	struct fs	*fs		= ufsvfsp->vfs_fs;
343	int		tdontblock = 0;
344
345	ASSERT(vfs_lock_held(vfsp));
346
347	/*
348	 * purge dnlc
349	 */
350	(void) dnlc_purge_vfsp(vfsp, 0);
351
352	/*
353	 * drain the delete and idle threads
354	 */
355	ufs_delete_drain(vfsp, 0, 0);
356	ufs_idle_drain(vfsp);
357
358	/*
359	 * flush and invalidate quota records
360	 */
361	(void) qsync(ufsvfsp);
362
363	/*
364	 * flush w/invalidate the inodes for vfsp
365	 */
366	if (error = ufs_scan_inodes(0, ufs_flush_inode, ufsvfsp, ufsvfsp))
367		saverror = error;
368
369	/*
370	 * synchronously flush superblock and summary info
371	 */
372	if (fs->fs_ronly == 0 && fs->fs_fmod) {
373		fs->fs_fmod = 0;
374		TRANS_SBUPDATE(ufsvfsp, vfsp, TOP_SBUPDATE_FLUSH);
375	}
376	/*
377	 * flush w/invalidate block device pages and buf cache
378	 */
379	if ((error = VOP_PUTPAGE(common_specvp(ufsvfsp->vfs_devvp),
380	    (offset_t)0, 0, B_INVAL, CRED(), NULL)) > 0)
381		saverror = error;
382
383	(void) bflush((dev_t)vfsp->vfs_dev);
384	(void) bfinval((dev_t)vfsp->vfs_dev, 0);
385
386	/*
387	 * drain the delete and idle threads again
388	 */
389	ufs_delete_drain(vfsp, 0, 0);
390	ufs_idle_drain(vfsp);
391
392	/*
393	 * play with the clean flag
394	 */
395	if (saverror == 0)
396		ufs_checkclean(vfsp);
397
398	/*
399	 * Flush any outstanding transactions and roll the log
400	 * only if we are supposed to do, i.e. LDL_NOROLL not set.
401	 * We can not simply check for fs_ronly here since fsck also may
402	 * use this code to roll the log on a read-only filesystem, e.g.
403	 * root during early stages of boot, if other then a sanity check is
404	 * done, it will clear LDL_NOROLL before.
405	 * In addition we assert that the deltamap does not contain any deltas
406	 * in case LDL_NOROLL is set since this is not supposed to happen.
407	 */
408	if (TRANS_ISTRANS(ufsvfsp)) {
409		ml_unit_t	*ul	= ufsvfsp->vfs_log;
410		mt_map_t	*mtm	= ul->un_deltamap;
411
412		if (ul->un_flags & LDL_NOROLL) {
413			ASSERT(mtm->mtm_nme == 0);
414		} else {
415			/*
416			 * Do not set T_DONTBLOCK if there is a
417			 * transaction opened by caller.
418			 */
419			if (curthread->t_flag & T_DONTBLOCK)
420				tdontblock = 1;
421			else
422				curthread->t_flag |= T_DONTBLOCK;
423
424			TRANS_BEGIN_SYNC(ufsvfsp, TOP_COMMIT_FLUSH,
425			    TOP_COMMIT_SIZE, error);
426
427			if (!error) {
428				TRANS_END_SYNC(ufsvfsp, saverror,
429				    TOP_COMMIT_FLUSH, TOP_COMMIT_SIZE);
430			}
431
432			if (tdontblock == 0)
433				curthread->t_flag &= ~T_DONTBLOCK;
434
435			logmap_roll_dev(ufsvfsp->vfs_log);
436		}
437	}
438
439	return (saverror);
440}
441
442/*
443 * ufs_thaw_wlock
444 *	special processing when thawing down to wlock
445 */
446static int
447ufs_thaw_wlock(struct inode *ip, void *arg)
448{
449	/*
450	 * wrong file system; keep looking
451	 */
452	if (ip->i_ufsvfs != (struct ufsvfs *)arg)
453		return (0);
454
455	/*
456	 * iupdat refuses to clear flags if the fs is read only.  The fs
457	 * may become read/write during the lock and we wouldn't want
458	 * these inodes being written to disk.  So clear the flags.
459	 */
460	rw_enter(&ip->i_contents, RW_WRITER);
461	ip->i_flag &= ~(IMOD|IMODACC|IACC|IUPD|ICHG|IATTCHG);
462	rw_exit(&ip->i_contents);
463
464	/*
465	 * pages are mlocked -- fail wlock
466	 */
467	if (ITOV(ip)->v_type != VCHR && vn_has_cached_data(ITOV(ip)))
468		return (EBUSY);
469
470	return (0);
471}
472
473/*
474 * ufs_thaw_hlock
475 *	special processing when thawing down to hlock or elock
476 */
477static int
478ufs_thaw_hlock(struct inode *ip, void *arg)
479{
480	struct vnode	*vp	= ITOV(ip);
481
482	/*
483	 * wrong file system; keep looking
484	 */
485	if (ip->i_ufsvfs != (struct ufsvfs *)arg)
486		return (0);
487
488	/*
489	 * blow away all pages - even if they are mlocked
490	 */
491	do {
492		(void) TRANS_SYNCIP(ip, B_INVAL | B_FORCE, 0, TOP_SYNCIP_HLOCK);
493	} while ((vp->v_type != VCHR) && vn_has_cached_data(vp));
494	rw_enter(&ip->i_contents, RW_WRITER);
495	ip->i_flag &= ~(IMOD|IMODACC|IACC|IUPD|ICHG|IATTCHG);
496	rw_exit(&ip->i_contents);
497
498	return (0);
499}
500
501/*
502 * ufs_thaw
503 *	thaw file system lock down to current value
504 */
505int
506ufs_thaw(struct vfs *vfsp, struct ufsvfs *ufsvfsp, struct ulockfs *ulp)
507{
508	int		error	= 0;
509	int		noidel	= (int)(ulp->ul_flag & ULOCKFS_NOIDEL);
510
511	/*
512	 * if wlock or hlock or elock
513	 */
514	if (ULOCKFS_IS_WLOCK(ulp) || ULOCKFS_IS_HLOCK(ulp) ||
515	    ULOCKFS_IS_ELOCK(ulp)) {
516
517		/*
518		 * don't keep access times
519		 * don't free deleted files
520		 * if superblock writes are allowed, limit them to me for now
521		 */
522		ulp->ul_flag |= (ULOCKFS_NOIACC|ULOCKFS_NOIDEL);
523		if (ulp->ul_sbowner != (kthread_id_t)-1)
524			ulp->ul_sbowner = curthread;
525
526		/*
527		 * wait for writes for deleted files and superblock updates
528		 */
529		(void) ufs_flush(vfsp);
530
531		/*
532		 * now make sure the quota file is up-to-date
533		 *	expensive; but effective
534		 */
535		error = ufs_flush(vfsp);
536		/*
537		 * no one can write the superblock
538		 */
539		ulp->ul_sbowner = (kthread_id_t)-1;
540
541		/*
542		 * special processing for wlock/hlock/elock
543		 */
544		if (ULOCKFS_IS_WLOCK(ulp)) {
545			if (error)
546				goto errout;
547			error = bfinval(ufsvfsp->vfs_dev, 0);
548			if (error)
549				goto errout;
550			error = ufs_scan_inodes(0, ufs_thaw_wlock,
551			    (void *)ufsvfsp, ufsvfsp);
552			if (error)
553				goto errout;
554		}
555		if (ULOCKFS_IS_HLOCK(ulp) || ULOCKFS_IS_ELOCK(ulp)) {
556			error = 0;
557			(void) ufs_scan_inodes(0, ufs_thaw_hlock,
558			    (void *)ufsvfsp, ufsvfsp);
559			(void) bfinval(ufsvfsp->vfs_dev, 1);
560		}
561	} else {
562
563		/*
564		 * okay to keep access times
565		 * okay to free deleted files
566		 * okay to write the superblock
567		 */
568		ulp->ul_flag &= ~(ULOCKFS_NOIACC|ULOCKFS_NOIDEL);
569		ulp->ul_sbowner = NULL;
570
571		/*
572		 * flush in case deleted files are in memory
573		 */
574		if (noidel) {
575			if (error = ufs_flush(vfsp))
576				goto errout;
577		}
578	}
579
580errout:
581	cv_broadcast(&ulp->ul_cv);
582	return (error);
583}
584
585/*
586 * ufs_reconcile_fs
587 *	reconcile incore superblock with ondisk superblock
588 */
589int
590ufs_reconcile_fs(struct vfs *vfsp, struct ufsvfs *ufsvfsp, int errlck)
591{
592	struct fs	*mfs; 	/* in-memory superblock */
593	struct fs	*dfs;	/* on-disk   superblock */
594	struct buf	*bp;	/* on-disk   superblock buf */
595	int		 needs_unlock;
596	char		 finished_fsclean;
597
598	mfs = ufsvfsp->vfs_fs;
599
600	/*
601	 * get the on-disk copy of the superblock
602	 */
603	bp = UFS_BREAD(ufsvfsp, vfsp->vfs_dev, SBLOCK, SBSIZE);
604	bp->b_flags |= (B_STALE|B_AGE);
605	if (bp->b_flags & B_ERROR) {
606		brelse(bp);
607		return (EIO);
608	}
609	dfs = bp->b_un.b_fs;
610
611	/* error locks may only unlock after the fs has been made consistent */
612	if (errlck == UN_ERRLCK) {
613		if (dfs->fs_clean == FSFIX) {	/* being repaired */
614			brelse(bp);
615			return (EAGAIN);
616		}
617		/* repair not yet started? */
618		finished_fsclean = TRANS_ISTRANS(ufsvfsp)? FSLOG: FSCLEAN;
619		if (dfs->fs_clean != finished_fsclean) {
620			brelse(bp);
621			return (EBUSY);
622		}
623	}
624
625	/*
626	 * if superblock has changed too much, abort
627	 */
628	if ((mfs->fs_sblkno		!= dfs->fs_sblkno) ||
629	    (mfs->fs_cblkno		!= dfs->fs_cblkno) ||
630	    (mfs->fs_iblkno		!= dfs->fs_iblkno) ||
631	    (mfs->fs_dblkno		!= dfs->fs_dblkno) ||
632	    (mfs->fs_cgoffset		!= dfs->fs_cgoffset) ||
633	    (mfs->fs_cgmask		!= dfs->fs_cgmask) ||
634	    (mfs->fs_bsize		!= dfs->fs_bsize) ||
635	    (mfs->fs_fsize		!= dfs->fs_fsize) ||
636	    (mfs->fs_frag		!= dfs->fs_frag) ||
637	    (mfs->fs_bmask		!= dfs->fs_bmask) ||
638	    (mfs->fs_fmask		!= dfs->fs_fmask) ||
639	    (mfs->fs_bshift		!= dfs->fs_bshift) ||
640	    (mfs->fs_fshift		!= dfs->fs_fshift) ||
641	    (mfs->fs_fragshift		!= dfs->fs_fragshift) ||
642	    (mfs->fs_fsbtodb		!= dfs->fs_fsbtodb) ||
643	    (mfs->fs_sbsize		!= dfs->fs_sbsize) ||
644	    (mfs->fs_nindir		!= dfs->fs_nindir) ||
645	    (mfs->fs_nspf		!= dfs->fs_nspf) ||
646	    (mfs->fs_trackskew		!= dfs->fs_trackskew) ||
647	    (mfs->fs_cgsize		!= dfs->fs_cgsize) ||
648	    (mfs->fs_ntrak		!= dfs->fs_ntrak) ||
649	    (mfs->fs_nsect		!= dfs->fs_nsect) ||
650	    (mfs->fs_spc		!= dfs->fs_spc) ||
651	    (mfs->fs_cpg		!= dfs->fs_cpg) ||
652	    (mfs->fs_ipg		!= dfs->fs_ipg) ||
653	    (mfs->fs_fpg		!= dfs->fs_fpg) ||
654	    (mfs->fs_postblformat	!= dfs->fs_postblformat) ||
655	    (mfs->fs_magic		!= dfs->fs_magic)) {
656		brelse(bp);
657		return (EACCES);
658	}
659	if (dfs->fs_clean == FSBAD || FSOKAY != dfs->fs_state + dfs->fs_time)
660		if (mfs->fs_clean == FSLOG) {
661			brelse(bp);
662			return (EACCES);
663		}
664
665	/*
666	 * get new summary info
667	 */
668	if (ufs_getsummaryinfo(vfsp->vfs_dev, ufsvfsp, dfs)) {
669		brelse(bp);
670		return (EIO);
671	}
672
673	/*
674	 * release old summary info and update in-memory superblock
675	 */
676	kmem_free(mfs->fs_u.fs_csp, mfs->fs_cssize);
677	mfs->fs_u.fs_csp = dfs->fs_u.fs_csp;	/* Only entry 0 used */
678
679	/*
680	 * update fields allowed to change
681	 */
682	mfs->fs_size		= dfs->fs_size;
683	mfs->fs_dsize		= dfs->fs_dsize;
684	mfs->fs_ncg		= dfs->fs_ncg;
685	mfs->fs_minfree		= dfs->fs_minfree;
686	mfs->fs_rotdelay	= dfs->fs_rotdelay;
687	mfs->fs_rps		= dfs->fs_rps;
688	mfs->fs_maxcontig	= dfs->fs_maxcontig;
689	mfs->fs_maxbpg		= dfs->fs_maxbpg;
690	mfs->fs_csmask		= dfs->fs_csmask;
691	mfs->fs_csshift		= dfs->fs_csshift;
692	mfs->fs_optim		= dfs->fs_optim;
693	mfs->fs_csaddr		= dfs->fs_csaddr;
694	mfs->fs_cssize		= dfs->fs_cssize;
695	mfs->fs_ncyl		= dfs->fs_ncyl;
696	mfs->fs_cstotal		= dfs->fs_cstotal;
697	mfs->fs_reclaim		= dfs->fs_reclaim;
698
699	if (mfs->fs_reclaim & (FS_RECLAIM|FS_RECLAIMING)) {
700		mfs->fs_reclaim &= ~FS_RECLAIM;
701		mfs->fs_reclaim |=  FS_RECLAIMING;
702		ufs_thread_start(&ufsvfsp->vfs_reclaim,
703		    ufs_thread_reclaim, vfsp);
704	}
705
706	/* XXX What to do about sparecon? */
707
708	/* XXX need to copy volume label */
709
710	/*
711	 * ondisk clean flag overrides inmemory clean flag iff == FSBAD
712	 * or if error-locked and ondisk is now clean
713	 */
714	needs_unlock = !MUTEX_HELD(&ufsvfsp->vfs_lock);
715	if (needs_unlock)
716		mutex_enter(&ufsvfsp->vfs_lock);
717
718	if (errlck == UN_ERRLCK) {
719		if (finished_fsclean == dfs->fs_clean)
720			mfs->fs_clean = finished_fsclean;
721		else
722			mfs->fs_clean = FSBAD;
723		mfs->fs_state = FSOKAY - dfs->fs_time;
724	}
725
726	if (FSOKAY != dfs->fs_state + dfs->fs_time ||
727	    (dfs->fs_clean == FSBAD))
728		mfs->fs_clean = FSBAD;
729
730	if (needs_unlock)
731		mutex_exit(&ufsvfsp->vfs_lock);
732
733	brelse(bp);
734
735	return (0);
736}
737
738/*
739 * ufs_reconcile_inode
740 *	reconcile ondisk inode with incore inode
741 */
742static int
743ufs_reconcile_inode(struct inode *ip, void *arg)
744{
745	int		i;
746	int		ndaddr;
747	int		niaddr;
748	struct dinode	*dp;		/* ondisk inode */
749	struct buf	*bp	= NULL;
750	uid_t		d_uid;
751	gid_t		d_gid;
752	int		error = 0;
753	struct fs	*fs;
754
755	/*
756	 * not an inode we care about
757	 */
758	if (ip->i_ufsvfs != (struct ufsvfs *)arg)
759		return (0);
760
761	fs = ip->i_fs;
762
763	/*
764	 * Inode reconciliation fails: we made the filesystem quiescent
765	 * and we did a ufs_flush() before calling ufs_reconcile_inode()
766	 * and thus the inode should not have been changed inbetween.
767	 * Any discrepancies indicate a logic error and a pretty
768	 * significant run-state inconsistency we should complain about.
769	 */
770	if (ip->i_flag & (IMOD|IMODACC|IACC|IUPD|ICHG|IATTCHG)) {
771		cmn_err(CE_WARN, "%s: Inode reconciliation failed for"
772		    "inode %llu", fs->fs_fsmnt, (u_longlong_t)ip->i_number);
773		return (EINVAL);
774	}
775
776	/*
777	 * get the dinode
778	 */
779	bp = UFS_BREAD(ip->i_ufsvfs,
780	    ip->i_dev, (daddr_t)fsbtodb(fs, itod(fs, ip->i_number)),
781	    (int)fs->fs_bsize);
782	if (bp->b_flags & B_ERROR) {
783		brelse(bp);
784		return (EIO);
785	}
786	dp  = bp->b_un.b_dino;
787	dp += itoo(fs, ip->i_number);
788
789	/*
790	 * handle Sun's implementation of EFT
791	 */
792	d_uid = (dp->di_suid == UID_LONG) ? dp->di_uid : (uid_t)dp->di_suid;
793	d_gid = (dp->di_sgid == GID_LONG) ? dp->di_gid : (uid_t)dp->di_sgid;
794
795	rw_enter(&ip->i_contents, RW_WRITER);
796
797	/*
798	 * some fields are not allowed to change
799	 */
800	if ((ip->i_mode  != dp->di_mode) ||
801	    (ip->i_gen   != dp->di_gen) ||
802	    (ip->i_uid   != d_uid) ||
803	    (ip->i_gid   != d_gid)) {
804		error = EACCES;
805		goto out;
806	}
807
808	/*
809	 * and some are allowed to change
810	 */
811	ip->i_size		= dp->di_size;
812	ip->i_ic.ic_flags	= dp->di_ic.ic_flags;
813	ip->i_blocks		= dp->di_blocks;
814	ip->i_nlink		= dp->di_nlink;
815	if (ip->i_flag & IFASTSYMLNK) {
816		ndaddr = 1;
817		niaddr = 0;
818	} else {
819		ndaddr = NDADDR;
820		niaddr = NIADDR;
821	}
822	for (i = 0; i < ndaddr; ++i)
823		ip->i_db[i] = dp->di_db[i];
824	for (i = 0; i < niaddr; ++i)
825		ip->i_ib[i] = dp->di_ib[i];
826
827out:
828	rw_exit(&ip->i_contents);
829	brelse(bp);
830	return (error);
831}
832
833/*
834 * ufs_reconcile
835 *	reconcile ondisk superblock/inodes with any incore
836 */
837static int
838ufs_reconcile(struct vfs *vfsp, struct ufsvfs *ufsvfsp, int errlck)
839{
840	int	error = 0;
841
842	/*
843	 * get rid of as much inmemory data as possible
844	 */
845	(void) ufs_flush(vfsp);
846
847	/*
848	 * reconcile the superblock and inodes
849	 */
850	if (error = ufs_reconcile_fs(vfsp, ufsvfsp, errlck))
851		return (error);
852	if (error = ufs_scan_inodes(0, ufs_reconcile_inode, ufsvfsp, ufsvfsp))
853		return (error);
854	/*
855	 * allocation blocks may be incorrect; get rid of them
856	 */
857	(void) ufs_flush(vfsp);
858
859	return (error);
860}
861
862/*
863 * File system locking
864 */
865int
866ufs_fiolfs(struct vnode *vp, struct lockfs *lockfsp, int from_log)
867{
868	return (ufs__fiolfs(vp, lockfsp, /* from_user */ 1, from_log));
869}
870
871/* kernel-internal interface, also used by fix-on-panic */
872int
873ufs__fiolfs(
874	struct vnode *vp,
875	struct lockfs *lockfsp,
876	int from_user,
877	int from_log)
878{
879	struct ulockfs	*ulp;
880	struct lockfs	lfs;
881	int		error;
882	struct vfs	*vfsp;
883	struct ufsvfs	*ufsvfsp;
884	int		 errlck		= NO_ERRLCK;
885	int		 poll_events	= POLLPRI;
886	extern struct pollhead ufs_pollhd;
887	ulockfs_info_t *head;
888	ulockfs_info_t *info;
889	int signal = 0;
890
891	/* check valid lock type */
892	if (!lockfsp || lockfsp->lf_lock > LOCKFS_MAXLOCK)
893		return (EINVAL);
894
895	if (!vp || !vp->v_vfsp || !vp->v_vfsp->vfs_data)
896		return (EIO);
897
898	vfsp = vp->v_vfsp;
899
900	if (vfsp->vfs_flag & VFS_UNMOUNTED) /* has been unmounted */
901		return (EIO);
902
903	/* take the lock and check again */
904	vfs_lock_wait(vfsp);
905	if (vfsp->vfs_flag & VFS_UNMOUNTED) {
906		vfs_unlock(vfsp);
907		return (EIO);
908	}
909
910	/*
911	 * Can't wlock or ro/elock fs with accounting or local swap file
912	 * We need to check for this before we grab the ul_lock to avoid
913	 * deadlocks with the accounting framework.
914	 */
915	if ((LOCKFS_IS_WLOCK(lockfsp) || LOCKFS_IS_ELOCK(lockfsp) ||
916	    LOCKFS_IS_ROELOCK(lockfsp)) && !from_log) {
917		if (ufs_checkaccton(vp) || ufs_checkswapon(vp)) {
918			vfs_unlock(vfsp);
919			return (EDEADLK);
920		}
921	}
922
923	ufsvfsp = (struct ufsvfs *)vfsp->vfs_data;
924	ulp = &ufsvfsp->vfs_ulockfs;
925	head = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
926	SEARCH_ULOCKFSP(head, ulp, info);
927
928	/*
929	 * Suspend both the reclaim thread and the delete thread.
930	 * This must be done outside the lockfs locking protocol.
931	 */
932	ufs_thread_suspend(&ufsvfsp->vfs_reclaim);
933	ufs_thread_suspend(&ufsvfsp->vfs_delete);
934
935	mutex_enter(&ulp->ul_lock);
936	atomic_add_long(&ufs_quiesce_pend, 1);
937
938	/*
939	 * Quit if there is another lockfs request in progress
940	 * that is waiting for existing ufs_vnops to complete.
941	 */
942	if (ULOCKFS_IS_BUSY(ulp)) {
943		error = EBUSY;
944		goto errexit;
945	}
946
947	/* cannot ulocked or downgrade a hard-lock */
948	if (ULOCKFS_IS_HLOCK(ulp)) {
949		error = EIO;
950		goto errexit;
951	}
952
953	/* an error lock may be unlocked or relocked, only */
954	if (ULOCKFS_IS_ELOCK(ulp)) {
955		if (!LOCKFS_IS_ULOCK(lockfsp) && !LOCKFS_IS_ELOCK(lockfsp)) {
956			error = EBUSY;
957			goto errexit;
958		}
959	}
960
961	/*
962	 * a read-only error lock may only be upgraded to an
963	 * error lock or hard lock
964	 */
965	if (ULOCKFS_IS_ROELOCK(ulp)) {
966		if (!LOCKFS_IS_HLOCK(lockfsp) && !LOCKFS_IS_ELOCK(lockfsp)) {
967			error = EBUSY;
968			goto errexit;
969		}
970	}
971
972	/*
973	 * until read-only error locks are fully implemented
974	 * just return EINVAL
975	 */
976	if (LOCKFS_IS_ROELOCK(lockfsp)) {
977		error = EINVAL;
978		goto errexit;
979	}
980
981	/*
982	 * an error lock may only be applied if the file system is
983	 * unlocked or already error locked.
984	 * (this is to prevent the case where a fs gets changed out from
985	 * underneath a fs that is locked for backup,
986	 * that is, name/delete/write-locked.)
987	 */
988	if ((!ULOCKFS_IS_ULOCK(ulp) && !ULOCKFS_IS_ELOCK(ulp) &&
989	    !ULOCKFS_IS_ROELOCK(ulp)) &&
990	    (LOCKFS_IS_ELOCK(lockfsp) || LOCKFS_IS_ROELOCK(lockfsp))) {
991		error = EBUSY;
992		goto errexit;
993	}
994
995	/* get and validate the input lockfs request */
996	if (error = ufs_getlfd(lockfsp, &ulp->ul_lockfs))
997		goto errexit;
998
999	/*
1000	 * save current ulockfs struct
1001	 */
1002	bcopy(&ulp->ul_lockfs, &lfs, sizeof (struct lockfs));
1003
1004	/*
1005	 * Freeze the file system (pend future accesses)
1006	 */
1007	ufs_freeze(ulp, lockfsp);
1008
1009	/*
1010	 * Set locking in progress because ufs_quiesce may free the
1011	 * ul_lock mutex.
1012	 */
1013	ULOCKFS_SET_BUSY(ulp);
1014	/* update the ioctl copy */
1015	LOCKFS_SET_BUSY(&ulp->ul_lockfs);
1016
1017	/*
1018	 * We  need to unset FWLOCK status before we call ufs_quiesce
1019	 * so that the thread doesnt get suspended. We do this only if
1020	 * this (fallocate) thread requested an unlock operation.
1021	 */
1022	if (info && (info->flags & ULOCK_INFO_FALLOCATE)) {
1023		if (!ULOCKFS_IS_WLOCK(ulp))
1024			ULOCKFS_CLR_FWLOCK(ulp);
1025	}
1026
1027	/*
1028	 * Quiesce (wait for outstanding accesses to finish)
1029	 */
1030	if (error = ufs_quiesce(ulp)) {
1031		/*
1032		 * Interrupted due to signal. There could still be
1033		 * pending vnops.
1034		 */
1035		signal = 1;
1036
1037		/*
1038		 * We do broadcast because lock-status
1039		 * could be reverted to old status.
1040		 */
1041		cv_broadcast(&ulp->ul_cv);
1042		goto errout;
1043	}
1044
1045	/*
1046	 * If the fallocate thread requested a write fs lock operation
1047	 * then we set fwlock status in the ulp.
1048	 */
1049	if (info && (info->flags & ULOCK_INFO_FALLOCATE)) {
1050		if (ULOCKFS_IS_WLOCK(ulp))
1051			ULOCKFS_SET_FWLOCK(ulp);
1052	}
1053
1054	/*
1055	 * save error lock status to pass down to reconcilation
1056	 * routines and for later cleanup
1057	 */
1058	if (LOCKFS_IS_ELOCK(&lfs) && ULOCKFS_IS_ULOCK(ulp))
1059		errlck = UN_ERRLCK;
1060
1061	if (ULOCKFS_IS_ELOCK(ulp) || ULOCKFS_IS_ROELOCK(ulp)) {
1062		int needs_unlock;
1063		int needs_sbwrite;
1064
1065		poll_events |= POLLERR;
1066		errlck = LOCKFS_IS_ELOCK(&lfs) || LOCKFS_IS_ROELOCK(&lfs) ?
1067		    RE_ERRLCK : SET_ERRLCK;
1068
1069		needs_unlock = !MUTEX_HELD(&ufsvfsp->vfs_lock);
1070		if (needs_unlock)
1071			mutex_enter(&ufsvfsp->vfs_lock);
1072
1073		/* disable delayed i/o */
1074		needs_sbwrite = 0;
1075
1076		if (errlck == SET_ERRLCK) {
1077			ufsvfsp->vfs_fs->fs_clean = FSBAD;
1078			needs_sbwrite = 1;
1079		}
1080
1081		needs_sbwrite |= ufsvfsp->vfs_dio;
1082		ufsvfsp->vfs_dio = 0;
1083
1084		if (needs_unlock)
1085			mutex_exit(&ufsvfsp->vfs_lock);
1086
1087		if (needs_sbwrite) {
1088			ulp->ul_sbowner = curthread;
1089			TRANS_SBWRITE(ufsvfsp, TOP_SBWRITE_STABLE);
1090
1091			if (needs_unlock)
1092				mutex_enter(&ufsvfsp->vfs_lock);
1093
1094			ufsvfsp->vfs_fs->fs_fmod = 0;
1095
1096			if (needs_unlock)
1097				mutex_exit(&ufsvfsp->vfs_lock);
1098		}
1099	}
1100
1101	/*
1102	 * reconcile superblock and inodes if was wlocked
1103	 */
1104	if (LOCKFS_IS_WLOCK(&lfs) || LOCKFS_IS_ELOCK(&lfs)) {
1105		if (error = ufs_reconcile(vfsp, ufsvfsp, errlck))
1106			goto errout;
1107		/*
1108		 * in case the fs grew; reset the metadata map for logging tests
1109		 */
1110		TRANS_MATA_UMOUNT(ufsvfsp);
1111		TRANS_MATA_MOUNT(ufsvfsp);
1112		TRANS_MATA_SI(ufsvfsp, ufsvfsp->vfs_fs);
1113	}
1114
1115	/*
1116	 * At least everything *currently* dirty goes out.
1117	 */
1118
1119	if ((error = ufs_flush(vfsp)) != 0 && !ULOCKFS_IS_HLOCK(ulp) &&
1120	    !ULOCKFS_IS_ELOCK(ulp))
1121		goto errout;
1122
1123	/*
1124	 * thaw file system and wakeup pended processes
1125	 */
1126	if (error = ufs_thaw(vfsp, ufsvfsp, ulp))
1127		if (!ULOCKFS_IS_HLOCK(ulp) && !ULOCKFS_IS_ELOCK(ulp))
1128			goto errout;
1129
1130	/*
1131	 * reset modified flag if not already write locked
1132	 */
1133	if (!LOCKFS_IS_WLOCK(&lfs))
1134		ULOCKFS_CLR_MOD(ulp);
1135
1136	/*
1137	 * idle the lock struct
1138	 */
1139	ULOCKFS_CLR_BUSY(ulp);
1140	/* update the ioctl copy */
1141	LOCKFS_CLR_BUSY(&ulp->ul_lockfs);
1142
1143	/*
1144	 * free current comment
1145	 */
1146	if (lfs.lf_comment && lfs.lf_comlen != 0) {
1147		kmem_free(lfs.lf_comment, lfs.lf_comlen);
1148		lfs.lf_comment = NULL;
1149		lfs.lf_comlen = 0;
1150	}
1151
1152	/* do error lock cleanup */
1153	if (errlck == UN_ERRLCK)
1154		ufsfx_unlockfs(ufsvfsp);
1155
1156	else if (errlck == RE_ERRLCK)
1157		ufsfx_lockfs(ufsvfsp);
1158
1159	/* don't allow error lock from user to invoke panic */
1160	else if (from_user && errlck == SET_ERRLCK &&
1161	    !(ufsvfsp->vfs_fsfx.fx_flags & (UFSMNT_ONERROR_PANIC >> 4)))
1162		(void) ufs_fault(ufsvfsp->vfs_root,
1163		    ulp->ul_lockfs.lf_comment && ulp->ul_lockfs.lf_comlen > 0 ?
1164		    ulp->ul_lockfs.lf_comment: "user-applied error lock");
1165
1166	atomic_add_long(&ufs_quiesce_pend, -1);
1167	mutex_exit(&ulp->ul_lock);
1168	vfs_unlock(vfsp);
1169
1170	if (ULOCKFS_IS_HLOCK(&ufsvfsp->vfs_ulockfs))
1171		poll_events |= POLLERR;
1172
1173	pollwakeup(&ufs_pollhd, poll_events);
1174
1175	/*
1176	 * Allow both the delete thread and the reclaim thread to
1177	 * continue.
1178	 */
1179	ufs_thread_continue(&ufsvfsp->vfs_delete);
1180	ufs_thread_continue(&ufsvfsp->vfs_reclaim);
1181
1182	return (0);
1183
1184errout:
1185	/*
1186	 * Lock failed. Reset the old lock in ufsvfs if not hard locked.
1187	 */
1188	if (!LOCKFS_IS_HLOCK(&ulp->ul_lockfs)) {
1189		bcopy(&lfs, &ulp->ul_lockfs, sizeof (struct lockfs));
1190		ulp->ul_fs_lock = (1 << lfs.lf_lock);
1191	}
1192
1193	/*
1194	 * Don't call ufs_thaw() when there's a signal during
1195	 * ufs quiesce operation as it can lead to deadlock
1196	 * with getpage.
1197	 */
1198	if (signal == 0)
1199		(void) ufs_thaw(vfsp, ufsvfsp, ulp);
1200
1201	ULOCKFS_CLR_BUSY(ulp);
1202	LOCKFS_CLR_BUSY(&ulp->ul_lockfs);
1203
1204errexit:
1205	atomic_add_long(&ufs_quiesce_pend, -1);
1206	mutex_exit(&ulp->ul_lock);
1207	vfs_unlock(vfsp);
1208
1209	/*
1210	 * Allow both the delete thread and the reclaim thread to
1211	 * continue.
1212	 */
1213	ufs_thread_continue(&ufsvfsp->vfs_delete);
1214	ufs_thread_continue(&ufsvfsp->vfs_reclaim);
1215
1216	return (error);
1217}
1218
1219/*
1220 * fiolfss
1221 * 	return the current file system locking state info
1222 */
1223int
1224ufs_fiolfss(struct vnode *vp, struct lockfs *lockfsp)
1225{
1226	struct ulockfs	*ulp;
1227
1228	if (!vp || !vp->v_vfsp || !VTOI(vp))
1229		return (EINVAL);
1230
1231	/* file system has been forcibly unmounted */
1232	if (VTOI(vp)->i_ufsvfs == NULL)
1233		return (EIO);
1234
1235	ulp = VTOUL(vp);
1236
1237	if (ULOCKFS_IS_HLOCK(ulp)) {
1238		*lockfsp = ulp->ul_lockfs;	/* structure assignment */
1239		return (0);
1240	}
1241
1242	mutex_enter(&ulp->ul_lock);
1243
1244	*lockfsp = ulp->ul_lockfs;	/* structure assignment */
1245
1246	if (ULOCKFS_IS_MOD(ulp))
1247		lockfsp->lf_flags |= LOCKFS_MOD;
1248
1249	mutex_exit(&ulp->ul_lock);
1250
1251	return (0);
1252}
1253
1254/*
1255 * ufs_check_lockfs
1256 *	check whether a ufs_vnops conflicts with the file system lock
1257 */
1258int
1259ufs_check_lockfs(struct ufsvfs *ufsvfsp, struct ulockfs *ulp, ulong_t mask)
1260{
1261	k_sigset_t	smask;
1262	int		sig, slock;
1263
1264	ASSERT(MUTEX_HELD(&ulp->ul_lock));
1265
1266	while (ulp->ul_fs_lock & mask) {
1267		slock = (int)ULOCKFS_IS_SLOCK(ulp);
1268		if ((curthread->t_flag & T_DONTPEND) && !slock) {
1269			curthread->t_flag |= T_WOULDBLOCK;
1270			return (EAGAIN);
1271		}
1272		curthread->t_flag &= ~T_WOULDBLOCK;
1273
1274		/*
1275		 * In the case of an onerr umount of the fs, threads could
1276		 * have blocked before coming into ufs_check_lockfs and
1277		 * need to check for the special case of ELOCK and
1278		 * vfs_dontblock being set which would indicate that the fs
1279		 * is on its way out and will not return therefore making
1280		 * EIO the appropriate response.
1281		 */
1282		if (ULOCKFS_IS_HLOCK(ulp) ||
1283		    (ULOCKFS_IS_ELOCK(ulp) && ufsvfsp->vfs_dontblock))
1284			return (EIO);
1285
1286		/*
1287		 * wait for lock status to change
1288		 */
1289		if (slock || ufsvfsp->vfs_nointr) {
1290			cv_wait(&ulp->ul_cv, &ulp->ul_lock);
1291		} else {
1292			sigintr(&smask, 1);
1293			sig = cv_wait_sig(&ulp->ul_cv, &ulp->ul_lock);
1294			sigunintr(&smask);
1295			if ((!sig && (ulp->ul_fs_lock & mask)) ||
1296			    ufsvfsp->vfs_dontblock)
1297				return (EINTR);
1298		}
1299	}
1300
1301	if (mask & ULOCKFS_FWLOCK) {
1302		atomic_add_long(&ulp->ul_falloc_cnt, 1);
1303		ULOCKFS_SET_FALLOC(ulp);
1304	} else {
1305		atomic_add_long(&ulp->ul_vnops_cnt, 1);
1306	}
1307
1308	return (0);
1309}
1310
1311/*
1312 * Check whether we came across the handcrafted lockfs protocol path. We can't
1313 * simply check for T_DONTBLOCK here as one would assume since this can also
1314 * falsely catch recursive VOP's going to a different filesystem, instead we
1315 * check if we already hold the ulockfs->ul_lock mutex.
1316 */
1317static int
1318ufs_lockfs_is_under_rawlockfs(struct ulockfs *ulp)
1319{
1320	return ((mutex_owner(&ulp->ul_lock) != curthread) ? 0 : 1);
1321}
1322
1323/*
1324 * ufs_lockfs_begin - start the lockfs locking protocol
1325 */
1326int
1327ufs_lockfs_begin(struct ufsvfs *ufsvfsp, struct ulockfs **ulpp, ulong_t mask)
1328{
1329	int 		error;
1330	int		rec_vop;
1331	ushort_t	op_cnt_incremented = 0;
1332	ulong_t		*ctr;
1333	struct ulockfs *ulp;
1334	ulockfs_info_t	*ulockfs_info;
1335	ulockfs_info_t	*ulockfs_info_free;
1336	ulockfs_info_t	*ulockfs_info_temp;
1337
1338	/*
1339	 * file system has been forcibly unmounted
1340	 */
1341	if (ufsvfsp == NULL)
1342		return (EIO);
1343
1344	*ulpp = ulp = &ufsvfsp->vfs_ulockfs;
1345
1346	/*
1347	 * Do lockfs protocol
1348	 */
1349	ulockfs_info = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
1350	IS_REC_VOP(rec_vop, ulockfs_info, ulp, ulockfs_info_free);
1351
1352	/*
1353	 * Detect recursive VOP call or handcrafted internal lockfs protocol
1354	 * path and bail out in that case.
1355	 */
1356	if (rec_vop || ufs_lockfs_is_under_rawlockfs(ulp)) {
1357		*ulpp = NULL;
1358		return (0);
1359	} else {
1360		if (ulockfs_info_free == NULL) {
1361			if ((ulockfs_info_temp = (ulockfs_info_t *)
1362			    kmem_zalloc(sizeof (ulockfs_info_t),
1363			    KM_NOSLEEP)) == NULL) {
1364				*ulpp = NULL;
1365				return (ENOMEM);
1366			}
1367		}
1368	}
1369
1370	/*
1371	 * First time VOP call
1372	 *
1373	 * Increment the ctr irrespective of the lockfs state. If the lockfs
1374	 * state is not ULOCKFS_ULOCK, we can decrement it later. However,
1375	 * before incrementing we need to check if there is a pending quiesce
1376	 * request because if we have a continuous stream of ufs_lockfs_begin
1377	 * requests pounding on a few cpu's then the ufs_quiesce thread might
1378	 * never see the value of zero for ctr - a livelock kind of scenario.
1379	 */
1380	ctr = (mask & ULOCKFS_FWLOCK) ?
1381	    &ulp->ul_falloc_cnt : &ulp->ul_vnops_cnt;
1382	if (!ULOCKFS_IS_SLOCK(ulp)) {
1383		atomic_add_long(ctr, 1);
1384		op_cnt_incremented++;
1385	}
1386
1387	/*
1388	 * If the lockfs state (indicated by ul_fs_lock) is not just
1389	 * ULOCKFS_ULOCK, then we will be routed through ufs_check_lockfs
1390	 * where there is a check with an appropriate mask to selectively allow
1391	 * operations permitted for that kind of lockfs state.
1392	 *
1393	 * Even these selective operations should not be allowed to go through
1394	 * if a lockfs request is in progress because that could result in inode
1395	 * modifications during a quiesce and could hence result in inode
1396	 * reconciliation failures. ULOCKFS_SLOCK alone would not be sufficient,
1397	 * so make use of ufs_quiesce_pend to disallow vnode operations when a
1398	 * quiesce is in progress.
1399	 */
1400	if (!ULOCKFS_IS_JUSTULOCK(ulp) || ufs_quiesce_pend) {
1401		if (op_cnt_incremented)
1402			if (!atomic_add_long_nv(ctr, -1))
1403				cv_broadcast(&ulp->ul_cv);
1404		mutex_enter(&ulp->ul_lock);
1405		error = ufs_check_lockfs(ufsvfsp, ulp, mask);
1406		mutex_exit(&ulp->ul_lock);
1407		if (error) {
1408			if (ulockfs_info_free == NULL)
1409				kmem_free(ulockfs_info_temp,
1410				    sizeof (ulockfs_info_t));
1411			return (error);
1412		}
1413	} else {
1414		/*
1415		 * This is the common case of file system in a unlocked state.
1416		 *
1417		 * If a file system is unlocked, we would expect the ctr to have
1418		 * been incremented by now. But this will not be true when a
1419		 * quiesce is winding up - SLOCK was set when we checked before
1420		 * incrementing the ctr, but by the time we checked for
1421		 * ULOCKFS_IS_JUSTULOCK, the quiesce thread was gone. It is okay
1422		 * to take ul_lock and go through the slow path in this uncommon
1423		 * case.
1424		 */
1425		if (op_cnt_incremented == 0) {
1426			mutex_enter(&ulp->ul_lock);
1427			error = ufs_check_lockfs(ufsvfsp, ulp, mask);
1428			if (error) {
1429				mutex_exit(&ulp->ul_lock);
1430				if (ulockfs_info_free == NULL)
1431					kmem_free(ulockfs_info_temp,
1432					    sizeof (ulockfs_info_t));
1433				return (error);
1434			}
1435			if (mask & ULOCKFS_FWLOCK)
1436				ULOCKFS_SET_FALLOC(ulp);
1437			mutex_exit(&ulp->ul_lock);
1438		} else if (mask & ULOCKFS_FWLOCK) {
1439			mutex_enter(&ulp->ul_lock);
1440			ULOCKFS_SET_FALLOC(ulp);
1441			mutex_exit(&ulp->ul_lock);
1442		}
1443	}
1444
1445	if (ulockfs_info_free != NULL) {
1446		ulockfs_info_free->ulp = ulp;
1447		if (mask & ULOCKFS_FWLOCK)
1448			ulockfs_info_free->flags |= ULOCK_INFO_FALLOCATE;
1449	} else {
1450		ulockfs_info_temp->ulp = ulp;
1451		ulockfs_info_temp->next = ulockfs_info;
1452		if (mask & ULOCKFS_FWLOCK)
1453			ulockfs_info_temp->flags |= ULOCK_INFO_FALLOCATE;
1454		ASSERT(ufs_lockfs_key != 0);
1455		(void) tsd_set(ufs_lockfs_key, (void *)ulockfs_info_temp);
1456	}
1457
1458	curthread->t_flag |= T_DONTBLOCK;
1459	return (0);
1460}
1461
1462/*
1463 * Check whether we are returning from the top level VOP.
1464 */
1465static int
1466ufs_lockfs_top_vop_return(ulockfs_info_t *head)
1467{
1468	ulockfs_info_t *info;
1469	int result = 1;
1470
1471	for (info = head; info != NULL; info = info->next) {
1472		if (info->ulp != NULL) {
1473			result = 0;
1474			break;
1475		}
1476	}
1477
1478	return (result);
1479}
1480
1481/*
1482 * ufs_lockfs_end - terminate the lockfs locking protocol
1483 */
1484void
1485ufs_lockfs_end(struct ulockfs *ulp)
1486{
1487	ulockfs_info_t *info;
1488	ulockfs_info_t *head;
1489
1490	/*
1491	 * end-of-VOP protocol
1492	 */
1493	if (ulp == NULL)
1494		return;
1495
1496	head = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
1497	SEARCH_ULOCKFSP(head, ulp, info);
1498
1499	/*
1500	 * If we're called from a first level VOP, we have to have a
1501	 * valid ulockfs record in the TSD.
1502	 */
1503	ASSERT(info != NULL);
1504
1505	/*
1506	 * Invalidate the ulockfs record.
1507	 */
1508	info->ulp = NULL;
1509
1510	if (ufs_lockfs_top_vop_return(head))
1511		curthread->t_flag &= ~T_DONTBLOCK;
1512
1513	/* fallocate thread */
1514	if (ULOCKFS_IS_FALLOC(ulp) && info->flags & ULOCK_INFO_FALLOCATE) {
1515		/* Clear the thread's fallocate state */
1516		info->flags &= ~ULOCK_INFO_FALLOCATE;
1517		if (!atomic_add_long_nv(&ulp->ul_falloc_cnt, -1)) {
1518			mutex_enter(&ulp->ul_lock);
1519			ULOCKFS_CLR_FALLOC(ulp);
1520			cv_broadcast(&ulp->ul_cv);
1521			mutex_exit(&ulp->ul_lock);
1522		}
1523	} else  { /* normal thread */
1524		if (!atomic_add_long_nv(&ulp->ul_vnops_cnt, -1))
1525			cv_broadcast(&ulp->ul_cv);
1526	}
1527}
1528
1529/*
1530 * ufs_lockfs_trybegin - try to start the lockfs locking protocol without
1531 * blocking.
1532 */
1533int
1534ufs_lockfs_trybegin(struct ufsvfs *ufsvfsp, struct ulockfs **ulpp, ulong_t mask)
1535{
1536	int 		error = 0;
1537	int		rec_vop;
1538	ushort_t	op_cnt_incremented = 0;
1539	ulong_t		*ctr;
1540	struct ulockfs *ulp;
1541	ulockfs_info_t	*ulockfs_info;
1542	ulockfs_info_t	*ulockfs_info_free;
1543	ulockfs_info_t	*ulockfs_info_temp;
1544
1545	/*
1546	 * file system has been forcibly unmounted
1547	 */
1548	if (ufsvfsp == NULL)
1549		return (EIO);
1550
1551	*ulpp = ulp = &ufsvfsp->vfs_ulockfs;
1552
1553	/*
1554	 * Do lockfs protocol
1555	 */
1556	ulockfs_info = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
1557	IS_REC_VOP(rec_vop, ulockfs_info, ulp, ulockfs_info_free);
1558
1559	/*
1560	 * Detect recursive VOP call or handcrafted internal lockfs protocol
1561	 * path and bail out in that case.
1562	 */
1563	if (rec_vop || ufs_lockfs_is_under_rawlockfs(ulp)) {
1564		*ulpp = NULL;
1565		return (0);
1566	} else {
1567		if (ulockfs_info_free == NULL) {
1568			if ((ulockfs_info_temp = (ulockfs_info_t *)
1569			    kmem_zalloc(sizeof (ulockfs_info_t),
1570			    KM_NOSLEEP)) == NULL) {
1571				*ulpp = NULL;
1572				return (ENOMEM);
1573			}
1574		}
1575	}
1576
1577	/*
1578	 * First time VOP call
1579	 *
1580	 * Increment the ctr irrespective of the lockfs state. If the lockfs
1581	 * state is not ULOCKFS_ULOCK, we can decrement it later. However,
1582	 * before incrementing we need to check if there is a pending quiesce
1583	 * request because if we have a continuous stream of ufs_lockfs_begin
1584	 * requests pounding on a few cpu's then the ufs_quiesce thread might
1585	 * never see the value of zero for ctr - a livelock kind of scenario.
1586	 */
1587	ctr = (mask & ULOCKFS_FWLOCK) ?
1588	    &ulp->ul_falloc_cnt : &ulp->ul_vnops_cnt;
1589	if (!ULOCKFS_IS_SLOCK(ulp)) {
1590		atomic_add_long(ctr, 1);
1591		op_cnt_incremented++;
1592	}
1593
1594	if (!ULOCKFS_IS_JUSTULOCK(ulp) || ufs_quiesce_pend) {
1595		/*
1596		 * Non-blocking version of ufs_check_lockfs() code.
1597		 *
1598		 * If the file system is not hard locked or error locked
1599		 * and if ulp->ul_fs_lock allows this operation, increment
1600		 * the appropriate counter and proceed (For eg., In case the
1601		 * file system is delete locked, a mmap can still go through).
1602		 */
1603		if (op_cnt_incremented)
1604			if (!atomic_add_long_nv(ctr, -1))
1605				cv_broadcast(&ulp->ul_cv);
1606		mutex_enter(&ulp->ul_lock);
1607		if (ULOCKFS_IS_HLOCK(ulp) ||
1608		    (ULOCKFS_IS_ELOCK(ulp) && ufsvfsp->vfs_dontblock))
1609			error = EIO;
1610		else if (ulp->ul_fs_lock & mask)
1611			error = EAGAIN;
1612
1613		if (error) {
1614			mutex_exit(&ulp->ul_lock);
1615			if (ulockfs_info_free == NULL)
1616				kmem_free(ulockfs_info_temp,
1617				    sizeof (ulockfs_info_t));
1618			return (error);
1619		}
1620		atomic_add_long(ctr, 1);
1621		if (mask & ULOCKFS_FWLOCK)
1622			ULOCKFS_SET_FALLOC(ulp);
1623		mutex_exit(&ulp->ul_lock);
1624	} else {
1625		/*
1626		 * This is the common case of file system in a unlocked state.
1627		 *
1628		 * If a file system is unlocked, we would expect the ctr to have
1629		 * been incremented by now. But this will not be true when a
1630		 * quiesce is winding up - SLOCK was set when we checked before
1631		 * incrementing the ctr, but by the time we checked for
1632		 * ULOCKFS_IS_JUSTULOCK, the quiesce thread was gone. Take
1633		 * ul_lock and go through the non-blocking version of
1634		 * ufs_check_lockfs() code.
1635		 */
1636		if (op_cnt_incremented == 0) {
1637			mutex_enter(&ulp->ul_lock);
1638			if (ULOCKFS_IS_HLOCK(ulp) ||
1639			    (ULOCKFS_IS_ELOCK(ulp) && ufsvfsp->vfs_dontblock))
1640				error = EIO;
1641			else if (ulp->ul_fs_lock & mask)
1642				error = EAGAIN;
1643
1644			if (error) {
1645				mutex_exit(&ulp->ul_lock);
1646				if (ulockfs_info_free == NULL)
1647					kmem_free(ulockfs_info_temp,
1648					    sizeof (ulockfs_info_t));
1649				return (error);
1650			}
1651			atomic_add_long(ctr, 1);
1652			if (mask & ULOCKFS_FWLOCK)
1653				ULOCKFS_SET_FALLOC(ulp);
1654			mutex_exit(&ulp->ul_lock);
1655		} else if (mask & ULOCKFS_FWLOCK) {
1656			mutex_enter(&ulp->ul_lock);
1657			ULOCKFS_SET_FALLOC(ulp);
1658			mutex_exit(&ulp->ul_lock);
1659		}
1660	}
1661
1662	if (ulockfs_info_free != NULL) {
1663		ulockfs_info_free->ulp = ulp;
1664		if (mask & ULOCKFS_FWLOCK)
1665			ulockfs_info_free->flags |= ULOCK_INFO_FALLOCATE;
1666	} else {
1667		ulockfs_info_temp->ulp = ulp;
1668		ulockfs_info_temp->next = ulockfs_info;
1669		if (mask & ULOCKFS_FWLOCK)
1670			ulockfs_info_temp->flags |= ULOCK_INFO_FALLOCATE;
1671		ASSERT(ufs_lockfs_key != 0);
1672		(void) tsd_set(ufs_lockfs_key, (void *)ulockfs_info_temp);
1673	}
1674
1675	curthread->t_flag |= T_DONTBLOCK;
1676	return (0);
1677}
1678
1679/*
1680 * specialized version of ufs_lockfs_begin() called by ufs_getpage().
1681 */
1682int
1683ufs_lockfs_begin_getpage(
1684	struct ufsvfs	*ufsvfsp,
1685	struct ulockfs	**ulpp,
1686	struct seg	*seg,
1687	int		read_access,
1688	uint_t		*protp)
1689{
1690	ulong_t			mask;
1691	int 			error;
1692	int			rec_vop;
1693	struct ulockfs		*ulp;
1694	ulockfs_info_t		*ulockfs_info;
1695	ulockfs_info_t		*ulockfs_info_free;
1696	ulockfs_info_t		*ulockfs_info_temp;
1697
1698	/*
1699	 * file system has been forcibly unmounted
1700	 */
1701	if (ufsvfsp == NULL)
1702		return (EIO);
1703
1704	*ulpp = ulp = &ufsvfsp->vfs_ulockfs;
1705
1706	/*
1707	 * Do lockfs protocol
1708	 */
1709	ulockfs_info = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
1710	IS_REC_VOP(rec_vop, ulockfs_info, ulp, ulockfs_info_free);
1711
1712	/*
1713	 * Detect recursive VOP call or handcrafted internal lockfs protocol
1714	 * path and bail out in that case.
1715	 */
1716	if (rec_vop || ufs_lockfs_is_under_rawlockfs(ulp)) {
1717		*ulpp = NULL;
1718		return (0);
1719	} else {
1720		if (ulockfs_info_free == NULL) {
1721			if ((ulockfs_info_temp = (ulockfs_info_t *)
1722			    kmem_zalloc(sizeof (ulockfs_info_t),
1723			    KM_NOSLEEP)) == NULL) {
1724				*ulpp = NULL;
1725				return (ENOMEM);
1726			}
1727		}
1728	}
1729
1730	/*
1731	 * First time VOP call
1732	 */
1733	atomic_add_long(&ulp->ul_vnops_cnt, 1);
1734	if (!ULOCKFS_IS_JUSTULOCK(ulp) || ufs_quiesce_pend) {
1735		if (!atomic_add_long_nv(&ulp->ul_vnops_cnt, -1))
1736			cv_broadcast(&ulp->ul_cv);
1737		mutex_enter(&ulp->ul_lock);
1738		if (seg->s_ops == &segvn_ops &&
1739		    ((struct segvn_data *)seg->s_data)->type != MAP_SHARED) {
1740			mask = (ulong_t)ULOCKFS_GETREAD_MASK;
1741		} else if (protp && read_access) {
1742			/*
1743			 * Restrict the mapping to readonly.
1744			 * Writes to this mapping will cause
1745			 * another fault which will then
1746			 * be suspended if fs is write locked
1747			 */
1748			*protp &= ~PROT_WRITE;
1749			mask = (ulong_t)ULOCKFS_GETREAD_MASK;
1750		} else
1751			mask = (ulong_t)ULOCKFS_GETWRITE_MASK;
1752
1753		/*
1754		 * will sleep if this fs is locked against this VOP
1755		 */
1756		error = ufs_check_lockfs(ufsvfsp, ulp, mask);
1757		mutex_exit(&ulp->ul_lock);
1758		if (error) {
1759			if (ulockfs_info_free == NULL)
1760				kmem_free(ulockfs_info_temp,
1761				    sizeof (ulockfs_info_t));
1762			return (error);
1763		}
1764	}
1765
1766	if (ulockfs_info_free != NULL) {
1767		ulockfs_info_free->ulp = ulp;
1768	} else {
1769		ulockfs_info_temp->ulp = ulp;
1770		ulockfs_info_temp->next = ulockfs_info;
1771		ASSERT(ufs_lockfs_key != 0);
1772		(void) tsd_set(ufs_lockfs_key, (void *)ulockfs_info_temp);
1773	}
1774
1775	curthread->t_flag |= T_DONTBLOCK;
1776	return (0);
1777}
1778
1779void
1780ufs_lockfs_tsd_destructor(void *head)
1781{
1782	ulockfs_info_t *curr = (ulockfs_info_t *)head;
1783	ulockfs_info_t *temp;
1784
1785	for (; curr != NULL; ) {
1786		/*
1787		 * The TSD destructor is being called when the thread exits
1788		 * (via thread_exit()). At that time it must have cleaned up
1789		 * all VOPs via ufs_lockfs_end() and there must not be a
1790		 * valid ulockfs record exist while a thread is exiting.
1791		 */
1792		temp = curr;
1793		curr = curr->next;
1794		ASSERT(temp->ulp == NULL);
1795		kmem_free(temp, sizeof (ulockfs_info_t));
1796	}
1797}
1798