fs/ufs/ufs_lockfs.c

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
 */

#include <sys/types.h>
#include <sys/t_lock.h>
#include <sys/param.h>
#include <sys/time.h>
#include <sys/systm.h>
#include <sys/sysmacros.h>
#include <sys/resource.h>
#include <sys/signal.h>
#include <sys/cred.h>
#include <sys/user.h>
#include <sys/buf.h>
#include <sys/vfs.h>
#include <sys/vnode.h>
#include <sys/proc.h>
#include <sys/disp.h>
#include <sys/file.h>
#include <sys/fcntl.h>
#include <sys/flock.h>
#include <sys/atomic.h>
#include <sys/kmem.h>
#include <sys/uio.h>
#include <sys/conf.h>
#include <sys/mman.h>
#include <sys/pathname.h>
#include <sys/debug.h>
#include <sys/vmsystm.h>
#include <sys/cmn_err.h>
#include <sys/acct.h>
#include <sys/dnlc.h>
#include <sys/swap.h>

#include <sys/fs/ufs_fs.h>
#include <sys/fs/ufs_inode.h>
#include <sys/fs/ufs_fsdir.h>
#include <sys/fs/ufs_trans.h>
#include <sys/fs/ufs_panic.h>
#include <sys/fs/ufs_mount.h>
#include <sys/fs/ufs_bio.h>
#include <sys/fs/ufs_log.h>
#include <sys/fs/ufs_quota.h>
#include <sys/dirent.h>		/* must be AFTER <sys/fs/fsdir.h>! */
#include <sys/errno.h>
#include <sys/sysinfo.h>

#include <vm/hat.h>
#include <vm/pvn.h>
#include <vm/as.h>
#include <vm/seg.h>
#include <vm/seg_map.h>
#include <vm/seg_vn.h>
#include <vm/rm.h>
#include <vm/anon.h>
#include <sys/swap.h>
#include <sys/dnlc.h>

extern struct vnode *common_specvp(struct vnode *vp);

/* error lock status */
#define	UN_ERRLCK	(-1)
#define	SET_ERRLCK	1
#define	RE_ERRLCK	2
#define	NO_ERRLCK	0

/*
 * Index to be used in TSD for storing lockfs data
 */
uint_t ufs_lockfs_key;

typedef struct _ulockfs_info {
	struct _ulockfs_info *next;
	struct ulockfs *ulp;
	uint_t flags;
} ulockfs_info_t;

#define	ULOCK_INFO_FALLOCATE	0x00000001	/* fallocate thread */

/*
 * Check in TSD that whether we are already doing any VOP on this filesystem
 */
#define	IS_REC_VOP(found, head, ulp, free)		\
{							\
	ulockfs_info_t *_curr;				\
							\
	for (found = 0, free = NULL, _curr = head;	\
	    _curr != NULL; _curr = _curr->next) {	\
		if ((free == NULL) &&			\
		    (_curr->ulp == NULL))		\
			free = _curr;			\
		if (_curr->ulp == ulp) {		\
			found = 1;			\
			break;				\
		}					\
	}						\
}

/*
 * Get the lockfs data from TSD so that lockfs handles the recursive VOP
 * properly
 */
#define	SEARCH_ULOCKFSP(head, ulp, info)		\
{							\
	ulockfs_info_t *_curr;				\
							\
	for (_curr = head; _curr != NULL;		\
	    _curr = _curr->next) {			\
		if (_curr->ulp == ulp) {		\
			break;				\
		}					\
	}						\
							\
	info = _curr;					\
}

/*
 * Validate lockfs request
 */
static int
ufs_getlfd(
	struct lockfs *lockfsp,		/* new lock request */
	struct lockfs *ul_lockfsp)	/* old lock state */
{
	int	error = 0;

	/*
	 * no input flags defined
	 */
	if (lockfsp->lf_flags != 0) {
		error = EINVAL;
		goto errout;
	}

	/*
	 * check key
	 */
	if (!LOCKFS_IS_ULOCK(ul_lockfsp))
		if (lockfsp->lf_key != ul_lockfsp->lf_key) {
			error = EINVAL;
			goto errout;
	}

	lockfsp->lf_key = ul_lockfsp->lf_key + 1;

errout:
	return (error);
}

/*
 * ufs_checkaccton
 *	check if accounting is turned on on this fs
 */

int
ufs_checkaccton(struct vnode *vp)
{
	if (acct_fs_in_use(vp))
		return (EDEADLK);
	return (0);
}

/*
 * ufs_checkswapon
 *	check if local swapping is to file on this fs
 */
int
ufs_checkswapon(struct vnode *vp)
{
	struct swapinfo	*sip;

	mutex_enter(&swapinfo_lock);
	for (sip = swapinfo; sip; sip = sip->si_next)
		if (sip->si_vp->v_vfsp == vp->v_vfsp) {
			mutex_exit(&swapinfo_lock);
			return (EDEADLK);
		}
	mutex_exit(&swapinfo_lock);
	return (0);
}

/*
 * ufs_freeze
 *	pend future accesses for current lock and desired lock
 */
void
ufs_freeze(struct ulockfs *ulp, struct lockfs *lockfsp)
{
	/*
	 * set to new lock type
	 */
	ulp->ul_lockfs.lf_lock = lockfsp->lf_lock;
	ulp->ul_lockfs.lf_key = lockfsp->lf_key;
	ulp->ul_lockfs.lf_comlen = lockfsp->lf_comlen;
	ulp->ul_lockfs.lf_comment = lockfsp->lf_comment;

	ulp->ul_fs_lock = (1 << ulp->ul_lockfs.lf_lock);
}

/*
 * All callers of ufs_quiesce() atomically increment ufs_quiesce_pend before
 * starting ufs_quiesce() protocol and decrement it only when a file system no
 * longer has to be in quiescent state. This allows ufs_pageio() to detect
 * that another thread wants to quiesce a file system. See more comments in
 * ufs_pageio().
 */
ulong_t ufs_quiesce_pend = 0;

/*
 * ufs_quiesce
 *	wait for outstanding accesses to finish
 */
int
ufs_quiesce(struct ulockfs *ulp)
{
	int error = 0;
	ulockfs_info_t *head;
	ulockfs_info_t *info;
	klwp_t *lwp = ttolwp(curthread);

	head = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
	SEARCH_ULOCKFSP(head, ulp, info);

	/*
	 * We have to keep /proc away from stopping us after we applied
	 * the softlock but before we got a chance to clear it again.
	 * prstop() may pagefault and become stuck on the softlock still
	 * pending.
	 */
	if (lwp != NULL)
		lwp->lwp_nostop++;

	/*
	 * Set a softlock to suspend future ufs_vnops so that
	 * this lockfs request will not be starved
	 */
	ULOCKFS_SET_SLOCK(ulp);
	ASSERT(ufs_quiesce_pend);

	/* check if there is any outstanding ufs vnodeops calls */
	while (ulp->ul_vnops_cnt || ulp->ul_falloc_cnt) {
		/*
		 * use timed version of cv_wait_sig() to make sure we don't
		 * miss a wake up call from ufs_pageio() when it doesn't use
		 * ul_lock.
		 *
		 * when a fallocate thread comes in, the only way it returns
		 * from this function is if there are no other vnode operations
		 * going on (remember fallocate threads are tracked using
		 * ul_falloc_cnt not ul_vnops_cnt), and another fallocate thread
		 * hasn't already grabbed the fs write lock.
		 */
		if (info && (info->flags & ULOCK_INFO_FALLOCATE)) {
			if (!ulp->ul_vnops_cnt && !ULOCKFS_IS_FWLOCK(ulp))
				goto out;
		}
		if (!cv_reltimedwait_sig(&ulp->ul_cv, &ulp->ul_lock, hz,
		    TR_CLOCK_TICK)) {
			error = EINTR;
			goto out;
		}
	}

out:
	/*
	 * unlock the soft lock
	 */
	ULOCKFS_CLR_SLOCK(ulp);

	if (lwp != NULL)
		lwp->lwp_nostop--;

	return (error);
}

/*
 * ufs_flush_inode
 */
int
ufs_flush_inode(struct inode *ip, void *arg)
{
	int	error;
	int	saverror	= 0;

	/*
	 * wrong file system; keep looking
	 */
	if (ip->i_ufsvfs != (struct ufsvfs *)arg)
		return (0);

	/*
	 * asynchronously push all the dirty pages
	 */
	if (((error = TRANS_SYNCIP(ip, B_ASYNC, 0, TOP_SYNCIP_FLUSHI)) != 0) &&
	    (error != EAGAIN))
		saverror = error;
	/*
	 * wait for io and discard all mappings
	 */
	if (error = TRANS_SYNCIP(ip, B_INVAL, 0, TOP_SYNCIP_FLUSHI))
		saverror = error;

	if (ITOV(ip)->v_type == VDIR) {
		dnlc_dir_purge(&ip->i_danchor);
	}

	return (saverror);
}

/*
 * ufs_flush
 *	Flush everything that is currently dirty; this includes invalidating
 *	any mappings.
 */
int
ufs_flush(struct vfs *vfsp)
{
	int		error;
	int		saverror = 0;
	struct ufsvfs	*ufsvfsp	= (struct ufsvfs *)vfsp->vfs_data;
	struct fs	*fs		= ufsvfsp->vfs_fs;
	int		tdontblock = 0;

	ASSERT(vfs_lock_held(vfsp));

	/*
	 * purge dnlc
	 */
	(void) dnlc_purge_vfsp(vfsp, 0);

	/*
	 * drain the delete and idle threads
	 */
	ufs_delete_drain(vfsp, 0, 0);
	ufs_idle_drain(vfsp);

	/*
	 * flush and invalidate quota records
	 */
	(void) qsync(ufsvfsp);

	/*
	 * flush w/invalidate the inodes for vfsp
	 */
	if (error = ufs_scan_inodes(0, ufs_flush_inode, ufsvfsp, ufsvfsp))
		saverror = error;

	/*
	 * synchronously flush superblock and summary info
	 */
	if (fs->fs_ronly == 0 && fs->fs_fmod) {
		fs->fs_fmod = 0;
		TRANS_SBUPDATE(ufsvfsp, vfsp, TOP_SBUPDATE_FLUSH);
	}
	/*
	 * flush w/invalidate block device pages and buf cache
	 */
	if ((error = VOP_PUTPAGE(common_specvp(ufsvfsp->vfs_devvp),
	    (offset_t)0, 0, B_INVAL, CRED(), NULL)) > 0)
		saverror = error;

	(void) bflush((dev_t)vfsp->vfs_dev);
	(void) bfinval((dev_t)vfsp->vfs_dev, 0);

	/*
	 * drain the delete and idle threads again
	 */
	ufs_delete_drain(vfsp, 0, 0);
	ufs_idle_drain(vfsp);

	/*
	 * play with the clean flag
	 */
	if (saverror == 0)
		ufs_checkclean(vfsp);

	/*
	 * Flush any outstanding transactions and roll the log
	 * only if we are supposed to do, i.e. LDL_NOROLL not set.
	 * We can not simply check for fs_ronly here since fsck also may
	 * use this code to roll the log on a read-only filesystem, e.g.
	 * root during early stages of boot, if other then a sanity check is
	 * done, it will clear LDL_NOROLL before.
	 * In addition we assert that the deltamap does not contain any deltas
	 * in case LDL_NOROLL is set since this is not supposed to happen.
	 */
	if (TRANS_ISTRANS(ufsvfsp)) {
		ml_unit_t	*ul	= ufsvfsp->vfs_log;
		mt_map_t	*mtm	= ul->un_deltamap;

		if (ul->un_flags & LDL_NOROLL) {
			ASSERT(mtm->mtm_nme == 0);
		} else {
			/*
			 * Do not set T_DONTBLOCK if there is a
			 * transaction opened by caller.
			 */
			if (curthread->t_flag & T_DONTBLOCK)
				tdontblock = 1;
			else
				curthread->t_flag |= T_DONTBLOCK;

			TRANS_BEGIN_SYNC(ufsvfsp, TOP_COMMIT_FLUSH,
			    TOP_COMMIT_SIZE, error);

			if (!error) {
				TRANS_END_SYNC(ufsvfsp, saverror,
				    TOP_COMMIT_FLUSH, TOP_COMMIT_SIZE);
			}

			if (tdontblock == 0)
				curthread->t_flag &= ~T_DONTBLOCK;

			logmap_roll_dev(ufsvfsp->vfs_log);
		}
	}

	return (saverror);
}

/*
 * ufs_thaw_wlock
 *	special processing when thawing down to wlock
 */
static int
ufs_thaw_wlock(struct inode *ip, void *arg)
{
	/*
	 * wrong file system; keep looking
	 */
	if (ip->i_ufsvfs != (struct ufsvfs *)arg)
		return (0);

	/*
	 * iupdat refuses to clear flags if the fs is read only.  The fs
	 * may become read/write during the lock and we wouldn't want
	 * these inodes being written to disk.  So clear the flags.
	 */
	rw_enter(&ip->i_contents, RW_WRITER);
	ip->i_flag &= ~(IMOD|IMODACC|IACC|IUPD|ICHG|IATTCHG);
	rw_exit(&ip->i_contents);

	/*
	 * pages are mlocked -- fail wlock
	 */
	if (ITOV(ip)->v_type != VCHR && vn_has_cached_data(ITOV(ip)))
		return (EBUSY);

	return (0);
}

/*
 * ufs_thaw_hlock
 *	special processing when thawing down to hlock or elock
 */
static int
ufs_thaw_hlock(struct inode *ip, void *arg)
{
	struct vnode	*vp	= ITOV(ip);

	/*
	 * wrong file system; keep looking
	 */
	if (ip->i_ufsvfs != (struct ufsvfs *)arg)
		return (0);

	/*
	 * blow away all pages - even if they are mlocked
	 */
	do {
		(void) TRANS_SYNCIP(ip, B_INVAL | B_FORCE, 0, TOP_SYNCIP_HLOCK);
	} while ((vp->v_type != VCHR) && vn_has_cached_data(vp));
	rw_enter(&ip->i_contents, RW_WRITER);
	ip->i_flag &= ~(IMOD|IMODACC|IACC|IUPD|ICHG|IATTCHG);
	rw_exit(&ip->i_contents);

	return (0);
}

/*
 * ufs_thaw
 *	thaw file system lock down to current value
 */
int
ufs_thaw(struct vfs *vfsp, struct ufsvfs *ufsvfsp, struct ulockfs *ulp)
{
	int		error	= 0;
	int		noidel	= (int)(ulp->ul_flag & ULOCKFS_NOIDEL);

	/*
	 * if wlock or hlock or elock
	 */
	if (ULOCKFS_IS_WLOCK(ulp) || ULOCKFS_IS_HLOCK(ulp) ||
	    ULOCKFS_IS_ELOCK(ulp)) {

		/*
		 * don't keep access times
		 * don't free deleted files
		 * if superblock writes are allowed, limit them to me for now
		 */
		ulp->ul_flag |= (ULOCKFS_NOIACC|ULOCKFS_NOIDEL);
		if (ulp->ul_sbowner != (kthread_id_t)-1)
			ulp->ul_sbowner = curthread;

		/*
		 * wait for writes for deleted files and superblock updates
		 */
		(void) ufs_flush(vfsp);

		/*
		 * now make sure the quota file is up-to-date
		 *	expensive; but effective
		 */
		error = ufs_flush(vfsp);
		/*
		 * no one can write the superblock
		 */
		ulp->ul_sbowner = (kthread_id_t)-1;

		/*
		 * special processing for wlock/hlock/elock
		 */
		if (ULOCKFS_IS_WLOCK(ulp)) {
			if (error)
				goto errout;
			error = bfinval(ufsvfsp->vfs_dev, 0);
			if (error)
				goto errout;
			error = ufs_scan_inodes(0, ufs_thaw_wlock,
			    (void *)ufsvfsp, ufsvfsp);
			if (error)
				goto errout;
		}
		if (ULOCKFS_IS_HLOCK(ulp) || ULOCKFS_IS_ELOCK(ulp)) {
			error = 0;
			(void) ufs_scan_inodes(0, ufs_thaw_hlock,
			    (void *)ufsvfsp, ufsvfsp);
			(void) bfinval(ufsvfsp->vfs_dev, 1);
		}
	} else {

		/*
		 * okay to keep access times
		 * okay to free deleted files
		 * okay to write the superblock
		 */
		ulp->ul_flag &= ~(ULOCKFS_NOIACC|ULOCKFS_NOIDEL);
		ulp->ul_sbowner = NULL;

		/*
		 * flush in case deleted files are in memory
		 */
		if (noidel) {
			if (error = ufs_flush(vfsp))
				goto errout;
		}
	}

errout:
	cv_broadcast(&ulp->ul_cv);
	return (error);
}

/*
 * ufs_reconcile_fs
 *	reconcile incore superblock with ondisk superblock
 */
int
ufs_reconcile_fs(struct vfs *vfsp, struct ufsvfs *ufsvfsp, int errlck)
{
	struct fs	*mfs; 	/* in-memory superblock */
	struct fs	*dfs;	/* on-disk   superblock */
	struct buf	*bp;	/* on-disk   superblock buf */
	int		 needs_unlock;
	char		 finished_fsclean;

	mfs = ufsvfsp->vfs_fs;

	/*
	 * get the on-disk copy of the superblock
	 */
	bp = UFS_BREAD(ufsvfsp, vfsp->vfs_dev, SBLOCK, SBSIZE);
	bp->b_flags |= (B_STALE|B_AGE);
	if (bp->b_flags & B_ERROR) {
		brelse(bp);
		return (EIO);
	}
	dfs = bp->b_un.b_fs;

	/* error locks may only unlock after the fs has been made consistent */
	if (errlck == UN_ERRLCK) {
		if (dfs->fs_clean == FSFIX) {	/* being repaired */
			brelse(bp);
			return (EAGAIN);
		}
		/* repair not yet started? */
		finished_fsclean = TRANS_ISTRANS(ufsvfsp)? FSLOG: FSCLEAN;
		if (dfs->fs_clean != finished_fsclean) {
			brelse(bp);
			return (EBUSY);
		}
	}

	/*
	 * if superblock has changed too much, abort
	 */
	if ((mfs->fs_sblkno		!= dfs->fs_sblkno) ||
	    (mfs->fs_cblkno		!= dfs->fs_cblkno) ||
	    (mfs->fs_iblkno		!= dfs->fs_iblkno) ||
	    (mfs->fs_dblkno		!= dfs->fs_dblkno) ||
	    (mfs->fs_cgoffset		!= dfs->fs_cgoffset) ||
	    (mfs->fs_cgmask		!= dfs->fs_cgmask) ||
	    (mfs->fs_bsize		!= dfs->fs_bsize) ||
	    (mfs->fs_fsize		!= dfs->fs_fsize) ||
	    (mfs->fs_frag		!= dfs->fs_frag) ||
	    (mfs->fs_bmask		!= dfs->fs_bmask) ||
	    (mfs->fs_fmask		!= dfs->fs_fmask) ||
	    (mfs->fs_bshift		!= dfs->fs_bshift) ||
	    (mfs->fs_fshift		!= dfs->fs_fshift) ||
	    (mfs->fs_fragshift		!= dfs->fs_fragshift) ||
	    (mfs->fs_fsbtodb		!= dfs->fs_fsbtodb) ||
	    (mfs->fs_sbsize		!= dfs->fs_sbsize) ||
	    (mfs->fs_nindir		!= dfs->fs_nindir) ||
	    (mfs->fs_nspf		!= dfs->fs_nspf) ||
	    (mfs->fs_trackskew		!= dfs->fs_trackskew) ||
	    (mfs->fs_cgsize		!= dfs->fs_cgsize) ||
	    (mfs->fs_ntrak		!= dfs->fs_ntrak) ||
	    (mfs->fs_nsect		!= dfs->fs_nsect) ||
	    (mfs->fs_spc		!= dfs->fs_spc) ||
	    (mfs->fs_cpg		!= dfs->fs_cpg) ||
	    (mfs->fs_ipg		!= dfs->fs_ipg) ||
	    (mfs->fs_fpg		!= dfs->fs_fpg) ||
	    (mfs->fs_postblformat	!= dfs->fs_postblformat) ||
	    (mfs->fs_magic		!= dfs->fs_magic)) {
		brelse(bp);
		return (EACCES);
	}
	if (dfs->fs_clean == FSBAD || FSOKAY != dfs->fs_state + dfs->fs_time)
		if (mfs->fs_clean == FSLOG) {
			brelse(bp);
			return (EACCES);
		}

	/*
	 * get new summary info
	 */
	if (ufs_getsummaryinfo(vfsp->vfs_dev, ufsvfsp, dfs)) {
		brelse(bp);
		return (EIO);
	}

	/*
	 * release old summary info and update in-memory superblock
	 */
	kmem_free(mfs->fs_u.fs_csp, mfs->fs_cssize);
	mfs->fs_u.fs_csp = dfs->fs_u.fs_csp;	/* Only entry 0 used */

	/*
	 * update fields allowed to change
	 */
	mfs->fs_size		= dfs->fs_size;
	mfs->fs_dsize		= dfs->fs_dsize;
	mfs->fs_ncg		= dfs->fs_ncg;
	mfs->fs_minfree		= dfs->fs_minfree;
	mfs->fs_rotdelay	= dfs->fs_rotdelay;
	mfs->fs_rps		= dfs->fs_rps;
	mfs->fs_maxcontig	= dfs->fs_maxcontig;
	mfs->fs_maxbpg		= dfs->fs_maxbpg;
	mfs->fs_csmask		= dfs->fs_csmask;
	mfs->fs_csshift		= dfs->fs_csshift;
	mfs->fs_optim		= dfs->fs_optim;
	mfs->fs_csaddr		= dfs->fs_csaddr;
	mfs->fs_cssize		= dfs->fs_cssize;
	mfs->fs_ncyl		= dfs->fs_ncyl;
	mfs->fs_cstotal		= dfs->fs_cstotal;
	mfs->fs_reclaim		= dfs->fs_reclaim;

	if (mfs->fs_reclaim & (FS_RECLAIM|FS_RECLAIMING)) {
		mfs->fs_reclaim &= ~FS_RECLAIM;
		mfs->fs_reclaim |=  FS_RECLAIMING;
		ufs_thread_start(&ufsvfsp->vfs_reclaim,
		    ufs_thread_reclaim, vfsp);
	}

	/* XXX What to do about sparecon? */

	/* XXX need to copy volume label */

	/*
	 * ondisk clean flag overrides inmemory clean flag iff == FSBAD
	 * or if error-locked and ondisk is now clean
	 */
	needs_unlock = !MUTEX_HELD(&ufsvfsp->vfs_lock);
	if (needs_unlock)
		mutex_enter(&ufsvfsp->vfs_lock);

	if (errlck == UN_ERRLCK) {
		if (finished_fsclean == dfs->fs_clean)
			mfs->fs_clean = finished_fsclean;
		else
			mfs->fs_clean = FSBAD;
		mfs->fs_state = FSOKAY - dfs->fs_time;
	}

	if (FSOKAY != dfs->fs_state + dfs->fs_time ||
	    (dfs->fs_clean == FSBAD))
		mfs->fs_clean = FSBAD;

	if (needs_unlock)
		mutex_exit(&ufsvfsp->vfs_lock);

	brelse(bp);

	return (0);
}

/*
 * ufs_reconcile_inode
 *	reconcile ondisk inode with incore inode
 */
static int
ufs_reconcile_inode(struct inode *ip, void *arg)
{
	int		i;
	int		ndaddr;
	int		niaddr;
	struct dinode	*dp;		/* ondisk inode */
	struct buf	*bp	= NULL;
	uid_t		d_uid;
	gid_t		d_gid;
	int		error = 0;
	struct fs	*fs;

	/*
	 * not an inode we care about
	 */
	if (ip->i_ufsvfs != (struct ufsvfs *)arg)
		return (0);

	fs = ip->i_fs;

	/*
	 * Inode reconciliation fails: we made the filesystem quiescent
	 * and we did a ufs_flush() before calling ufs_reconcile_inode()
	 * and thus the inode should not have been changed inbetween.
	 * Any discrepancies indicate a logic error and a pretty
	 * significant run-state inconsistency we should complain about.
	 */
	if (ip->i_flag & (IMOD|IMODACC|IACC|IUPD|ICHG|IATTCHG)) {
		cmn_err(CE_WARN, "%s: Inode reconciliation failed for"
		    "inode %llu", fs->fs_fsmnt, (u_longlong_t)ip->i_number);
		return (EINVAL);
	}

	/*
	 * get the dinode
	 */
	bp = UFS_BREAD(ip->i_ufsvfs,
	    ip->i_dev, (daddr_t)fsbtodb(fs, itod(fs, ip->i_number)),
	    (int)fs->fs_bsize);
	if (bp->b_flags & B_ERROR) {
		brelse(bp);
		return (EIO);
	}
	dp  = bp->b_un.b_dino;
	dp += itoo(fs, ip->i_number);

	/*
	 * handle Sun's implementation of EFT
	 */
	d_uid = (dp->di_suid == UID_LONG) ? dp->di_uid : (uid_t)dp->di_suid;
	d_gid = (dp->di_sgid == GID_LONG) ? dp->di_gid : (uid_t)dp->di_sgid;

	rw_enter(&ip->i_contents, RW_WRITER);

	/*
	 * some fields are not allowed to change
	 */
	if ((ip->i_mode  != dp->di_mode) ||
	    (ip->i_gen   != dp->di_gen) ||
	    (ip->i_uid   != d_uid) ||
	    (ip->i_gid   != d_gid)) {
		error = EACCES;
		goto out;
	}

	/*
	 * and some are allowed to change
	 */
	ip->i_size		= dp->di_size;
	ip->i_ic.ic_flags	= dp->di_ic.ic_flags;
	ip->i_blocks		= dp->di_blocks;
	ip->i_nlink		= dp->di_nlink;
	if (ip->i_flag & IFASTSYMLNK) {
		ndaddr = 1;
		niaddr = 0;
	} else {
		ndaddr = NDADDR;
		niaddr = NIADDR;
	}
	for (i = 0; i < ndaddr; ++i)
		ip->i_db[i] = dp->di_db[i];
	for (i = 0; i < niaddr; ++i)
		ip->i_ib[i] = dp->di_ib[i];

out:
	rw_exit(&ip->i_contents);
	brelse(bp);
	return (error);
}

/*
 * ufs_reconcile
 *	reconcile ondisk superblock/inodes with any incore
 */
static int
ufs_reconcile(struct vfs *vfsp, struct ufsvfs *ufsvfsp, int errlck)
{
	int	error = 0;

	/*
	 * get rid of as much inmemory data as possible
	 */
	(void) ufs_flush(vfsp);

	/*
	 * reconcile the superblock and inodes
	 */
	if (error = ufs_reconcile_fs(vfsp, ufsvfsp, errlck))
		return (error);
	if (error = ufs_scan_inodes(0, ufs_reconcile_inode, ufsvfsp, ufsvfsp))
		return (error);
	/*
	 * allocation blocks may be incorrect; get rid of them
	 */
	(void) ufs_flush(vfsp);

	return (error);
}

/*
 * File system locking
 */
int
ufs_fiolfs(struct vnode *vp, struct lockfs *lockfsp, int from_log)
{
	return (ufs__fiolfs(vp, lockfsp, /* from_user */ 1, from_log));
}

/* kernel-internal interface, also used by fix-on-panic */
int
ufs__fiolfs(
	struct vnode *vp,
	struct lockfs *lockfsp,
	int from_user,
	int from_log)
{
	struct ulockfs	*ulp;
	struct lockfs	lfs;
	int		error;
	struct vfs	*vfsp;
	struct ufsvfs	*ufsvfsp;
	int		 errlck		= NO_ERRLCK;
	int		 poll_events	= POLLPRI;
	extern struct pollhead ufs_pollhd;
	ulockfs_info_t *head;
	ulockfs_info_t *info;
	int signal = 0;

	/* check valid lock type */
	if (!lockfsp || lockfsp->lf_lock > LOCKFS_MAXLOCK)
		return (EINVAL);

	if (!vp || !vp->v_vfsp || !vp->v_vfsp->vfs_data)
		return (EIO);

	vfsp = vp->v_vfsp;

	if (vfsp->vfs_flag & VFS_UNMOUNTED) /* has been unmounted */
		return (EIO);

	/* take the lock and check again */
	vfs_lock_wait(vfsp);
	if (vfsp->vfs_flag & VFS_UNMOUNTED) {
		vfs_unlock(vfsp);
		return (EIO);
	}

	/*
	 * Can't wlock or ro/elock fs with accounting or local swap file
	 * We need to check for this before we grab the ul_lock to avoid
	 * deadlocks with the accounting framework.
	 */
	if ((LOCKFS_IS_WLOCK(lockfsp) || LOCKFS_IS_ELOCK(lockfsp) ||
	    LOCKFS_IS_ROELOCK(lockfsp)) && !from_log) {
		if (ufs_checkaccton(vp) || ufs_checkswapon(vp)) {
			vfs_unlock(vfsp);
			return (EDEADLK);
		}
	}

	ufsvfsp = (struct ufsvfs *)vfsp->vfs_data;
	ulp = &ufsvfsp->vfs_ulockfs;
	head = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
	SEARCH_ULOCKFSP(head, ulp, info);

	/*
	 * Suspend both the reclaim thread and the delete thread.
	 * This must be done outside the lockfs locking protocol.
	 */
	ufs_thread_suspend(&ufsvfsp->vfs_reclaim);
	ufs_thread_suspend(&ufsvfsp->vfs_delete);

	mutex_enter(&ulp->ul_lock);
	atomic_add_long(&ufs_quiesce_pend, 1);

	/*
	 * Quit if there is another lockfs request in progress
	 * that is waiting for existing ufs_vnops to complete.
	 */
	if (ULOCKFS_IS_BUSY(ulp)) {
		error = EBUSY;
		goto errexit;
	}

	/* cannot ulocked or downgrade a hard-lock */
	if (ULOCKFS_IS_HLOCK(ulp)) {
		error = EIO;
		goto errexit;
	}

	/* an error lock may be unlocked or relocked, only */
	if (ULOCKFS_IS_ELOCK(ulp)) {
		if (!LOCKFS_IS_ULOCK(lockfsp) && !LOCKFS_IS_ELOCK(lockfsp)) {
			error = EBUSY;
			goto errexit;
		}
	}

	/*
	 * a read-only error lock may only be upgraded to an
	 * error lock or hard lock
	 */
	if (ULOCKFS_IS_ROELOCK(ulp)) {
		if (!LOCKFS_IS_HLOCK(lockfsp) && !LOCKFS_IS_ELOCK(lockfsp)) {
			error = EBUSY;
			goto errexit;
		}
	}

	/*
	 * until read-only error locks are fully implemented
	 * just return EINVAL
	 */
	if (LOCKFS_IS_ROELOCK(lockfsp)) {
		error = EINVAL;
		goto errexit;
	}

	/*
	 * an error lock may only be applied if the file system is
	 * unlocked or already error locked.
	 * (this is to prevent the case where a fs gets changed out from
	 * underneath a fs that is locked for backup,
	 * that is, name/delete/write-locked.)
	 */
	if ((!ULOCKFS_IS_ULOCK(ulp) && !ULOCKFS_IS_ELOCK(ulp) &&
	    !ULOCKFS_IS_ROELOCK(ulp)) &&
	    (LOCKFS_IS_ELOCK(lockfsp) || LOCKFS_IS_ROELOCK(lockfsp))) {
		error = EBUSY;
		goto errexit;
	}

	/* get and validate the input lockfs request */
	if (error = ufs_getlfd(lockfsp, &ulp->ul_lockfs))
		goto errexit;

	/*
	 * save current ulockfs struct
	 */
	bcopy(&ulp->ul_lockfs, &lfs, sizeof (struct lockfs));

	/*
	 * Freeze the file system (pend future accesses)
	 */
	ufs_freeze(ulp, lockfsp);

	/*
	 * Set locking in progress because ufs_quiesce may free the
	 * ul_lock mutex.
	 */
	ULOCKFS_SET_BUSY(ulp);
	/* update the ioctl copy */
	LOCKFS_SET_BUSY(&ulp->ul_lockfs);

	/*
	 * We  need to unset FWLOCK status before we call ufs_quiesce
	 * so that the thread doesnt get suspended. We do this only if
	 * this (fallocate) thread requested an unlock operation.
	 */
	if (info && (info->flags & ULOCK_INFO_FALLOCATE)) {
		if (!ULOCKFS_IS_WLOCK(ulp))
			ULOCKFS_CLR_FWLOCK(ulp);
	}

	/*
	 * Quiesce (wait for outstanding accesses to finish)
	 */
	if (error = ufs_quiesce(ulp)) {
		/*
		 * Interrupted due to signal. There could still be
		 * pending vnops.
		 */
		signal = 1;

		/*
		 * We do broadcast because lock-status
		 * could be reverted to old status.
		 */
		cv_broadcast(&ulp->ul_cv);
		goto errout;
	}

	/*
	 * If the fallocate thread requested a write fs lock operation
	 * then we set fwlock status in the ulp.
	 */
	if (info && (info->flags & ULOCK_INFO_FALLOCATE)) {
		if (ULOCKFS_IS_WLOCK(ulp))
			ULOCKFS_SET_FWLOCK(ulp);
	}

	/*
	 * save error lock status to pass down to reconcilation
	 * routines and for later cleanup
	 */
	if (LOCKFS_IS_ELOCK(&lfs) && ULOCKFS_IS_ULOCK(ulp))
		errlck = UN_ERRLCK;

	if (ULOCKFS_IS_ELOCK(ulp) || ULOCKFS_IS_ROELOCK(ulp)) {
		int needs_unlock;
		int needs_sbwrite;

		poll_events |= POLLERR;
		errlck = LOCKFS_IS_ELOCK(&lfs) || LOCKFS_IS_ROELOCK(&lfs) ?
		    RE_ERRLCK : SET_ERRLCK;

		needs_unlock = !MUTEX_HELD(&ufsvfsp->vfs_lock);
		if (needs_unlock)
			mutex_enter(&ufsvfsp->vfs_lock);

		/* disable delayed i/o */
		needs_sbwrite = 0;

		if (errlck == SET_ERRLCK) {
			ufsvfsp->vfs_fs->fs_clean = FSBAD;
			needs_sbwrite = 1;
		}

		needs_sbwrite |= ufsvfsp->vfs_dio;
		ufsvfsp->vfs_dio = 0;

		if (needs_unlock)
			mutex_exit(&ufsvfsp->vfs_lock);

		if (needs_sbwrite) {
			ulp->ul_sbowner = curthread;
			TRANS_SBWRITE(ufsvfsp, TOP_SBWRITE_STABLE);

			if (needs_unlock)
				mutex_enter(&ufsvfsp->vfs_lock);

			ufsvfsp->vfs_fs->fs_fmod = 0;

			if (needs_unlock)
				mutex_exit(&ufsvfsp->vfs_lock);
		}
	}

	/*
	 * reconcile superblock and inodes if was wlocked
	 */
	if (LOCKFS_IS_WLOCK(&lfs) || LOCKFS_IS_ELOCK(&lfs)) {
		if (error = ufs_reconcile(vfsp, ufsvfsp, errlck))
			goto errout;
		/*
		 * in case the fs grew; reset the metadata map for logging tests
		 */
		TRANS_MATA_UMOUNT(ufsvfsp);
		TRANS_MATA_MOUNT(ufsvfsp);
		TRANS_MATA_SI(ufsvfsp, ufsvfsp->vfs_fs);
	}

	/*
	 * At least everything *currently* dirty goes out.
	 */

	if ((error = ufs_flush(vfsp)) != 0 && !ULOCKFS_IS_HLOCK(ulp) &&
	    !ULOCKFS_IS_ELOCK(ulp))
		goto errout;

	/*
	 * thaw file system and wakeup pended processes
	 */
	if (error = ufs_thaw(vfsp, ufsvfsp, ulp))
		if (!ULOCKFS_IS_HLOCK(ulp) && !ULOCKFS_IS_ELOCK(ulp))
			goto errout;

	/*
	 * reset modified flag if not already write locked
	 */
	if (!LOCKFS_IS_WLOCK(&lfs))
		ULOCKFS_CLR_MOD(ulp);

	/*
	 * idle the lock struct
	 */
	ULOCKFS_CLR_BUSY(ulp);
	/* update the ioctl copy */
	LOCKFS_CLR_BUSY(&ulp->ul_lockfs);

	/*
	 * free current comment
	 */
	if (lfs.lf_comment && lfs.lf_comlen != 0) {
		kmem_free(lfs.lf_comment, lfs.lf_comlen);
		lfs.lf_comment = NULL;
		lfs.lf_comlen = 0;
	}

	/* do error lock cleanup */
	if (errlck == UN_ERRLCK)
		ufsfx_unlockfs(ufsvfsp);

	else if (errlck == RE_ERRLCK)
		ufsfx_lockfs(ufsvfsp);

	/* don't allow error lock from user to invoke panic */
	else if (from_user && errlck == SET_ERRLCK &&
	    !(ufsvfsp->vfs_fsfx.fx_flags & (UFSMNT_ONERROR_PANIC >> 4)))
		(void) ufs_fault(ufsvfsp->vfs_root,
		    ulp->ul_lockfs.lf_comment && ulp->ul_lockfs.lf_comlen > 0 ?
		    ulp->ul_lockfs.lf_comment: "user-applied error lock");

	atomic_add_long(&ufs_quiesce_pend, -1);
	mutex_exit(&ulp->ul_lock);
	vfs_unlock(vfsp);

	if (ULOCKFS_IS_HLOCK(&ufsvfsp->vfs_ulockfs))
		poll_events |= POLLERR;

	pollwakeup(&ufs_pollhd, poll_events);

	/*
	 * Allow both the delete thread and the reclaim thread to
	 * continue.
	 */
	ufs_thread_continue(&ufsvfsp->vfs_delete);
	ufs_thread_continue(&ufsvfsp->vfs_reclaim);

	return (0);

errout:
	/*
	 * Lock failed. Reset the old lock in ufsvfs if not hard locked.
	 */
	if (!LOCKFS_IS_HLOCK(&ulp->ul_lockfs)) {
		bcopy(&lfs, &ulp->ul_lockfs, sizeof (struct lockfs));
		ulp->ul_fs_lock = (1 << lfs.lf_lock);
	}

	/*
	 * Don't call ufs_thaw() when there's a signal during
	 * ufs quiesce operation as it can lead to deadlock
	 * with getpage.
	 */
	if (signal == 0)
		(void) ufs_thaw(vfsp, ufsvfsp, ulp);

	ULOCKFS_CLR_BUSY(ulp);
	LOCKFS_CLR_BUSY(&ulp->ul_lockfs);

errexit:
	atomic_add_long(&ufs_quiesce_pend, -1);
	mutex_exit(&ulp->ul_lock);
	vfs_unlock(vfsp);

	/*
	 * Allow both the delete thread and the reclaim thread to
	 * continue.
	 */
	ufs_thread_continue(&ufsvfsp->vfs_delete);
	ufs_thread_continue(&ufsvfsp->vfs_reclaim);

	return (error);
}

/*
 * fiolfss
 * 	return the current file system locking state info
 */
int
ufs_fiolfss(struct vnode *vp, struct lockfs *lockfsp)
{
	struct ulockfs	*ulp;

	if (!vp || !vp->v_vfsp || !VTOI(vp))
		return (EINVAL);

	/* file system has been forcibly unmounted */
	if (VTOI(vp)->i_ufsvfs == NULL)
		return (EIO);

	ulp = VTOUL(vp);

	if (ULOCKFS_IS_HLOCK(ulp)) {
		*lockfsp = ulp->ul_lockfs;	/* structure assignment */
		return (0);
	}

	mutex_enter(&ulp->ul_lock);

	*lockfsp = ulp->ul_lockfs;	/* structure assignment */

	if (ULOCKFS_IS_MOD(ulp))
		lockfsp->lf_flags |= LOCKFS_MOD;

	mutex_exit(&ulp->ul_lock);

	return (0);
}

/*
 * ufs_check_lockfs
 *	check whether a ufs_vnops conflicts with the file system lock
 */
int
ufs_check_lockfs(struct ufsvfs *ufsvfsp, struct ulockfs *ulp, ulong_t mask)
{
	k_sigset_t	smask;
	int		sig, slock;

	ASSERT(MUTEX_HELD(&ulp->ul_lock));

	while (ulp->ul_fs_lock & mask) {
		slock = (int)ULOCKFS_IS_SLOCK(ulp);
		if ((curthread->t_flag & T_DONTPEND) && !slock) {
			curthread->t_flag |= T_WOULDBLOCK;
			return (EAGAIN);
		}
		curthread->t_flag &= ~T_WOULDBLOCK;

		/*
		 * In the case of an onerr umount of the fs, threads could
		 * have blocked before coming into ufs_check_lockfs and
		 * need to check for the special case of ELOCK and
		 * vfs_dontblock being set which would indicate that the fs
		 * is on its way out and will not return therefore making
		 * EIO the appropriate response.
		 */
		if (ULOCKFS_IS_HLOCK(ulp) ||
		    (ULOCKFS_IS_ELOCK(ulp) && ufsvfsp->vfs_dontblock))
			return (EIO);

		/*
		 * wait for lock status to change
		 */
		if (slock || ufsvfsp->vfs_nointr) {
			cv_wait(&ulp->ul_cv, &ulp->ul_lock);
		} else {
			sigintr(&smask, 1);
			sig = cv_wait_sig(&ulp->ul_cv, &ulp->ul_lock);
			sigunintr(&smask);
			if ((!sig && (ulp->ul_fs_lock & mask)) ||
			    ufsvfsp->vfs_dontblock)
				return (EINTR);
		}
	}

	if (mask & ULOCKFS_FWLOCK) {
		atomic_add_long(&ulp->ul_falloc_cnt, 1);
		ULOCKFS_SET_FALLOC(ulp);
	} else {
		atomic_add_long(&ulp->ul_vnops_cnt, 1);
	}

	return (0);
}

/*
 * Check whether we came across the handcrafted lockfs protocol path. We can't
 * simply check for T_DONTBLOCK here as one would assume since this can also
 * falsely catch recursive VOP's going to a different filesystem, instead we
 * check if we already hold the ulockfs->ul_lock mutex.
 */
static int
ufs_lockfs_is_under_rawlockfs(struct ulockfs *ulp)
{
	return ((mutex_owner(&ulp->ul_lock) != curthread) ? 0 : 1);
}

/*
 * ufs_lockfs_begin - start the lockfs locking protocol
 */
int
ufs_lockfs_begin(struct ufsvfs *ufsvfsp, struct ulockfs **ulpp, ulong_t mask)
{
	int 		error;
	int		rec_vop;
	ushort_t	op_cnt_incremented = 0;
	ulong_t		*ctr;
	struct ulockfs *ulp;
	ulockfs_info_t	*ulockfs_info;
	ulockfs_info_t	*ulockfs_info_free;
	ulockfs_info_t	*ulockfs_info_temp;

	/*
	 * file system has been forcibly unmounted
	 */
	if (ufsvfsp == NULL)
		return (EIO);

	*ulpp = ulp = &ufsvfsp->vfs_ulockfs;

	/*
	 * Do lockfs protocol
	 */
	ulockfs_info = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
	IS_REC_VOP(rec_vop, ulockfs_info, ulp, ulockfs_info_free);

	/*
	 * Detect recursive VOP call or handcrafted internal lockfs protocol
	 * path and bail out in that case.
	 */
	if (rec_vop || ufs_lockfs_is_under_rawlockfs(ulp)) {
		*ulpp = NULL;
		return (0);
	} else {
		if (ulockfs_info_free == NULL) {
			if ((ulockfs_info_temp = (ulockfs_info_t *)
			    kmem_zalloc(sizeof (ulockfs_info_t),
			    KM_NOSLEEP)) == NULL) {
				*ulpp = NULL;
				return (ENOMEM);
			}
		}
	}

	/*
	 * First time VOP call
	 *
	 * Increment the ctr irrespective of the lockfs state. If the lockfs
	 * state is not ULOCKFS_ULOCK, we can decrement it later. However,
	 * before incrementing we need to check if there is a pending quiesce
	 * request because if we have a continuous stream of ufs_lockfs_begin
	 * requests pounding on a few cpu's then the ufs_quiesce thread might
	 * never see the value of zero for ctr - a livelock kind of scenario.
	 */
	ctr = (mask & ULOCKFS_FWLOCK) ?
	    &ulp->ul_falloc_cnt : &ulp->ul_vnops_cnt;
	if (!ULOCKFS_IS_SLOCK(ulp)) {
		atomic_add_long(ctr, 1);
		op_cnt_incremented++;
	}

	/*
	 * If the lockfs state (indicated by ul_fs_lock) is not just
	 * ULOCKFS_ULOCK, then we will be routed through ufs_check_lockfs
	 * where there is a check with an appropriate mask to selectively allow
	 * operations permitted for that kind of lockfs state.
	 *
	 * Even these selective operations should not be allowed to go through
	 * if a lockfs request is in progress because that could result in inode
	 * modifications during a quiesce and could hence result in inode
	 * reconciliation failures. ULOCKFS_SLOCK alone would not be sufficient,
	 * so make use of ufs_quiesce_pend to disallow vnode operations when a
	 * quiesce is in progress.
	 */
	if (!ULOCKFS_IS_JUSTULOCK(ulp) || ufs_quiesce_pend) {
		if (op_cnt_incremented)
			if (!atomic_add_long_nv(ctr, -1))
				cv_broadcast(&ulp->ul_cv);
		mutex_enter(&ulp->ul_lock);
		error = ufs_check_lockfs(ufsvfsp, ulp, mask);
		mutex_exit(&ulp->ul_lock);
		if (error) {
			if (ulockfs_info_free == NULL)
				kmem_free(ulockfs_info_temp,
				    sizeof (ulockfs_info_t));
			return (error);
		}
	} else {
		/*
		 * This is the common case of file system in a unlocked state.
		 *
		 * If a file system is unlocked, we would expect the ctr to have
		 * been incremented by now. But this will not be true when a
		 * quiesce is winding up - SLOCK was set when we checked before
		 * incrementing the ctr, but by the time we checked for
		 * ULOCKFS_IS_JUSTULOCK, the quiesce thread was gone. It is okay
		 * to take ul_lock and go through the slow path in this uncommon
		 * case.
		 */
		if (op_cnt_incremented == 0) {
			mutex_enter(&ulp->ul_lock);
			error = ufs_check_lockfs(ufsvfsp, ulp, mask);
			if (error) {
				mutex_exit(&ulp->ul_lock);
				if (ulockfs_info_free == NULL)
					kmem_free(ulockfs_info_temp,
					    sizeof (ulockfs_info_t));
				return (error);
			}
			if (mask & ULOCKFS_FWLOCK)
				ULOCKFS_SET_FALLOC(ulp);
			mutex_exit(&ulp->ul_lock);
		} else if (mask & ULOCKFS_FWLOCK) {
			mutex_enter(&ulp->ul_lock);
			ULOCKFS_SET_FALLOC(ulp);
			mutex_exit(&ulp->ul_lock);
		}
	}

	if (ulockfs_info_free != NULL) {
		ulockfs_info_free->ulp = ulp;
		if (mask & ULOCKFS_FWLOCK)
			ulockfs_info_free->flags |= ULOCK_INFO_FALLOCATE;
	} else {
		ulockfs_info_temp->ulp = ulp;
		ulockfs_info_temp->next = ulockfs_info;
		if (mask & ULOCKFS_FWLOCK)
			ulockfs_info_temp->flags |= ULOCK_INFO_FALLOCATE;
		ASSERT(ufs_lockfs_key != 0);
		(void) tsd_set(ufs_lockfs_key, (void *)ulockfs_info_temp);
	}

	curthread->t_flag |= T_DONTBLOCK;
	return (0);
}

/*
 * Check whether we are returning from the top level VOP.
 */
static int
ufs_lockfs_top_vop_return(ulockfs_info_t *head)
{
	ulockfs_info_t *info;
	int result = 1;

	for (info = head; info != NULL; info = info->next) {
		if (info->ulp != NULL) {
			result = 0;
			break;
		}
	}

	return (result);
}

/*
 * ufs_lockfs_end - terminate the lockfs locking protocol
 */
void
ufs_lockfs_end(struct ulockfs *ulp)
{
	ulockfs_info_t *info;
	ulockfs_info_t *head;

	/*
	 * end-of-VOP protocol
	 */
	if (ulp == NULL)
		return;

	head = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
	SEARCH_ULOCKFSP(head, ulp, info);

	/*
	 * If we're called from a first level VOP, we have to have a
	 * valid ulockfs record in the TSD.
	 */
	ASSERT(info != NULL);

	/*
	 * Invalidate the ulockfs record.
	 */
	info->ulp = NULL;

	if (ufs_lockfs_top_vop_return(head))
		curthread->t_flag &= ~T_DONTBLOCK;

	/* fallocate thread */
	if (ULOCKFS_IS_FALLOC(ulp) && info->flags & ULOCK_INFO_FALLOCATE) {
		/* Clear the thread's fallocate state */
		info->flags &= ~ULOCK_INFO_FALLOCATE;
		if (!atomic_add_long_nv(&ulp->ul_falloc_cnt, -1)) {
			mutex_enter(&ulp->ul_lock);
			ULOCKFS_CLR_FALLOC(ulp);
			cv_broadcast(&ulp->ul_cv);
			mutex_exit(&ulp->ul_lock);
		}
	} else  { /* normal thread */
		if (!atomic_add_long_nv(&ulp->ul_vnops_cnt, -1))
			cv_broadcast(&ulp->ul_cv);
	}
}

/*
 * ufs_lockfs_trybegin - try to start the lockfs locking protocol without
 * blocking.
 */
int
ufs_lockfs_trybegin(struct ufsvfs *ufsvfsp, struct ulockfs **ulpp, ulong_t mask)
{
	int 		error = 0;
	int		rec_vop;
	ushort_t	op_cnt_incremented = 0;
	ulong_t		*ctr;
	struct ulockfs *ulp;
	ulockfs_info_t	*ulockfs_info;
	ulockfs_info_t	*ulockfs_info_free;
	ulockfs_info_t	*ulockfs_info_temp;

	/*
	 * file system has been forcibly unmounted
	 */
	if (ufsvfsp == NULL)
		return (EIO);

	*ulpp = ulp = &ufsvfsp->vfs_ulockfs;

	/*
	 * Do lockfs protocol
	 */
	ulockfs_info = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
	IS_REC_VOP(rec_vop, ulockfs_info, ulp, ulockfs_info_free);

	/*
	 * Detect recursive VOP call or handcrafted internal lockfs protocol
	 * path and bail out in that case.
	 */
	if (rec_vop || ufs_lockfs_is_under_rawlockfs(ulp)) {
		*ulpp = NULL;
		return (0);
	} else {
		if (ulockfs_info_free == NULL) {
			if ((ulockfs_info_temp = (ulockfs_info_t *)
			    kmem_zalloc(sizeof (ulockfs_info_t),
			    KM_NOSLEEP)) == NULL) {
				*ulpp = NULL;
				return (ENOMEM);
			}
		}
	}

	/*
	 * First time VOP call
	 *
	 * Increment the ctr irrespective of the lockfs state. If the lockfs
	 * state is not ULOCKFS_ULOCK, we can decrement it later. However,
	 * before incrementing we need to check if there is a pending quiesce
	 * request because if we have a continuous stream of ufs_lockfs_begin
	 * requests pounding on a few cpu's then the ufs_quiesce thread might
	 * never see the value of zero for ctr - a livelock kind of scenario.
	 */
	ctr = (mask & ULOCKFS_FWLOCK) ?
	    &ulp->ul_falloc_cnt : &ulp->ul_vnops_cnt;
	if (!ULOCKFS_IS_SLOCK(ulp)) {
		atomic_add_long(ctr, 1);
		op_cnt_incremented++;
	}

	if (!ULOCKFS_IS_JUSTULOCK(ulp) || ufs_quiesce_pend) {
		/*
		 * Non-blocking version of ufs_check_lockfs() code.
		 *
		 * If the file system is not hard locked or error locked
		 * and if ulp->ul_fs_lock allows this operation, increment
		 * the appropriate counter and proceed (For eg., In case the
		 * file system is delete locked, a mmap can still go through).
		 */
		if (op_cnt_incremented)
			if (!atomic_add_long_nv(ctr, -1))
				cv_broadcast(&ulp->ul_cv);
		mutex_enter(&ulp->ul_lock);
		if (ULOCKFS_IS_HLOCK(ulp) ||
		    (ULOCKFS_IS_ELOCK(ulp) && ufsvfsp->vfs_dontblock))
			error = EIO;
		else if (ulp->ul_fs_lock & mask)
			error = EAGAIN;

		if (error) {
			mutex_exit(&ulp->ul_lock);
			if (ulockfs_info_free == NULL)
				kmem_free(ulockfs_info_temp,
				    sizeof (ulockfs_info_t));
			return (error);
		}
		atomic_add_long(ctr, 1);
		if (mask & ULOCKFS_FWLOCK)
			ULOCKFS_SET_FALLOC(ulp);
		mutex_exit(&ulp->ul_lock);
	} else {
		/*
		 * This is the common case of file system in a unlocked state.
		 *
		 * If a file system is unlocked, we would expect the ctr to have
		 * been incremented by now. But this will not be true when a
		 * quiesce is winding up - SLOCK was set when we checked before
		 * incrementing the ctr, but by the time we checked for
		 * ULOCKFS_IS_JUSTULOCK, the quiesce thread was gone. Take
		 * ul_lock and go through the non-blocking version of
		 * ufs_check_lockfs() code.
		 */
		if (op_cnt_incremented == 0) {
			mutex_enter(&ulp->ul_lock);
			if (ULOCKFS_IS_HLOCK(ulp) ||
			    (ULOCKFS_IS_ELOCK(ulp) && ufsvfsp->vfs_dontblock))
				error = EIO;
			else if (ulp->ul_fs_lock & mask)
				error = EAGAIN;

			if (error) {
				mutex_exit(&ulp->ul_lock);
				if (ulockfs_info_free == NULL)
					kmem_free(ulockfs_info_temp,
					    sizeof (ulockfs_info_t));
				return (error);
			}
			atomic_add_long(ctr, 1);
			if (mask & ULOCKFS_FWLOCK)
				ULOCKFS_SET_FALLOC(ulp);
			mutex_exit(&ulp->ul_lock);
		} else if (mask & ULOCKFS_FWLOCK) {
			mutex_enter(&ulp->ul_lock);
			ULOCKFS_SET_FALLOC(ulp);
			mutex_exit(&ulp->ul_lock);
		}
	}

	if (ulockfs_info_free != NULL) {
		ulockfs_info_free->ulp = ulp;
		if (mask & ULOCKFS_FWLOCK)
			ulockfs_info_free->flags |= ULOCK_INFO_FALLOCATE;
	} else {
		ulockfs_info_temp->ulp = ulp;
		ulockfs_info_temp->next = ulockfs_info;
		if (mask & ULOCKFS_FWLOCK)
			ulockfs_info_temp->flags |= ULOCK_INFO_FALLOCATE;
		ASSERT(ufs_lockfs_key != 0);
		(void) tsd_set(ufs_lockfs_key, (void *)ulockfs_info_temp);
	}

	curthread->t_flag |= T_DONTBLOCK;
	return (0);
}

/*
 * specialized version of ufs_lockfs_begin() called by ufs_getpage().
 */
int
ufs_lockfs_begin_getpage(
	struct ufsvfs	*ufsvfsp,
	struct ulockfs	**ulpp,
	struct seg	*seg,
	int		read_access,
	uint_t		*protp)
{
	ulong_t			mask;
	int 			error;
	int			rec_vop;
	struct ulockfs		*ulp;
	ulockfs_info_t		*ulockfs_info;
	ulockfs_info_t		*ulockfs_info_free;
	ulockfs_info_t		*ulockfs_info_temp;

	/*
	 * file system has been forcibly unmounted
	 */
	if (ufsvfsp == NULL)
		return (EIO);

	*ulpp = ulp = &ufsvfsp->vfs_ulockfs;

	/*
	 * Do lockfs protocol
	 */
	ulockfs_info = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
	IS_REC_VOP(rec_vop, ulockfs_info, ulp, ulockfs_info_free);

	/*
	 * Detect recursive VOP call or handcrafted internal lockfs protocol
	 * path and bail out in that case.
	 */
	if (rec_vop || ufs_lockfs_is_under_rawlockfs(ulp)) {
		*ulpp = NULL;
		return (0);
	} else {
		if (ulockfs_info_free == NULL) {
			if ((ulockfs_info_temp = (ulockfs_info_t *)
			    kmem_zalloc(sizeof (ulockfs_info_t),
			    KM_NOSLEEP)) == NULL) {
				*ulpp = NULL;
				return (ENOMEM);
			}
		}
	}

	/*
	 * First time VOP call
	 */
	atomic_add_long(&ulp->ul_vnops_cnt, 1);
	if (!ULOCKFS_IS_JUSTULOCK(ulp) || ufs_quiesce_pend) {
		if (!atomic_add_long_nv(&ulp->ul_vnops_cnt, -1))
			cv_broadcast(&ulp->ul_cv);
		mutex_enter(&ulp->ul_lock);
		if (seg->s_ops == &segvn_ops &&
		    ((struct segvn_data *)seg->s_data)->type != MAP_SHARED) {
			mask = (ulong_t)ULOCKFS_GETREAD_MASK;
		} else if (protp && read_access) {
			/*
			 * Restrict the mapping to readonly.
			 * Writes to this mapping will cause
			 * another fault which will then
			 * be suspended if fs is write locked
			 */
			*protp &= ~PROT_WRITE;
			mask = (ulong_t)ULOCKFS_GETREAD_MASK;
		} else
			mask = (ulong_t)ULOCKFS_GETWRITE_MASK;

		/*
		 * will sleep if this fs is locked against this VOP
		 */
		error = ufs_check_lockfs(ufsvfsp, ulp, mask);
		mutex_exit(&ulp->ul_lock);
		if (error) {
			if (ulockfs_info_free == NULL)
				kmem_free(ulockfs_info_temp,
				    sizeof (ulockfs_info_t));
			return (error);
		}
	}

	if (ulockfs_info_free != NULL) {
		ulockfs_info_free->ulp = ulp;
	} else {
		ulockfs_info_temp->ulp = ulp;
		ulockfs_info_temp->next = ulockfs_info;
		ASSERT(ufs_lockfs_key != 0);
		(void) tsd_set(ufs_lockfs_key, (void *)ulockfs_info_temp);
	}

	curthread->t_flag |= T_DONTBLOCK;
	return (0);
}

void
ufs_lockfs_tsd_destructor(void *head)
{
	ulockfs_info_t *curr = (ulockfs_info_t *)head;
	ulockfs_info_t *temp;

	for (; curr != NULL; ) {
		/*
		 * The TSD destructor is being called when the thread exits
		 * (via thread_exit()). At that time it must have cleaned up
		 * all VOPs via ufs_lockfs_end() and there must not be a
		 * valid ulockfs record exist while a thread is exiting.
		 */
		temp = curr;
		curr = curr->next;
		ASSERT(temp->ulp == NULL);
		kmem_free(temp, sizeof (ulockfs_info_t));
	}
}