1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26/*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
27/*	  All Rights Reserved  	*/
28
29/*
30 * University Copyright- Copyright (c) 1982, 1986, 1988
31 * The Regents of the University of California
32 * All Rights Reserved
33 *
34 * University Acknowledgment- Portions of this document are derived from
35 * software developed by the University of California, Berkeley, and its
36 * contributors.
37 */
38
39#include <sys/types.h>
40#include <sys/t_lock.h>
41#include <sys/param.h>
42#include <sys/time.h>
43#include <sys/fs/ufs_fs.h>
44#include <sys/cmn_err.h>
45
46#ifdef _KERNEL
47
48#include <sys/systm.h>
49#include <sys/sysmacros.h>
50#include <sys/buf.h>
51#include <sys/conf.h>
52#include <sys/user.h>
53#include <sys/var.h>
54#include <sys/vfs.h>
55#include <sys/vnode.h>
56#include <sys/proc.h>
57#include <sys/debug.h>
58#include <sys/fssnap_if.h>
59#include <sys/fs/ufs_inode.h>
60#include <sys/fs/ufs_trans.h>
61#include <sys/fs/ufs_panic.h>
62#include <sys/fs/ufs_bio.h>
63#include <sys/fs/ufs_log.h>
64#include <sys/kmem.h>
65#include <sys/policy.h>
66#include <vm/hat.h>
67#include <vm/as.h>
68#include <vm/seg.h>
69#include <vm/pvn.h>
70#include <vm/seg_map.h>
71#include <sys/swap.h>
72#include <vm/seg_kmem.h>
73
74#else  /* _KERNEL */
75
76#define	ASSERT(x)		/* don't use asserts for fsck et al */
77
78#endif  /* _KERNEL */
79
80#ifdef _KERNEL
81
82/*
83 * Used to verify that a given entry on the ufs_instances list (see below)
84 * still refers to a mounted file system.
85 *
86 * XXX:	This is a crock that substitutes for proper locking to coordinate
87 *	updates to and uses of the entries in ufs_instances.
88 */
89struct check_node {
90	struct vfs *vfsp;
91	struct ufsvfs *ufsvfs;
92	dev_t vfs_dev;
93};
94
95static vfs_t *still_mounted(struct check_node *);
96
97/*
98 * All ufs file system instances are linked together into a list starting at
99 * ufs_instances.  The list is updated as part of mount and unmount.  It's
100 * consulted in ufs_update, to allow syncing out all ufs file system instances
101 * in a batch.
102 *
103 * ufsvfs_mutex guards access to this list and to the {,old}ufsvfslist
104 * manipulated in ufs_funmount_cleanup.  (A given ufs instance is always on
105 * exactly one of these lists except while it's being allocated or
106 * deallocated.)
107 */
108struct ufsvfs	*ufs_instances;
109extern kmutex_t		ufsvfs_mutex;	/* XXX: move this to ufs_inode.h? */
110
111/*
112 * ufsvfs list manipulation routines
113 */
114
115/*
116 * Link ufsp in at the head of the list of ufs_instances.
117 */
118void
119ufs_vfs_add(struct ufsvfs *ufsp)
120{
121	mutex_enter(&ufsvfs_mutex);
122	ufsp->vfs_next = ufs_instances;
123	ufs_instances = ufsp;
124	mutex_exit(&ufsvfs_mutex);
125}
126
127/*
128 * Remove ufsp from the list of ufs_instances.
129 *
130 * Does no error checking; ufsp is assumed to actually be on the list.
131 */
132void
133ufs_vfs_remove(struct ufsvfs *ufsp)
134{
135	struct ufsvfs	**delpt = &ufs_instances;
136
137	mutex_enter(&ufsvfs_mutex);
138	for (; *delpt != NULL; delpt = &((*delpt)->vfs_next)) {
139		if (*delpt == ufsp) {
140			*delpt = ufsp->vfs_next;
141			ufsp->vfs_next = NULL;
142			break;
143		}
144	}
145	mutex_exit(&ufsvfs_mutex);
146}
147
148/*
149 * Clean up state resulting from a forcible unmount that couldn't be handled
150 * directly during the unmount.  (See commentary in the unmount code for more
151 * info.)
152 */
153static void
154ufs_funmount_cleanup()
155{
156	struct ufsvfs		*ufsvfsp;
157	extern struct ufsvfs	*oldufsvfslist, *ufsvfslist;
158
159	/*
160	 * Assumption: it's now safe to blow away the entries on
161	 * oldufsvfslist.
162	 */
163	mutex_enter(&ufsvfs_mutex);
164	while ((ufsvfsp = oldufsvfslist) != NULL) {
165		oldufsvfslist = ufsvfsp->vfs_next;
166
167		mutex_destroy(&ufsvfsp->vfs_lock);
168		kmem_free(ufsvfsp, sizeof (struct ufsvfs));
169	}
170	/*
171	 * Rotate more recent unmount entries into place in preparation for
172	 * the next time around.
173	 */
174	oldufsvfslist = ufsvfslist;
175	ufsvfslist = NULL;
176	mutex_exit(&ufsvfs_mutex);
177}
178
179
180/*
181 * ufs_update performs the ufs part of `sync'.  It goes through the disk
182 * queues to initiate sandbagged IO; goes through the inodes to write
183 * modified nodes; and it goes through the mount table to initiate
184 * the writing of the modified super blocks.
185 */
186extern time_t	time;
187time_t		ufs_sync_time;
188time_t		ufs_sync_time_secs = 1;
189
190extern kmutex_t	ufs_scan_lock;
191
192void
193ufs_update(int flag)
194{
195	struct vfs *vfsp;
196	struct fs *fs;
197	struct ufsvfs *ufsp;
198	struct ufsvfs *ufsnext;
199	struct ufsvfs *update_list = NULL;
200	int check_cnt = 0;
201	size_t check_size;
202	struct check_node *check_list, *ptr;
203	int cheap = flag & SYNC_ATTR;
204
205	/*
206	 * This is a hack.  A design flaw in the forced unmount protocol
207	 * could allow a thread to attempt to use a kmem_freed ufsvfs
208	 * structure in ufs_lockfs_begin/ufs_check_lockfs.  This window
209	 * is difficult to hit, even during the lockfs stress tests.
210	 * So the hacky fix is to wait awhile before kmem_free'ing the
211	 * ufsvfs structures for forcibly unmounted file systems.  `Awhile'
212	 * is defined as every other call from fsflush (~60 seconds).
213	 */
214	if (cheap)
215		ufs_funmount_cleanup();
216
217	/*
218	 * Examine all ufsvfs structures and add those that we can lock to the
219	 * update list.  This is so that we don't hold the list lock for a
220	 * long time.  If vfs_lock fails for a file system instance, then skip
221	 * it because somebody is doing a unmount on it.
222	 */
223	mutex_enter(&ufsvfs_mutex);
224	for (ufsp = ufs_instances; ufsp != NULL; ufsp = ufsp->vfs_next) {
225		vfsp = ufsp->vfs_vfs;
226		if (vfs_lock(vfsp) != 0)
227			continue;
228		ufsp->vfs_wnext = update_list;
229		update_list = ufsp;
230		check_cnt++;
231	}
232	mutex_exit(&ufsvfs_mutex);
233
234	if (update_list == NULL)
235		return;
236
237	check_size = sizeof (struct check_node) * check_cnt;
238	check_list = ptr = kmem_alloc(check_size, KM_NOSLEEP);
239
240	/*
241	 * Write back modified superblocks.
242	 * Consistency check that the superblock of
243	 * each file system is still in the buffer cache.
244	 *
245	 * Note that the update_list traversal is done without the protection
246	 * of an overall list lock, so it's necessary to rely on the fact that
247	 * each entry of the list is vfs_locked when moving from one entry to
248	 * the next.  This works because a concurrent attempt to add an entry
249	 * to another thread's update_list won't find it, since it'll already
250	 * be locked.
251	 */
252	check_cnt = 0;
253	for (ufsp = update_list; ufsp != NULL; ufsp = ufsnext) {
254		/*
255		 * Need to grab the next ptr before we unlock this one so
256		 * another thread doesn't grab it and change it before we move
257		 * on to the next vfs.  (Once we unlock it, it's ok if another
258		 * thread finds it to add it to its own update_list; we don't
259		 * attempt to refer to it through our list any more.)
260		 */
261		ufsnext = ufsp->vfs_wnext;
262		vfsp = ufsp->vfs_vfs;
263
264		/*
265		 * Seems like this can't happen, so perhaps it should become
266		 * an ASSERT(vfsp->vfs_data != NULL).
267		 */
268		if (!vfsp->vfs_data) {
269			vfs_unlock(vfsp);
270			continue;
271		}
272
273		fs = ufsp->vfs_fs;
274
275		/*
276		 * don't update a locked superblock during a panic; it
277		 * may be in an inconsistent state
278		 */
279		if (panicstr) {
280			if (!mutex_tryenter(&ufsp->vfs_lock)) {
281				vfs_unlock(vfsp);
282				continue;
283			}
284		} else
285			mutex_enter(&ufsp->vfs_lock);
286		/*
287		 * Build up the STABLE check list, so we can unlock the vfs
288		 * until we do the actual checking.
289		 */
290		if (check_list != NULL) {
291			if ((fs->fs_ronly == 0) &&
292			    (fs->fs_clean != FSBAD) &&
293			    (fs->fs_clean != FSSUSPEND)) {
294				ptr->vfsp = vfsp;
295				ptr->ufsvfs = ufsp;
296				ptr->vfs_dev = vfsp->vfs_dev;
297				ptr++;
298				check_cnt++;
299			}
300		}
301
302		/*
303		 * superblock is not modified
304		 */
305		if (fs->fs_fmod == 0) {
306			mutex_exit(&ufsp->vfs_lock);
307			vfs_unlock(vfsp);
308			continue;
309		}
310		if (fs->fs_ronly != 0) {
311			mutex_exit(&ufsp->vfs_lock);
312			vfs_unlock(vfsp);
313			(void) ufs_fault(ufsp->vfs_root,
314			    "fs = %s update: ro fs mod\n", fs->fs_fsmnt);
315			/*
316			 * XXX:	Why is this a return instead of a continue?
317			 *	This may be an attempt to replace a panic with
318			 *	something less drastic, but there's cleanup we
319			 *	should be doing that's not being done (e.g.,
320			 *	unlocking the remaining entries on the list).
321			 */
322			return;
323		}
324		fs->fs_fmod = 0;
325		mutex_exit(&ufsp->vfs_lock);
326		TRANS_SBUPDATE(ufsp, vfsp, TOP_SBUPDATE_UPDATE);
327		vfs_unlock(vfsp);
328	}
329
330	ufs_sync_time = time;
331
332	/*
333	 * Avoid racing with ufs_unmount() and ufs_sync().
334	 */
335	mutex_enter(&ufs_scan_lock);
336
337	(void) ufs_scan_inodes(1, ufs_sync_inode, (void *)(uintptr_t)cheap,
338	    NULL);
339
340	mutex_exit(&ufs_scan_lock);
341
342	/*
343	 * Force stale buffer cache information to be flushed,
344	 * for all devices.  This should cause any remaining control
345	 * information (e.g., cg and inode info) to be flushed back.
346	 */
347	bflush((dev_t)NODEV);
348
349	if (check_list == NULL)
350		return;
351
352	/*
353	 * For each UFS filesystem in the STABLE check_list, update
354	 * the clean flag if warranted.
355	 */
356	for (ptr = check_list; check_cnt > 0; check_cnt--, ptr++) {
357		int	error;
358
359		/*
360		 * still_mounted() returns with vfsp and the vfs_reflock
361		 * held if ptr refers to a vfs that is still mounted.
362		 */
363		if ((vfsp = still_mounted(ptr)) == NULL)
364			continue;
365		ufs_checkclean(vfsp);
366		/*
367		 * commit any outstanding async transactions
368		 */
369		ufsp = (struct ufsvfs *)vfsp->vfs_data;
370		curthread->t_flag |= T_DONTBLOCK;
371		TRANS_BEGIN_SYNC(ufsp, TOP_COMMIT_UPDATE, TOP_COMMIT_SIZE,
372		    error);
373		if (!error) {
374			TRANS_END_SYNC(ufsp, error, TOP_COMMIT_UPDATE,
375			    TOP_COMMIT_SIZE);
376		}
377		curthread->t_flag &= ~T_DONTBLOCK;
378
379		vfs_unlock(vfsp);
380	}
381
382	kmem_free(check_list, check_size);
383}
384
385int
386ufs_sync_inode(struct inode *ip, void *arg)
387{
388	int cheap = (int)(uintptr_t)arg;
389	struct ufsvfs *ufsvfsp;
390	uint_t flag = ip->i_flag;
391
392	if (cheap && ((flag & (IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG)) == 0))
393		return (0);
394
395	/*
396	 * if we are panic'ing; then don't update the inode if this
397	 * file system is FSSTABLE.  Otherwise, we would have to
398	 * force the superblock to FSACTIVE and the superblock
399	 * may not be in a good state.  Also, if the inode is
400	 * IREF'ed then it may be in an inconsistent state.  Don't
401	 * push it.  Finally, don't push the inode if the fs is
402	 * logging; the transaction will be discarded at boot.
403	 */
404	if (panicstr) {
405
406		if (flag & IREF)
407			return (0);
408
409		if (ip->i_ufsvfs == NULL ||
410		    (ip->i_fs->fs_clean == FSSTABLE ||
411		    ip->i_fs->fs_clean == FSLOG))
412				return (0);
413	}
414
415	ufsvfsp = ip->i_ufsvfs;
416
417	/*
418	 * Limit access time only updates
419	 */
420	if (((flag & (IMOD|IMODACC|IUPD|ICHG|IACC)) == IMODACC) && ufsvfsp) {
421		/*
422		 * if file system has deferred access time turned on and there
423		 * was no IO recently, don't bother flushing it. It will be
424		 * flushed when I/Os start again.
425		 */
426		if (cheap && (ufsvfsp->vfs_dfritime & UFS_DFRATIME) &&
427		    (ufsvfsp->vfs_iotstamp + ufs_iowait < ddi_get_lbolt()))
428			return (0);
429		/*
430		 * an app issueing a sync() can take forever on a trans device
431		 * when NetWorker or find is running because all of the
432		 * directorys' access times have to be updated. So, we limit
433		 * the time we spend updating access times per sync.
434		 */
435		if (TRANS_ISTRANS(ufsvfsp) && ((ufs_sync_time +
436		    ufs_sync_time_secs) < time))
437			return (0);
438	}
439
440	/*
441	 * if we are running on behalf of the flush thread or this is
442	 * a swap file, then simply do a delay update of the inode.
443	 * Otherwise, push the pages and then do a delayed inode update.
444	 */
445	if (cheap || IS_SWAPVP(ITOV(ip))) {
446		TRANS_IUPDAT(ip, 0);
447	} else {
448		(void) TRANS_SYNCIP(ip, B_ASYNC, I_ASYNC, TOP_SYNCIP_SYNC);
449	}
450	return (0);
451}
452
453/*
454 * Flush all the pages associated with an inode using the given 'flags',
455 * then force inode information to be written back using the given 'waitfor'.
456 */
457int
458ufs_syncip(struct inode *ip, int flags, int waitfor, top_t topid)
459{
460	int	error;
461	struct vnode *vp = ITOV(ip);
462	struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
463	int dotrans = 0;
464
465	/*
466	 * Return if file system has been forcibly umounted.
467	 */
468	if (ufsvfsp == NULL)
469		return (EIO);
470	/*
471	 * don't need to VOP_PUTPAGE if there are no pages
472	 */
473	if (!vn_has_cached_data(vp) || vp->v_type == VCHR) {
474		error = 0;
475	} else {
476		/*
477		 * if the inode we're working on is a shadow inode
478		 * or quota inode we need to make sure that the
479		 * ufs_putpage call is inside a transaction as this
480		 * could include meta data changes.
481		 */
482		if ((ip->i_mode & IFMT) == IFSHAD ||
483		    ufsvfsp->vfs_qinod == ip) {
484			dotrans = 1;
485			curthread->t_flag |= T_DONTBLOCK;
486			TRANS_BEGIN_ASYNC(ufsvfsp, TOP_PUTPAGE,
487			    TOP_PUTPAGE_SIZE(ip));
488		}
489		error = VOP_PUTPAGE(vp, (offset_t)0, (size_t)0,
490		    flags, CRED(), NULL);
491		if (dotrans) {
492			TRANS_END_ASYNC(ufsvfsp, TOP_PUTPAGE,
493			    TOP_PUTPAGE_SIZE(ip));
494			curthread->t_flag &= ~T_DONTBLOCK;
495			dotrans = 0;
496		}
497	}
498	if (panicstr && TRANS_ISTRANS(ufsvfsp))
499		goto out;
500	/*
501	 * waitfor represents two things -
502	 * 1. whether data sync or file sync.
503	 * 2. if file sync then ufs_iupdat should 'waitfor' disk i/o or not.
504	 */
505	if (waitfor == I_DSYNC) {
506		/*
507		 * If data sync, only IATTCHG (size/block change) requires
508		 * inode update, fdatasync()/FDSYNC implementation.
509		 */
510		if (ip->i_flag & (IBDWRITE|IATTCHG)) {
511			/*
512			 * Enter a transaction to provide mutual exclusion
513			 * with deltamap_push and avoid a race where
514			 * the inode flush could get dropped.
515			 */
516			if ((curthread->t_flag & T_DONTBLOCK) == 0) {
517				dotrans = 1;
518				curthread->t_flag |= T_DONTBLOCK;
519				TRANS_BEGIN_ASYNC(ufsvfsp, topid,
520				    TOP_SYNCIP_SIZE);
521			}
522			rw_enter(&ip->i_contents, RW_READER);
523			mutex_enter(&ip->i_tlock);
524			ip->i_flag &= ~IMODTIME;
525			mutex_exit(&ip->i_tlock);
526			ufs_iupdat(ip, 1);
527			rw_exit(&ip->i_contents);
528			if (dotrans) {
529				TRANS_END_ASYNC(ufsvfsp, topid,
530				    TOP_SYNCIP_SIZE);
531				curthread->t_flag &= ~T_DONTBLOCK;
532			}
533		}
534	} else {
535		/* For file sync, any inode change requires inode update */
536		if (ip->i_flag & (IBDWRITE|IUPD|IACC|ICHG|IMOD|IMODACC)) {
537			/*
538			 * Enter a transaction to provide mutual exclusion
539			 * with deltamap_push and avoid a race where
540			 * the inode flush could get dropped.
541			 */
542			if ((curthread->t_flag & T_DONTBLOCK) == 0) {
543				dotrans = 1;
544				curthread->t_flag |= T_DONTBLOCK;
545				TRANS_BEGIN_ASYNC(ufsvfsp, topid,
546				    TOP_SYNCIP_SIZE);
547			}
548			rw_enter(&ip->i_contents, RW_READER);
549			mutex_enter(&ip->i_tlock);
550			ip->i_flag &= ~IMODTIME;
551			mutex_exit(&ip->i_tlock);
552			ufs_iupdat(ip, waitfor);
553			rw_exit(&ip->i_contents);
554			if (dotrans) {
555				TRANS_END_ASYNC(ufsvfsp, topid,
556				    TOP_SYNCIP_SIZE);
557				curthread->t_flag &= ~T_DONTBLOCK;
558			}
559		}
560	}
561
562out:
563	return (error);
564}
565/*
566 * Flush all indirect blocks related to an inode.
567 * Supports triple indirect blocks also.
568 */
569int
570ufs_sync_indir(struct inode *ip)
571{
572	int i;
573	daddr_t blkno;
574	daddr_t lbn;	/* logical blkno of last blk in file */
575	daddr_t clbn;	/* current logical blk */
576	daddr32_t *bap;
577	struct fs *fs;
578	struct buf *bp;
579	int bsize;
580	struct ufsvfs *ufsvfsp;
581	int j;
582	daddr_t indirect_blkno;
583	daddr32_t *indirect_bap;
584	struct buf *indirect_bp;
585
586	ufsvfsp = ip->i_ufsvfs;
587	/*
588	 * unnecessary when logging; allocation blocks are kept up-to-date
589	 */
590	if (TRANS_ISTRANS(ufsvfsp))
591		return (0);
592
593	fs = ufsvfsp->vfs_fs;
594	bsize = fs->fs_bsize;
595	lbn = (daddr_t)lblkno(fs, ip->i_size - 1);
596	if (lbn < NDADDR)
597		return (0);	/* No indirect blocks used */
598	if (lbn < NDADDR + NINDIR(fs)) {
599		/* File has one indirect block. */
600		blkflush(ip->i_dev, (daddr_t)fsbtodb(fs, ip->i_ib[0]));
601		return (0);
602	}
603
604	/* Write out all the first level indirect blocks */
605	for (i = 0; i <= NIADDR; i++) {
606		if ((blkno = ip->i_ib[i]) == 0)
607			continue;
608		blkflush(ip->i_dev, (daddr_t)fsbtodb(fs, blkno));
609	}
610	/* Write out second level of indirect blocks */
611	if ((blkno = ip->i_ib[1]) == 0)
612		return (0);
613	bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, blkno), bsize);
614	if (bp->b_flags & B_ERROR) {
615		brelse(bp);
616		return (EIO);
617	}
618	bap = bp->b_un.b_daddr;
619	clbn = NDADDR + NINDIR(fs);
620	for (i = 0; i < NINDIR(fs); i++) {
621		if (clbn > lbn)
622			break;
623		clbn += NINDIR(fs);
624		if ((blkno = bap[i]) == 0)
625			continue;
626		blkflush(ip->i_dev, (daddr_t)fsbtodb(fs, blkno));
627	}
628
629	brelse(bp);
630	/* write out third level indirect blocks */
631
632	if ((blkno = ip->i_ib[2]) == 0)
633		return (0);
634
635	bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, blkno), bsize);
636	if (bp->b_flags & B_ERROR) {
637		brelse(bp);
638		return (EIO);
639	}
640	bap = bp->b_un.b_daddr;
641	clbn = NDADDR + NINDIR(fs) + (NINDIR(fs) * NINDIR(fs));
642
643	for (i = 0; i < NINDIR(fs); i++) {
644		if (clbn > lbn)
645			break;
646		if ((indirect_blkno = bap[i]) == 0)
647			continue;
648		blkflush(ip->i_dev, (daddr_t)fsbtodb(fs, indirect_blkno));
649		indirect_bp = UFS_BREAD(ufsvfsp, ip->i_dev,
650		    (daddr_t)fsbtodb(fs, indirect_blkno), bsize);
651		if (indirect_bp->b_flags & B_ERROR) {
652			brelse(indirect_bp);
653			brelse(bp);
654			return (EIO);
655		}
656		indirect_bap = indirect_bp->b_un.b_daddr;
657		for (j = 0; j < NINDIR(fs); j++) {
658			if (clbn > lbn)
659				break;
660			clbn += NINDIR(fs);
661			if ((blkno = indirect_bap[j]) == 0)
662				continue;
663			blkflush(ip->i_dev, (daddr_t)fsbtodb(fs, blkno));
664		}
665		brelse(indirect_bp);
666	}
667	brelse(bp);
668
669	return (0);
670}
671
672/*
673 * Flush all indirect blocks related to an offset of a file.
674 * read/write in sync mode may have to flush indirect blocks.
675 */
676int
677ufs_indirblk_sync(struct inode *ip, offset_t off)
678{
679	daddr_t	lbn;
680	struct	fs *fs;
681	struct	buf *bp;
682	int	i, j, shft;
683	daddr_t	ob, nb, tbn;
684	daddr32_t *bap;
685	int	nindirshift, nindiroffset;
686	struct ufsvfs *ufsvfsp;
687
688	ufsvfsp = ip->i_ufsvfs;
689	/*
690	 * unnecessary when logging; allocation blocks are kept up-to-date
691	 */
692	if (TRANS_ISTRANS(ufsvfsp))
693		return (0);
694
695	fs = ufsvfsp->vfs_fs;
696
697	lbn = (daddr_t)lblkno(fs, off);
698	if (lbn < 0)
699		return (EFBIG);
700
701	/* The first NDADDR are direct so nothing to do */
702	if (lbn < NDADDR)
703		return (0);
704
705	nindirshift = ip->i_ufsvfs->vfs_nindirshift;
706	nindiroffset = ip->i_ufsvfs->vfs_nindiroffset;
707
708	/* Determine level of indirect blocks */
709	shft = 0;
710	tbn = lbn - NDADDR;
711	for (j = NIADDR; j > 0; j--) {
712		longlong_t	sh;
713
714		shft += nindirshift;
715		sh = 1LL << shft;
716		if (tbn < sh)
717			break;
718		tbn -= (daddr_t)sh;
719	}
720
721	if (j == 0)
722		return (EFBIG);
723
724	if ((nb = ip->i_ib[NIADDR - j]) == 0)
725			return (0);		/* UFS Hole */
726
727	/* Flush first level indirect block */
728	blkflush(ip->i_dev, fsbtodb(fs, nb));
729
730	/* Fetch through next levels */
731	for (; j < NIADDR; j++) {
732		ob = nb;
733		bp = UFS_BREAD(ufsvfsp,
734		    ip->i_dev, fsbtodb(fs, ob), fs->fs_bsize);
735		if (bp->b_flags & B_ERROR) {
736			brelse(bp);
737			return (EIO);
738		}
739		bap = bp->b_un.b_daddr;
740		shft -= nindirshift;		/* sh / nindir */
741		i = (tbn >> shft) & nindiroffset; /* (tbn /sh) & nindir */
742		nb = bap[i];
743		brelse(bp);
744		if (nb == 0) {
745			return (0); 		/* UFS hole */
746		}
747		blkflush(ip->i_dev, fsbtodb(fs, nb));
748	}
749	return (0);
750}
751
752#ifdef DEBUG
753
754/*
755 * The bad block checking routines: ufs_indir_badblock() and ufs_badblock()
756 * are very expensive. It's been found from profiling that we're
757 * spending 6-7% of our time in ufs_badblock, and another 1-2% in
758 * ufs_indir_badblock. They are only called via ASSERTs (from debug kernels).
759 * In addition from experience no failures have been found in recent
760 * years. So the following tunable can be set to enable checking.
761 */
762int ufs_badblock_checks = 0;
763
764/*
765 * Check that a given indirect block contains blocks in range
766 */
767int
768ufs_indir_badblock(struct inode *ip, daddr32_t *bap)
769{
770	int i;
771	int err = 0;
772
773	if (ufs_badblock_checks) {
774		for (i = 0; i < NINDIR(ip->i_fs) - 1; i++)
775			if (bap[i] != 0 && (err = ufs_badblock(ip, bap[i])))
776				break;
777	}
778	return (err);
779}
780
781/*
782 * Check that a specified block number is in range.
783 */
784int
785ufs_badblock(struct inode *ip, daddr_t bn)
786{
787	long	c;
788	daddr_t	sum;
789
790	if (!ufs_badblock_checks)
791		return (0);
792	ASSERT(bn);
793	if (bn <= 0 || bn > ip->i_fs->fs_size)
794		return (bn);
795
796	sum = 0;
797	c = dtog(ip->i_fs, bn);
798	if (c == 0) {
799		sum = howmany(ip->i_fs->fs_cssize, ip->i_fs->fs_fsize);
800	}
801	/*
802	 * if block no. is below this cylinder group,
803	 * within the space reserved for superblock, inodes, (summary data)
804	 * or if it is above this cylinder group
805	 * then its invalid
806	 * It's hard to see how we'd be outside this cyl, but let's be careful.
807	 */
808	if ((bn < cgbase(ip->i_fs, c)) ||
809	    (bn >= cgsblock(ip->i_fs, c) && bn < cgdmin(ip->i_fs, c)+sum) ||
810	    (bn >= (unsigned)cgbase(ip->i_fs, c+1)))
811		return (bn);
812
813	return (0);	/* not a bad block */
814}
815
816#endif /* DEBUG */
817
818/*
819 * When i_rwlock is write-locked or has a writer pended, then the inode
820 * is going to change in a way that the filesystem will be marked as
821 * active. So no need to let the filesystem be mark as stable now.
822 * Also to ensure the filesystem consistency during the directory
823 * operations, filesystem cannot be marked as stable if i_rwlock of
824 * the directory inode is write-locked.
825 */
826
827/*
828 * Check for busy inodes for this filesystem.
829 * NOTE: Needs better way to do this expensive operation in the future.
830 */
831static void
832ufs_icheck(struct ufsvfs *ufsvfsp, int *isbusyp, int *isreclaimp)
833{
834	union  ihead	*ih;
835	struct inode	*ip;
836	int		i;
837	int		isnottrans	= !TRANS_ISTRANS(ufsvfsp);
838	int		isbusy		= *isbusyp;
839	int		isreclaim	= *isreclaimp;
840
841	for (i = 0, ih = ihead; i < inohsz; i++, ih++) {
842		mutex_enter(&ih_lock[i]);
843		for (ip = ih->ih_chain[0];
844		    ip != (struct inode *)ih;
845		    ip = ip->i_forw) {
846			/*
847			 * if inode is busy/modified/deleted, filesystem is busy
848			 */
849			if (ip->i_ufsvfs != ufsvfsp)
850				continue;
851			if ((ip->i_flag & (IMOD | IUPD | ICHG)) ||
852			    (RW_ISWRITER(&ip->i_rwlock)))
853				isbusy = 1;
854			if ((ip->i_nlink <= 0) && (ip->i_flag & IREF))
855				isreclaim = 1;
856			if (isbusy && (isreclaim || isnottrans))
857				break;
858		}
859		mutex_exit(&ih_lock[i]);
860		if (isbusy && (isreclaim || isnottrans))
861			break;
862	}
863	*isbusyp = isbusy;
864	*isreclaimp = isreclaim;
865}
866
867/*
868 * As part of the ufs 'sync' operation, this routine is called to mark
869 * the filesystem as STABLE if there is no modified metadata in memory.
870 */
871void
872ufs_checkclean(struct vfs *vfsp)
873{
874	struct ufsvfs	*ufsvfsp	= (struct ufsvfs *)vfsp->vfs_data;
875	struct fs	*fs		= ufsvfsp->vfs_fs;
876	int		isbusy;
877	int		isreclaim;
878	int		updatesb;
879
880	ASSERT(vfs_lock_held(vfsp));
881
882	/*
883	 * filesystem is stable or cleanflag processing is disabled; do nothing
884	 *	no transitions when panic'ing
885	 */
886	if (fs->fs_ronly ||
887	    fs->fs_clean == FSBAD ||
888	    fs->fs_clean == FSSUSPEND ||
889	    fs->fs_clean == FSSTABLE ||
890	    panicstr)
891		return;
892
893	/*
894	 * if logging and nothing to reclaim; do nothing
895	 */
896	if ((fs->fs_clean == FSLOG) &&
897	    (((fs->fs_reclaim & FS_RECLAIM) == 0) ||
898	    (fs->fs_reclaim & FS_RECLAIMING)))
899		return;
900
901	/*
902	 * FS_CHECKCLEAN is reset if the file system goes dirty
903	 * FS_CHECKRECLAIM is reset if a file gets deleted
904	 */
905	mutex_enter(&ufsvfsp->vfs_lock);
906	fs->fs_reclaim |= (FS_CHECKCLEAN | FS_CHECKRECLAIM);
907	mutex_exit(&ufsvfsp->vfs_lock);
908
909	updatesb = 0;
910
911	/*
912	 * if logging or buffers are busy; do nothing
913	 */
914	isbusy = isreclaim = 0;
915	if ((fs->fs_clean == FSLOG) ||
916	    (bcheck(vfsp->vfs_dev, ufsvfsp->vfs_bufp)))
917		isbusy = 1;
918
919	/*
920	 * isreclaim == TRUE means can't change the state of fs_reclaim
921	 */
922	isreclaim =
923	    ((fs->fs_clean == FSLOG) &&
924	    (((fs->fs_reclaim & FS_RECLAIM) == 0) ||
925	    (fs->fs_reclaim & FS_RECLAIMING)));
926
927	/*
928	 * if fs is busy or can't change the state of fs_reclaim; do nothing
929	 */
930	if (isbusy && isreclaim)
931		return;
932
933	/*
934	 * look for busy or deleted inodes; (deleted == needs reclaim)
935	 */
936	ufs_icheck(ufsvfsp, &isbusy, &isreclaim);
937
938	mutex_enter(&ufsvfsp->vfs_lock);
939
940	/*
941	 * IF POSSIBLE, RESET RECLAIM
942	 */
943	/*
944	 * the reclaim thread is not running
945	 */
946	if ((fs->fs_reclaim & FS_RECLAIMING) == 0)
947		/*
948		 * no files were deleted during the scan
949		 */
950		if (fs->fs_reclaim & FS_CHECKRECLAIM)
951			/*
952			 * no deleted files were found in the inode cache
953			 */
954			if ((isreclaim == 0) && (fs->fs_reclaim & FS_RECLAIM)) {
955				fs->fs_reclaim &= ~FS_RECLAIM;
956				updatesb = 1;
957			}
958	/*
959	 * IF POSSIBLE, SET STABLE
960	 */
961	/*
962	 * not logging
963	 */
964	if (fs->fs_clean != FSLOG)
965		/*
966		 * file system has not gone dirty since the scan began
967		 */
968		if (fs->fs_reclaim & FS_CHECKCLEAN)
969			/*
970			 * nothing dirty was found in the buffer or inode cache
971			 */
972			if ((isbusy == 0) && (isreclaim == 0) &&
973			    (fs->fs_clean != FSSTABLE)) {
974				fs->fs_clean = FSSTABLE;
975				updatesb = 1;
976			}
977
978	mutex_exit(&ufsvfsp->vfs_lock);
979	if (updatesb) {
980		TRANS_SBWRITE(ufsvfsp, TOP_SBWRITE_STABLE);
981	}
982}
983
984/*
985 * called whenever an unlink occurs
986 */
987void
988ufs_setreclaim(struct inode *ip)
989{
990	struct ufsvfs	*ufsvfsp	= ip->i_ufsvfs;
991	struct fs	*fs		= ufsvfsp->vfs_fs;
992
993	if (ip->i_nlink || fs->fs_ronly || (fs->fs_clean != FSLOG))
994		return;
995
996	/*
997	 * reclaim-needed bit is already set or we need to tell
998	 * ufs_checkclean that a file has been deleted
999	 */
1000	if ((fs->fs_reclaim & (FS_RECLAIM | FS_CHECKRECLAIM)) == FS_RECLAIM)
1001		return;
1002
1003	mutex_enter(&ufsvfsp->vfs_lock);
1004	/*
1005	 * inform ufs_checkclean that the file system has gone dirty
1006	 */
1007	fs->fs_reclaim &= ~FS_CHECKRECLAIM;
1008
1009	/*
1010	 * set the reclaim-needed bit
1011	 */
1012	if ((fs->fs_reclaim & FS_RECLAIM) == 0) {
1013		fs->fs_reclaim |= FS_RECLAIM;
1014		ufs_sbwrite(ufsvfsp);
1015	}
1016	mutex_exit(&ufsvfsp->vfs_lock);
1017}
1018
1019/*
1020 * Before any modified metadata written back to the disk, this routine
1021 * is called to mark the filesystem as ACTIVE.
1022 */
1023void
1024ufs_notclean(struct ufsvfs *ufsvfsp)
1025{
1026	struct fs *fs = ufsvfsp->vfs_fs;
1027
1028	ASSERT(MUTEX_HELD(&ufsvfsp->vfs_lock));
1029	ULOCKFS_SET_MOD((&ufsvfsp->vfs_ulockfs));
1030
1031	/*
1032	 * inform ufs_checkclean that the file system has gone dirty
1033	 */
1034	fs->fs_reclaim &= ~FS_CHECKCLEAN;
1035
1036	/*
1037	 * ignore if active or bad or suspended or readonly or logging
1038	 */
1039	if ((fs->fs_clean == FSACTIVE) || (fs->fs_clean == FSLOG) ||
1040	    (fs->fs_clean == FSBAD) || (fs->fs_clean == FSSUSPEND) ||
1041	    (fs->fs_ronly)) {
1042		mutex_exit(&ufsvfsp->vfs_lock);
1043		return;
1044	}
1045	fs->fs_clean = FSACTIVE;
1046	/*
1047	 * write superblock synchronously
1048	 */
1049	ufs_sbwrite(ufsvfsp);
1050	mutex_exit(&ufsvfsp->vfs_lock);
1051}
1052
1053/*
1054 * ufs specific fbwrite()
1055 */
1056int
1057ufs_fbwrite(struct fbuf *fbp, struct inode *ip)
1058{
1059	struct ufsvfs	*ufsvfsp	= ip->i_ufsvfs;
1060
1061	if (TRANS_ISTRANS(ufsvfsp))
1062		return (fbwrite(fbp));
1063	mutex_enter(&ufsvfsp->vfs_lock);
1064	ufs_notclean(ufsvfsp);
1065	return ((ufsvfsp->vfs_dio) ? fbdwrite(fbp) : fbwrite(fbp));
1066}
1067
1068/*
1069 * ufs specific fbiwrite()
1070 */
1071int
1072ufs_fbiwrite(struct fbuf *fbp, struct inode *ip, daddr_t bn, long bsize)
1073{
1074	struct ufsvfs	*ufsvfsp	= ip->i_ufsvfs;
1075	o_mode_t	ifmt		= ip->i_mode & IFMT;
1076	buf_t		*bp;
1077	int		error;
1078
1079	mutex_enter(&ufsvfsp->vfs_lock);
1080	ufs_notclean(ufsvfsp);
1081	if (ifmt == IFDIR || ifmt == IFSHAD || ifmt == IFATTRDIR ||
1082	    (ip->i_ufsvfs->vfs_qinod == ip)) {
1083		TRANS_DELTA(ufsvfsp, ldbtob(bn * (offset_t)(btod(bsize))),
1084		    fbp->fb_count, DT_FBI, 0, 0);
1085	}
1086	/*
1087	 * Inlined version of fbiwrite()
1088	 */
1089	bp = pageio_setup((struct page *)NULL, fbp->fb_count,
1090	    ip->i_devvp, B_WRITE);
1091	bp->b_flags &= ~B_PAGEIO;
1092	bp->b_un.b_addr = fbp->fb_addr;
1093
1094	bp->b_blkno = bn * btod(bsize);
1095	bp->b_dev = cmpdev(ip->i_dev);	/* store in old dev format */
1096	bp->b_edev = ip->i_dev;
1097	bp->b_proc = NULL;			/* i.e. the kernel */
1098	bp->b_file = ip->i_vnode;
1099	bp->b_offset = -1;
1100
1101	if (ufsvfsp->vfs_log) {
1102		lufs_write_strategy(ufsvfsp->vfs_log, bp);
1103	} else if (ufsvfsp->vfs_snapshot) {
1104		fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
1105	} else {
1106		ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
1107		ub.ub_fbiwrites.value.ul++;
1108		(void) bdev_strategy(bp);
1109		lwp_stat_update(LWP_STAT_OUBLK, 1);
1110	}
1111	error = biowait(bp);
1112	pageio_done(bp);
1113	fbrelse(fbp, S_OTHER);
1114	return (error);
1115}
1116
1117/*
1118 * Write the ufs superblock only.
1119 */
1120void
1121ufs_sbwrite(struct ufsvfs *ufsvfsp)
1122{
1123	char sav_fs_fmod;
1124	struct fs *fs = ufsvfsp->vfs_fs;
1125	struct buf *bp = ufsvfsp->vfs_bufp;
1126
1127	ASSERT(MUTEX_HELD(&ufsvfsp->vfs_lock));
1128
1129	/*
1130	 * for ulockfs processing, limit the superblock writes
1131	 */
1132	if ((ufsvfsp->vfs_ulockfs.ul_sbowner) &&
1133	    (curthread != ufsvfsp->vfs_ulockfs.ul_sbowner)) {
1134		/* try again later */
1135		fs->fs_fmod = 1;
1136		return;
1137	}
1138
1139	ULOCKFS_SET_MOD((&ufsvfsp->vfs_ulockfs));
1140	/*
1141	 * update superblock timestamp and fs_clean checksum
1142	 * if marked FSBAD, we always want an erroneous
1143	 * checksum to force repair
1144	 */
1145	fs->fs_time = gethrestime_sec();
1146	fs->fs_state = (fs->fs_clean != FSBAD) ?
1147	    FSOKAY - fs->fs_time : -(FSOKAY - fs->fs_time);
1148	switch (fs->fs_clean) {
1149	case FSCLEAN:
1150	case FSSTABLE:
1151		fs->fs_reclaim &= ~FS_RECLAIM;
1152		break;
1153	case FSACTIVE:
1154	case FSSUSPEND:
1155	case FSBAD:
1156	case FSLOG:
1157		break;
1158	default:
1159		fs->fs_clean = FSACTIVE;
1160		break;
1161	}
1162	/*
1163	 * reset incore only bits
1164	 */
1165	fs->fs_reclaim &= ~(FS_CHECKCLEAN | FS_CHECKRECLAIM);
1166
1167	/*
1168	 * delta the whole superblock
1169	 */
1170	TRANS_DELTA(ufsvfsp, ldbtob(SBLOCK), sizeof (struct fs),
1171	    DT_SB, NULL, 0);
1172	/*
1173	 * retain the incore state of fs_fmod; set the ondisk state to 0
1174	 */
1175	sav_fs_fmod = fs->fs_fmod;
1176	fs->fs_fmod = 0;
1177
1178	/*
1179	 * Don't release the buffer after written to the disk
1180	 */
1181	UFS_BWRITE2(ufsvfsp, bp);
1182	fs->fs_fmod = sav_fs_fmod;	/* reset fs_fmod's incore state */
1183}
1184
1185/*
1186 * Returns vfs pointer if vfs still being mounted. vfs lock is held.
1187 * Otherwise, returns NULL.
1188 *
1189 * For our purposes, "still mounted" means that the file system still appears
1190 * on the list of UFS file system instances.
1191 */
1192static vfs_t *
1193still_mounted(struct check_node *checkp)
1194{
1195	struct vfs	*vfsp;
1196	struct ufsvfs	*ufsp;
1197
1198	mutex_enter(&ufsvfs_mutex);
1199	for (ufsp = ufs_instances; ufsp != NULL; ufsp = ufsp->vfs_next) {
1200		if (ufsp != checkp->ufsvfs)
1201			continue;
1202		/*
1203		 * Tentative match:  verify it and try to lock.  (It's not at
1204		 * all clear how the verification could fail, given that we've
1205		 * gotten this far.  We would have had to reallocate the
1206		 * ufsvfs struct at hand for a new incarnation; is that really
1207		 * possible in the interval from constructing the check_node
1208		 * to here?)
1209		 */
1210		vfsp = ufsp->vfs_vfs;
1211		if (vfsp != checkp->vfsp)
1212			continue;
1213		if (vfsp->vfs_dev != checkp->vfs_dev)
1214			continue;
1215		if (vfs_lock(vfsp) != 0)
1216			continue;
1217
1218		mutex_exit(&ufsvfs_mutex);
1219		return (vfsp);
1220	}
1221	mutex_exit(&ufsvfs_mutex);
1222	return (NULL);
1223}
1224
1225int
1226ufs_si_io_done(struct buf *bp)
1227{
1228	sema_v(&bp->b_io);
1229	return (0);
1230}
1231
1232#define	SI_BUFSZ roundup(sizeof (struct cg), DEV_BSIZE)
1233#define	NSIBUF 32
1234
1235/*
1236 * ufs_construct_si()
1237 * Read each cylinder group in turn and construct the summary information
1238 */
1239static int
1240ufs_construct_si(dev_t dev, struct fs *fs, struct ufsvfs *ufsvfsp)
1241{
1242	buf_t *bps, *bp;
1243	char *bufs;
1244	struct csum *sip = fs->fs_u.fs_csp;
1245	struct cg *cgp;
1246	int i, ncg;
1247	int error = 0, cg = 0;
1248
1249	bps = kmem_alloc(NSIBUF * sizeof (buf_t), KM_SLEEP);
1250	bufs = kmem_alloc(NSIBUF * SI_BUFSZ, KM_SLEEP);
1251
1252	/*
1253	 * Initialise the buffer headers
1254	 */
1255	for (bp = bps, i = 0; i < NSIBUF; i++, bp++) {
1256		bioinit(bp);
1257		bp->b_iodone = ufs_si_io_done;
1258		bp->b_bufsize = bp->b_bcount = SI_BUFSZ;
1259		bp->b_flags = B_READ;
1260		bp->b_un.b_addr = bufs + (i * SI_BUFSZ);
1261		bp->b_edev = dev;
1262	}
1263
1264	/*
1265	 * Repeat while there are cylinder groups left to read.
1266	 */
1267	do {
1268		/*
1269		 * Issue upto NSIBUF asynchronous reads
1270		 */
1271		ncg = MIN(NSIBUF, (fs->fs_ncg - cg));
1272		for (bp = bps, i = 0; i < ncg; i++, bp++) {
1273			bp->b_blkno = (daddr_t)fsbtodb(fs, cgtod(fs, cg + i));
1274			if (ufsvfsp->vfs_log) {
1275				lufs_read_strategy(ufsvfsp->vfs_log, bp);
1276			} else {
1277				(void) bdev_strategy(bp);
1278			}
1279		}
1280
1281		/*
1282		 * wait for each read to finish;
1283		 * check for errors and copy the csum info
1284		 */
1285		for (bp = bps, i = 0; i < ncg; i++, bp++) {
1286			sema_p(&bp->b_io);
1287			if (!error) {
1288				cgp = bp->b_un.b_cg;
1289				sip[cg + i] = cgp->cg_cs;
1290				error = geterror(bp);
1291			}
1292		}
1293		if (error) {
1294			goto err;
1295		}
1296		cg += ncg;
1297	} while (cg < fs->fs_ncg);
1298
1299err:
1300	kmem_free(bps, NSIBUF * sizeof (buf_t));
1301	kmem_free(bufs, NSIBUF * SI_BUFSZ);
1302	return (error);
1303}
1304
1305/*
1306 * ufs_getsummaryinfo
1307 */
1308int
1309ufs_getsummaryinfo(dev_t dev, struct ufsvfs *ufsvfsp, struct fs *fs)
1310{
1311	int		i;		/* `for' loop counter */
1312	ssize_t		size;		/* bytes of summary info to read */
1313	daddr_t		frags;		/* frags of summary info to read */
1314	caddr_t		sip;		/* summary info */
1315	struct buf	*tp;		/* tmp buf */
1316
1317	/*
1318	 * maintain metadata map for trans device (debug only)
1319	 */
1320	TRANS_MATA_SI(ufsvfsp, fs);
1321
1322	/*
1323	 * Compute #frags and allocate space for summary info
1324	 */
1325	frags = howmany(fs->fs_cssize, fs->fs_fsize);
1326	sip = kmem_alloc((size_t)fs->fs_cssize, KM_SLEEP);
1327	fs->fs_u.fs_csp = (struct csum *)sip;
1328
1329	if (fs->fs_si == FS_SI_BAD) {
1330		/*
1331		 * The summary information is unknown, read it in from
1332		 * the cylinder groups.
1333		 */
1334		if (TRANS_ISTRANS(ufsvfsp) && !TRANS_ISERROR(ufsvfsp) &&
1335		    ufsvfsp->vfs_log->un_logmap) {
1336			logmap_roll_dev(ufsvfsp->vfs_log); /* flush the log */
1337		}
1338		bzero(sip, (size_t)fs->fs_cssize);
1339		if (ufs_construct_si(dev, fs, ufsvfsp)) {
1340			kmem_free(fs->fs_u.fs_csp, fs->fs_cssize);
1341			fs->fs_u.fs_csp = NULL;
1342			return (EIO);
1343		}
1344	} else {
1345		/* Read summary info a fs block at a time */
1346		size = fs->fs_bsize;
1347		for (i = 0; i < frags; i += fs->fs_frag) {
1348			if (i + fs->fs_frag > frags)
1349				/*
1350				 * This happens only the last iteration, so
1351				 * don't worry about size being reset
1352				 */
1353				size = (frags - i) * fs->fs_fsize;
1354			tp = UFS_BREAD(ufsvfsp, dev,
1355			    (daddr_t)fsbtodb(fs, fs->fs_csaddr+i), size);
1356			tp->b_flags |= B_STALE | B_AGE;
1357			if (tp->b_flags & B_ERROR) {
1358				kmem_free(fs->fs_u.fs_csp, fs->fs_cssize);
1359				fs->fs_u.fs_csp = NULL;
1360				brelse(tp);
1361				return (EIO);
1362			}
1363			bcopy(tp->b_un.b_addr, sip, size);
1364			sip += size;
1365			brelse(tp);
1366		}
1367	}
1368	bzero((caddr_t)&fs->fs_cstotal, sizeof (fs->fs_cstotal));
1369	for (i = 0; i < fs->fs_ncg; ++i) {
1370		fs->fs_cstotal.cs_ndir += fs->fs_cs(fs, i).cs_ndir;
1371		fs->fs_cstotal.cs_nbfree += fs->fs_cs(fs, i).cs_nbfree;
1372		fs->fs_cstotal.cs_nifree += fs->fs_cs(fs, i).cs_nifree;
1373		fs->fs_cstotal.cs_nffree += fs->fs_cs(fs, i).cs_nffree;
1374	}
1375	return (0);
1376}
1377
1378/*
1379 * ufs_putsummaryinfo() stores all the cylinder group summary information
1380 * This is only used when logging, but the file system may not
1381 * be logging at the time, eg a read-only mount to flush the log
1382 * may push the summary info out.
1383 */
1384int
1385ufs_putsummaryinfo(dev_t dev, struct ufsvfs *ufsvfsp, struct fs *fs)
1386{
1387	struct buf	b, *bp;		/* tmp buf */
1388	caddr_t		sip;		/* summary info */
1389	ssize_t		size;		/* bytes of summary info to write */
1390	daddr_t		frags;		/* frags of summary info to write */
1391	int		i;		/* `for' loop counter */
1392	int		error;		/* error */
1393
1394	if (TRANS_ISERROR(ufsvfsp)) {
1395		return (EIO);
1396	}
1397
1398	if ((fs->fs_si != FS_SI_BAD) || !ufsvfsp->vfs_nolog_si) {
1399		return (0);
1400	}
1401
1402	bp = &b;
1403	bioinit(bp);
1404	bp->b_iodone = ufs_si_io_done;
1405	bp->b_bufsize = size = fs->fs_bsize;
1406	bp->b_flags = B_WRITE;
1407	bp->b_un.b_addr = kmem_alloc(size, KM_SLEEP);
1408	bp->b_edev = dev;
1409	frags = howmany(fs->fs_cssize, fs->fs_fsize);
1410	sip = (caddr_t)fs->fs_u.fs_csp;
1411
1412	/* Write summary info one fs block at a time */
1413	for (error = 0, i = 0; (i < frags) && (error == 0); i += fs->fs_frag) {
1414		if (i + fs->fs_frag > frags) {
1415			/*
1416			 * This happens only the last iteration, so
1417			 * don't worry about size being reset
1418			 */
1419			size = (frags - i) * fs->fs_fsize;
1420		}
1421		bcopy(sip, bp->b_un.b_addr, size);
1422		bp->b_blkno = (daddr_t)fsbtodb(fs, fs->fs_csaddr+i);
1423		bp->b_bcount = size;
1424		(void) bdev_strategy(bp);
1425		sema_p(&bp->b_io); /* wait for write to complete */
1426		error = geterror(bp);
1427		sip += size;
1428	}
1429	kmem_free(bp->b_un.b_addr, fs->fs_bsize);
1430	if (!error) {
1431		fs->fs_si = FS_SI_OK;
1432	}
1433	return (error);
1434}
1435
1436/*
1437 * Decide whether it is okay to remove within a sticky directory.
1438 * Two conditions need to be met:  write access to the directory
1439 * is needed.  In sticky directories, write access is not sufficient;
1440 * you can remove entries from a directory only if you own the directory,
1441 * if you are privileged, if you own the entry or if the entry is
1442 * a plain file and you have write access to that file.
1443 * Function returns 0 if remove access is granted.
1444 * Note, the caller is responsible for holding the i_contents lock
1445 * at least as reader on the inquired inode 'ip'.
1446 */
1447int
1448ufs_sticky_remove_access(struct inode *dp, struct inode *ip, struct cred *cr)
1449{
1450	uid_t uid;
1451
1452	ASSERT(RW_LOCK_HELD(&ip->i_contents));
1453
1454	if ((dp->i_mode & ISVTX) &&
1455	    (uid = crgetuid(cr)) != dp->i_uid &&
1456	    uid != ip->i_uid &&
1457	    ((ip->i_mode & IFMT) != IFREG ||
1458	    ufs_iaccess(ip, IWRITE, cr, 0) != 0))
1459		return (secpolicy_vnode_remove(cr));
1460
1461	return (0);
1462}
1463#endif	/* _KERNEL */
1464
1465extern	int around[9];
1466extern	int inside[9];
1467extern	uchar_t *fragtbl[];
1468
1469/*
1470 * Update the frsum fields to reflect addition or deletion
1471 * of some frags.
1472 */
1473void
1474fragacct(struct fs *fs, int fragmap, int32_t *fraglist, int cnt)
1475{
1476	int inblk;
1477	int field, subfield;
1478	int siz, pos;
1479
1480	/*
1481	 * ufsvfsp->vfs_lock is held when calling this.
1482	 */
1483	inblk = (int)(fragtbl[fs->fs_frag][fragmap]) << 1;
1484	fragmap <<= 1;
1485	for (siz = 1; siz < fs->fs_frag; siz++) {
1486		if ((inblk & (1 << (siz + (fs->fs_frag % NBBY)))) == 0)
1487			continue;
1488		field = around[siz];
1489		subfield = inside[siz];
1490		for (pos = siz; pos <= fs->fs_frag; pos++) {
1491			if ((fragmap & field) == subfield) {
1492				fraglist[siz] += cnt;
1493				ASSERT(fraglist[siz] >= 0);
1494				pos += siz;
1495				field <<= siz;
1496				subfield <<= siz;
1497			}
1498			field <<= 1;
1499			subfield <<= 1;
1500		}
1501	}
1502}
1503
1504/*
1505 * Block operations
1506 */
1507
1508/*
1509 * Check if a block is available
1510 */
1511int
1512isblock(struct fs *fs, uchar_t *cp, daddr_t h)
1513{
1514	uchar_t mask;
1515
1516	ASSERT(fs->fs_frag == 8 || fs->fs_frag == 4 || fs->fs_frag == 2 || \
1517	    fs->fs_frag == 1);
1518	/*
1519	 * ufsvfsp->vfs_lock is held when calling this.
1520	 */
1521	switch ((int)fs->fs_frag) {
1522	case 8:
1523		return (cp[h] == 0xff);
1524	case 4:
1525		mask = 0x0f << ((h & 0x1) << 2);
1526		return ((cp[h >> 1] & mask) == mask);
1527	case 2:
1528		mask = 0x03 << ((h & 0x3) << 1);
1529		return ((cp[h >> 2] & mask) == mask);
1530	case 1:
1531		mask = 0x01 << (h & 0x7);
1532		return ((cp[h >> 3] & mask) == mask);
1533	default:
1534#ifndef _KERNEL
1535		cmn_err(CE_PANIC, "isblock: illegal fs->fs_frag value (%d)",
1536		    fs->fs_frag);
1537#endif /* _KERNEL */
1538		return (0);
1539	}
1540}
1541
1542/*
1543 * Take a block out of the map
1544 */
1545void
1546clrblock(struct fs *fs, uchar_t *cp, daddr_t h)
1547{
1548	ASSERT(fs->fs_frag == 8 || fs->fs_frag == 4 || fs->fs_frag == 2 || \
1549	    fs->fs_frag == 1);
1550	/*
1551	 * ufsvfsp->vfs_lock is held when calling this.
1552	 */
1553	switch ((int)fs->fs_frag) {
1554	case 8:
1555		cp[h] = 0;
1556		return;
1557	case 4:
1558		cp[h >> 1] &= ~(0x0f << ((h & 0x1) << 2));
1559		return;
1560	case 2:
1561		cp[h >> 2] &= ~(0x03 << ((h & 0x3) << 1));
1562		return;
1563	case 1:
1564		cp[h >> 3] &= ~(0x01 << (h & 0x7));
1565		return;
1566	default:
1567#ifndef _KERNEL
1568		cmn_err(CE_PANIC, "clrblock: illegal fs->fs_frag value (%d)",
1569		    fs->fs_frag);
1570#endif /* _KERNEL */
1571		return;
1572	}
1573}
1574
1575/*
1576 * Is block allocated?
1577 */
1578int
1579isclrblock(struct fs *fs, uchar_t *cp, daddr_t h)
1580{
1581	uchar_t	mask;
1582	int	frag;
1583	/*
1584	 * ufsvfsp->vfs_lock is held when calling this.
1585	 */
1586	frag = fs->fs_frag;
1587	ASSERT(frag == 8 || frag == 4 || frag == 2 || frag == 1);
1588	switch (frag) {
1589	case 8:
1590		return (cp[h] == 0);
1591	case 4:
1592		mask = ~(0x0f << ((h & 0x1) << 2));
1593		return (cp[h >> 1] == (cp[h >> 1] & mask));
1594	case 2:
1595		mask =	~(0x03 << ((h & 0x3) << 1));
1596		return (cp[h >> 2] == (cp[h >> 2] & mask));
1597	case 1:
1598		mask = ~(0x01 << (h & 0x7));
1599		return (cp[h >> 3] == (cp[h >> 3] & mask));
1600	default:
1601#ifndef _KERNEL
1602		cmn_err(CE_PANIC, "isclrblock: illegal fs->fs_frag value (%d)",
1603		    fs->fs_frag);
1604#endif /* _KERNEL */
1605		break;
1606	}
1607	return (0);
1608}
1609
1610/*
1611 * Put a block into the map
1612 */
1613void
1614setblock(struct fs *fs, uchar_t *cp, daddr_t h)
1615{
1616	ASSERT(fs->fs_frag == 8 || fs->fs_frag == 4 || fs->fs_frag == 2 || \
1617	    fs->fs_frag == 1);
1618	/*
1619	 * ufsvfsp->vfs_lock is held when calling this.
1620	 */
1621	switch ((int)fs->fs_frag) {
1622	case 8:
1623		cp[h] = 0xff;
1624		return;
1625	case 4:
1626		cp[h >> 1] |= (0x0f << ((h & 0x1) << 2));
1627		return;
1628	case 2:
1629		cp[h >> 2] |= (0x03 << ((h & 0x3) << 1));
1630		return;
1631	case 1:
1632		cp[h >> 3] |= (0x01 << (h & 0x7));
1633		return;
1634	default:
1635#ifndef _KERNEL
1636		cmn_err(CE_PANIC, "setblock: illegal fs->fs_frag value (%d)",
1637		    fs->fs_frag);
1638#endif /* _KERNEL */
1639		return;
1640	}
1641}
1642
1643int
1644skpc(char c, uint_t len, char *cp)
1645{
1646	if (len == 0)
1647		return (0);
1648	while (*cp++ == c && --len)
1649		;
1650	return (len);
1651}
1652