ufs_inode.c revision 63:25e4c130753b
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License").  You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22/*
23 * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27/*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
28/*	  All Rights Reserved  	*/
29
30/*
31 * University Copyright- Copyright (c) 1982, 1986, 1988
32 * The Regents of the University of California
33 * All Rights Reserved
34 *
35 * University Acknowledgment- Portions of this document are derived from
36 * software developed by the University of California, Berkeley, and its
37 * contributors.
38 */
39
40
41#pragma ident	"%Z%%M%	%I%	%E% SMI"
42
43#include <sys/types.h>
44#include <sys/t_lock.h>
45#include <sys/param.h>
46#include <sys/systm.h>
47#include <sys/uio.h>
48#include <sys/bitmap.h>
49#include <sys/signal.h>
50#include <sys/cred.h>
51#include <sys/user.h>
52#include <sys/vfs.h>
53#include <sys/stat.h>
54#include <sys/vnode.h>
55#include <sys/buf.h>
56#include <sys/proc.h>
57#include <sys/disp.h>
58#include <sys/dnlc.h>
59#include <sys/mode.h>
60#include <sys/cmn_err.h>
61#include <sys/kstat.h>
62#include <sys/acl.h>
63#include <sys/var.h>
64#include <sys/fs/ufs_inode.h>
65#include <sys/fs/ufs_fs.h>
66#include <sys/fs/ufs_trans.h>
67#include <sys/fs/ufs_acl.h>
68#include <sys/fs/ufs_bio.h>
69#include <sys/fs/ufs_quota.h>
70#include <sys/fs/ufs_log.h>
71#include <vm/hat.h>
72#include <vm/as.h>
73#include <vm/pvn.h>
74#include <vm/seg.h>
75#include <sys/swap.h>
76#include <sys/cpuvar.h>
77#include <sys/sysmacros.h>
78#include <sys/errno.h>
79#include <sys/kmem.h>
80#include <sys/debug.h>
81#include <fs/fs_subr.h>
82#include <sys/policy.h>
83
84struct kmem_cache *inode_cache;		/* cache of free inodes */
85
86/* UFS Inode Cache Stats -- Not protected */
87struct	instats ins = {
88	{ "size",		KSTAT_DATA_ULONG },
89	{ "maxsize",		KSTAT_DATA_ULONG },
90	{ "hits",		KSTAT_DATA_ULONG },
91	{ "misses",		KSTAT_DATA_ULONG },
92	{ "kmem allocs",	KSTAT_DATA_ULONG },
93	{ "kmem frees",		KSTAT_DATA_ULONG },
94	{ "maxsize reached",	KSTAT_DATA_ULONG },
95	{ "puts at frontlist",	KSTAT_DATA_ULONG },
96	{ "puts at backlist",	KSTAT_DATA_ULONG },
97	{ "queues to free",	KSTAT_DATA_ULONG },
98	{ "scans",		KSTAT_DATA_ULONG },
99	{ "thread idles",	KSTAT_DATA_ULONG },
100	{ "lookup idles",	KSTAT_DATA_ULONG },
101	{ "vget idles",		KSTAT_DATA_ULONG },
102	{ "cache allocs",	KSTAT_DATA_ULONG },
103	{ "cache frees",	KSTAT_DATA_ULONG },
104	{ "pushes at close",	KSTAT_DATA_ULONG }
105};
106
107/* kstat data */
108static kstat_t		*ufs_inode_kstat = NULL;
109
110union ihead *ihead;	/* inode LRU cache, Chris Maltby */
111kmutex_t *ih_lock;	/* protect inode cache hash table */
112static int ino_hashlen = 4;	/* desired average hash chain length */
113int inohsz;		/* number of buckets in the hash table */
114
115kmutex_t	ufs_scan_lock;	/* stop racing multiple ufs_scan_inodes() */
116kmutex_t	ufs_iuniqtime_lock; /* protect iuniqtime */
117kmutex_t	ufsvfs_mutex;
118struct ufsvfs	*oldufsvfslist, *ufsvfslist;
119
120/*
121 * time to wait after ufsvfsp->vfs_iotstamp before declaring that no
122 * I/Os are going on.
123 */
124clock_t	ufs_iowait;
125
126/*
127 * the threads that process idle inodes and free (deleted) inodes
128 * have high water marks that are set in ufsinit().
129 * These values but can be no less then the minimum shown below
130 */
131int	ufs_idle_max;	/* # of allowable idle inodes */
132ulong_t	ufs_inode_max;	/* hard limit of allowable idle inodes */
133#define	UFS_IDLE_MAX	(16)	/* min # of allowable idle inodes */
134
135/*
136 * Tunables for ufs write throttling.
137 * These are validated in ufs_iinit() since improper settings
138 * can lead to filesystem hangs.
139 */
140#define	UFS_HW_DEFAULT	(16 * 1024 * 1024)
141#define	UFS_LW_DEFAULT	(8 * 1024 * 1024)
142int	ufs_HW = UFS_HW_DEFAULT;
143int	ufs_LW = UFS_LW_DEFAULT;
144
145static void ihinit(void);
146extern int hash2ints(int, int);
147
148static int ufs_iget_internal(struct vfs *, ino_t, struct inode **,
149    struct cred *, int);
150
151/* ARGSUSED */
152static int
153ufs_inode_kstat_update(kstat_t *ksp, int rw)
154{
155	if (rw == KSTAT_WRITE)
156		return (EACCES);
157
158	ins.in_malloc.value.ul	= (ulong_t)kmem_cache_stat(inode_cache,
159	    "slab_alloc");
160	ins.in_mfree.value.ul	= (ulong_t)kmem_cache_stat(inode_cache,
161	    "slab_free");
162	ins.in_kcalloc.value.ul	= (ulong_t)kmem_cache_stat(inode_cache,
163	    "alloc");
164	ins.in_kcfree.value.ul	= (ulong_t)kmem_cache_stat(inode_cache,
165	    "free");
166	ins.in_size.value.ul	= (ulong_t)kmem_cache_stat(inode_cache,
167	    "buf_inuse");
168	ins.in_maxreached.value.ul = (ulong_t)kmem_cache_stat(inode_cache,
169	    "buf_max");
170	ins.in_misses.value.ul = ins.in_kcalloc.value.ul;
171
172	return (0);
173}
174
175void
176ufs_iinit(void)
177{
178	/*
179	 * Validate that ufs_HW > ufs_LW.
180	 * The default values for these two tunables have been increased.
181	 * There is now a range of values for ufs_HW that used to be
182	 * legal on previous Solaris versions but no longer is now.
183	 * Upgrading a machine which has an /etc/system setting for ufs_HW
184	 * from that range can lead to filesystem hangs unless the values
185	 * are checked here.
186	 */
187	if (ufs_HW <= ufs_LW) {
188		cmn_err(CE_WARN,
189			    "ufs_HW (%d) <= ufs_LW (%d). Check /etc/system.",
190			    ufs_HW, ufs_LW);
191		ufs_LW = UFS_LW_DEFAULT;
192		ufs_HW = UFS_HW_DEFAULT;
193		cmn_err(CE_CONT, "using defaults, ufs_HW = %d, ufs_LW = %d\n",
194			    ufs_HW, ufs_LW);
195	}
196
197	/*
198	 * Adjust the tunable `ufs_ninode' to a reasonable value
199	 */
200	if (ufs_ninode <= 0)
201		ufs_ninode = ncsize;
202	if (ufs_inode_max == 0)
203		ufs_inode_max = (ulong_t)((kmem_maxavail() >> 2) /
204					sizeof (struct inode));
205	if (ufs_ninode > ufs_inode_max || (ufs_ninode == 0 && ncsize == 0)) {
206		cmn_err(CE_NOTE, "setting ufs_ninode to max value of %ld",
207				ufs_inode_max);
208		ufs_ninode = ufs_inode_max;
209	}
210	/*
211	 * Wait till third call of ufs_update to declare that no I/Os are
212	 * going on. This allows deferred access times to be flushed to disk.
213	 */
214	ufs_iowait = v.v_autoup * hz * 2;
215
216	/*
217	 * idle thread runs when 25% of ufs_ninode entries are on the queue
218	 */
219	if (ufs_idle_max == 0)
220		ufs_idle_max = ufs_ninode >> 2;
221	if (ufs_idle_max < UFS_IDLE_MAX)
222		ufs_idle_max = UFS_IDLE_MAX;
223	if (ufs_idle_max > ufs_ninode)
224		ufs_idle_max = ufs_ninode;
225	/*
226	 * This is really a misnomer, it is ufs_queue_init
227	 */
228	ufs_thread_init(&ufs_idle_q, ufs_idle_max);
229	ufs_thread_start(&ufs_idle_q, ufs_thread_idle, NULL);
230
231	/*
232	 * global hlock thread
233	 */
234	ufs_thread_init(&ufs_hlock, 1);
235	ufs_thread_start(&ufs_hlock, ufs_thread_hlock, NULL);
236
237	ihinit();
238	qtinit();
239	ins.in_maxsize.value.ul = ufs_ninode;
240	if ((ufs_inode_kstat = kstat_create("ufs", 0, "inode_cache", "ufs",
241	    KSTAT_TYPE_NAMED, sizeof (ins) / sizeof (kstat_named_t),
242	    KSTAT_FLAG_VIRTUAL)) != NULL) {
243		ufs_inode_kstat->ks_data = (void *)&ins;
244		ufs_inode_kstat->ks_update = ufs_inode_kstat_update;
245		kstat_install(ufs_inode_kstat);
246	}
247	ufsfx_init();		/* fix-on-panic initialization */
248	si_cache_init();
249	ufs_directio_init();
250	lufs_init();
251	mutex_init(&ufs_iuniqtime_lock, NULL, MUTEX_DEFAULT, NULL);
252}
253
254/* ARGSUSED */
255static int
256ufs_inode_cache_constructor(void *buf, void *cdrarg, int kmflags)
257{
258	struct inode *ip = buf;
259	struct vnode *vp;
260
261	rw_init(&ip->i_rwlock, NULL, RW_DEFAULT, NULL);
262	rw_init(&ip->i_contents, NULL, RW_DEFAULT, NULL);
263	mutex_init(&ip->i_tlock, NULL, MUTEX_DEFAULT, NULL);
264	dnlc_dir_init(&ip->i_danchor);
265
266	cv_init(&ip->i_wrcv, NULL, CV_DRIVER, NULL);
267
268	vp = vn_alloc(KM_SLEEP);
269	ip->i_vnode = vp;
270
271	vn_setops(vp, ufs_vnodeops);
272	vp->v_data = (caddr_t)ip;
273
274	return (0);
275}
276
277/* ARGSUSED */
278static void
279ufs_inode_cache_destructor(void *buf, void *cdrarg)
280{
281	struct inode *ip = buf;
282	struct vnode *vp;
283
284	vp = ITOV(ip);
285
286	rw_destroy(&ip->i_rwlock);
287	rw_destroy(&ip->i_contents);
288
289	mutex_destroy(&ip->i_tlock);
290	if (vp->v_type == VDIR) {
291		dnlc_dir_fini(&ip->i_danchor);
292	}
293
294	cv_destroy(&ip->i_wrcv);
295
296	vn_free(vp);
297}
298
299/*
300 * Initialize hash links for inodes
301 * and build inode free list.
302 */
303void
304ihinit(void)
305{
306	int i;
307	union	ihead *ih = ihead;
308
309	mutex_init(&ufs_scan_lock, NULL, MUTEX_DEFAULT, NULL);
310
311	inohsz = 1 << highbit(ufs_ninode / ino_hashlen);
312	ihead = kmem_zalloc(inohsz * sizeof (union ihead), KM_SLEEP);
313	ih_lock = kmem_zalloc(inohsz * sizeof (kmutex_t), KM_SLEEP);
314
315	for (i = 0, ih = ihead; i < inohsz; i++,  ih++) {
316		ih->ih_head[0] = ih;
317		ih->ih_head[1] = ih;
318		mutex_init(&ih_lock[i], NULL, MUTEX_DEFAULT, NULL);
319	}
320	inode_cache = kmem_cache_create("ufs_inode_cache",
321		sizeof (struct inode), 0, ufs_inode_cache_constructor,
322		ufs_inode_cache_destructor, ufs_inode_cache_reclaim,
323		NULL, NULL, 0);
324}
325
326/*
327 * Free an inode structure
328 */
329void
330ufs_free_inode(struct inode *ip)
331{
332	vn_invalid(ITOV(ip));
333	kmem_cache_free(inode_cache, ip);
334}
335
336/*
337 * Allocate an inode structure
338 */
339struct inode *
340ufs_alloc_inode(ufsvfs_t *ufsvfsp, ino_t ino)
341{
342	struct inode *ip;
343	vnode_t *vp;
344
345	ip = kmem_cache_alloc(inode_cache, KM_SLEEP);
346	/*
347	 * at this point we have a newly allocated inode
348	 */
349	ip->i_freef = ip;
350	ip->i_freeb = ip;
351	ip->i_flag = IREF;
352	ip->i_seq = 0xFF;	/* Unique initial value */
353	ip->i_dev = ufsvfsp->vfs_dev;
354	ip->i_ufsvfs = ufsvfsp;
355	ip->i_devvp = ufsvfsp->vfs_devvp;
356	ip->i_number = ino;
357	ip->i_diroff = 0;
358	ip->i_nextr = 0;
359	ip->i_map = NULL;
360	ip->i_rdev = 0;
361	ip->i_writes = 0;
362	ip->i_mode = 0;
363	ip->i_delaylen = 0;
364	ip->i_delayoff = 0;
365	ip->i_nextrio = 0;
366	ip->i_ufs_acl = NULL;
367	ip->i_cflags = 0;
368	ip->i_mapcnt = 0;
369	ip->i_dquot = NULL;
370	ip->i_cachedir = 1;
371	ip->i_writer = NULL;
372
373	/*
374	 * the vnode for this inode was allocated by the constructor
375	 */
376	vp = ITOV(ip);
377	vn_reinit(vp);
378	if (ino == (ino_t)UFSROOTINO)
379		vp->v_flag = VROOT;
380	vp->v_vfsp = ufsvfsp->vfs_vfs;
381	vn_exists(vp);
382	return (ip);
383}
384
385/*
386 * Look up an inode by device, inumber.  If it is in core (in the
387 * inode structure), honor the locking protocol.  If it is not in
388 * core, read it in from the specified device after freeing any pages.
389 * In all cases, a pointer to a VN_HELD inode structure is returned.
390 */
391int
392ufs_iget(struct vfs *vfsp, ino_t ino, struct inode **ipp, struct cred *cr)
393{
394	return (ufs_iget_internal(vfsp, ino, ipp, cr, 0));
395}
396
397/*
398 * A version of ufs_iget which returns only allocated, linked inodes.
399 * This is appropriate for any callers who do not expect a free inode.
400 */
401int
402ufs_iget_alloced(struct vfs *vfsp, ino_t ino, struct inode **ipp,
403    struct cred *cr)
404{
405	return (ufs_iget_internal(vfsp, ino, ipp, cr, 1));
406}
407
408/*
409 * Set vnode attributes based on v_type, this should be called whenever
410 * an inode's i_mode is changed.
411 */
412void
413ufs_reset_vnode(vnode_t *vp)
414{
415	/*
416	 * an old DBE hack
417	 */
418	if ((VTOI(vp)->i_mode & (ISVTX | IEXEC | IFDIR)) == ISVTX)
419		vp->v_flag |= VSWAPLIKE;
420	else
421		vp->v_flag &= ~VSWAPLIKE;
422
423	/*
424	 * if not swap like and it's just a regular file, we want
425	 * to maintain the vnode's pages sorted by clean/modified
426	 * for faster sync'ing to disk
427	 */
428	if (vp->v_type == VREG)
429		vp->v_flag |= VMODSORT;
430	else
431		vp->v_flag &= ~VMODSORT;
432
433	/*
434	 * Is this an attribute hidden dir?
435	 */
436	if ((VTOI(vp)->i_mode & IFMT) == IFATTRDIR)
437		vp->v_flag |= V_XATTRDIR;
438	else
439		vp->v_flag &= ~V_XATTRDIR;
440}
441
442/*
443 * Shared implementation of ufs_iget and ufs_iget_alloced.  The 'validate'
444 * flag is used to distinguish the two; when true, we validate that the inode
445 * being retrieved looks like a linked and allocated inode.
446 */
447/* ARGSUSED */
448static int
449ufs_iget_internal(struct vfs *vfsp, ino_t ino, struct inode **ipp,
450    struct cred *cr, int validate)
451{
452	struct inode *ip, *sp;
453	union ihead *ih;
454	kmutex_t *ihm;
455	struct buf *bp;
456	struct dinode *dp;
457	struct vnode *vp;
458	extern vfs_t EIO_vfs;
459	int error;
460	int ftype;	/* XXX - Remove later on */
461	dev_t vfs_dev;
462	struct ufsvfs *ufsvfsp;
463	struct fs *fs;
464	int hno;
465	daddr_t bno;
466	ulong_t ioff;
467
468	CPU_STATS_ADD_K(sys, ufsiget, 1);
469
470	/*
471	 * Lookup inode in cache.
472	 */
473	vfs_dev = vfsp->vfs_dev;
474	hno = INOHASH(ino);
475	ih = &ihead[hno];
476	ihm = &ih_lock[hno];
477
478again:
479	mutex_enter(ihm);
480	for (ip = ih->ih_chain[0]; ip != (struct inode *)ih; ip = ip->i_forw) {
481		if (ino != ip->i_number || vfs_dev != ip->i_dev ||
482		    (ip->i_flag & ISTALE))
483			continue;
484
485		/*
486		 * Found the interesting inode; hold it and drop the cache lock
487		 */
488		vp = ITOV(ip);	/* for locknest */
489		VN_HOLD(vp);
490		mutex_exit(ihm);
491		rw_enter(&ip->i_contents, RW_READER);
492
493		/*
494		 * if necessary, remove from idle list
495		 */
496		if ((ip->i_flag & IREF) == 0) {
497			if (ufs_rmidle(ip))
498				VN_RELE(vp);
499		}
500
501		/*
502		 * Could the inode be read from disk?
503		 */
504		if (ip->i_flag & ISTALE) {
505			rw_exit(&ip->i_contents);
506			VN_RELE(vp);
507			goto again;
508		}
509
510		ins.in_hits.value.ul++;
511		*ipp = ip;
512
513		/*
514		 * Reset the vnode's attribute flags
515		 */
516		mutex_enter(&vp->v_lock);
517		ufs_reset_vnode(vp);
518		mutex_exit(&vp->v_lock);
519
520		rw_exit(&ip->i_contents);
521
522		return (0);
523	}
524	mutex_exit(ihm);
525
526	/*
527	 * Inode was not in cache.
528	 *
529	 * Allocate a new entry
530	 */
531	ufsvfsp = (struct ufsvfs *)vfsp->vfs_data;
532	fs = ufsvfsp->vfs_fs;
533
534	ip = ufs_alloc_inode(ufsvfsp, ino);
535	vp = ITOV(ip);
536
537	bno = fsbtodb(fs, itod(fs, ino));
538	ioff = (sizeof (struct dinode)) * (itoo(fs, ino));
539	ip->i_doff = (offset_t)ioff + ldbtob(bno);
540
541	/*
542	 * put a place holder in the cache (if not already there)
543	 */
544	mutex_enter(ihm);
545	for (sp = ih->ih_chain[0]; sp != (struct inode *)ih; sp = sp->i_forw)
546		if (ino == sp->i_number && vfs_dev == sp->i_dev &&
547		    ((sp->i_flag & ISTALE) == 0)) {
548			mutex_exit(ihm);
549			ufs_free_inode(ip);
550			goto again;
551		}
552	/*
553	 * It would be nice to ASSERT(RW_READ_HELD(&ufsvfsp->vfs_dqrwlock))
554	 * here, but if we do, then shadow inode allocations panic the
555	 * system.  We don't have to hold vfs_dqrwlock for shadow inodes
556	 * and the ufs_iget() parameters don't tell us what we are getting
557	 * so we have no way of knowing this is a ufs_iget() call from
558	 * a ufs_ialloc() call for a shadow inode.
559	 */
560	rw_enter(&ip->i_contents, RW_WRITER);
561	insque(ip, ih);
562	mutex_exit(ihm);
563	/*
564	 * read the dinode
565	 */
566	bp = UFS_BREAD(ufsvfsp, ip->i_dev, bno, (int)fs->fs_bsize);
567
568	/*
569	 * Check I/O errors
570	 */
571	error = ((bp->b_flags & B_ERROR) ? geterror(bp) : 0);
572	if (error) {
573		brelse(bp);
574		ip->i_flag |= ISTALE;	/* in case someone is looking it up */
575		rw_exit(&ip->i_contents);
576		vp->v_vfsp = &EIO_vfs;
577		VN_RELE(vp);
578		return (error);
579	}
580	/*
581	 * initialize the inode's dinode
582	 */
583	dp = (struct dinode *)(ioff + bp->b_un.b_addr);
584	ip->i_ic = dp->di_ic;			/* structure assignment */
585	brelse(bp);
586
587	/*
588	 * Maintain compatibility with Solaris 1.x UFS
589	 */
590	if (ip->i_suid != UID_LONG)
591		ip->i_uid = ip->i_suid;
592	if (ip->i_sgid != GID_LONG)
593		ip->i_gid = ip->i_sgid;
594
595	ftype = ip->i_mode & IFMT;
596	if (ftype == IFBLK || ftype == IFCHR) {
597		dev_t dv;
598		uint_t top16 = ip->i_ordev & 0xffff0000u;
599
600		if (top16 == 0 || top16 == 0xffff0000u)
601			dv = expdev(ip->i_ordev);
602		else
603			dv = expldev(ip->i_ordev);
604		vp->v_rdev = ip->i_rdev = dv;
605	}
606
607	/*
608	 * if our caller only expects allocated inodes, verify that
609	 * this inode looks good; throw it out if it's bad.
610	 */
611	if (validate) {
612		if ((ftype == 0) || (ip->i_nlink <= 0)) {
613			ip->i_flag |= ISTALE;
614			rw_exit(&ip->i_contents);
615			vp->v_vfsp = &EIO_vfs;
616			VN_RELE(vp);
617			cmn_err(CE_NOTE,
618			    "%s: unexpected free inode %d, run fsck(1M)%s",
619			    fs->fs_fsmnt, (int)ino,
620			    (TRANS_ISTRANS(ufsvfsp) ? " -o f" : ""));
621			return (EIO);
622		}
623	}
624
625	/*
626	 * finish initializing the vnode
627	 */
628	vp->v_type = IFTOVT((mode_t)ip->i_mode);
629
630	ufs_reset_vnode(vp);
631
632	/*
633	 * read the shadow
634	 */
635	if (ftype != 0 && ip->i_shadow != 0) {
636		if ((error = ufs_si_load(ip, cr)) != 0) {
637			ip->i_flag |= ISTALE;
638			ip->i_ufs_acl = NULL;
639			rw_exit(&ip->i_contents);
640			vp->v_vfsp = &EIO_vfs;
641			VN_RELE(vp);
642			return (error);
643		}
644	}
645
646	/*
647	 * Only attach quota information if the inode has a type and if
648	 * that type is not a shadow inode.
649	 */
650	if (ip->i_mode && ((ip->i_mode & IFMT) != IFSHAD) &&
651	    ((ip->i_mode & IFMT) != IFATTRDIR)) {
652		ip->i_dquot = getinoquota(ip);
653	}
654	TRANS_MATA_IGET(ufsvfsp, ip);
655	*ipp = ip;
656	rw_exit(&ip->i_contents);
657
658	return (0);
659}
660
661/*
662 * Vnode is no longer referenced, write the inode out
663 * and if necessary, truncate and deallocate the file.
664 */
665void
666ufs_iinactive(struct inode *ip)
667{
668	int		front;
669	struct inode	*iq;
670	struct inode	*hip;
671	struct ufs_q	*uq;
672	struct vnode	*vp = ITOV(ip);
673
674
675	/*
676	 * Because the vnode type might have been changed,
677	 * the dnlc_dir_purge must be called unconditionally.
678	 */
679	dnlc_dir_purge(&ip->i_danchor);
680
681	/*
682	 * Get exclusive access to inode data.
683	 */
684	rw_enter(&ip->i_contents, RW_WRITER);
685	ASSERT(ip->i_flag & IREF);
686
687	/*
688	 * Make sure no one reclaimed the inode before we put it on
689	 * the freelist or destroy it. We keep our 'hold' on the vnode
690	 * from vn_rele until we are ready to do something with the inode.
691	 *
692	 * Pageout may put a VN_HOLD/VN_RELE at anytime during this
693	 * operation via an async putpage, so we must make sure
694	 * we don't free/destroy the inode more than once. ufs_iget
695	 * may also put a VN_HOLD on the inode before it grabs
696	 * the i_contents lock. This is done so we don't free
697	 * an inode that a thread is waiting on.
698	 */
699	mutex_enter(&vp->v_lock);
700
701	if (vp->v_count > 1) {
702	    vp->v_count--;  /* release our hold from vn_rele */
703	    mutex_exit(&vp->v_lock);
704	    rw_exit(&ip->i_contents);
705	    return;
706	}
707	mutex_exit(&vp->v_lock);
708
709	/*
710	 * For umount case: if ufsvfs ptr is NULL, the inode is unhashed
711	 * and clean.  It can be safely destroyed (cyf).
712	 */
713	if (ip->i_ufsvfs == NULL) {
714		rw_exit(&ip->i_contents);
715		ufs_si_del(ip);
716		ASSERT((vp->v_type == VCHR) || !vn_has_cached_data(vp));
717		ufs_free_inode(ip);
718		return;
719	}
720
721	/*
722	 * queue idle inode to appropriate thread. Will check v_count == 1
723	 * prior to putting this on the appropriate queue.
724	 * Stale inodes will be unhashed and freed by the ufs idle thread
725	 * in ufs_idle_free()
726	 */
727	front = 1;
728	if ((ip->i_flag & ISTALE) == 0 && ip->i_fs->fs_ronly == 0 &&
729	    ip->i_mode && ip->i_nlink <= 0) {
730		/*
731		 * Mark the i_flag to indicate that inode is being deleted.
732		 * This flag will be cleared when the deletion is complete.
733		 * This prevents nfs from sneaking in via ufs_vget() while
734		 * the delete is in progress (bugid 1242481).
735		 */
736		ip->i_flag |= IDEL;
737
738		/*
739		 * NOIDEL means that deletes are not allowed at this time;
740		 * whoever resets NOIDEL will also send this inode back
741		 * through ufs_iinactive.  IREF remains set.
742		 */
743		if (ULOCKFS_IS_NOIDEL(ITOUL(ip))) {
744			mutex_enter(&vp->v_lock);
745			vp->v_count--;
746			mutex_exit(&vp->v_lock);
747			rw_exit(&ip->i_contents);
748			return;
749		}
750		if (!TRANS_ISTRANS(ip->i_ufsvfs)) {
751			rw_exit(&ip->i_contents);
752			ufs_delete(ip->i_ufsvfs, ip, 0);
753			return;
754		}
755
756		/* queue to delete thread; IREF remains set */
757		ins.in_qfree.value.ul++;
758		uq = &ip->i_ufsvfs->vfs_delete;
759
760		mutex_enter(&uq->uq_mutex);
761
762		/* add to q */
763		if ((iq = uq->uq_ihead) != 0) {
764			ip->i_freef = iq;
765			ip->i_freeb = iq->i_freeb;
766			iq->i_freeb->i_freef = ip;
767			iq->i_freeb = ip;
768			if (front)
769				uq->uq_ihead = ip;
770		} else {
771			uq->uq_ihead = ip;
772			ip->i_freef = ip;
773			ip->i_freeb = ip;
774		}
775	} else {
776		/*
777		 * queue to idle thread
778		 *  Check the v_count == 1 again.
779		 *
780		 */
781		mutex_enter(&vp->v_lock);
782		if (vp->v_count > 1) {
783		    vp->v_count--;  /* release our hold from vn_rele */
784		    mutex_exit(&vp->v_lock);
785		    rw_exit(&ip->i_contents);
786		    return;
787		}
788		mutex_exit(&vp->v_lock);
789		uq = &ufs_idle_q;
790
791		/*
792		 * useful iff it has pages or is a fastsymlink; otherwise junk
793		 */
794		mutex_enter(&uq->uq_mutex);
795
796		/* clear IREF means `on idle list' */
797		ip->i_flag &= ~(IREF | IDIRECTIO);
798
799		if (vn_has_cached_data(vp) || ip->i_flag & IFASTSYMLNK) {
800			ins.in_frback.value.ul++;
801			hip = (inode_t *)&ufs_useful_iq[IQHASH(ip)];
802			ufs_nuseful_iq++;
803		} else {
804			ins.in_frfront.value.ul++;
805			hip = (inode_t *)&ufs_junk_iq[IQHASH(ip)];
806			ip->i_flag |= IJUNKIQ;
807			ufs_njunk_iq++;
808		}
809		ip->i_freef = hip;
810		ip->i_freeb = hip->i_freeb;
811		hip->i_freeb->i_freef = ip;
812		hip->i_freeb = ip;
813	}
814
815	/* wakeup thread(s) if q is overfull */
816	if (++uq->uq_ne == uq->uq_lowat)
817		cv_broadcast(&uq->uq_cv);
818
819	/* all done, release the q and inode */
820	mutex_exit(&uq->uq_mutex);
821	rw_exit(&ip->i_contents);
822}
823
824/*
825 * Check accessed and update flags on an inode structure.
826 * If any are on, update the inode with the (unique) current time.
827 * If waitfor is given, insure I/O order so wait for write to complete.
828 */
829void
830ufs_iupdat(struct inode *ip, int waitfor)
831{
832	struct buf	*bp;
833	struct fs	*fp;
834	struct dinode	*dp;
835	struct ufsvfs	*ufsvfsp 	= ip->i_ufsvfs;
836	int 		i;
837	int		do_trans_times;
838	ushort_t	flag;
839	o_uid_t		suid;
840	o_gid_t		sgid;
841
842	/*
843	 * This function is now safe to be called with either the reader
844	 * or writer i_contents lock.
845	 */
846	ASSERT(RW_LOCK_HELD(&ip->i_contents));
847
848	/*
849	 * Return if file system has been forcibly umounted.
850	 */
851	if (ufsvfsp == NULL)
852		return;
853
854	flag = ip->i_flag;	/* Atomic read */
855	/*
856	 * We better not update the disk inode from a stale inode.
857	 */
858	if (flag & ISTALE)
859		return;
860
861	fp = ip->i_fs;
862
863	if ((flag & (IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG)) != 0) {
864		if (fp->fs_ronly) {
865			mutex_enter(&ip->i_tlock);
866			ip->i_flag &= ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG);
867			mutex_exit(&ip->i_tlock);
868			return;
869		}
870		/*
871		 * fs is active while metadata is being written
872		 */
873		mutex_enter(&ufsvfsp->vfs_lock);
874		ufs_notclean(ufsvfsp);
875		/*
876		 * get the dinode
877		 */
878		bp = UFS_BREAD(ufsvfsp, ip->i_dev,
879		    (daddr_t)fsbtodb(fp, itod(fp, ip->i_number)),
880		    (int)fp->fs_bsize);
881		if (bp->b_flags & B_ERROR) {
882			mutex_enter(&ip->i_tlock);
883			ip->i_flag &=
884			    ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG);
885			mutex_exit(&ip->i_tlock);
886			brelse(bp);
887			return;
888		}
889		/*
890		 * munge inode fields
891		 */
892		mutex_enter(&ip->i_tlock);
893		ITIMES_NOLOCK(ip);
894		do_trans_times = ((ip->i_flag & (IMOD|IMODACC)) == IMODACC);
895		ip->i_flag &= ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG);
896		mutex_exit(&ip->i_tlock);
897
898		/*
899		 * For reads and concurrent re-writes, no deltas were
900		 * entered for the access time changes - do it now.
901		 */
902		if (do_trans_times) {
903			TRANS_INODE_TIMES(ufsvfsp, ip);
904		}
905
906		/*
907		 * For SunOS 5.0->5.4, these lines below read:
908		 *
909		 * suid = (ip->i_uid > MAXUID) ? UID_LONG : ip->i_uid;
910		 * sgid = (ip->i_gid > MAXUID) ? GID_LONG : ip->i_gid;
911		 *
912		 * where MAXUID was set to 60002.  This was incorrect -
913		 * the uids should have been constrained to what fitted into
914		 * a 16-bit word.
915		 *
916		 * This means that files from 4.x filesystems that have an
917		 * i_suid field larger than 60002 will have that field
918		 * changed to 65535.
919		 *
920		 * Security note: 4.x UFS could never create a i_suid of
921		 * UID_LONG since that would've corresponded to -1.
922		 */
923		suid = (ulong_t)ip->i_uid > (ulong_t)USHRT_MAX ?
924			UID_LONG : ip->i_uid;
925		sgid = (ulong_t)ip->i_gid > (ulong_t)USHRT_MAX ?
926			GID_LONG : ip->i_gid;
927
928		if ((ip->i_suid != suid) || (ip->i_sgid != sgid)) {
929			ip->i_suid = suid;
930			ip->i_sgid = sgid;
931			TRANS_INODE(ufsvfsp, ip);
932		}
933
934		if ((ip->i_mode & IFMT) == IFBLK ||
935		    (ip->i_mode & IFMT) == IFCHR) {
936			dev_t d = ip->i_rdev;
937			dev32_t dev32;
938
939			/*
940			 * load first direct block only if special device
941			 */
942			if (!cmpldev(&dev32, d)) {
943				/*
944				 * We panic here because there's "no way"
945				 * we should have been able to create a large
946				 * inode with a large dev_t.  Earlier layers
947				 * should've caught this.
948				 */
949				panic("ip %p: i_rdev too big", (void *)ip);
950			}
951
952			if (dev32 & ~((O_MAXMAJ << L_BITSMINOR32) | O_MAXMIN)) {
953				ip->i_ordev = dev32;	/* can't use old fmt. */
954			} else {
955				ip->i_ordev = cmpdev(d);
956			}
957		}
958
959		/*
960		 * copy inode to dinode (zero fastsymlnk in dinode)
961		 */
962		dp = (struct dinode *)bp->b_un.b_addr + itoo(fp, ip->i_number);
963		dp->di_ic = ip->i_ic;	/* structure assignment */
964		if (flag & IFASTSYMLNK) {
965			for (i = 1; i < NDADDR; i++)
966				dp->di_db[i] = 0;
967			for (i = 0; i < NIADDR; i++)
968				dp->di_ib[i] = 0;
969		}
970		if (TRANS_ISTRANS(ufsvfsp)) {
971			/*
972			 * Pass only a sector size buffer containing
973			 * the inode, otherwise when the buffer is copied
974			 * into a cached roll buffer then too much memory
975			 * gets consumed if 8KB inode buffers are passed.
976			 */
977			TRANS_LOG(ufsvfsp, (caddr_t)dp, ip->i_doff,
978			    sizeof (struct dinode),
979			    (caddr_t)P2ALIGN((uintptr_t)dp, DEV_BSIZE),
980			    DEV_BSIZE);
981
982			brelse(bp);
983		} else if (waitfor && (ip->i_ufsvfs->vfs_dio == 0)) {
984			UFS_BRWRITE(ufsvfsp, bp);
985
986			/*
987			 * Synchronous write has guaranteed that inode
988			 * has been written on disk so clear the flag
989			 */
990			mutex_enter(&ip->i_tlock);
991			ip->i_flag &= ~IBDWRITE;
992			mutex_exit(&ip->i_tlock);
993		} else {
994			bdrwrite(bp);
995
996			/*
997			 * This write hasn't guaranteed that inode has been
998			 * written on the disk.
999			 * Since, all updat flags on inode are cleared, we must
1000			 * remember the condition in case inode is to be updated
1001			 * synchronously later (e.g.- fsync()/fdatasync())
1002			 * and inode has not been modified yet.
1003			 */
1004			mutex_enter(&ip->i_tlock);
1005			ip->i_flag |= IBDWRITE;
1006			mutex_exit(&ip->i_tlock);
1007		}
1008	} else {
1009		/*
1010		 * In case previous inode update was done asynchronously
1011		 * (IBDWRITE) and this inode update request wants guaranteed
1012		 * (synchronous) disk update, flush the inode.
1013		 */
1014		if (waitfor && (flag & IBDWRITE)) {
1015			blkflush(ip->i_dev,
1016				(daddr_t)fsbtodb(fp, itod(fp, ip->i_number)));
1017			mutex_enter(&ip->i_tlock);
1018			ip->i_flag &= ~IBDWRITE;
1019			mutex_exit(&ip->i_tlock);
1020		}
1021	}
1022}
1023
1024#define	SINGLE	0	/* index of single indirect block */
1025#define	DOUBLE	1	/* index of double indirect block */
1026#define	TRIPLE	2	/* index of triple indirect block */
1027
1028/*
1029 * Release blocks associated with the inode ip and
1030 * stored in the indirect block bn.  Blocks are free'd
1031 * in LIFO order up to (but not including) lastbn.  If
1032 * level is greater than SINGLE, the block is an indirect
1033 * block and recursive calls to indirtrunc must be used to
1034 * cleanse other indirect blocks.
1035 *
1036 * N.B.: triple indirect blocks are untested.
1037 */
1038static long
1039indirtrunc(struct inode *ip, daddr_t bn, daddr_t lastbn, int level, int flags)
1040{
1041	int i;
1042	struct buf *bp, *copy;
1043	daddr32_t *bap;
1044	struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
1045	struct fs *fs = ufsvfsp->vfs_fs;
1046	daddr_t nb, last;
1047	long factor;
1048	int blocksreleased = 0, nblocks;
1049
1050	ASSERT(RW_WRITE_HELD(&ip->i_contents));
1051	/*
1052	 * Calculate index in current block of last
1053	 * block to be kept.  -1 indicates the entire
1054	 * block so we need not calculate the index.
1055	 */
1056	factor = 1;
1057	for (i = SINGLE; i < level; i++)
1058		factor *= NINDIR(fs);
1059	last = lastbn;
1060	if (lastbn > 0)
1061		last /= factor;
1062	nblocks = btodb(fs->fs_bsize);
1063	/*
1064	 * Get buffer of block pointers, zero those
1065	 * entries corresponding to blocks to be free'd,
1066	 * and update on disk copy first.
1067	 * *Unless* the root pointer has been synchronously
1068	 * written to disk.  If nothing points to this
1069	 * indirect block then don't bother zero'ing and
1070	 * writing it.
1071	 */
1072	bp = UFS_BREAD(ufsvfsp,
1073			ip->i_dev, (daddr_t)fsbtodb(fs, bn), (int)fs->fs_bsize);
1074	if (bp->b_flags & B_ERROR) {
1075		brelse(bp);
1076		return (0);
1077	}
1078	bap = bp->b_un.b_daddr;
1079	if ((flags & I_CHEAP) == 0) {
1080		uint_t	zb;
1081
1082		zb = (uint_t)((NINDIR(fs) - (last + 1)) * sizeof (daddr32_t));
1083
1084		if (zb) {
1085			/*
1086			 * push any data into the log before we zero it
1087			 */
1088			if (bp->b_flags & B_DELWRI)
1089				TRANS_LOG(ufsvfsp, (caddr_t)bap,
1090					ldbtob(bp->b_blkno), bp->b_bcount,
1091					bp->b_un.b_addr, bp->b_bcount);
1092			copy = ngeteblk(fs->fs_bsize);
1093			bcopy((caddr_t)bap, (caddr_t)copy->b_un.b_daddr,
1094				(uint_t)fs->fs_bsize);
1095			bzero((caddr_t)&bap[last + 1], zb);
1096
1097			TRANS_BUF(ufsvfsp,
1098				(caddr_t)&bap[last + 1] - (caddr_t)bap,
1099				zb, bp, DT_ABZERO);
1100
1101			UFS_BRWRITE(ufsvfsp, bp);
1102			bp = copy, bap = bp->b_un.b_daddr;
1103		}
1104	} else {
1105		/* make sure write retries are also cleared */
1106		bp->b_flags &= ~(B_DELWRI | B_RETRYWRI);
1107		bp->b_flags |= B_STALE | B_AGE;
1108	}
1109
1110	/*
1111	 * Recursively free totally unused blocks.
1112	 */
1113	flags |= I_CHEAP;
1114	for (i = NINDIR(fs) - 1; i > last; i--) {
1115		nb = bap[i];
1116		if (nb == 0)
1117			continue;
1118		if (level > SINGLE) {
1119			blocksreleased +=
1120			    indirtrunc(ip, nb, (daddr_t)-1, level - 1, flags);
1121			free(ip, nb, (off_t)fs->fs_bsize, flags | I_IBLK);
1122		} else
1123			free(ip, nb, (off_t)fs->fs_bsize, flags);
1124		blocksreleased += nblocks;
1125	}
1126	flags &= ~I_CHEAP;
1127
1128	/*
1129	 * Recursively free last partial block.
1130	 */
1131	if (level > SINGLE && lastbn >= 0) {
1132		last = lastbn % factor;
1133		nb = bap[i];
1134		if (nb != 0)
1135			blocksreleased += indirtrunc(ip, nb, last, level - 1,
1136				flags);
1137	}
1138	brelse(bp);
1139	return (blocksreleased);
1140}
1141
1142/*
1143 * Truncate the inode ip to at most length size.
1144 * Free affected disk blocks -- the blocks of the
1145 * file are removed in reverse order.
1146 *
1147 * N.B.: triple indirect blocks are untested.
1148 */
1149static int i_genrand = 1234;
1150int
1151ufs_itrunc(struct inode *oip, u_offset_t length, int flags, cred_t *cr)
1152{
1153	struct fs *fs = oip->i_fs;
1154	struct ufsvfs *ufsvfsp = oip->i_ufsvfs;
1155	struct inode *ip;
1156	daddr_t lastblock;
1157	off_t bsize;
1158	int boff;
1159	daddr_t bn, lastiblock[NIADDR];
1160	int level;
1161	long nblocks, blocksreleased = 0;
1162	int i;
1163	ushort_t mode;
1164	struct inode tip;
1165	int err;
1166	u_offset_t maxoffset = (ufsvfsp->vfs_lfflags & UFS_LARGEFILES) ?
1167	    (UFS_MAXOFFSET_T) : (MAXOFF32_T);
1168
1169	/*
1170	 * Shadow inodes do not need to hold the vfs_dqrwlock lock. Most
1171	 * other uses need the reader lock. opendq() holds the writer lock.
1172	 */
1173	ASSERT((oip->i_mode & IFMT) == IFSHAD ||
1174		RW_LOCK_HELD(&ufsvfsp->vfs_dqrwlock));
1175	ASSERT(RW_WRITE_HELD(&oip->i_contents));
1176	/*
1177	 * We only allow truncation of regular files and directories
1178	 * to arbitrary lengths here.  In addition, we allow symbolic
1179	 * links to be truncated only to zero length.  Other inode
1180	 * types cannot have their length set here.  Disk blocks are
1181	 * being dealt with - especially device inodes where
1182	 * ip->i_ordev is actually being stored in ip->i_db[0]!
1183	 */
1184	TRANS_INODE(ufsvfsp, oip);
1185	mode = oip->i_mode & IFMT;
1186	if (flags & I_FREE) {
1187		i_genrand *= 16843009;  /* turns into shift and adds */
1188		i_genrand++;
1189		oip->i_gen += ((i_genrand + lbolt) & 0xffff) + 1;
1190		oip->i_flag |= ICHG |IUPD;
1191		oip->i_seq++;
1192		if (length == oip->i_size)
1193			return (0);
1194		flags |= I_CHEAP;
1195	}
1196	if (mode == IFIFO)
1197		return (0);
1198	if (mode != IFREG && mode != IFDIR && mode != IFATTRDIR &&
1199	    !(mode == IFLNK && length == (offset_t)0) && mode != IFSHAD)
1200		return (EINVAL);
1201	if (length > maxoffset)
1202		return (EFBIG);
1203	if ((mode == IFDIR) || (mode == IFATTRDIR))
1204		flags |= I_DIR;
1205	if (mode == IFSHAD)
1206		flags |= I_SHAD;
1207	if (oip == ufsvfsp->vfs_qinod)
1208		flags |= I_QUOTA;
1209	if (length == oip->i_size) {
1210		/* update ctime and mtime to please POSIX tests */
1211		oip->i_flag |= ICHG |IUPD;
1212		oip->i_seq++;
1213		if (length == 0) {
1214			/* nothing to cache so clear the flag */
1215			oip->i_flag &= ~IFASTSYMLNK;
1216		}
1217		return (0);
1218	}
1219	/* wipe out fast symlink till next access */
1220	if (oip->i_flag & IFASTSYMLNK) {
1221		int j;
1222
1223		ASSERT(ITOV(oip)->v_type == VLNK);
1224
1225		oip->i_flag &= ~IFASTSYMLNK;
1226
1227		for (j = 1; j < NDADDR; j++)
1228			oip->i_db[j] = 0;
1229		for (j = 0; j < NIADDR; j++)
1230			oip->i_ib[j] = 0;
1231	}
1232
1233	boff = (int)blkoff(fs, length);
1234
1235	if (length > oip->i_size) {
1236		/*
1237		 * Trunc up case.  BMAPALLOC will insure that the right blocks
1238		 * are allocated.  This includes extending the old frag to a
1239		 * full block (if needed) in addition to doing any work
1240		 * needed for allocating the last block.
1241		 */
1242		if (boff == 0)
1243			err = BMAPALLOC(oip, length - 1, (int)fs->fs_bsize, cr);
1244		else
1245			err = BMAPALLOC(oip, length - 1, boff, cr);
1246
1247		if (err == 0) {
1248			/*
1249			 * Save old size and set inode's size now
1250			 * so that we don't cause too much of the
1251			 * file to be zero'd and pushed.
1252			 */
1253			u_offset_t osize = oip->i_size;
1254			oip->i_size  = length;
1255			/*
1256			 * Make sure we zero out the remaining bytes of
1257			 * the page in case a mmap scribbled on it. We
1258			 * can't prevent a mmap from writing beyond EOF
1259			 * on the last page of a file.
1260			 *
1261			 */
1262			if ((boff = (int)blkoff(fs, osize)) != 0) {
1263				bsize = (int)lblkno(fs, osize - 1) >= NDADDR ?
1264				    fs->fs_bsize : fragroundup(fs, boff);
1265				pvn_vpzero(ITOV(oip), osize,
1266				    (size_t)(bsize - boff));
1267			}
1268			oip->i_flag |= ICHG|IATTCHG;
1269			oip->i_seq++;
1270			ITIMES_NOLOCK(oip);
1271			/*
1272			 * MAXOFF32_T is old 2GB size limit. If
1273			 * this operation caused a large file to be
1274			 * created, turn on the superblock flag
1275			 * and update the superblock, if the flag
1276			 * is not already on.
1277			 */
1278			if ((length > (u_offset_t)MAXOFF32_T) &&
1279			    !(fs->fs_flags & FSLARGEFILES)) {
1280				ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES);
1281				mutex_enter(&ufsvfsp->vfs_lock);
1282				fs->fs_flags |= FSLARGEFILES;
1283				ufs_sbwrite(ufsvfsp);
1284				mutex_exit(&ufsvfsp->vfs_lock);
1285			}
1286		}
1287
1288		return (err);
1289	}
1290
1291	/*
1292	 * Update the pages of the file.  If the file is not being
1293	 * truncated to a block boundary, the contents of the
1294	 * pages following the end of the file must be zero'ed
1295	 * in case it ever become accessible again because
1296	 * of subsequent file growth.
1297	 */
1298	if (boff == 0) {
1299		(void) pvn_vplist_dirty(ITOV(oip), length, ufs_putapage,
1300		    B_INVAL | B_TRUNC, CRED());
1301	} else {
1302		/*
1303		 * Make sure that the last block is properly allocated.
1304		 * We only really have to do this if the last block is
1305		 * actually allocated since ufs_bmap will now handle the case
1306		 * of an fragment which has no block allocated.  Just to
1307		 * be sure, we do it now independent of current allocation.
1308		 */
1309		err = BMAPALLOC(oip, length - 1, boff, cr);
1310		if (err)
1311			return (err);
1312
1313		/*
1314		 * BMAPALLOC will call bmap_write which defers i_seq
1315		 * processing.  If the timestamps were changed, update
1316		 * i_seq before rdip drops i_contents or syncs the inode.
1317		 */
1318		if (oip->i_flag & (ICHG|IUPD))
1319			oip->i_seq++;
1320
1321		/*
1322		 * BugId 4069932
1323		 * Make sure that the relevant partial page appears in
1324		 * the v_pages list, so that pvn_vpzero() will do its
1325		 * job.  Since doing this correctly requires everything
1326		 * in rdip() except for the uiomove(), it's easier and
1327		 * safer to do the uiomove() rather than duplicate the
1328		 * rest of rdip() here.
1329		 *
1330		 * To get here, we know that length indicates a byte
1331		 * that is not the first byte of a block.  (length - 1)
1332		 * is the last actual byte known to exist.  Deduction
1333		 * shows it is in the same block as byte (length).
1334		 * Thus, this rdip() invocation should always succeed
1335		 * except in the face of i/o errors, and give us the
1336		 * block we care about.
1337		 *
1338		 * rdip() makes the same locking assertions and
1339		 * assumptions as we do.  We do not acquire any locks
1340		 * before calling it, so we have not changed the locking
1341		 * situation.  Finally, there do not appear to be any
1342		 * paths whereby rdip() ends up invoking us again.
1343		 * Thus, infinite recursion is avoided.
1344		 */
1345		{
1346			uio_t uio;
1347			iovec_t iov[1];
1348			char buffer;
1349
1350			uio.uio_iov = iov;
1351			uio.uio_iovcnt = 1;
1352			uio.uio_loffset = length - 1;
1353			uio.uio_resid = 1;
1354			uio.uio_segflg = UIO_SYSSPACE;
1355			uio.uio_extflg = UIO_COPY_CACHED;
1356
1357			iov[0].iov_base = &buffer;
1358			iov[0].iov_len = 1;
1359
1360			err = rdip(oip, &uio, UIO_READ, NULL);
1361			if (err)
1362				return (err);
1363		}
1364
1365		bsize = (int)lblkno(fs, length - 1) >= NDADDR ?
1366		    fs->fs_bsize : fragroundup(fs, boff);
1367		pvn_vpzero(ITOV(oip), length, (size_t)(bsize - boff));
1368		/*
1369		 * Ensure full fs block is marked as dirty.
1370		 */
1371		(void) pvn_vplist_dirty(ITOV(oip), length + (bsize - boff),
1372		    ufs_putapage, B_INVAL | B_TRUNC, CRED());
1373	}
1374
1375	/*
1376	 * Calculate index into inode's block list of
1377	 * last direct and indirect blocks (if any)
1378	 * which we want to keep.  Lastblock is -1 when
1379	 * the file is truncated to 0.
1380	 */
1381	lastblock = lblkno(fs, length + fs->fs_bsize - 1) - 1;
1382	lastiblock[SINGLE] = lastblock - NDADDR;
1383	lastiblock[DOUBLE] = lastiblock[SINGLE] - NINDIR(fs);
1384	lastiblock[TRIPLE] = lastiblock[DOUBLE] - NINDIR(fs) * NINDIR(fs);
1385	nblocks = btodb(fs->fs_bsize);
1386
1387	/*
1388	 * Update file and block pointers
1389	 * on disk before we start freeing blocks.
1390	 * If we crash before free'ing blocks below,
1391	 * the blocks will be returned to the free list.
1392	 * lastiblock values are also normalized to -1
1393	 * for calls to indirtrunc below.
1394	 */
1395	tip = *oip;			/* structure copy */
1396	ip = &tip;
1397
1398	for (level = TRIPLE; level >= SINGLE; level--)
1399		if (lastiblock[level] < 0) {
1400			oip->i_ib[level] = 0;
1401			lastiblock[level] = -1;
1402		}
1403	for (i = NDADDR - 1; i > lastblock; i--) {
1404		oip->i_db[i] = 0;
1405		flags |= I_CHEAP;
1406	}
1407	oip->i_size = length;
1408	oip->i_flag |= ICHG|IUPD|IATTCHG;
1409	oip->i_seq++;
1410	if (!TRANS_ISTRANS(ufsvfsp))
1411		ufs_iupdat(oip, I_SYNC);	/* do sync inode update */
1412
1413	/*
1414	 * Indirect blocks first.
1415	 */
1416	for (level = TRIPLE; level >= SINGLE; level--) {
1417		bn = ip->i_ib[level];
1418		if (bn != 0) {
1419			blocksreleased +=
1420			    indirtrunc(ip, bn, lastiblock[level], level, flags);
1421			if (lastiblock[level] < 0) {
1422				ip->i_ib[level] = 0;
1423				free(ip, bn, (off_t)fs->fs_bsize,
1424					flags | I_IBLK);
1425				blocksreleased += nblocks;
1426			}
1427		}
1428		if (lastiblock[level] >= 0)
1429			goto done;
1430	}
1431
1432	/*
1433	 * All whole direct blocks or frags.
1434	 */
1435	for (i = NDADDR - 1; i > lastblock; i--) {
1436		bn = ip->i_db[i];
1437		if (bn == 0)
1438			continue;
1439		ip->i_db[i] = 0;
1440		bsize = (off_t)blksize(fs, ip, i);
1441		free(ip, bn, bsize, flags);
1442		blocksreleased += btodb(bsize);
1443	}
1444	if (lastblock < 0)
1445		goto done;
1446
1447	/*
1448	 * Finally, look for a change in size of the
1449	 * last direct block; release any frags.
1450	 */
1451	bn = ip->i_db[lastblock];
1452	if (bn != 0) {
1453		off_t oldspace, newspace;
1454
1455		/*
1456		 * Calculate amount of space we're giving
1457		 * back as old block size minus new block size.
1458		 */
1459		oldspace = blksize(fs, ip, lastblock);
1460		UFS_SET_ISIZE(length, ip);
1461		newspace = blksize(fs, ip, lastblock);
1462		if (newspace == 0) {
1463			err = ufs_fault(ITOV(ip), "ufs_itrunc: newspace == 0");
1464			return (err);
1465		}
1466		if (oldspace - newspace > 0) {
1467			/*
1468			 * Block number of space to be free'd is
1469			 * the old block # plus the number of frags
1470			 * required for the storage we're keeping.
1471			 */
1472			bn += numfrags(fs, newspace);
1473			free(ip, bn, oldspace - newspace, flags);
1474			blocksreleased += btodb(oldspace - newspace);
1475		}
1476	}
1477done:
1478/* BEGIN PARANOIA */
1479	for (level = SINGLE; level <= TRIPLE; level++)
1480		if (ip->i_ib[level] != oip->i_ib[level]) {
1481			err = ufs_fault(ITOV(ip), "ufs_itrunc: indirect block");
1482			return (err);
1483		}
1484
1485	for (i = 0; i < NDADDR; i++)
1486		if (ip->i_db[i] != oip->i_db[i]) {
1487			err = ufs_fault(ITOV(ip), "ufs_itrunc: direct block");
1488			return (err);
1489		}
1490/* END PARANOIA */
1491	oip->i_blocks -= blocksreleased;
1492
1493	if (oip->i_blocks < 0) {		/* sanity */
1494		cmn_err(CE_NOTE,
1495		    "ufs_itrunc: %s/%d new size = %lld, blocks = %d\n",
1496		    fs->fs_fsmnt, (int)oip->i_number, oip->i_size,
1497		    (int)oip->i_blocks);
1498		oip->i_blocks = 0;
1499	}
1500	oip->i_flag |= ICHG|IATTCHG;
1501	oip->i_seq++;
1502	/* blocksreleased is >= zero, so this can not fail */
1503	(void) chkdq(oip, -blocksreleased, 0, cr, (char **)NULL,
1504		(size_t *)NULL);
1505	return (0);
1506}
1507
1508/*
1509 * Check mode permission on inode.  Mode is READ, WRITE or EXEC.
1510 * In the case of WRITE, the read-only status of the file system
1511 * is checked.  Depending on the calling user, the appropriate
1512 * mode bits are selected; privileges to override missing permission
1513 * bits are checked through secpolicy_vnode_access().
1514 */
1515int
1516ufs_iaccess(void *vip, int mode, struct cred *cr)
1517{
1518	struct inode *ip = vip;
1519	int shift = 0;
1520
1521	if (mode & IWRITE) {
1522		/*
1523		 * Disallow write attempts on read-only
1524		 * file systems, unless the file is a block
1525		 * or character device or a FIFO.
1526		 */
1527		if (ip->i_fs->fs_ronly != 0) {
1528			if ((ip->i_mode & IFMT) != IFCHR &&
1529			    (ip->i_mode & IFMT) != IFBLK &&
1530			    (ip->i_mode & IFMT) != IFIFO) {
1531				return (EROFS);
1532			}
1533		}
1534	}
1535	/*
1536	 * If there is a shadow inode check for the presence of an acl,
1537	 * if the acl is there use the ufs_acl_access routine to check
1538	 * the acl
1539	 */
1540	if (ip->i_ufs_acl && ip->i_ufs_acl->aowner)
1541		return (ufs_acl_access(ip, mode, cr));
1542
1543	/*
1544	 * Access check is based on only
1545	 * one of owner, group, public.
1546	 * If not owner, then check group.
1547	 * If not a member of the group, then
1548	 * check public access.
1549	 */
1550	if (crgetuid(cr) != ip->i_uid) {
1551		shift += 3;
1552		if (!groupmember((uid_t)ip->i_gid, cr))
1553			shift += 3;
1554	}
1555
1556	mode &= ~(ip->i_mode << shift);
1557
1558	if (mode == 0)
1559		return (0);
1560
1561	/* test missing privilege bits */
1562	return (secpolicy_vnode_access(cr, ITOV(ip), ip->i_uid, mode));
1563}
1564
1565/*
1566 * if necessary, remove an inode from the free list
1567 *	i_contents is held except at unmount
1568 *
1569 * Return 1 if the inode is taken off of the ufs_idle_q,
1570 * and the caller is expected to call VN_RELE.
1571 *
1572 * Return 0 otherwise.
1573 */
1574int
1575ufs_rmidle(struct inode *ip)
1576{
1577	int rval = 0;
1578
1579	mutex_enter(&ip->i_tlock);
1580	if ((ip->i_flag & IREF) == 0) {
1581		mutex_enter(&ufs_idle_q.uq_mutex);
1582		ip->i_freef->i_freeb = ip->i_freeb;
1583		ip->i_freeb->i_freef = ip->i_freef;
1584		ip->i_freef = ip;
1585		ip->i_freeb = ip;
1586		ip->i_flag |= IREF;
1587		ufs_idle_q.uq_ne--;
1588		if (ip->i_flag & IJUNKIQ) {
1589			ufs_njunk_iq--;
1590			ip->i_flag &= ~IJUNKIQ;
1591		} else {
1592			ufs_nuseful_iq--;
1593		}
1594		mutex_exit(&ufs_idle_q.uq_mutex);
1595		rval = 1;
1596	}
1597	mutex_exit(&ip->i_tlock);
1598	return (rval);
1599}
1600
1601/*
1602 * scan the hash of inodes and call func with the inode locked
1603 */
1604int
1605ufs_scan_inodes(int rwtry, int (*func)(struct inode *, void *), void *arg,
1606		struct ufsvfs *ufsvfsp)
1607{
1608	struct inode		*ip;		/* current inode */
1609	struct inode		*lip = NULL;	/* last/previous inode */
1610	union ihead		*ih;		/* current hash chain */
1611	int			error, i;
1612	int			saverror = 0;
1613	int			lip_held;	/* lip needs a VN_RELE() */
1614
1615	/*
1616	 * If ufsvfsp is NULL, then our caller should be holding
1617	 * ufs_scan_lock to avoid conflicts between ufs_unmount() and
1618	 * ufs_update().  Otherwise, to avoid false-positives in
1619	 * ufs_unmount()'s v_count-based EBUSY check, we only hold
1620	 * those inodes that are in the file system our caller cares
1621	 * about.
1622	 *
1623	 * We know that ip is a valid inode in the hash chain (and thus
1624	 * we can trust i_ufsvfs) because the inode we chained from
1625	 * (lip) is still in the hash chain.  This is true because either:
1626	 *
1627	 * 1. We did not drop the hash chain lock since the last
1628	 *    iteration (because we were not interested in the last inode),
1629	 * or
1630	 * 2. We maintained a hold on the last inode while we
1631	 *    we were processing it, so it could not be removed
1632	 *    from the hash chain.
1633	 *
1634	 * The whole reason we're dropping and re-grabbing the chain
1635	 * lock on every inode is so that we don't present a major
1636	 * choke point on throughput, particularly when we've been
1637	 * called on behalf of fsflush.
1638	 */
1639
1640	for (i = 0, ih = ihead; i < inohsz; i++, ih++) {
1641		mutex_enter(&ih_lock[i]);
1642		for (ip = ih->ih_chain[0], lip_held = 0;
1643		    ip != (struct inode *)ih;
1644		    ip = lip->i_forw) {
1645
1646			ins.in_scan.value.ul++;
1647
1648			/*
1649			 * Undo the previous iteration's VN_HOLD(), but
1650			 * only if one was done.
1651			 */
1652			if (lip_held)
1653				VN_RELE(ITOV(lip));
1654
1655			lip = ip;
1656			if (ufsvfsp != NULL && ip->i_ufsvfs != ufsvfsp) {
1657				/*
1658				 * We're not processing all inodes, and
1659				 * this inode is not in the filesystem of
1660				 * interest, so skip it.  No need to do a
1661				 * VN_HOLD() since we're not dropping the
1662				 * hash chain lock until after we've
1663				 * done the i_forw traversal above.
1664				 */
1665				lip_held = 0;
1666				continue;
1667			}
1668			VN_HOLD(ITOV(ip));
1669			lip_held = 1;
1670			mutex_exit(&ih_lock[i]);
1671
1672			/*
1673			 * Acquire the contents lock as writer to make
1674			 * sure that the inode has been initialized in
1675			 * the cache or removed from the idle list by
1676			 * ufs_iget().  This works because ufs_iget()
1677			 * acquires the contents lock before putting
1678			 * the inode into the cache.  If we can lock
1679			 * it, then he's done with it.
1680			 */
1681
1682			if (rwtry) {
1683				if (!rw_tryenter(&ip->i_contents, RW_WRITER)) {
1684					mutex_enter(&ih_lock[i]);
1685					continue;
1686				}
1687			} else {
1688				rw_enter(&ip->i_contents, RW_WRITER);
1689			}
1690
1691			rw_exit(&ip->i_contents);
1692
1693			/*
1694			 * ISTALE means the inode couldn't be read
1695			 *
1696			 * We don't have to hold the i_contents lock
1697			 * for this check for a couple of
1698			 * reasons. First, if ISTALE is set then the
1699			 * flag cannot be cleared until the inode is
1700			 * removed from the cache and that cannot
1701			 * happen until after we VN_RELE() it.
1702			 * Second, if ISTALE is not set, then the
1703			 * inode is in the cache and does not need to
1704			 * be read from disk so ISTALE cannot be set
1705			 * while we are not looking.
1706			 */
1707			if ((ip->i_flag & ISTALE) == 0) {
1708				if ((error = (*func)(ip, arg)) != 0)
1709					saverror = error;
1710			}
1711
1712			mutex_enter(&ih_lock[i]);
1713		}
1714		if (lip_held)
1715			VN_RELE(ITOV(lip));
1716		mutex_exit(&ih_lock[i]);
1717	}
1718	return (saverror);
1719}
1720
1721/*
1722 * Mark inode with the current time, plus a unique increment.
1723 *
1724 * Since we only keep 32-bit time on disk, if UFS is still alive
1725 * beyond 2038, filesystem times will simply stick at the last
1726 * possible second of 32-bit time. Not ideal, but probably better
1727 * than going into the remote past, or confusing applications with
1728 * negative time.
1729 */
1730void
1731ufs_imark(struct inode *ip)
1732{
1733	timestruc_t now;
1734	int32_t usec, nsec;
1735
1736	/*
1737	 * The update of i_seq may have been deferred, increase i_seq here
1738	 * to make sure it is in sync with the timestamps.
1739	 */
1740	if (ip->i_flag & ISEQ) {
1741		ASSERT(ip->i_flag & (IUPD|ICHG));
1742		ip->i_seq++;
1743		ip->i_flag &= ~ISEQ;
1744	}
1745
1746	gethrestime(&now);
1747
1748	/*
1749	 * Fast algorithm to convert nsec to usec -- see hrt2ts()
1750	 * in common/os/timers.c for a full description.
1751	 */
1752	nsec = now.tv_nsec;
1753	usec = nsec + (nsec >> 2);
1754	usec = nsec + (usec >> 1);
1755	usec = nsec + (usec >> 2);
1756	usec = nsec + (usec >> 4);
1757	usec = nsec - (usec >> 3);
1758	usec = nsec + (usec >> 2);
1759	usec = nsec + (usec >> 3);
1760	usec = nsec + (usec >> 4);
1761	usec = nsec + (usec >> 1);
1762	usec = nsec + (usec >> 6);
1763	usec = usec >> 10;
1764
1765	mutex_enter(&ufs_iuniqtime_lock);
1766	if (now.tv_sec > (time_t)iuniqtime.tv_sec ||
1767	    usec > iuniqtime.tv_usec) {
1768		if (now.tv_sec < TIME32_MAX) {
1769			iuniqtime.tv_sec = (time32_t)now.tv_sec;
1770			iuniqtime.tv_usec = usec;
1771		}
1772	} else {
1773		if (iuniqtime.tv_sec < TIME32_MAX) {
1774			iuniqtime.tv_usec++;
1775			/* Check for usec overflow */
1776			if (iuniqtime.tv_usec >= MICROSEC) {
1777				iuniqtime.tv_sec++;
1778				iuniqtime.tv_usec = 0;
1779			}
1780		}
1781	}
1782
1783	if ((ip->i_flag & IACC) && !(ip->i_ufsvfs->vfs_noatime)) {
1784		ip->i_atime = iuniqtime;
1785	}
1786	if (ip->i_flag & IUPD) {
1787		ip->i_mtime = iuniqtime;
1788		ip->i_flag |= IMODTIME;
1789	}
1790	if (ip->i_flag & ICHG) {
1791		ip->i_diroff = 0;
1792		ip->i_ctime = iuniqtime;
1793	}
1794	mutex_exit(&ufs_iuniqtime_lock);
1795}
1796
1797/*
1798 * Update timestamps in inode.
1799 */
1800void
1801ufs_itimes_nolock(struct inode *ip)
1802{
1803
1804	/*
1805	 * if noatime is set and the inode access time is the only field that
1806	 * must be changed, exit immediately.
1807	 */
1808	if (((ip->i_flag & (IUPD|IACC|ICHG)) == IACC) &&
1809	    (ip->i_ufsvfs->vfs_noatime)) {
1810		return;
1811	}
1812
1813	if (ip->i_flag & (IUPD|IACC|ICHG)) {
1814		if (ip->i_flag & ICHG)
1815			ip->i_flag |= IMOD;
1816		else
1817			ip->i_flag |= IMODACC;
1818		ufs_imark(ip);
1819		ip->i_flag &= ~(IACC|IUPD|ICHG);
1820	}
1821}
1822