ufs_inode.h revision 9915:bc9126487a5f
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26/*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
27/*	  All Rights Reserved  	*/
28
29/*
30 * University Copyright- Copyright (c) 1982, 1986, 1988
31 * The Regents of the University of California
32 * All Rights Reserved
33 *
34 * University Acknowledgment- Portions of this document are derived from
35 * software developed by the University of California, Berkeley, and its
36 * contributors.
37 */
38
39#ifndef	_SYS_FS_UFS_INODE_H
40#define	_SYS_FS_UFS_INODE_H
41
42#include <sys/isa_defs.h>
43#include <sys/fbuf.h>
44#include <sys/fdbuffer.h>
45#include <sys/fcntl.h>
46#include <sys/uio.h>
47#include <sys/t_lock.h>
48#include <sys/thread.h>
49#include <sys/cred.h>
50#include <sys/time.h>
51#include <sys/types32.h>
52#include <sys/fs/ufs_fs.h>
53#include <sys/fs/ufs_lockfs.h>
54#include <sys/fs/ufs_trans.h>
55#include <sys/kstat.h>
56#include <sys/fs/ufs_acl.h>
57#include <sys/fs/ufs_panic.h>
58#include <sys/dnlc.h>
59
60#ifdef _KERNEL
61#include <sys/vfs_opreg.h>
62#endif
63
64#ifdef	__cplusplus
65extern "C" {
66#endif
67
68/*
69 * The I node is the focus of all local file activity in UNIX.
70 * There is a unique inode allocated for each active file,
71 * each current directory, each mounted-on file, each mapping,
72 * and the root.  An inode is `named' by its dev/inumber pair.
73 * Data in icommon is read in from permanent inode on volume.
74 *
75 * Each inode has 5 locks associated with it:
76 *	i_rwlock:	Serializes ufs_write and ufs_setattr request
77 *			and allows ufs_read requests to proceed in parallel.
78 *			Serializes reads/updates to directories.
79 *	vfs_dqrwlock:	Manages quota sub-system quiescence.  See below.
80 *	i_contents:	Protects almost all of the fields in the inode
81 *			except for those listed below. When held
82 *			in writer mode also protects those fields
83 *			listed under i_tlock.
84 *	i_tlock:	When i_tlock is held with the i_contents reader
85 *			lock the i_atime, i_mtime, i_ctime,
86 *			i_delayoff, i_delaylen, i_nextrio, i_writes, i_flag
87 *			i_seq, i_writer & i_mapcnt fields are protected.
88 *			For more i_flag locking info see below.
89 *	ih_lock:	Protects inode hash chain buckets
90 *	ifree_lock:	Protects inode freelist
91 *
92 * Lock ordering:
93 *	i_rwlock > i_contents > i_tlock
94 *	i_rwlock > vfs_dqrwlock > i_contents(writer) > i_tlock
95 *	i_contents > i_tlock
96 *	vfs_dqrwlock > i_contents(writer) > i_tlock
97 *	ih_lock > i_contents > i_tlock
98 *
99 * Making major changes to quota sub-system state, while the file
100 * system is mounted required the addition of another lock.  The
101 * primary lock in the quota sub-system is vfs_dqrwlock in the ufsvfs
102 * structure.  This lock is used to manage quota sub-system quiescence
103 * for a particular file system. Major changes to quota sub-system
104 * state (disabling quotas, enabling quotas, and setting new quota
105 * limits) all require the file system to be quiescent and grabbing
106 * vfs_dqrwlock as writer accomplishes this.  On the other hand,
107 * grabbing vfs_dqrwlock as reader makes the quota sub-system
108 * non-quiescent and lets the quota sub-system know that now is not a
109 * good time to change major quota sub-system state.  Typically
110 * vfs_dqrwlock is grabbed for reading before i_contents is grabbed for
111 * writing.  However, there are cases where vfs_dqrwlock is grabbed for
112 * reading without a corresponding i_contents write grab because there
113 * is no relevant inode.  There are also cases where i_contents is
114 * grabbed for writing when a vfs_dqrwlock read grab is not needed
115 * because the inode changes do not affect quotas.
116 *
117 * Unfortunately, performance considerations have required that we be more
118 * intelligent about using i_tlock when updating i_flag.  Ideally, we would
119 * have simply separated out several of the bits in i_flag into their own
120 * ints to avoid problems.  But, instead, we have implemented the following
121 * rules:
122 *
123 *	o You can update any i_flag field while holding the writer-contents,
124 *	  or by holding the reader-contents AND holding i_tlock.
125 *	  You can only call ITIMES_NOLOCK while holding the writer-contents,
126 *	  or by holding the reader-contents AND holding i_tlock.
127 *
128 *	o For a directory, holding the reader-rw_lock is sufficient for setting
129 *	  IACC.
130 *
131 *	o Races with IREF are avoided by holding the reader contents lock
132 *	  and by holding i_tlock in ufs_rmidle, ufs_putapage, and ufs_getpage.
133 *	  And by holding the writer-contents in ufs_iinactive.
134 *
135 *	o The callers are no longer required to handle the calls to ITIMES
136 *	  and ITIMES_NOLOCK.  The functions that set the i_flag bits are
137 *	  responsible for managing those calls.  The exceptions are the
138 *	  bmap routines.
139 *
140 * SVR4 Extended Fundamental Type (EFT) support:
141 * 	The inode structure has been enhanced to support
142 *	32-bit user-id, 32-bit group-id, and 32-bit device number.
143 *	Standard SVR4 ufs also supports 32-bit mode field.  For the reason
144 *	of backward compatibility with the previous ufs disk format,
145 *	32-bit mode field is not supported.
146 *
147 *	The current inode structure is 100% backward compatible with
148 *	the previous inode structure if no user-id or group-id exceeds
149 *	USHRT_MAX, and no major or minor number of a device number
150 *	stored in an inode exceeds 255.
151 *
152 * Rules for managing i_seq:
153 *	o i_seq is locked under the same rules as i_flag
154 *	o The i_ctime or i_mtime MUST never change without increasing
155 *	  the value of i_seq.
156 *	o You may increase the value of i_seq without the timestamps
157 *	  changing, this may decrease the callers performance but will
158 *	  be functionally correct.
159 *	o The common case is when IUPD or ICHG is set, increase i_seq
160 *	  and immediately call ITIMES* or ufs_iupdat to create a new timestamp.
161 *	o A less common case is the setting of IUPD or ICHG and while still
162 *	  holding the correct lock defer the timestamp and i_seq update
163 *	  until later, but it must still be done before the lock is released.
164 *	  bmap_write is an example of this, where the caller does the update.
165 *	o If multiple changes are being made with the timestamps being
166 *	  updated only at the end, a single increase of i_seq is allowed.
167 *	o If changes are made with IUPD or ICHG being set, but
168 *	  the controlling lock is being dropped before the timestamp is
169 *	  updated, there is a risk that another thread will also change
170 *	  the file, update i_flag, and push just one timestamp update.
171 *	  There is also the risk that another thread calls ITIMES or
172 *	  ufs_iupdat without setting IUPD|ICHG and thus not changing i_seq,
173 *	  this will cause ufs_imark to change the timestamps without changing
174 *	  i_seq. If the controlling lock is dropped, ISEQ must be set to
175 *	  force i_seq to be increased on next ufs_imark, but i_seq MUST still
176 *	  be increased by the original setting thread before its deferred
177 *	  call to ITIMES to insure it is increased the correct number of times.
178 */
179
180#define	UID_LONG  (o_uid_t)65535
181				/* flag value to indicate uid is 32-bit long */
182#define	GID_LONG  (o_uid_t)65535
183				/* flag value to indicate gid is 32-bit long */
184
185#define	NDADDR	12		/* direct addresses in inode */
186#define	NIADDR	3		/* indirect addresses in inode */
187#define	FSL_SIZE (NDADDR + NIADDR - 1) * sizeof (daddr32_t)
188				/* max fast symbolic name length is 56 */
189
190#define	i_fs	i_ufsvfs->vfs_bufp->b_un.b_fs
191#define	i_vfs	i_vnode->v_vfsp
192
193struct 	icommon {
194	o_mode_t ic_smode;	/*  0: mode and type of file */
195	short	ic_nlink;	/*  2: number of links to file */
196	o_uid_t	ic_suid;	/*  4: owner's user id */
197	o_gid_t	ic_sgid;	/*  6: owner's group id */
198	u_offset_t ic_lsize;	/*  8: number of bytes in file */
199#ifdef _KERNEL
200	struct timeval32 ic_atime;	/* 16: time last accessed */
201	struct timeval32 ic_mtime;	/* 24: time last modified */
202	struct timeval32 ic_ctime;	/* 32: last time inode changed */
203#else
204	time32_t ic_atime;	/* 16: time last accessed */
205	int32_t	ic_atspare;
206	time32_t ic_mtime;	/* 24: time last modified */
207	int32_t	ic_mtspare;
208	time32_t ic_ctime;	/* 32: last time inode changed */
209	int32_t	ic_ctspare;
210#endif
211	daddr32_t	ic_db[NDADDR];	/* 40: disk block addresses */
212	daddr32_t	ic_ib[NIADDR];	/* 88: indirect blocks */
213	int32_t	ic_flags;	/* 100: cflags */
214	int32_t	ic_blocks;	/* 104: 512 byte blocks actually held */
215	int32_t	ic_gen;		/* 108: generation number */
216	int32_t	ic_shadow;	/* 112: shadow inode */
217	uid_t	ic_uid;		/* 116: long EFT version of uid */
218	gid_t	ic_gid;		/* 120: long EFT version of gid */
219	uint32_t ic_oeftflag;	/* 124: extended attr directory ino, 0 = none */
220};
221
222/*
223 * Large directories can be cached. Directory caching can take the following
224 * states:
225 */
226typedef enum {
227	CD_DISABLED_NOMEM = -2,
228	CD_DISABLED_TOOBIG,
229	CD_DISABLED,
230	CD_ENABLED
231} cachedir_t;
232
233/*
234 * Large Files: Note we use the inline functions load_double, store_double
235 * to load and store the long long values of i_size. Therefore the
236 * address of i_size must be eight byte aligned. Kmem_alloc of incore
237 * inode structure makes sure that the structure is 8-byte aligned.
238 * XX64 - reorder this structure?
239 */
240typedef struct inode {
241	struct	inode *i_chain[2];	/* must be first */
242	struct inode *i_freef;	/* free list forward - must be before i_ic */
243	struct inode *i_freeb;	/* free list back - must be before i_ic */
244	struct 	icommon	i_ic;	/* Must be here */
245	struct	vnode *i_vnode;	/* vnode associated with this inode */
246	struct	vnode *i_devvp;	/* vnode for block I/O */
247	dev_t	i_dev;		/* device where inode resides */
248	ino_t	i_number;	/* i number, 1-to-1 with device address */
249	off_t	i_diroff;	/* offset in dir, where we found last entry */
250				/* just a hint - no locking needed */
251	struct ufsvfs *i_ufsvfs; /* incore fs associated with inode */
252	struct	dquot *i_dquot;	/* quota structure controlling this file */
253	krwlock_t i_rwlock;	/* serializes write/setattr requests */
254	krwlock_t i_contents;	/* protects (most of) inode contents */
255	kmutex_t i_tlock;	/* protects time fields, i_flag */
256	offset_t i_nextr;	/*					*/
257				/* next byte read offset (read-ahead)	*/
258				/*   No lock required			*/
259				/*					*/
260	uint_t	i_flag;		/* inode flags */
261	uint_t	i_seq;		/* modification sequence number */
262	cachedir_t i_cachedir;	/* Cache this directory on next lookup */
263				/* - no locking needed  */
264	long	i_mapcnt;	/* mappings to file pages */
265	int	*i_map;		/* block list for the corresponding file */
266	dev_t	i_rdev;		/* INCORE rdev from i_oldrdev by ufs_iget */
267	size_t	i_delaylen;	/* delayed writes, units=bytes */
268	offset_t i_delayoff;	/* where we started delaying */
269	offset_t i_nextrio;	/* where to start the next clust */
270	long	i_writes;	/* number of outstanding bytes in write q */
271	kcondvar_t i_wrcv;	/* sleep/wakeup for write throttle */
272	offset_t i_doff;	/* dinode byte offset in file system */
273	si_t *i_ufs_acl;	/* pointer to acl entry */
274	dcanchor_t i_danchor;	/* directory cache anchor */
275	kthread_t *i_writer;	/* thread which is in window in wrip() */
276} inode_t;
277
278struct dinode {
279	union {
280		struct	icommon di_icom;
281		char	di_size[128];
282	} di_un;
283};
284
285#define	i_mode		i_ic.ic_smode
286#define	i_nlink		i_ic.ic_nlink
287#define	i_uid		i_ic.ic_uid
288#define	i_gid		i_ic.ic_gid
289#define	i_smode		i_ic.ic_smode
290#define	i_suid		i_ic.ic_suid
291#define	i_sgid		i_ic.ic_sgid
292
293#define	i_size		i_ic.ic_lsize
294#define	i_db		i_ic.ic_db
295#define	i_ib		i_ic.ic_ib
296
297#define	i_atime		i_ic.ic_atime
298#define	i_mtime		i_ic.ic_mtime
299#define	i_ctime		i_ic.ic_ctime
300
301#define	i_shadow	i_ic.ic_shadow
302#define	i_oeftflag	i_ic.ic_oeftflag
303#define	i_blocks	i_ic.ic_blocks
304#define	i_cflags	i_ic.ic_flags
305#ifdef _LITTLE_ENDIAN
306/*
307 * Originally done on x86, but carried on to all other little
308 * architectures, which provides for file system compatibility.
309 */
310#define	i_ordev		i_ic.ic_db[1]	/* USL SVR4 compatibility */
311#else
312#define	i_ordev		i_ic.ic_db[0]	/* was i_oldrdev */
313#endif
314#define	i_gen		i_ic.ic_gen
315#define	i_forw		i_chain[0]
316#define	i_back		i_chain[1]
317
318/* EFT transition aids - obsolete */
319#define	oEFT_MAGIC	0x90909090
320#define	di_oeftflag	di_ic.ic_oeftflag
321
322#define	di_ic		di_un.di_icom
323#define	di_mode		di_ic.ic_smode
324#define	di_nlink	di_ic.ic_nlink
325#define	di_uid		di_ic.ic_uid
326#define	di_gid		di_ic.ic_gid
327#define	di_smode	di_ic.ic_smode
328#define	di_suid		di_ic.ic_suid
329#define	di_sgid		di_ic.ic_sgid
330
331#define	di_size		di_ic.ic_lsize
332#define	di_db		di_ic.ic_db
333#define	di_ib		di_ic.ic_ib
334
335#define	di_atime	di_ic.ic_atime
336#define	di_mtime	di_ic.ic_mtime
337#define	di_ctime	di_ic.ic_ctime
338#define	di_cflags	di_ic.ic_flags
339
340#ifdef _LITTLE_ENDIAN
341#define	di_ordev	di_ic.ic_db[1]
342#else
343#define	di_ordev	di_ic.ic_db[0]
344#endif
345#define	di_shadow	di_ic.ic_shadow
346#define	di_blocks	di_ic.ic_blocks
347#define	di_gen		di_ic.ic_gen
348
349/* flags */
350#define	IUPD		0x0001		/* file has been modified */
351#define	IACC		0x0002		/* inode access time to be updated */
352#define	IMOD		0x0004		/* inode has been modified */
353#define	ICHG		0x0008		/* inode has been changed */
354#define	INOACC		0x0010		/* no access time update in getpage */
355#define	IMODTIME	0x0020		/* mod time already set */
356#define	IREF		0x0040		/* inode is being referenced */
357#define	ISYNC		0x0080		/* do all allocation synchronously */
358#define	IFASTSYMLNK	0x0100		/* fast symbolic link */
359#define	IMODACC		0x0200		/* only access time changed; */
360					/*   filesystem won't become active */
361#define	IATTCHG		0x0400		/* only size/blocks have changed */
362#define	IBDWRITE	0x0800		/* the inode has been scheduled for */
363					/* write operation asynchronously */
364#define	ISTALE		0x1000		/* inode couldn't be read from disk */
365#define	IDEL		0x2000		/* inode is being deleted */
366#define	IDIRECTIO	0x4000		/* attempt directio */
367#define	ISEQ		0x8000		/* deferred i_seq increase */
368#define	IJUNKIQ		0x10000		/* on junk idle queue */
369#define	IQUIET		0x20000		/* No file system full messages */
370
371/* cflags */
372#define	IXATTR		0x0001		/* extended attribute */
373#define	IFALLOCATE	0x0002		/* fallocate'd file */
374#define	ICOMPRESS	0x0004		/* compressed for dcfs - see */
375					/*   `ufs_ioctl()`_FIO_COMPRESSED */
376
377/* modes */
378#define	IFMT		0170000		/* type of file */
379#define	IFIFO		0010000		/* named pipe (fifo) */
380#define	IFCHR		0020000		/* character special */
381#define	IFDIR		0040000		/* directory */
382#define	IFBLK		0060000		/* block special */
383#define	IFREG		0100000		/* regular */
384#define	IFLNK		0120000		/* symbolic link */
385#define	IFSHAD		0130000		/* shadow indode */
386#define	IFSOCK		0140000		/* socket */
387#define	IFATTRDIR	0160000		/* Attribute directory */
388
389#define	ISUID		04000		/* set user id on execution */
390#define	ISGID		02000		/* set group id on execution */
391#define	ISVTX		01000		/* save swapped text even after use */
392#define	IREAD		0400		/* read, write, execute permissions */
393#define	IWRITE		0200
394#define	IEXEC		0100
395
396/* specify how the inode info is written in ufs_syncip() */
397#define	I_SYNC		1		/* wait for the inode written to disk */
398#define	I_DSYNC		2		/* wait for the inode written to disk */
399					/* only if IATTCHG is set */
400#define	I_ASYNC		0		/* don't wait for the inode written */
401
402/* flags passed to ufs_itrunc(), indirtrunc(), and free() */
403#define	I_FREE	0x00000001		/* inode is being freed */
404#define	I_DIR	0x00000002		/* inode is a directory */
405#define	I_IBLK	0x00000004		/* indirect block */
406#define	I_CHEAP	0x00000008		/* cheap free */
407#define	I_SHAD	0x00000010		/* inode is a shadow inode */
408#define	I_QUOTA	0x00000020		/* quota file */
409#define	I_NOCANCEL	0x40		/* Don't cancel these fragments */
410#define	I_ACCT	0x00000080		/* Update ufsvfs' unreclaimed_blocks */
411
412/*
413 * If ufs_dircheckforname() fails to find an entry with the given name,
414 * this "slot" structure holds state for ufs_direnter_*() as to where
415 * there is space to put an entry with that name.
416 * If ufs_dircheckforname() finds an entry with the given name, this structure
417 * holds state for ufs_dirrename() and ufs_dirremove() as to where the
418 * entry is. "status" indicates what ufs_dircheckforname() found:
419 *      NONE            name not found, large enough free slot not found,
420 *      FOUND           name not found, large enough free slot found
421 *      EXIST           name found
422 * If ufs_dircheckforname() fails due to an error, this structure is not
423 * filled in.
424 *
425 * After ufs_dircheckforname() succeeds the values are:
426 *      status  offset          size            fbp, ep
427 *      ------  ------          ----            -------
428 *      NONE    end of dir      needed          not valid
429 *      FOUND   start of entry  of ent          both valid if fbp != NULL
430 *      EXIST   start of entry  of prev ent     valid
431 *
432 * "endoff" is set to 0 if the an entry with the given name is found, or if no
433 * free slot could be found or made; this means that the directory should not
434 * be truncated.  If the entry was found, the search terminates so
435 * ufs_dircheckforname() didn't find out where the last valid entry in the
436 * directory was, so it doesn't know where to cut the directory off; if no free
437 * slot could be found or made, the directory has to be extended to make room
438 * for the new entry, so there's nothing to cut off.
439 * Otherwise, "endoff" is set to the larger of the offset of the last
440 * non-empty entry in the directory, or the offset at which the new entry will
441 * be placed, whichever is larger.  This is used by ufs_diraddentry(); if a new
442 * entry is to be added to the directory, any complete directory blocks at the
443 * end of the directory that contain no non-empty entries are lopped off the
444 * end, thus shrinking the directory dynamically.
445 */
446typedef enum {NONE, FOUND, EXIST} slotstat_t;
447struct ufs_slot {
448	struct	direct *ep;	/* pointer to slot */
449	struct	fbuf *fbp;	/* dir buf where slot is */
450	off_t	offset;		/* offset of area with free space */
451	off_t	endoff;		/* last useful location found in search */
452	slotstat_t status;	/* status of slot */
453	int	size;		/* size of area at slotoffset */
454	int	cached;		/* cached directory */
455};
456
457/*
458 * Statistics on inodes
459 * Not protected by locks
460 */
461struct instats {
462	kstat_named_t in_size;		/* current cache size */
463	kstat_named_t in_maxsize;	/* maximum cache size */
464	kstat_named_t in_hits;		/* cache hits */
465	kstat_named_t in_misses;	/* cache misses */
466	kstat_named_t in_malloc;	/* kmem_alloce'd */
467	kstat_named_t in_mfree;		/* kmem_free'd */
468	kstat_named_t in_maxreached;	/* Largest size reached by cache */
469	kstat_named_t in_frfront;	/* # put at front of freelist */
470	kstat_named_t in_frback;	/* # put at back of freelist */
471	kstat_named_t in_qfree;		/* q's to delete thread */
472	kstat_named_t in_scan;		/* # inodes scanned */
473	kstat_named_t in_tidles;	/* # inodes idled by idle thread */
474	kstat_named_t in_lidles;	/* # inodes idled by ufs_lookup */
475	kstat_named_t in_vidles;	/* # inodes idled by ufs_vget */
476	kstat_named_t in_kcalloc;	/* # inodes kmem_cache_alloced */
477	kstat_named_t in_kcfree;	/* # inodes kmem_cache_freed */
478	kstat_named_t in_poc;		/* # push-on-close's */
479};
480
481#ifdef _KERNEL
482
483/*
484 * Extended attributes
485 */
486
487#define	XATTR_DIR_NAME	"/@/"
488extern int	ufs_ninode;		/* high-water mark for inode cache */
489
490extern struct vnodeops *ufs_vnodeops;	/* vnode operations for ufs */
491extern const struct fs_operation_def ufs_vnodeops_template[];
492
493/*
494 * Convert between inode pointers and vnode pointers
495 */
496#define	VTOI(VP)	((struct inode *)(VP)->v_data)
497#define	ITOV(IP)	((struct vnode *)(IP)->i_vnode)
498
499/*
500 * convert to fs
501 */
502#define	ITOF(IP)	((struct fs *)(IP)->i_fs)
503
504/*
505 * Convert between vnode types and inode formats
506 */
507extern enum vtype	iftovt_tab[];
508
509#ifdef notneeded
510
511/* Look at sys/mode.h and os/vnode.c */
512
513extern int		vttoif_tab[];
514
515#endif
516
517/*
518 * Mark an inode with the current (unique) timestamp.
519 * (Note that UFS's concept of time only keeps 32 bits of seconds
520 * in the on-disk format).
521 */
522struct timeval32 iuniqtime;
523extern kmutex_t ufs_iuniqtime_lock;
524
525#define	ITIMES_NOLOCK(ip) ufs_itimes_nolock(ip)
526
527#define	ITIMES(ip) { \
528	mutex_enter(&(ip)->i_tlock); \
529	ITIMES_NOLOCK(ip); \
530	mutex_exit(&(ip)->i_tlock); \
531}
532
533/*
534 * The following interfaces are used to do atomic loads and stores
535 * of an inode's i_size, which is a long long data type.
536 *
537 * For LP64, we just to a load or a store - atomicity and alignment
538 * are 8-byte guaranteed.  For x86 there are no such instructions,
539 * so we grab i_contents as reader to get the size; we already hold
540 * it as writer when we're setting the size.
541 */
542
543#ifdef _LP64
544
545#define	UFS_GET_ISIZE(resultp, ip)	*(resultp) = (ip)->i_size
546#define	UFS_SET_ISIZE(value, ip)	(ip)->i_size = (value)
547
548#else	/* _LP64 */
549
550#define	UFS_GET_ISIZE(resultp, ip)				\
551	{							\
552		rw_enter(&(ip)->i_contents, RW_READER);		\
553		*(resultp) = (ip)->i_size;			\
554		rw_exit(&(ip)->i_contents);			\
555	}
556#define	UFS_SET_ISIZE(value, ip)				\
557	{							\
558		ASSERT(RW_WRITE_HELD(&(ip)->i_contents));	\
559		(ip)->i_size = (value);				\
560	}
561
562#endif	/* _LP64 */
563
564/*
565 * Allocate the specified block in the inode
566 * and make sure any in-core pages are initialized.
567 */
568#define	BMAPALLOC(ip, off, size, cr) \
569	bmap_write((ip), (u_offset_t)(off), (size), BI_NORMAL, NULL, cr)
570
571#define	ESAME	(-1)		/* trying to rename linked files (special) */
572
573#define	UFS_HOLE	(daddr32_t)-1	/* value used when no block allocated */
574
575/*
576 * enums
577 */
578
579/* direnter ops */
580enum de_op { DE_CREATE, DE_MKDIR, DE_LINK, DE_RENAME, DE_SYMLINK, DE_ATTRDIR};
581
582/* dirremove ops */
583enum dr_op { DR_REMOVE, DR_RMDIR, DR_RENAME };
584
585/*
586 * block initialization type for bmap_write
587 *
588 * BI_NORMAL - allocate and zero fill pages in memory
589 * BI_ALLOC_ONLY - only allocate the block, do not zero out pages in mem
590 * BI_FALLOCATE - allocate only, do not zero out pages, and store as negative
591 *                block number in inode block list
592 */
593enum bi_type { BI_NORMAL, BI_ALLOC_ONLY, BI_FALLOCATE };
594
595/*
596 * This overlays the fid structure (see vfs.h)
597 *
598 * LP64 note: we use int32_t instead of ino_t since UFS does not use
599 * inode numbers larger than 32-bits and ufid's are passed to NFS
600 * which expects them to not grow in size beyond 10 bytes (12 including
601 * the length).
602 */
603struct ufid {
604	ushort_t ufid_len;
605	ushort_t ufid_flags;
606	int32_t	ufid_ino;
607	int32_t	ufid_gen;
608};
609
610/*
611 * each ufs thread (see ufs_thread.c) is managed by this struct
612 */
613struct ufs_q {
614	union uq_head {
615		void		*_uq_generic;	/* first entry on q */
616		struct inode	*_uq_i;
617		ufs_failure_t	*_uq_uf;
618	} _uq_head;
619	int		uq_ne;		/* # of entries/failures found */
620	int		uq_lowat;	/* thread runs when ne == lowat */
621	int		uq_hiwat;	/* synchronous idle if ne >= hiwat */
622	ushort_t	uq_flags;	/* flags (see below) */
623	kcondvar_t	uq_cv;		/* for sleep/wakeup */
624	kthread_id_t	uq_threadp;	/* thread managing this q */
625	kmutex_t	uq_mutex;	/* protects this struct */
626};
627
628#define	uq_head		_uq_head._uq_generic
629#define	uq_ihead	_uq_head._uq_i
630#define	uq_ufhead	_uq_head._uq_uf
631
632/*
633 * uq_flags
634 */
635#define	UQ_EXIT		(0x0001)	/* q server exits at its convenience */
636#define	UQ_WAIT		(0x0002)	/* thread is waiting on q server */
637#define	UQ_SUSPEND	(0x0004)	/* request for suspension */
638#define	UQ_SUSPENDED	(0x0008)	/* thread has suspended itself */
639
640/*
641 * When logging is enabled, statvfs must account for blocks and files that
642 * may be on the delete queue.  Protected by ufsvfsp->vfs_delete.uq_mutex
643 */
644struct ufs_delq_info {
645	u_offset_t	delq_unreclaimed_blocks;
646	ulong_t		delq_unreclaimed_files;
647};
648
649
650/*
651 * global idle queues
652 * The queues are sized dynamically in proportion to ufs_ninode
653 * which, unless overridden, scales with the amount of memory.
654 * The idle queue is halved whenever it hits the low water mark
655 * (1/4 of ufs_ninode), but can burst to sizes much larger. The number
656 * of hash queues is currently maintained to give on average IQHASHQLEN
657 * entries when the idle queue is at the low water mark.
658 * Note, we do not need to search along the hash queues, but use them
659 * in order to batch together geographically local inodes to allow
660 * their updates (via the log or buffer cache) to require less disk seeks.
661 * This gives an incredible performance boost for logging and a boost for
662 * non logging file systems.
663 */
664typedef struct {
665	inode_t *i_chain[2];	/* must match inode_t, but unused */
666	inode_t *i_freef;	/* must match inode_t, idle list forward */
667	inode_t *i_freeb;	/* must match inode_t, idle list back  */
668} iqhead_t;
669
670extern struct ufs_q ufs_idle_q;		/* used by global ufs idle thread */
671extern iqhead_t *ufs_junk_iq;		/* junk idle queues */
672extern iqhead_t *ufs_useful_iq;		/* useful idle queues */
673extern int ufs_njunk_iq;		/* number of entries in junk iq */
674extern int ufs_nuseful_iq;		/* number of entries in useful iq */
675extern int ufs_niqhash;			/* number of iq hash qs - power of 2 */
676extern int ufs_iqhashmask;		/* iq hash mask = ufs_niqhash - 1 */
677
678#define	IQHASHQLEN 32			/* see comments above */
679#define	INOCGSHIFT 7			/* 128 inodes per cylinder group */
680#define	IQHASH(ip) (((ip)->i_number >> INOCGSHIFT) & ufs_iqhashmask)
681#define	IQNEXT(i) ((i) + 1) & ufs_iqhashmask /* next idle queue */
682
683extern struct ufs_q	ufs_hlock;	/* used by global ufs hlock thread */
684
685/*
686 * vfs_lfflags flags
687 */
688#define	UFS_LARGEFILES	((ushort_t)0x1)	/* set if mount allows largefiles */
689
690/*
691 * vfs_dfritime flags
692 */
693#define	UFS_DFRATIME	0x1		/* deferred access time */
694
695/*
696 * UFS VFS private data.
697 *
698 * UFS file system instances may be linked on several lists.
699 *
700 * -	The vfs_next field chains together every extant ufs instance; this
701 *	list is rooted at ufs_instances and should be used in preference to
702 *	the overall vfs list (which is properly the province of the generic
703 *	file system code, not of file system implementations).  This same list
704 *	link is used during forcible unmounts to chain together instances that
705 *	can't yet be completely dismantled,
706 *
707 * -	The vfs_wnext field is used within ufs_update to form a work list of
708 *	UFS instances to be synced out.
709 */
710typedef struct ufsvfs {
711	struct vfs	*vfs_vfs;	/* back link			*/
712	struct ufsvfs	*vfs_next;	/* instance list link		*/
713	struct ufsvfs	*vfs_wnext;	/* work list link		*/
714	struct vnode	*vfs_root;	/* root vnode			*/
715	struct buf	*vfs_bufp;	/* buffer containing superblock */
716	struct vnode	*vfs_devvp;	/* block device vnode		*/
717	ushort_t	vfs_lfflags;	/* Large files (set by mount)   */
718	ushort_t	vfs_qflags;	/* QUOTA: filesystem flags	*/
719	struct inode	*vfs_qinod;	/* QUOTA: pointer to quota file */
720	uint_t		vfs_btimelimit;	/* QUOTA: block time limit	*/
721	uint_t		vfs_ftimelimit;	/* QUOTA: file time limit	*/
722	krwlock_t	vfs_dqrwlock;	/* QUOTA: protects quota fields */
723	/*
724	 * some fs local threads
725	 */
726	struct ufs_q	vfs_delete;	/* delayed inode delete */
727	struct ufs_q	vfs_reclaim;	/* reclaim open, deleted files */
728
729	/*
730	 * This is copied from the super block at mount time.
731	 */
732	int		vfs_nrpos;	/* # rotational positions */
733	/*
734	 * This lock protects cg's and super block pointed at by
735	 * vfs_bufp->b_fs.  Locks contents of fs and cg's and contents
736	 * of vfs_dio.
737	 */
738	kmutex_t	vfs_lock;
739	struct ulockfs	vfs_ulockfs;	/* ufs lockfs support */
740	uint_t		vfs_dio;	/* delayed io (_FIODIO) */
741	uint_t		vfs_nointr;	/* disallow lockfs interrupts */
742	uint_t		vfs_nosetsec;	/* disallow ufs_setsecattr */
743	uint_t		vfs_syncdir;	/* synchronous local directory ops */
744	uint_t		vfs_dontblock;	/* don't block on forced umount */
745
746	/*
747	 * trans (logging ufs) stuff
748	 */
749	uint_t		vfs_domatamap;	/* set if matamap enabled */
750	ulong_t		vfs_maxacl;	/* transaction stuff - max acl size */
751	ulong_t		vfs_dirsize;	/* logspace for directory creation */
752	ulong_t		vfs_avgbfree;	/* average free blks in cg (blkpref) */
753	/*
754	 * Some useful constants
755	 */
756	int	vfs_nindirshift;	/* calc. from fs_nindir */
757	int	vfs_nindiroffset;	/* calc. from fs_ninidr */
758	int	vfs_ioclustsz;		/* bytes in read/write cluster */
759	int	vfs_iotransz;		/* max device i/o transfer size  */
760
761	vfs_ufsfx_t	vfs_fsfx;	/* lock/fix-on-panic support */
762	/*
763	 * More useful constants
764	 */
765	int	vfs_minfrags;		/* calc. from fs_minfree */
766	/*
767	 * Force DirectIO on all files
768	 */
769	uint_t	vfs_forcedirectio;
770	/*
771	 * Deferred inode time related fields
772	 */
773	clock_t		vfs_iotstamp;	/* last I/O timestamp */
774	uint_t		vfs_dfritime;	/* deferred inode time flags */
775	/*
776	 * Some more useful info
777	 */
778	dev_t		vfs_dev;	/* device mounted from */
779	struct ml_unit	*vfs_log;	/* pointer to embedded log struct */
780	uint_t		vfs_noatime;    /* disable inode atime updates */
781	/*
782	 * snapshot stuff
783	 */
784	void		*vfs_snapshot;	/* snapshot handle */
785	/*
786	 *  Controls logging "file system full" messages to messages file
787	 */
788	clock_t		vfs_lastwhinetime;
789
790	int 		vfs_nolog_si;	/* not logging summary info */
791	int		vfs_validfs;	/* indicates mounted fs */
792
793	/*
794	 * Additional information about vfs_delete above
795	 */
796	struct ufs_delq_info vfs_delete_info; /* what's on the delete queue */
797} ufsvfs_t;
798
799#define	vfs_fs	vfs_bufp->b_un.b_fs
800
801/*
802 * values for vfs_validfs
803 */
804#define	UT_UNMOUNTED	0
805#define	UT_MOUNTED	1
806#define	UT_HLOCKING	2
807
808/* inohsz is guaranteed to be a power of 2 */
809#define	INOHASH(ino)	(((int)ino) & (inohsz - 1))
810
811#define	ISFALLOCBLK(ip, bn)	\
812	(((bn) < 0) && ((bn) % ip->i_fs->fs_frag == 0) && \
813	((ip)->i_cflags & IFALLOCATE && (bn) != UFS_HOLE))
814
815union ihead {
816	union	ihead	*ih_head[2];
817	struct	inode	*ih_chain[2];
818};
819
820extern	union	ihead	*ihead;
821extern  kmutex_t	*ih_lock;
822extern  int	*ih_ne;
823extern	int	inohsz;
824
825extern	clock_t	ufs_iowait;
826
827#endif /* _KERNEL */
828
829/*
830 * ufs function prototypes
831 */
832#if defined(_KERNEL) && !defined(_BOOT)
833
834extern	void	ufs_iinit(void);
835extern	int	ufs_iget(struct vfs *, ino_t, struct inode **, cred_t *);
836extern	int	ufs_iget_alloced(struct vfs *, ino_t, struct inode **,
837    cred_t *);
838extern	void	ufs_reset_vnode(vnode_t *);
839extern	void	ufs_iinactive(struct inode *);
840extern	void	ufs_iupdat(struct inode *, int);
841extern	int	ufs_rmidle(struct inode *);
842extern	int	ufs_itrunc(struct inode *, u_offset_t, int, cred_t *);
843extern	int	ufs_iaccess(struct inode *, int, cred_t *, int);
844extern  int	rdip(struct inode *, struct uio *, int, struct cred *);
845extern  int	wrip(struct inode *, struct uio *, int, struct cred *);
846
847extern void	ufs_imark(struct inode *);
848extern void	ufs_itimes_nolock(struct inode *);
849
850extern	int	ufs_diraccess(struct inode *, int, struct cred *);
851extern	int	ufs_dirlook(struct inode *, char *, struct inode **,
852    cred_t *, int);
853extern	int	ufs_direnter_cm(struct inode *, char *, enum de_op,
854    struct vattr *, struct inode **, cred_t *, int);
855extern	int	ufs_direnter_lr(struct inode *, char *, enum de_op,
856    struct inode *, struct inode *, cred_t *, vnode_t **);
857extern	int	ufs_dircheckpath(ino_t, struct inode *, struct inode *,
858    struct cred *);
859extern	int	ufs_dirmakeinode(struct inode *, struct inode **,
860    struct vattr *, enum de_op, cred_t *);
861extern	int	ufs_dirremove(struct inode *, char *, struct inode *,
862    vnode_t *, enum dr_op, cred_t *, vnode_t **);
863extern  int	ufs_dircheckforname(struct inode *, char *, int,
864    struct ufs_slot *, struct inode **, struct cred *, int);
865extern	int	ufs_xattrdirempty(struct inode *, ino_t, cred_t *);
866extern	int	blkatoff(struct inode *, off_t, char **, struct fbuf **);
867
868extern	void	sbupdate(struct vfs *);
869
870extern	int	ufs_ialloc(struct inode *, ino_t, mode_t, struct inode **,
871    cred_t *);
872extern	void	ufs_ifree(struct inode *, ino_t, mode_t);
873extern	void	free(struct inode *, daddr_t, off_t, int);
874extern	int	alloc(struct inode *, daddr_t, int, daddr_t *, cred_t *);
875extern	int	realloccg(struct inode *, daddr_t, daddr_t, int, int,
876    daddr_t *, cred_t *);
877extern	int	ufs_allocsp(struct vnode *, struct flock64 *, cred_t *);
878extern	int	ufs_freesp(struct vnode *, struct flock64 *, int, cred_t *);
879extern	ino_t	dirpref(inode_t *);
880extern	daddr_t	blkpref(struct inode *, daddr_t, int, daddr32_t *);
881extern	daddr_t	contigpref(ufsvfs_t *, size_t, size_t);
882
883extern	int	ufs_rdwri(enum uio_rw, int, struct inode *, caddr_t, ssize_t,
884	offset_t, enum uio_seg, int *, cred_t *);
885
886extern	int	bmap_read(struct inode *, u_offset_t, daddr_t *, int *);
887extern	int	bmap_write(struct inode *, u_offset_t, int, enum bi_type,
888    daddr_t *, struct cred *);
889extern	int	bmap_has_holes(struct inode *);
890extern	int	bmap_find(struct inode *, boolean_t, u_offset_t *);
891extern	int	bmap_set_bn(struct vnode *, u_offset_t, daddr32_t);
892
893extern	void	ufs_vfs_add(struct ufsvfs *);
894extern	void	ufs_vfs_remove(struct ufsvfs *);
895
896extern	void	ufs_sbwrite(struct ufsvfs *);
897extern	void	ufs_update(int);
898extern	int	ufs_getsummaryinfo(dev_t, struct ufsvfs *, struct fs *);
899extern	int	ufs_putsummaryinfo(dev_t, struct ufsvfs *, struct fs *);
900extern	int	ufs_syncip(struct inode *, int, int, top_t);
901extern	int	ufs_sync_indir(struct inode *);
902extern	int	ufs_indirblk_sync(struct inode *, offset_t);
903extern	int	ufs_badblock(struct inode *, daddr_t);
904extern	int	ufs_indir_badblock(struct inode *, daddr32_t *);
905extern	void	ufs_notclean(struct ufsvfs *);
906extern	void	ufs_checkclean(struct vfs *);
907extern	int	isblock(struct fs *, uchar_t *, daddr_t);
908extern	void	setblock(struct fs *, uchar_t *, daddr_t);
909extern	void	clrblock(struct fs *, uchar_t *, daddr_t);
910extern	int	isclrblock(struct fs *, uchar_t *, daddr_t);
911extern	void	fragacct(struct fs *, int, int32_t *, int);
912extern	int	skpc(char, uint_t, char *);
913extern	int	ufs_fbwrite(struct fbuf *, struct inode *);
914extern	int	ufs_fbiwrite(struct fbuf *, struct inode *, daddr_t, long);
915extern	int	ufs_putapage(struct vnode *, struct page *, u_offset_t *,
916				size_t *, int, struct cred *);
917extern inode_t	*ufs_alloc_inode(ufsvfs_t *, ino_t);
918extern void	ufs_free_inode(inode_t *);
919
920/*
921 * special stuff
922 */
923extern	void	ufs_setreclaim(struct inode *);
924extern	int	ufs_scan_inodes(int, int (*)(struct inode *, void *), void *,
925				struct ufsvfs *);
926extern	int	ufs_sync_inode(struct inode *, void *);
927extern	int	ufs_sticky_remove_access(struct inode *, struct inode *,
928    struct cred *);
929/*
930 * quota
931 */
932extern	int	chkiq(struct ufsvfs *, int, struct inode *, uid_t, int,
933			struct cred *, char **errp, size_t *lenp);
934
935/*
936 * ufs thread stuff
937 */
938extern	void	ufs_thread_delete(struct vfs *);
939extern	void	ufs_delete_drain(struct vfs *, int, int);
940extern	void	ufs_delete(struct ufsvfs *, struct inode *, int);
941extern	void	ufs_inode_cache_reclaim(void *);
942extern	void	ufs_idle_drain(struct vfs *);
943extern	void	ufs_idle_some(int);
944extern	void	ufs_thread_idle(void);
945extern	void	ufs_thread_reclaim(struct vfs *);
946extern	void	ufs_thread_init(struct ufs_q *, int);
947extern	void	ufs_thread_start(struct ufs_q *, void (*)(), struct vfs *);
948extern	void	ufs_thread_exit(struct ufs_q *);
949extern	void	ufs_thread_suspend(struct ufs_q *);
950extern	void	ufs_thread_continue(struct ufs_q *);
951extern	void	ufs_thread_hlock(void *);
952extern	void	ufs_delete_init(struct ufsvfs *, int);
953extern	void	ufs_delete_adjust_stats(struct ufsvfs *, struct statvfs64 *);
954extern	void	ufs_delete_drain_wait(struct ufsvfs *, int);
955
956/*
957 * ufs lockfs stuff
958 */
959struct seg;
960extern int ufs_reconcile_fs(struct vfs *, struct ufsvfs *, int);
961extern int ufs_quiesce(struct ulockfs *);
962extern int ufs_flush(struct vfs *);
963extern int ufs_fiolfs(struct vnode *, struct lockfs *, int);
964extern int ufs__fiolfs(struct vnode *, struct lockfs *, int, int);
965extern int ufs_fiolfss(struct vnode *, struct lockfs *);
966extern int ufs_fioffs(struct vnode *, char *, struct cred *);
967extern int ufs_check_lockfs(struct ufsvfs *, struct ulockfs *, ulong_t);
968extern int ufs_lockfs_begin(struct ufsvfs *, struct ulockfs **, ulong_t);
969extern int ufs_lockfs_trybegin(struct ufsvfs *, struct ulockfs **, ulong_t);
970extern int ufs_lockfs_begin_getpage(struct ufsvfs *, struct ulockfs **,
971		struct seg *, int, uint_t *);
972extern void ufs_lockfs_end(struct ulockfs *);
973/*
974 * ufs acl stuff
975 */
976extern int ufs_si_inherit(struct inode *, struct inode *, o_mode_t, cred_t *);
977extern void si_cache_init(void);
978extern int ufs_si_load(struct inode *, cred_t *);
979extern void ufs_si_del(struct inode *);
980extern int ufs_acl_access(struct inode *, int, cred_t *);
981extern void ufs_si_cache_flush(dev_t);
982extern int ufs_si_free(si_t *, struct vfs *, cred_t *);
983extern int ufs_acl_setattr(struct inode *, struct vattr *, cred_t *);
984extern int ufs_acl_get(struct inode *, vsecattr_t *, int, cred_t *);
985extern int ufs_acl_set(struct inode *, vsecattr_t *, int, cred_t *);
986/*
987 * ufs directio stuff
988 */
989extern void ufs_directio_init();
990extern int ufs_directio_write(struct inode *, uio_t *, int, int, cred_t *,
991    int *);
992extern int ufs_directio_read(struct inode *, uio_t *, cred_t *, int *);
993#define	DIRECTIO_FAILURE	(0)
994#define	DIRECTIO_SUCCESS	(1)
995
996/*
997 * ufs extensions for PXFS
998 */
999
1000int ufs_rdwr_data(vnode_t *vp, u_offset_t offset, size_t len, fdbuffer_t *fdb,
1001    int flags, cred_t *cr);
1002int ufs_alloc_data(vnode_t *vp, u_offset_t offset, size_t *len, fdbuffer_t *fdb,
1003    int flags, cred_t *cr);
1004
1005/*
1006 * prototypes to support the forced unmount
1007 */
1008
1009void ufs_freeze(struct ulockfs *, struct lockfs *);
1010int ufs_thaw(struct vfs *, struct ufsvfs *, struct ulockfs *);
1011
1012/*
1013 * extended attributes
1014 */
1015
1016int ufs_xattrmkdir(inode_t *, inode_t **, int, struct cred *);
1017int ufs_xattr_getattrdir(vnode_t *, inode_t **, int, struct cred *);
1018void ufs_unhook_shadow(inode_t *, inode_t *);
1019
1020#endif	/* defined(_KERNEL) && !defined(_BOOT) */
1021
1022#ifdef	__cplusplus
1023}
1024#endif
1025
1026#endif	/* _SYS_FS_UFS_INODE_H */
1027