xfs_vnodeops.c revision 159451
1/*
2 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17 */
18
19#include "xfs.h"
20#include "xfs_fs.h"
21#include "xfs_types.h"
22#include "xfs_bit.h"
23#include "xfs_log.h"
24#include "xfs_inum.h"
25#include "xfs_trans.h"
26#include "xfs_sb.h"
27#include "xfs_ag.h"
28#include "xfs_dir.h"
29#include "xfs_dir2.h"
30#include "xfs_dmapi.h"
31#include "xfs_mount.h"
32#include "xfs_da_btree.h"
33#include "xfs_bmap_btree.h"
34#include "xfs_alloc_btree.h"
35#include "xfs_ialloc_btree.h"
36#include "xfs_dir_sf.h"
37#include "xfs_dir2_sf.h"
38#include "xfs_attr_sf.h"
39#include "xfs_dinode.h"
40#include "xfs_inode.h"
41#include "xfs_inode_item.h"
42#include "xfs_dir_leaf.h"
43#include "xfs_itable.h"
44#include "xfs_btree.h"
45#include "xfs_ialloc.h"
46#include "xfs_alloc.h"
47#include "xfs_bmap.h"
48#include "xfs_attr.h"
49#include "xfs_rw.h"
50#include "xfs_error.h"
51#include "xfs_quota.h"
52#include "xfs_utils.h"
53#include "xfs_rtalloc.h"
54#include "xfs_refcache.h"
55#include "xfs_trans_space.h"
56#include "xfs_log_priv.h"
57#include "xfs_mac.h"
58
59#include "xfs_fs.h"
60
61/*
62 * The maximum pathlen is 1024 bytes. Since the minimum file system
63 * blocksize is 512 bytes, we can get a max of 2 extents back from
64 * bmapi.
65 */
66#define SYMLINK_MAPS 2
67
68/*
69 * For xfs, we check that the file isn't too big to be opened by this kernel.
70 * No other open action is required for regular files.  Devices are handled
71 * through the specfs file system, pipes through fifofs.  Device and
72 * fifo vnodes are "wrapped" by specfs and fifofs vnodes, respectively,
73 * when a new vnode is first looked up or created.
74 */
75STATIC int
76xfs_open(
77	bhv_desc_t	*bdp,
78	cred_t		*credp)
79{
80	int		mode;
81	xfs_vnode_t	*vp;
82	xfs_inode_t	*ip;
83
84	vp = BHV_TO_VNODE(bdp);
85	ip = XFS_BHVTOI(bdp);
86
87	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
88		return XFS_ERROR(EIO);
89
90	/*
91	 * If it's a directory with any blocks, read-ahead block 0
92	 * as we're almost certain to have the next operation be a read there.
93	 */
94	if (VN_ISDIR(vp) && ip->i_d.di_nextents > 0) {
95		mode = xfs_ilock_map_shared(ip);
96		if (ip->i_d.di_nextents > 0)
97			(void)xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK);
98		xfs_iunlock(ip, mode);
99	}
100	return 0;
101}
102
103
104/*
105 * xfs_getattr
106 */
107STATIC int
108xfs_getattr(
109	bhv_desc_t	*bdp,
110	xfs_vattr_t	*vap,
111	int		flags,
112	cred_t		*credp)
113{
114	xfs_inode_t	*ip;
115	xfs_mount_t	*mp;
116	xfs_vnode_t	*vp;
117
118	vp  = BHV_TO_VNODE(bdp);
119	vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
120
121	ip = XFS_BHVTOI(bdp);
122	mp = ip->i_mount;
123
124	if (XFS_FORCED_SHUTDOWN(mp))
125		return XFS_ERROR(EIO);
126
127	if (!(flags & ATTR_LAZY))
128		xfs_ilock(ip, XFS_ILOCK_SHARED);
129
130	vap->va_size = ip->i_d.di_size;
131	if (vap->va_mask == XFS_AT_SIZE)
132		goto all_done;
133
134	vap->va_nblocks =
135		XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks);
136	vap->va_nodeid = ip->i_ino;
137#if XFS_BIG_INUMS
138	vap->va_nodeid += mp->m_inoadd;
139#endif
140	vap->va_nlink = ip->i_d.di_nlink;
141
142	/*
143	 * Quick exit for non-stat callers
144	 */
145	if ((vap->va_mask &
146	    ~(XFS_AT_SIZE|XFS_AT_FSID|XFS_AT_NODEID|
147	      XFS_AT_NLINK|XFS_AT_BLKSIZE)) == 0)
148		goto all_done;
149
150	/*
151	 * Copy from in-core inode.
152	 */
153	vap->va_mode = ip->i_d.di_mode;
154	vap->va_uid = ip->i_d.di_uid;
155	vap->va_gid = ip->i_d.di_gid;
156	vap->va_projid = ip->i_d.di_projid;
157
158	/*
159	 * Check vnode type block/char vs. everything else.
160	 */
161	switch (ip->i_d.di_mode & S_IFMT) {
162	case S_IFBLK:
163	case S_IFCHR:
164		vap->va_rdev = ip->i_df.if_u2.if_rdev;
165		vap->va_blocksize = BLKDEV_IOSIZE;
166		break;
167	default:
168		vap->va_rdev = 0;
169
170		if (!(ip->i_d.di_flags & XFS_DIFLAG_REALTIME)) {
171			vap->va_blocksize = xfs_preferred_iosize(mp);
172		} else {
173
174			/*
175			 * If the file blocks are being allocated from a
176			 * realtime partition, then return the inode's
177			 * realtime extent size or the realtime volume's
178			 * extent size.
179			 */
180			vap->va_blocksize = ip->i_d.di_extsize ?
181				(ip->i_d.di_extsize << mp->m_sb.sb_blocklog) :
182				(mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog);
183		}
184		break;
185	}
186
187	vn_atime_to_timespec(vp, &vap->va_atime);
188	vap->va_mtime.tv_sec = ip->i_d.di_mtime.t_sec;
189	vap->va_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec;
190	vap->va_ctime.tv_sec = ip->i_d.di_ctime.t_sec;
191	vap->va_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec;
192
193	/*
194	 * Exit for stat callers.  See if any of the rest of the fields
195	 * to be filled in are needed.
196	 */
197	if ((vap->va_mask &
198	     (XFS_AT_XFLAGS|XFS_AT_EXTSIZE|XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|
199	      XFS_AT_GENCOUNT|XFS_AT_VCODE)) == 0)
200		goto all_done;
201
202	/*
203	 * Convert di_flags to xflags.
204	 */
205	vap->va_xflags = xfs_ip2xflags(ip);
206
207	/*
208	 * Exit for inode revalidate.  See if any of the rest of
209	 * the fields to be filled in are needed.
210	 */
211	if ((vap->va_mask &
212	     (XFS_AT_EXTSIZE|XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|
213	      XFS_AT_GENCOUNT|XFS_AT_VCODE)) == 0)
214		goto all_done;
215
216	vap->va_extsize = ip->i_d.di_extsize << mp->m_sb.sb_blocklog;
217	vap->va_nextents =
218		(ip->i_df.if_flags & XFS_IFEXTENTS) ?
219			ip->i_df.if_bytes / sizeof(xfs_bmbt_rec_t) :
220			ip->i_d.di_nextents;
221	if (ip->i_afp)
222		vap->va_anextents =
223			(ip->i_afp->if_flags & XFS_IFEXTENTS) ?
224				ip->i_afp->if_bytes / sizeof(xfs_bmbt_rec_t) :
225				 ip->i_d.di_anextents;
226	else
227		vap->va_anextents = 0;
228	vap->va_gen = ip->i_d.di_gen;
229
230 all_done:
231	if (!(flags & ATTR_LAZY))
232		xfs_iunlock(ip, XFS_ILOCK_SHARED);
233	return 0;
234}
235
236
237/*
238 * xfs_setattr
239 */
240int
241xfs_setattr(
242	bhv_desc_t		*bdp,
243	xfs_vattr_t		*vap,
244	int			flags,
245	cred_t			*credp)
246{
247	xfs_inode_t		*ip;
248	xfs_trans_t		*tp;
249	xfs_mount_t		*mp;
250	int			mask;
251	int			code;
252	uint			lock_flags;
253	uint			commit_flags=0;
254	uid_t			uid=0, iuid=0;
255	gid_t			gid=0, igid=0;
256	int			timeflags = 0;
257	xfs_vnode_t		*vp;
258	xfs_prid_t		projid=0, iprojid=0;
259	int			mandlock_before, mandlock_after;
260	struct xfs_dquot	*udqp, *gdqp, *olddquot1, *olddquot2;
261	int			file_owner;
262	int			need_iolock = 1;
263
264	vp = BHV_TO_VNODE(bdp);
265	vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
266
267	if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
268		return XFS_ERROR(EROFS);
269
270	/*
271	 * Cannot set certain attributes.
272	 */
273	mask = vap->va_mask;
274	if (mask & XFS_AT_NOSET) {
275		return XFS_ERROR(EINVAL);
276	}
277
278	ip = XFS_BHVTOI(bdp);
279	mp = ip->i_mount;
280
281	if (XFS_FORCED_SHUTDOWN(mp))
282		return XFS_ERROR(EIO);
283
284	/*
285	 * Timestamps do not need to be logged and hence do not
286	 * need to be done within a transaction.
287	 */
288	if (mask & XFS_AT_UPDTIMES) {
289		ASSERT((mask & ~XFS_AT_UPDTIMES) == 0);
290		timeflags = ((mask & XFS_AT_UPDATIME) ? XFS_ICHGTIME_ACC : 0) |
291			    ((mask & XFS_AT_UPDCTIME) ? XFS_ICHGTIME_CHG : 0) |
292			    ((mask & XFS_AT_UPDMTIME) ? XFS_ICHGTIME_MOD : 0);
293		xfs_ichgtime(ip, timeflags);
294		return 0;
295	}
296
297	olddquot1 = olddquot2 = NULL;
298	udqp = gdqp = NULL;
299
300	/*
301	 * If disk quotas is on, we make sure that the dquots do exist on disk,
302	 * before we start any other transactions. Trying to do this later
303	 * is messy. We don't care to take a readlock to look at the ids
304	 * in inode here, because we can't hold it across the trans_reserve.
305	 * If the IDs do change before we take the ilock, we're covered
306	 * because the i_*dquot fields will get updated anyway.
307	 */
308	if (XFS_IS_QUOTA_ON(mp) &&
309	    (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID))) {
310		uint	qflags = 0;
311
312		if ((mask & XFS_AT_UID) && XFS_IS_UQUOTA_ON(mp)) {
313			uid = vap->va_uid;
314			qflags |= XFS_QMOPT_UQUOTA;
315		} else {
316			uid = ip->i_d.di_uid;
317		}
318		if ((mask & XFS_AT_GID) && XFS_IS_GQUOTA_ON(mp)) {
319			gid = vap->va_gid;
320			qflags |= XFS_QMOPT_GQUOTA;
321		}  else {
322			gid = ip->i_d.di_gid;
323		}
324		if ((mask & XFS_AT_PROJID) && XFS_IS_PQUOTA_ON(mp)) {
325			projid = vap->va_projid;
326			qflags |= XFS_QMOPT_PQUOTA;
327		}  else {
328			projid = ip->i_d.di_projid;
329		}
330		/*
331		 * We take a reference when we initialize udqp and gdqp,
332		 * so it is important that we never blindly double trip on
333		 * the same variable. See xfs_create() for an example.
334		 */
335		ASSERT(udqp == NULL);
336		ASSERT(gdqp == NULL);
337		code = XFS_QM_DQVOPALLOC(mp, ip, uid, gid, projid, qflags,
338					 &udqp, &gdqp);
339		if (code)
340			return code;
341	}
342
343	/*
344	 * For the other attributes, we acquire the inode lock and
345	 * first do an error checking pass.
346	 */
347	tp = NULL;
348	lock_flags = XFS_ILOCK_EXCL;
349	ASSERT(flags & ATTR_NOLOCK ? flags & ATTR_DMI : 1);
350	if (flags & ATTR_NOLOCK)
351		need_iolock = 0;
352	if (!(mask & XFS_AT_SIZE)) {
353		if ((mask != (XFS_AT_CTIME|XFS_AT_ATIME|XFS_AT_MTIME)) ||
354		    (mp->m_flags & XFS_MOUNT_WSYNC)) {
355			tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
356			commit_flags = 0;
357			if ((code = xfs_trans_reserve(tp, 0,
358						     XFS_ICHANGE_LOG_RES(mp), 0,
359						     0, 0))) {
360				lock_flags = 0;
361				goto error_return;
362			}
363		}
364	} else {
365		if (DM_EVENT_ENABLED (vp->v_vfsp, ip, DM_EVENT_TRUNCATE) &&
366		    !(flags & ATTR_DMI)) {
367			int dmflags = AT_DELAY_FLAG(flags) | DM_SEM_FLAG_WR;
368			code = XFS_SEND_DATA(mp, DM_EVENT_TRUNCATE, vp,
369				vap->va_size, 0, dmflags, NULL);
370			if (code) {
371				lock_flags = 0;
372				goto error_return;
373			}
374		}
375		if (need_iolock)
376			lock_flags |= XFS_IOLOCK_EXCL;
377	}
378
379	xfs_ilock(ip, lock_flags);
380
381	/* boolean: are we the file owner? */
382#if 0
383	file_owner = (current_fsuid(credp) == ip->i_d.di_uid);
384#endif
385
386	/*
387	 * Change various properties of a file.
388	 * Only the owner or users with CAP_FOWNER
389	 * capability may do these things.
390	 */
391	if (mask &
392	    (XFS_AT_MODE|XFS_AT_XFLAGS|XFS_AT_EXTSIZE|XFS_AT_UID|
393	     XFS_AT_GID|XFS_AT_PROJID)) {
394		/*
395		 * CAP_FOWNER overrides the following restrictions:
396		 *
397		 * The user ID of the calling process must be equal
398		 * to the file owner ID, except in cases where the
399		 * CAP_FSETID capability is applicable.
400		 */
401		if (!file_owner && !capable(CAP_FOWNER)) {
402			code = XFS_ERROR(EPERM);
403			goto error_return;
404		}
405
406		/*
407		 * CAP_FSETID overrides the following restrictions:
408		 *
409		 * The effective user ID of the calling process shall match
410		 * the file owner when setting the set-user-ID and
411		 * set-group-ID bits on that file.
412		 *
413		 * The effective group ID or one of the supplementary group
414		 * IDs of the calling process shall match the group owner of
415		 * the file when setting the set-group-ID bit on that file
416		 */
417		if (mask & XFS_AT_MODE) {
418			mode_t m = 0;
419
420			if ((vap->va_mode & S_ISUID) && !file_owner)
421				m |= S_ISUID;
422			if ((vap->va_mode & S_ISGID) &&
423			    !groupmember((gid_t)ip->i_d.di_gid, credp))
424				m |= S_ISGID;
425#if 1
426			/* Linux allows this, Irix doesn't. */
427			if ((vap->va_mode & S_ISVTX) && !VN_ISDIR(vp))
428				m |= S_ISVTX;
429#endif
430			if (m && !capable(CAP_FSETID))
431				vap->va_mode &= ~m;
432		}
433	}
434
435	/*
436	 * Change file ownership.  Must be the owner or privileged.
437	 * If the system was configured with the "restricted_chown"
438	 * option, the owner is not permitted to give away the file,
439	 * and can change the group id only to a group of which he
440	 * or she is a member.
441	 */
442	if (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID)) {
443		/*
444		 * These IDs could have changed since we last looked at them.
445		 * But, we're assured that if the ownership did change
446		 * while we didn't have the inode locked, inode's dquot(s)
447		 * would have changed also.
448		 */
449		iuid = ip->i_d.di_uid;
450		iprojid = ip->i_d.di_projid;
451		igid = ip->i_d.di_gid;
452		gid = (mask & XFS_AT_GID) ? vap->va_gid : igid;
453		uid = (mask & XFS_AT_UID) ? vap->va_uid : iuid;
454
455		projid = (mask & XFS_AT_PROJID) ? (xfs_prid_t)vap->va_projid :
456			 iprojid;
457
458		/*
459		 * CAP_CHOWN overrides the following restrictions:
460		 *
461		 * If _POSIX_CHOWN_RESTRICTED is defined, this capability
462		 * shall override the restriction that a process cannot
463		 * change the user ID of a file it owns and the restriction
464		 * that the group ID supplied to the chown() function
465		 * shall be equal to either the group ID or one of the
466		 * supplementary group IDs of the calling process.
467		 */
468		if (restricted_chown &&
469		    (iuid != uid || (igid != gid &&
470				     !groupmember((gid_t)gid, credp))) &&
471		    !capable(CAP_CHOWN)) {
472			code = XFS_ERROR(EPERM);
473			goto error_return;
474		}
475		/*
476		 * Do a quota reservation only if uid/projid/gid is actually
477		 * going to change.
478		 */
479		if ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) ||
480		    (XFS_IS_PQUOTA_ON(mp) && iprojid != projid) ||
481		    (XFS_IS_GQUOTA_ON(mp) && igid != gid)) {
482			ASSERT(tp);
483			code = XFS_QM_DQVOPCHOWNRESV(mp, tp, ip, udqp, gdqp,
484						capable(CAP_FOWNER) ?
485						XFS_QMOPT_FORCE_RES : 0);
486			if (code)	/* out of quota */
487				goto error_return;
488		}
489	}
490
491	/*
492	 * Truncate file.  Must have write permission and not be a directory.
493	 */
494	if (mask & XFS_AT_SIZE) {
495		/* Short circuit the truncate case for zero length files */
496		if ((vap->va_size == 0) &&
497		   (ip->i_d.di_size == 0) && (ip->i_d.di_nextents == 0)) {
498			xfs_iunlock(ip, XFS_ILOCK_EXCL);
499			lock_flags &= ~XFS_ILOCK_EXCL;
500			if (mask & XFS_AT_CTIME)
501				xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
502			code = 0;
503			goto error_return;
504		}
505
506		if (VN_ISDIR(vp)) {
507			code = XFS_ERROR(EISDIR);
508			goto error_return;
509		} else if (!VN_ISREG(vp)) {
510			code = XFS_ERROR(EINVAL);
511			goto error_return;
512		}
513		/*
514		 * Make sure that the dquots are attached to the inode.
515		 */
516		if ((code = XFS_QM_DQATTACH(mp, ip, XFS_QMOPT_ILOCKED)))
517			goto error_return;
518	}
519
520	/*
521	 * Change file access or modified times.
522	 */
523	if (mask & (XFS_AT_ATIME|XFS_AT_MTIME)) {
524		if (!file_owner) {
525			if ((flags & ATTR_UTIME) &&
526			    !capable(CAP_FOWNER)) {
527				code = XFS_ERROR(EPERM);
528				goto error_return;
529			}
530		}
531	}
532
533	/*
534	 * Change extent size or realtime flag.
535	 */
536	if (mask & (XFS_AT_EXTSIZE|XFS_AT_XFLAGS)) {
537		/*
538		 * Can't change extent size if any extents are allocated.
539		 */
540		if (ip->i_d.di_nextents && (mask & XFS_AT_EXTSIZE) &&
541		    ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) !=
542		     vap->va_extsize) ) {
543			code = XFS_ERROR(EINVAL);	/* EFBIG? */
544			goto error_return;
545		}
546		/*
547		 * Can't change realtime flag if any extents are allocated.
548		 */
549		if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
550		    (mask & XFS_AT_XFLAGS) &&
551		    (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) !=
552		    (vap->va_xflags & XFS_XFLAG_REALTIME)) {
553			code = XFS_ERROR(EINVAL);	/* EFBIG? */
554			goto error_return;
555		}
556
557		/*
558		 * Extent size must be a multiple of the appropriate block
559		 * size, if set at all.
560		 */
561		if ((mask & XFS_AT_EXTSIZE) && vap->va_extsize != 0) {
562			xfs_extlen_t	size;
563
564			if ((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) ||
565			    ((mask & XFS_AT_XFLAGS) &&
566			    (vap->va_xflags & XFS_XFLAG_REALTIME))) {
567				size = mp->m_sb.sb_rextsize <<
568				       mp->m_sb.sb_blocklog;
569			} else {
570				size = mp->m_sb.sb_blocksize;
571			}
572			if (vap->va_extsize % size) {
573				code = XFS_ERROR(EINVAL);
574				goto error_return;
575			}
576		}
577		/*
578		 * If realtime flag is set then must have realtime data.
579		 */
580		if ((mask & XFS_AT_XFLAGS) &&
581		    (vap->va_xflags & XFS_XFLAG_REALTIME)) {
582			if ((mp->m_sb.sb_rblocks == 0) ||
583			    (mp->m_sb.sb_rextsize == 0) ||
584			    (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) {
585				code = XFS_ERROR(EINVAL);
586				goto error_return;
587			}
588		}
589
590		/*
591		 * Can't modify an immutable/append-only file unless
592		 * we have appropriate permission.
593		 */
594		if ((mask & XFS_AT_XFLAGS) &&
595		    (ip->i_d.di_flags &
596				(XFS_DIFLAG_IMMUTABLE|XFS_DIFLAG_APPEND) ||
597		     (vap->va_xflags &
598				(XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) &&
599		    !capable(CAP_LINUX_IMMUTABLE)) {
600			code = XFS_ERROR(EPERM);
601			goto error_return;
602		}
603	}
604
605	/*
606	 * Now we can make the changes.  Before we join the inode
607	 * to the transaction, if XFS_AT_SIZE is set then take care of
608	 * the part of the truncation that must be done without the
609	 * inode lock.  This needs to be done before joining the inode
610	 * to the transaction, because the inode cannot be unlocked
611	 * once it is a part of the transaction.
612	 */
613	if (mask & XFS_AT_SIZE) {
614		code = 0;
615		if ((vap->va_size > ip->i_d.di_size) &&
616		    (flags & ATTR_NOSIZETOK) == 0) {
617			code = xfs_igrow_start(ip, vap->va_size, credp);
618		}
619		xfs_iunlock(ip, XFS_ILOCK_EXCL);
620		vn_iowait(vp); /* wait for the completion of any pending DIOs */
621		if (!code)
622			code = xfs_itruncate_data(ip, vap->va_size);
623		if (code) {
624			ASSERT(tp == NULL);
625			lock_flags &= ~XFS_ILOCK_EXCL;
626			ASSERT(lock_flags == XFS_IOLOCK_EXCL);
627			goto error_return;
628		}
629		tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
630		if ((code = xfs_trans_reserve(tp, 0,
631					     XFS_ITRUNCATE_LOG_RES(mp), 0,
632					     XFS_TRANS_PERM_LOG_RES,
633					     XFS_ITRUNCATE_LOG_COUNT))) {
634			xfs_trans_cancel(tp, 0);
635			if (need_iolock)
636				xfs_iunlock(ip, XFS_IOLOCK_EXCL);
637			return code;
638		}
639		commit_flags = XFS_TRANS_RELEASE_LOG_RES;
640		xfs_ilock(ip, XFS_ILOCK_EXCL);
641	}
642
643	if (tp) {
644		xfs_trans_ijoin(tp, ip, lock_flags);
645		xfs_trans_ihold(tp, ip);
646	}
647
648	/* determine whether mandatory locking mode changes */
649	mandlock_before = MANDLOCK(vp, ip->i_d.di_mode);
650
651	/*
652	 * Truncate file.  Must have write permission and not be a directory.
653	 */
654	if (mask & XFS_AT_SIZE) {
655		if (vap->va_size > ip->i_d.di_size) {
656			xfs_igrow_finish(tp, ip, vap->va_size,
657			    !(flags & ATTR_DMI));
658		} else if ((vap->va_size <= ip->i_d.di_size) ||
659			   ((vap->va_size == 0) && ip->i_d.di_nextents)) {
660			/*
661			 * signal a sync transaction unless
662			 * we're truncating an already unlinked
663			 * file on a wsync filesystem
664			 */
665			code = xfs_itruncate_finish(&tp, ip,
666					    (xfs_fsize_t)vap->va_size,
667					    XFS_DATA_FORK,
668					    ((ip->i_d.di_nlink != 0 ||
669					      !(mp->m_flags & XFS_MOUNT_WSYNC))
670					     ? 1 : 0));
671			if (code) {
672				goto abort_return;
673			}
674		}
675		/*
676		 * Have to do this even if the file's size doesn't change.
677		 */
678		timeflags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
679	}
680
681	/*
682	 * Change file access modes.
683	 */
684	if (mask & XFS_AT_MODE) {
685		ip->i_d.di_mode &= S_IFMT;
686		ip->i_d.di_mode |= vap->va_mode & ~S_IFMT;
687
688		xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
689		timeflags |= XFS_ICHGTIME_CHG;
690	}
691
692	/*
693	 * Change file ownership.  Must be the owner or privileged.
694	 * If the system was configured with the "restricted_chown"
695	 * option, the owner is not permitted to give away the file,
696	 * and can change the group id only to a group of which he
697	 * or she is a member.
698	 */
699	if (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID)) {
700		/*
701		 * CAP_FSETID overrides the following restrictions:
702		 *
703		 * The set-user-ID and set-group-ID bits of a file will be
704		 * cleared upon successful return from chown()
705		 */
706		if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
707		    !capable(CAP_FSETID)) {
708			ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
709		}
710
711		/*
712		 * Change the ownerships and register quota modifications
713		 * in the transaction.
714		 */
715		if (iuid != uid) {
716			if (XFS_IS_UQUOTA_ON(mp)) {
717				ASSERT(mask & XFS_AT_UID);
718				ASSERT(udqp);
719				olddquot1 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
720							&ip->i_udquot, udqp);
721			}
722			ip->i_d.di_uid = uid;
723		}
724		if (igid != gid) {
725			if (XFS_IS_GQUOTA_ON(mp)) {
726				ASSERT(!XFS_IS_PQUOTA_ON(mp));
727				ASSERT(mask & XFS_AT_GID);
728				ASSERT(gdqp);
729				olddquot2 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
730							&ip->i_gdquot, gdqp);
731			}
732			ip->i_d.di_gid = gid;
733		}
734		if (iprojid != projid) {
735			if (XFS_IS_PQUOTA_ON(mp)) {
736				ASSERT(!XFS_IS_GQUOTA_ON(mp));
737				ASSERT(mask & XFS_AT_PROJID);
738				ASSERT(gdqp);
739				olddquot2 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
740							&ip->i_gdquot, gdqp);
741			}
742			ip->i_d.di_projid = projid;
743			/*
744			 * We may have to rev the inode as well as
745			 * the superblock version number since projids didn't
746			 * exist before DINODE_VERSION_2 and SB_VERSION_NLINK.
747			 */
748			if (ip->i_d.di_version == XFS_DINODE_VERSION_1)
749				xfs_bump_ino_vers2(tp, ip);
750		}
751
752		xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
753		timeflags |= XFS_ICHGTIME_CHG;
754	}
755
756
757	/*
758	 * Change file access or modified times.
759	 */
760	if (mask & (XFS_AT_ATIME|XFS_AT_MTIME)) {
761		if (mask & XFS_AT_ATIME) {
762			ip->i_d.di_atime.t_sec = vap->va_atime.tv_sec;
763			ip->i_d.di_atime.t_nsec = vap->va_atime.tv_nsec;
764			ip->i_update_core = 1;
765			//timeflags &= ~XFS_ICHGTIME_ACC;
766		}
767		if (mask & XFS_AT_MTIME) {
768			ip->i_d.di_mtime.t_sec = vap->va_mtime.tv_sec;
769			ip->i_d.di_mtime.t_nsec = vap->va_mtime.tv_nsec;
770			timeflags &= ~XFS_ICHGTIME_MOD;
771			timeflags |= XFS_ICHGTIME_CHG;
772		}
773		if (tp && (flags & ATTR_UTIME))
774			xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
775	}
776
777	/*
778	 * Change XFS-added attributes.
779	 */
780	if (mask & (XFS_AT_EXTSIZE|XFS_AT_XFLAGS)) {
781		if (mask & XFS_AT_EXTSIZE) {
782			/*
783			 * Converting bytes to fs blocks.
784			 */
785			ip->i_d.di_extsize = vap->va_extsize >>
786				mp->m_sb.sb_blocklog;
787		}
788		if (mask & XFS_AT_XFLAGS) {
789			uint	di_flags;
790
791			/* can't set PREALLOC this way, just preserve it */
792			di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC);
793			if (vap->va_xflags & XFS_XFLAG_IMMUTABLE)
794				di_flags |= XFS_DIFLAG_IMMUTABLE;
795			if (vap->va_xflags & XFS_XFLAG_APPEND)
796				di_flags |= XFS_DIFLAG_APPEND;
797			if (vap->va_xflags & XFS_XFLAG_SYNC)
798				di_flags |= XFS_DIFLAG_SYNC;
799			if (vap->va_xflags & XFS_XFLAG_NOATIME)
800				di_flags |= XFS_DIFLAG_NOATIME;
801			if (vap->va_xflags & XFS_XFLAG_NODUMP)
802				di_flags |= XFS_DIFLAG_NODUMP;
803			if (vap->va_xflags & XFS_XFLAG_PROJINHERIT)
804				di_flags |= XFS_DIFLAG_PROJINHERIT;
805			if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
806				if (vap->va_xflags & XFS_XFLAG_RTINHERIT)
807					di_flags |= XFS_DIFLAG_RTINHERIT;
808				if (vap->va_xflags & XFS_XFLAG_NOSYMLINKS)
809					di_flags |= XFS_DIFLAG_NOSYMLINKS;
810				if (vap->va_xflags & XFS_XFLAG_EXTSZINHERIT)
811					di_flags |= XFS_DIFLAG_EXTSZINHERIT;
812			} else if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) {
813				if (vap->va_xflags & XFS_XFLAG_REALTIME) {
814					di_flags |= XFS_DIFLAG_REALTIME;
815					ip->i_iocore.io_flags |= XFS_IOCORE_RT;
816				} else {
817					ip->i_iocore.io_flags &= ~XFS_IOCORE_RT;
818				}
819				if (vap->va_xflags & XFS_XFLAG_EXTSIZE)
820					di_flags |= XFS_DIFLAG_EXTSIZE;
821			}
822			ip->i_d.di_flags = di_flags;
823		}
824		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
825		timeflags |= XFS_ICHGTIME_CHG;
826	}
827
828	/*
829	 * Change file inode change time only if XFS_AT_CTIME set
830	 * AND we have been called by a DMI function.
831	 */
832
833	if ( (flags & ATTR_DMI) && (mask & XFS_AT_CTIME) ) {
834		ip->i_d.di_ctime.t_sec = vap->va_ctime.tv_sec;
835		ip->i_d.di_ctime.t_nsec = vap->va_ctime.tv_nsec;
836		ip->i_update_core = 1;
837		timeflags &= ~XFS_ICHGTIME_CHG;
838	}
839
840	/*
841	 * Send out timestamp changes that need to be set to the
842	 * current time.  Not done when called by a DMI function.
843	 */
844	if (timeflags && !(flags & ATTR_DMI))
845		xfs_ichgtime(ip, timeflags);
846
847	XFS_STATS_INC(xs_ig_attrchg);
848
849	/*
850	 * If this is a synchronous mount, make sure that the
851	 * transaction goes to disk before returning to the user.
852	 * This is slightly sub-optimal in that truncates require
853	 * two sync transactions instead of one for wsync filesystems.
854	 * One for the truncate and one for the timestamps since we
855	 * don't want to change the timestamps unless we're sure the
856	 * truncate worked.  Truncates are less than 1% of the laddis
857	 * mix so this probably isn't worth the trouble to optimize.
858	 */
859	code = 0;
860	if (tp) {
861		if (mp->m_flags & XFS_MOUNT_WSYNC)
862			xfs_trans_set_sync(tp);
863
864		code = xfs_trans_commit(tp, commit_flags, NULL);
865	}
866
867	/*
868	 * If the (regular) file's mandatory locking mode changed, then
869	 * notify the vnode.  We do this under the inode lock to prevent
870	 * racing calls to vop_vnode_change.
871	 */
872	mandlock_after = MANDLOCK(vp, ip->i_d.di_mode);
873	if (mandlock_before != mandlock_after) {
874		XVOP_VNODE_CHANGE(vp, VCHANGE_FLAGS_ENF_LOCKING,
875				 mandlock_after);
876	}
877
878	xfs_iunlock(ip, lock_flags);
879
880	/*
881	 * Release any dquot(s) the inode had kept before chown.
882	 */
883	XFS_QM_DQRELE(mp, olddquot1);
884	XFS_QM_DQRELE(mp, olddquot2);
885	XFS_QM_DQRELE(mp, udqp);
886	XFS_QM_DQRELE(mp, gdqp);
887
888	if (code) {
889		return code;
890	}
891
892	if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_ATTRIBUTE) &&
893	    !(flags & ATTR_DMI)) {
894		(void) XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, vp, DM_RIGHT_NULL,
895					NULL, DM_RIGHT_NULL, NULL, NULL,
896					0, 0, AT_DELAY_FLAG(flags));
897	}
898	return 0;
899
900 abort_return:
901	commit_flags |= XFS_TRANS_ABORT;
902	/* FALLTHROUGH */
903 error_return:
904	XFS_QM_DQRELE(mp, udqp);
905	XFS_QM_DQRELE(mp, gdqp);
906	if (tp) {
907		xfs_trans_cancel(tp, commit_flags);
908	}
909	if (lock_flags != 0) {
910		xfs_iunlock(ip, lock_flags);
911	}
912	return code;
913}
914
915
916/*
917 * xfs_access
918 * Null conversion from vnode mode bits to inode mode bits, as in efs.
919 */
920STATIC int
921xfs_access(
922	bhv_desc_t	*bdp,
923	int		mode,
924	cred_t		*credp)
925{
926	xfs_inode_t	*ip;
927	int		error;
928
929	vn_trace_entry(BHV_TO_VNODE(bdp), __FUNCTION__,
930					       (inst_t *)__return_address);
931
932	ip = XFS_BHVTOI(bdp);
933	xfs_ilock(ip, XFS_ILOCK_SHARED);
934	error = xfs_iaccess(ip, mode, credp);
935	xfs_iunlock(ip, XFS_ILOCK_SHARED);
936	return error;
937}
938
939
940/*
941 * xfs_readlink
942 *
943 */
944STATIC int
945xfs_readlink(
946	bhv_desc_t	*bdp,
947	uio_t		*uiop,
948	int		ioflags,
949	cred_t		*credp)
950{
951	xfs_inode_t     *ip;
952	int		count;
953	xfs_off_t	offset;
954	int		pathlen;
955	xfs_vnode_t	*vp;
956	int		error = 0;
957	xfs_mount_t	*mp;
958	int             nmaps;
959	xfs_bmbt_irec_t mval[SYMLINK_MAPS];
960	xfs_daddr_t	d;
961	int		byte_cnt;
962	int		n;
963	xfs_buf_t	*bp;
964
965	vp = BHV_TO_VNODE(bdp);
966	vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
967
968	ip = XFS_BHVTOI(bdp);
969	mp = ip->i_mount;
970
971	if (XFS_FORCED_SHUTDOWN(mp))
972		return XFS_ERROR(EIO);
973
974	xfs_ilock(ip, XFS_ILOCK_SHARED);
975
976	ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFLNK);
977
978	offset = uiop->uio_offset;
979	count = uiop->uio_resid;
980
981	if (offset < 0) {
982		error = XFS_ERROR(EINVAL);
983		goto error_return;
984	}
985	if (count <= 0) {
986		error = 0;
987		goto error_return;
988	}
989
990	/*
991	 * See if the symlink is stored inline.
992	 */
993	pathlen = (int)ip->i_d.di_size;
994
995	if (ip->i_df.if_flags & XFS_IFINLINE) {
996		error = uio_read(ip->i_df.if_u1.if_data, pathlen, uiop);
997	}
998	else {
999		/*
1000		 * Symlink not inline.  Call bmap to get it in.
1001		 */
1002		nmaps = SYMLINK_MAPS;
1003
1004		error = xfs_bmapi(NULL, ip, 0, XFS_B_TO_FSB(mp, pathlen),
1005				  0, NULL, 0, mval, &nmaps, NULL, NULL);
1006
1007		if (error) {
1008			goto error_return;
1009		}
1010
1011		for (n = 0; n < nmaps; n++) {
1012			d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
1013			byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
1014			bp = xfs_buf_read(mp->m_ddev_targp, d,
1015				      BTOBB(byte_cnt), 0);
1016			error = XFS_BUF_GETERROR(bp);
1017			if (error) {
1018				xfs_ioerror_alert("xfs_readlink",
1019					  ip->i_mount, bp, XFS_BUF_ADDR(bp));
1020				xfs_buf_relse(bp);
1021				goto error_return;
1022			}
1023			if (pathlen < byte_cnt)
1024				byte_cnt = pathlen;
1025			pathlen -= byte_cnt;
1026
1027			error = uio_read(XFS_BUF_PTR(bp), byte_cnt, uiop);
1028			xfs_buf_relse (bp);
1029		}
1030
1031	}
1032
1033error_return:
1034	xfs_iunlock(ip, XFS_ILOCK_SHARED);
1035	return error;
1036}
1037
1038
1039/*
1040 * xfs_fsync
1041 *
1042 * This is called to sync the inode and its data out to disk.
1043 * We need to hold the I/O lock while flushing the data, and
1044 * the inode lock while flushing the inode.  The inode lock CANNOT
1045 * be held while flushing the data, so acquire after we're done
1046 * with that.
1047 */
1048STATIC int
1049xfs_fsync(
1050	bhv_desc_t	*bdp,
1051	int		flag,
1052	cred_t		*credp,
1053	xfs_off_t	start,
1054	xfs_off_t	stop)
1055{
1056	xfs_inode_t	*ip;
1057	xfs_trans_t	*tp;
1058	int		error;
1059	int		log_flushed = 0, changed = 1;
1060
1061	vn_trace_entry(BHV_TO_VNODE(bdp),
1062			__FUNCTION__, (inst_t *)__return_address);
1063
1064	ip = XFS_BHVTOI(bdp);
1065
1066	ASSERT(start >= 0 && stop >= -1);
1067
1068	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
1069		return XFS_ERROR(EIO);
1070
1071	/*
1072	 * We always need to make sure that the required inode state
1073	 * is safe on disk.  The vnode might be clean but because
1074	 * of committed transactions that haven't hit the disk yet.
1075	 * Likewise, there could be unflushed non-transactional
1076	 * changes to the inode core that have to go to disk.
1077	 *
1078	 * The following code depends on one assumption:  that
1079	 * any transaction that changes an inode logs the core
1080	 * because it has to change some field in the inode core
1081	 * (typically nextents or nblocks).  That assumption
1082	 * implies that any transactions against an inode will
1083	 * catch any non-transactional updates.  If inode-altering
1084	 * transactions exist that violate this assumption, the
1085	 * code breaks.  Right now, it figures that if the involved
1086	 * update_* field is clear and the inode is unpinned, the
1087	 * inode is clean.  Either it's been flushed or it's been
1088	 * committed and the commit has hit the disk unpinning the inode.
1089	 * (Note that xfs_inode_item_format() called at commit clears
1090	 * the update_* fields.)
1091	 */
1092	xfs_ilock(ip, XFS_ILOCK_SHARED);
1093
1094	/* If we are flushing data then we care about update_size
1095	 * being set, otherwise we care about update_core
1096	 */
1097	if ((flag & FSYNC_DATA) ?
1098			(ip->i_update_size == 0) :
1099			(ip->i_update_core == 0)) {
1100		/*
1101		 * Timestamps/size haven't changed since last inode
1102		 * flush or inode transaction commit.  That means
1103		 * either nothing got written or a transaction
1104		 * committed which caught the updates.	If the
1105		 * latter happened and the transaction hasn't
1106		 * hit the disk yet, the inode will be still
1107		 * be pinned.  If it is, force the log.
1108		 */
1109
1110		xfs_iunlock(ip, XFS_ILOCK_SHARED);
1111
1112		if (xfs_ipincount(ip)) {
1113			_xfs_log_force(ip->i_mount, (xfs_lsn_t)0,
1114				      XFS_LOG_FORCE |
1115				      ((flag & FSYNC_WAIT)
1116				       ? XFS_LOG_SYNC : 0),
1117				      &log_flushed);
1118		} else {
1119			/*
1120			 * If the inode is not pinned and nothing
1121			 * has changed we don't need to flush the
1122			 * cache.
1123			 */
1124			changed = 0;
1125		}
1126		error = 0;
1127	} else	{
1128		/*
1129		 * Kick off a transaction to log the inode
1130		 * core to get the updates.  Make it
1131		 * sync if FSYNC_WAIT is passed in (which
1132		 * is done by everybody but specfs).  The
1133		 * sync transaction will also force the log.
1134		 */
1135		xfs_iunlock(ip, XFS_ILOCK_SHARED);
1136		tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS);
1137		if ((error = xfs_trans_reserve(tp, 0,
1138				XFS_FSYNC_TS_LOG_RES(ip->i_mount),
1139				0, 0, 0)))  {
1140			xfs_trans_cancel(tp, 0);
1141			return error;
1142		}
1143		xfs_ilock(ip, XFS_ILOCK_EXCL);
1144
1145		/*
1146		 * Note - it's possible that we might have pushed
1147		 * ourselves out of the way during trans_reserve
1148		 * which would flush the inode.	 But there's no
1149		 * guarantee that the inode buffer has actually
1150		 * gone out yet (it's delwri).	Plus the buffer
1151		 * could be pinned anyway if it's part of an
1152		 * inode in another recent transaction.	 So we
1153		 * play it safe and fire off the transaction anyway.
1154		 */
1155		xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1156		xfs_trans_ihold(tp, ip);
1157		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1158		if (flag & FSYNC_WAIT)
1159			xfs_trans_set_sync(tp);
1160		error = _xfs_trans_commit(tp, 0, NULL, &log_flushed);
1161
1162		xfs_iunlock(ip, XFS_ILOCK_EXCL);
1163	}
1164
1165	if ((ip->i_mount->m_flags & XFS_MOUNT_BARRIER) && changed) {
1166		/*
1167		 * If the log write didn't issue an ordered tag we need
1168		 * to flush the disk cache for the data device now.
1169		 */
1170		if (!log_flushed)
1171			xfs_blkdev_issue_flush(ip->i_mount->m_ddev_targp);
1172
1173		/*
1174		 * If this inode is on the RT dev we need to flush that
1175		 * cache as well.
1176		 */
1177		if (ip->i_d.di_flags & XFS_DIFLAG_REALTIME)
1178			xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp);
1179	}
1180
1181	return error;
1182}
1183
1184/*
1185 * This is called by xfs_inactive to free any blocks beyond eof,
1186 * when the link count isn't zero.
1187 */
1188STATIC int
1189xfs_inactive_free_eofblocks(
1190	xfs_mount_t	*mp,
1191	xfs_inode_t	*ip)
1192{
1193	xfs_trans_t	*tp;
1194	int		error;
1195	xfs_fileoff_t	end_fsb;
1196	xfs_fileoff_t	last_fsb;
1197	xfs_filblks_t	map_len;
1198	int		nimaps;
1199	xfs_bmbt_irec_t	imap;
1200
1201	/*
1202	 * Figure out if there are any blocks beyond the end
1203	 * of the file.  If not, then there is nothing to do.
1204	 */
1205	end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_d.di_size));
1206	last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
1207	map_len = last_fsb - end_fsb;
1208	if (map_len <= 0)
1209		return 0;
1210
1211	nimaps = 1;
1212	xfs_ilock(ip, XFS_ILOCK_SHARED);
1213	error = XFS_BMAPI(mp, NULL, &ip->i_iocore, end_fsb, map_len, 0,
1214			  NULL, 0, &imap, &nimaps, NULL, NULL);
1215	xfs_iunlock(ip, XFS_ILOCK_SHARED);
1216
1217	if (!error && (nimaps != 0) &&
1218	    (imap.br_startblock != HOLESTARTBLOCK ||
1219	     ip->i_delayed_blks)) {
1220		/*
1221		 * Attach the dquots to the inode up front.
1222		 */
1223		if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
1224			return error;
1225
1226		/*
1227		 * There are blocks after the end of file.
1228		 * Free them up now by truncating the file to
1229		 * its current size.
1230		 */
1231		tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1232
1233		/*
1234		 * Do the xfs_itruncate_start() call before
1235		 * reserving any log space because
1236		 * itruncate_start will call into the buffer
1237		 * cache and we can't
1238		 * do that within a transaction.
1239		 */
1240		xfs_ilock(ip, XFS_IOLOCK_EXCL);
1241		xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE,
1242				    ip->i_d.di_size);
1243
1244		error = xfs_trans_reserve(tp, 0,
1245					  XFS_ITRUNCATE_LOG_RES(mp),
1246					  0, XFS_TRANS_PERM_LOG_RES,
1247					  XFS_ITRUNCATE_LOG_COUNT);
1248		if (error) {
1249			ASSERT(XFS_FORCED_SHUTDOWN(mp));
1250			xfs_trans_cancel(tp, 0);
1251			xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1252			return error;
1253		}
1254
1255		xfs_ilock(ip, XFS_ILOCK_EXCL);
1256		xfs_trans_ijoin(tp, ip,
1257				XFS_IOLOCK_EXCL |
1258				XFS_ILOCK_EXCL);
1259		xfs_trans_ihold(tp, ip);
1260
1261		error = xfs_itruncate_finish(&tp, ip,
1262					     ip->i_d.di_size,
1263					     XFS_DATA_FORK,
1264					     0);
1265		/*
1266		 * If we get an error at this point we
1267		 * simply don't bother truncating the file.
1268		 */
1269		if (error) {
1270			xfs_trans_cancel(tp,
1271					 (XFS_TRANS_RELEASE_LOG_RES |
1272					  XFS_TRANS_ABORT));
1273		} else {
1274			error = xfs_trans_commit(tp,
1275						XFS_TRANS_RELEASE_LOG_RES,
1276						NULL);
1277		}
1278		xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1279	}
1280	return error;
1281}
1282
1283/*
1284 * Free a symlink that has blocks associated with it.
1285 */
1286STATIC int
1287xfs_inactive_symlink_rmt(
1288	xfs_inode_t	*ip,
1289	xfs_trans_t	**tpp)
1290{
1291	xfs_buf_t	*bp;
1292	int		committed;
1293	int		done;
1294	int		error;
1295	xfs_fsblock_t	first_block;
1296	xfs_bmap_free_t	free_list;
1297	int		i;
1298	xfs_mount_t	*mp;
1299	xfs_bmbt_irec_t	mval[SYMLINK_MAPS];
1300	int		nmaps;
1301	xfs_trans_t	*ntp;
1302	int		size;
1303	xfs_trans_t	*tp;
1304
1305	tp = *tpp;
1306	mp = ip->i_mount;
1307	ASSERT(ip->i_d.di_size > XFS_IFORK_DSIZE(ip));
1308	/*
1309	 * We're freeing a symlink that has some
1310	 * blocks allocated to it.  Free the
1311	 * blocks here.  We know that we've got
1312	 * either 1 or 2 extents and that we can
1313	 * free them all in one bunmapi call.
1314	 */
1315	ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2);
1316	if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
1317			XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
1318		ASSERT(XFS_FORCED_SHUTDOWN(mp));
1319		xfs_trans_cancel(tp, 0);
1320		*tpp = NULL;
1321		return error;
1322	}
1323	/*
1324	 * Lock the inode, fix the size, and join it to the transaction.
1325	 * Hold it so in the normal path, we still have it locked for
1326	 * the second transaction.  In the error paths we need it
1327	 * held so the cancel won't rele it, see below.
1328	 */
1329	xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1330	size = (int)ip->i_d.di_size;
1331	ip->i_d.di_size = 0;
1332	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1333	xfs_trans_ihold(tp, ip);
1334	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1335	/*
1336	 * Find the block(s) so we can inval and unmap them.
1337	 */
1338	done = 0;
1339	XFS_BMAP_INIT(&free_list, &first_block);
1340	nmaps = sizeof(mval) / sizeof(mval[0]);
1341	if ((error = xfs_bmapi(tp, ip, 0, XFS_B_TO_FSB(mp, size),
1342			XFS_BMAPI_METADATA, &first_block, 0, mval, &nmaps,
1343			&free_list, NULL)))
1344		goto error0;
1345	/*
1346	 * Invalidate the block(s).
1347	 */
1348	for (i = 0; i < nmaps; i++) {
1349		bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
1350			XFS_FSB_TO_DADDR(mp, mval[i].br_startblock),
1351			XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0);
1352		xfs_trans_binval(tp, bp);
1353	}
1354	/*
1355	 * Unmap the dead block(s) to the free_list.
1356	 */
1357	if ((error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps,
1358			&first_block, &free_list, NULL, &done)))
1359		goto error1;
1360	ASSERT(done);
1361	/*
1362	 * Commit the first transaction.  This logs the EFI and the inode.
1363	 */
1364	if ((error = xfs_bmap_finish(&tp, &free_list, first_block, &committed)))
1365		goto error1;
1366	/*
1367	 * The transaction must have been committed, since there were
1368	 * actually extents freed by xfs_bunmapi.  See xfs_bmap_finish.
1369	 * The new tp has the extent freeing and EFDs.
1370	 */
1371	ASSERT(committed);
1372	/*
1373	 * The first xact was committed, so add the inode to the new one.
1374	 * Mark it dirty so it will be logged and moved forward in the log as
1375	 * part of every commit.
1376	 */
1377	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1378	xfs_trans_ihold(tp, ip);
1379	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1380	/*
1381	 * Get a new, empty transaction to return to our caller.
1382	 */
1383	ntp = xfs_trans_dup(tp);
1384	/*
1385	 * Commit the transaction containing extent freeing and EFDs.
1386	 * If we get an error on the commit here or on the reserve below,
1387	 * we need to unlock the inode since the new transaction doesn't
1388	 * have the inode attached.
1389	 */
1390	error = xfs_trans_commit(tp, 0, NULL);
1391	tp = ntp;
1392	if (error) {
1393		ASSERT(XFS_FORCED_SHUTDOWN(mp));
1394		goto error0;
1395	}
1396	/*
1397	 * Remove the memory for extent descriptions (just bookkeeping).
1398	 */
1399	if (ip->i_df.if_bytes)
1400		xfs_idata_realloc(ip, -ip->i_df.if_bytes, XFS_DATA_FORK);
1401	ASSERT(ip->i_df.if_bytes == 0);
1402	/*
1403	 * Put an itruncate log reservation in the new transaction
1404	 * for our caller.
1405	 */
1406	if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
1407			XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
1408		ASSERT(XFS_FORCED_SHUTDOWN(mp));
1409		goto error0;
1410	}
1411	/*
1412	 * Return with the inode locked but not joined to the transaction.
1413	 */
1414	*tpp = tp;
1415	return 0;
1416
1417 error1:
1418	xfs_bmap_cancel(&free_list);
1419 error0:
1420	/*
1421	 * Have to come here with the inode locked and either
1422	 * (held and in the transaction) or (not in the transaction).
1423	 * If the inode isn't held then cancel would iput it, but
1424	 * that's wrong since this is inactive and the vnode ref
1425	 * count is 0 already.
1426	 * Cancel won't do anything to the inode if held, but it still
1427	 * needs to be locked until the cancel is done, if it was
1428	 * joined to the transaction.
1429	 */
1430	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1431	xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1432	*tpp = NULL;
1433	return error;
1434
1435}
1436
1437STATIC int
1438xfs_inactive_symlink_local(
1439	xfs_inode_t	*ip,
1440	xfs_trans_t	**tpp)
1441{
1442	int		error;
1443
1444	ASSERT(ip->i_d.di_size <= XFS_IFORK_DSIZE(ip));
1445	/*
1446	 * We're freeing a symlink which fit into
1447	 * the inode.  Just free the memory used
1448	 * to hold the old symlink.
1449	 */
1450	error = xfs_trans_reserve(*tpp, 0,
1451				  XFS_ITRUNCATE_LOG_RES(ip->i_mount),
1452				  0, XFS_TRANS_PERM_LOG_RES,
1453				  XFS_ITRUNCATE_LOG_COUNT);
1454
1455	if (error) {
1456		xfs_trans_cancel(*tpp, 0);
1457		*tpp = NULL;
1458		return error;
1459	}
1460	xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1461
1462	/*
1463	 * Zero length symlinks _can_ exist.
1464	 */
1465	if (ip->i_df.if_bytes > 0) {
1466		xfs_idata_realloc(ip,
1467				  -(ip->i_df.if_bytes),
1468				  XFS_DATA_FORK);
1469		ASSERT(ip->i_df.if_bytes == 0);
1470	}
1471	return 0;
1472}
1473
1474/*
1475 *
1476 */
1477STATIC int
1478xfs_inactive_attrs(
1479	xfs_inode_t	*ip,
1480	xfs_trans_t	**tpp)
1481{
1482	xfs_trans_t	*tp;
1483	int		error;
1484	xfs_mount_t	*mp;
1485
1486	ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE));
1487	tp = *tpp;
1488	mp = ip->i_mount;
1489	ASSERT(ip->i_d.di_forkoff != 0);
1490	xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
1491	xfs_iunlock(ip, XFS_ILOCK_EXCL);
1492
1493	error = xfs_attr_inactive(ip);
1494	if (error) {
1495		*tpp = NULL;
1496		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1497		return error; /* goto out */
1498	}
1499
1500	tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1501	error = xfs_trans_reserve(tp, 0,
1502				  XFS_IFREE_LOG_RES(mp),
1503				  0, XFS_TRANS_PERM_LOG_RES,
1504				  XFS_INACTIVE_LOG_COUNT);
1505	if (error) {
1506		ASSERT(XFS_FORCED_SHUTDOWN(mp));
1507		xfs_trans_cancel(tp, 0);
1508		*tpp = NULL;
1509		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1510		return error;
1511	}
1512
1513	xfs_ilock(ip, XFS_ILOCK_EXCL);
1514	xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1515	xfs_trans_ihold(tp, ip);
1516	xfs_idestroy_fork(ip, XFS_ATTR_FORK);
1517
1518	ASSERT(ip->i_d.di_anextents == 0);
1519
1520	*tpp = tp;
1521	return 0;
1522}
1523
1524STATIC int
1525xfs_release(
1526	bhv_desc_t	*bdp)
1527{
1528	xfs_inode_t	*ip;
1529	xfs_vnode_t	*vp;
1530	xfs_mount_t	*mp;
1531	int		error;
1532
1533	vp = BHV_TO_VNODE(bdp);
1534	ip = XFS_BHVTOI(bdp);
1535
1536	if (!VN_ISREG(vp) || (ip->i_d.di_mode == 0)) {
1537		return 0;
1538	}
1539
1540	/* If this is a read-only mount, don't do this (would generate I/O) */
1541	if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
1542		return 0;
1543
1544#ifdef HAVE_REFCACHE
1545	/* If we are in the NFS reference cache then don't do this now */
1546	if (ip->i_refcache)
1547		return 0;
1548#endif
1549
1550	mp = ip->i_mount;
1551
1552	if (ip->i_d.di_nlink != 0) {
1553		if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
1554		     ((ip->i_d.di_size > 0) || (VN_CACHED(vp) > 0 ||
1555		       ip->i_delayed_blks > 0)) &&
1556		     (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
1557		    (!(ip->i_d.di_flags &
1558				(XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
1559			if ((error = xfs_inactive_free_eofblocks(mp, ip)))
1560				return error;
1561
1562#ifdef RMC			/* Update linux inode block count after free above */
1563			vn_to_inode(vp)->i_blocks = XFS_FSB_TO_BB(mp,
1564				ip->i_d.di_nblocks + ip->i_delayed_blks);
1565#endif
1566		}
1567	}
1568
1569	return 0;
1570}
1571
1572/*
1573 * xfs_inactive
1574 *
1575 * This is called when the vnode reference count for the vnode
1576 * goes to zero.  If the file has been unlinked, then it must
1577 * now be truncated.  Also, we clear all of the read-ahead state
1578 * kept for the inode here since the file is now closed.
1579 */
1580STATIC int
1581xfs_inactive(
1582	bhv_desc_t	*bdp,
1583	cred_t		*credp)
1584{
1585	xfs_inode_t	*ip;
1586	xfs_vnode_t	*vp;
1587
1588	xfs_bmap_free_t	free_list;
1589	xfs_fsblock_t	first_block;
1590	int		committed;
1591	xfs_trans_t	*tp;
1592	xfs_mount_t	*mp;
1593	int		error;
1594	int		truncate;
1595
1596	vp = BHV_TO_VNODE(bdp);
1597	vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
1598
1599	ip = XFS_BHVTOI(bdp);
1600
1601	/*
1602	 * If the inode is already free, then there can be nothing
1603	 * to clean up here.
1604	 */
1605	if (ip->i_d.di_mode == 0 || VN_BAD(vp)) {
1606		ASSERT(ip->i_df.if_real_bytes == 0);
1607		ASSERT(ip->i_df.if_broot_bytes == 0);
1608		return VN_INACTIVE_CACHE;
1609	}
1610
1611	/*
1612	 * Only do a truncate if it's a regular file with
1613	 * some actual space in it.  It's OK to look at the
1614	 * inode's fields without the lock because we're the
1615	 * only one with a reference to the inode.
1616	 */
1617	truncate = ((ip->i_d.di_nlink == 0) &&
1618            ((ip->i_d.di_size != 0) || (ip->i_d.di_nextents > 0) ||
1619             (ip->i_delayed_blks > 0)) &&
1620	    ((ip->i_d.di_mode & S_IFMT) == S_IFREG));
1621
1622	mp = ip->i_mount;
1623
1624	if (ip->i_d.di_nlink == 0 &&
1625	    DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_DESTROY)) {
1626		(void) XFS_SEND_DESTROY(mp, vp, DM_RIGHT_NULL);
1627	}
1628
1629	error = 0;
1630
1631	/* If this is a read-only mount, don't do this (would generate I/O) */
1632	if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
1633		goto out;
1634
1635	if (ip->i_d.di_nlink != 0) {
1636		if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
1637                     ((ip->i_d.di_size > 0) || (VN_CACHED(vp) > 0 ||
1638                       ip->i_delayed_blks > 0)) &&
1639		      (ip->i_df.if_flags & XFS_IFEXTENTS) &&
1640		     (!(ip->i_d.di_flags &
1641				(XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) ||
1642		      (ip->i_delayed_blks != 0)))) {
1643			if ((error = xfs_inactive_free_eofblocks(mp, ip)))
1644				return VN_INACTIVE_CACHE;
1645#ifdef RMC
1646			/* Update linux inode block count after free above */
1647			vn_to_inode(vp)->i_blocks = XFS_FSB_TO_BB(mp,
1648				ip->i_d.di_nblocks + ip->i_delayed_blks);
1649#endif
1650		}
1651		goto out;
1652	}
1653
1654	ASSERT(ip->i_d.di_nlink == 0);
1655
1656	if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
1657		return VN_INACTIVE_CACHE;
1658
1659	tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1660	if (truncate) {
1661		/*
1662		 * Do the xfs_itruncate_start() call before
1663		 * reserving any log space because itruncate_start
1664		 * will call into the buffer cache and we can't
1665		 * do that within a transaction.
1666		 */
1667		xfs_ilock(ip, XFS_IOLOCK_EXCL);
1668
1669		xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, 0);
1670
1671		error = xfs_trans_reserve(tp, 0,
1672					  XFS_ITRUNCATE_LOG_RES(mp),
1673					  0, XFS_TRANS_PERM_LOG_RES,
1674					  XFS_ITRUNCATE_LOG_COUNT);
1675		if (error) {
1676			/* Don't call itruncate_cleanup */
1677			ASSERT(XFS_FORCED_SHUTDOWN(mp));
1678			xfs_trans_cancel(tp, 0);
1679			xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1680			return VN_INACTIVE_CACHE;
1681		}
1682
1683		xfs_ilock(ip, XFS_ILOCK_EXCL);
1684		xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1685		xfs_trans_ihold(tp, ip);
1686
1687		/*
1688		 * normally, we have to run xfs_itruncate_finish sync.
1689		 * But if filesystem is wsync and we're in the inactive
1690		 * path, then we know that nlink == 0, and that the
1691		 * xaction that made nlink == 0 is permanently committed
1692		 * since xfs_remove runs as a synchronous transaction.
1693		 */
1694		error = xfs_itruncate_finish(&tp, ip, 0, XFS_DATA_FORK,
1695				(!(mp->m_flags & XFS_MOUNT_WSYNC) ? 1 : 0));
1696
1697		if (error) {
1698			xfs_trans_cancel(tp,
1699				XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1700			xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1701			return VN_INACTIVE_CACHE;
1702		}
1703	} else if ((ip->i_d.di_mode & S_IFMT) == S_IFLNK) {
1704
1705		/*
1706		 * If we get an error while cleaning up a
1707		 * symlink we bail out.
1708		 */
1709		error = (ip->i_d.di_size > XFS_IFORK_DSIZE(ip)) ?
1710			xfs_inactive_symlink_rmt(ip, &tp) :
1711			xfs_inactive_symlink_local(ip, &tp);
1712
1713		if (error) {
1714			ASSERT(tp == NULL);
1715			return VN_INACTIVE_CACHE;
1716		}
1717
1718		xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1719		xfs_trans_ihold(tp, ip);
1720	} else {
1721		error = xfs_trans_reserve(tp, 0,
1722					  XFS_IFREE_LOG_RES(mp),
1723					  0, XFS_TRANS_PERM_LOG_RES,
1724					  XFS_INACTIVE_LOG_COUNT);
1725		if (error) {
1726			ASSERT(XFS_FORCED_SHUTDOWN(mp));
1727			xfs_trans_cancel(tp, 0);
1728			return VN_INACTIVE_CACHE;
1729		}
1730
1731		xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1732		xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1733		xfs_trans_ihold(tp, ip);
1734	}
1735
1736	/*
1737	 * If there are attributes associated with the file
1738	 * then blow them away now.  The code calls a routine
1739	 * that recursively deconstructs the attribute fork.
1740	 * We need to just commit the current transaction
1741	 * because we can't use it for xfs_attr_inactive().
1742	 */
1743	if (ip->i_d.di_anextents > 0) {
1744		error = xfs_inactive_attrs(ip, &tp);
1745		/*
1746		 * If we got an error, the transaction is already
1747		 * cancelled, and the inode is unlocked. Just get out.
1748		 */
1749		 if (error)
1750			 return VN_INACTIVE_CACHE;
1751	} else if (ip->i_afp) {
1752		xfs_idestroy_fork(ip, XFS_ATTR_FORK);
1753	}
1754
1755	/*
1756	 * Free the inode.
1757	 */
1758	XFS_BMAP_INIT(&free_list, &first_block);
1759	error = xfs_ifree(tp, ip, &free_list);
1760	if (error) {
1761		/*
1762		 * If we fail to free the inode, shut down.  The cancel
1763		 * might do that, we need to make sure.  Otherwise the
1764		 * inode might be lost for a long time or forever.
1765		 */
1766		if (!XFS_FORCED_SHUTDOWN(mp)) {
1767			cmn_err(CE_NOTE,
1768		"xfs_inactive:	xfs_ifree() returned an error = %d on %s",
1769				error, mp->m_fsname);
1770			xfs_force_shutdown(mp, XFS_METADATA_IO_ERROR);
1771		}
1772		xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
1773	} else {
1774		/*
1775		 * Credit the quota account(s). The inode is gone.
1776		 */
1777		XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
1778
1779		/*
1780		 * Just ignore errors at this point.  There is
1781		 * nothing we can do except to try to keep going.
1782		 */
1783		(void) xfs_bmap_finish(&tp,  &free_list, first_block,
1784				       &committed);
1785		(void) xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
1786	}
1787	/*
1788	 * Release the dquots held by inode, if any.
1789	 */
1790	XFS_QM_DQDETACH(mp, ip);
1791
1792	xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1793
1794 out:
1795	return VN_INACTIVE_CACHE;
1796}
1797
1798
1799/*
1800 * xfs_lookup
1801 */
1802STATIC int
1803xfs_lookup(
1804	bhv_desc_t		*dir_bdp,
1805	vname_t			*dentry,
1806	xfs_vnode_t		**vpp,
1807	int			flags,
1808	xfs_vnode_t		*rdir,
1809	cred_t			*credp)
1810{
1811	xfs_inode_t		*dp, *ip;
1812	xfs_ino_t		e_inum;
1813	int			error;
1814	uint			lock_mode;
1815	xfs_vnode_t		*dir_vp;
1816
1817	dir_vp = BHV_TO_VNODE(dir_bdp);
1818	vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
1819
1820	dp = XFS_BHVTOI(dir_bdp);
1821
1822	if (XFS_FORCED_SHUTDOWN(dp->i_mount))
1823		return XFS_ERROR(EIO);
1824
1825	lock_mode = xfs_ilock_map_shared(dp);
1826	error = xfs_dir_lookup_int(dir_bdp, lock_mode, dentry, &e_inum, &ip);
1827	if (!error) {
1828		*vpp = XFS_ITOV(ip);
1829		ITRACE(ip);
1830	}
1831	xfs_iunlock_map_shared(dp, lock_mode);
1832	return error;
1833}
1834
1835
1836/*
1837 * xfs_create (create a new file).
1838 */
1839STATIC int
1840xfs_create(
1841	bhv_desc_t		*dir_bdp,
1842	vname_t			*dentry,
1843	xfs_vattr_t		*vap,
1844	xfs_vnode_t		**vpp,
1845	cred_t			*credp)
1846{
1847	char			*name = VNAME(dentry);
1848	xfs_vnode_t		*dir_vp;
1849	xfs_inode_t		*dp, *ip;
1850	xfs_vnode_t	        *vp=NULL;
1851	xfs_trans_t		*tp;
1852	xfs_mount_t	        *mp;
1853	xfs_dev_t		rdev;
1854	int                     error;
1855	xfs_bmap_free_t		free_list;
1856	xfs_fsblock_t		first_block;
1857	boolean_t		dp_joined_to_trans;
1858	int			dm_event_sent = 0;
1859	uint			cancel_flags;
1860	int			committed;
1861	xfs_prid_t		prid;
1862	struct xfs_dquot	*udqp, *gdqp;
1863	uint			resblks;
1864	int			dm_di_mode;
1865	int			namelen;
1866
1867	ASSERT(!*vpp);
1868	dir_vp = BHV_TO_VNODE(dir_bdp);
1869	vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
1870
1871	dp = XFS_BHVTOI(dir_bdp);
1872	mp = dp->i_mount;
1873
1874	dm_di_mode = vap->va_mode;
1875	namelen = VNAMELEN(dentry);
1876
1877	if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_CREATE)) {
1878		error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
1879				dir_vp, DM_RIGHT_NULL, NULL,
1880				DM_RIGHT_NULL, name, NULL,
1881				dm_di_mode, 0, 0);
1882
1883		if (error)
1884			return error;
1885		dm_event_sent = 1;
1886	}
1887
1888	if (XFS_FORCED_SHUTDOWN(mp))
1889		return XFS_ERROR(EIO);
1890
1891	/* Return through std_return after this point. */
1892
1893	udqp = gdqp = NULL;
1894
1895	if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
1896		prid = dp->i_d.di_projid;
1897	else if (vap->va_mask & XFS_AT_PROJID)
1898		prid = (xfs_prid_t)vap->va_projid;
1899	else
1900		prid = (xfs_prid_t)dfltprid;
1901
1902	/*
1903	 * Make sure that we have allocated dquot(s) on disk.
1904	 */
1905	error = XFS_QM_DQVOPALLOC(mp, dp,
1906			current_fsuid(credp), current_fsgid(credp), prid,
1907			XFS_QMOPT_QUOTALL|XFS_QMOPT_INHERIT, &udqp, &gdqp);
1908	if (error)
1909		goto std_return;
1910
1911	ip = NULL;
1912	dp_joined_to_trans = B_FALSE;
1913
1914	tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
1915	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
1916	resblks = XFS_CREATE_SPACE_RES(mp, namelen);
1917	/*
1918	 * Initially assume that the file does not exist and
1919	 * reserve the resources for that case.  If that is not
1920	 * the case we'll drop the one we have and get a more
1921	 * appropriate transaction later.
1922	 */
1923	error = xfs_trans_reserve(tp, resblks, XFS_CREATE_LOG_RES(mp), 0,
1924			XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
1925	if (error == ENOSPC) {
1926		resblks = 0;
1927		error = xfs_trans_reserve(tp, 0, XFS_CREATE_LOG_RES(mp), 0,
1928				XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
1929	}
1930	if (error) {
1931		cancel_flags = 0;
1932		dp = NULL;
1933		goto error_return;
1934	}
1935
1936	xfs_ilock(dp, XFS_ILOCK_EXCL);
1937
1938	XFS_BMAP_INIT(&free_list, &first_block);
1939
1940	ASSERT(ip == NULL);
1941
1942	/*
1943	 * Reserve disk quota and the inode.
1944	 */
1945	error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
1946	if (error)
1947		goto error_return;
1948
1949	if (resblks == 0 &&
1950	    (error = XFS_DIR_CANENTER(mp, tp, dp, name, namelen)))
1951		goto error_return;
1952	rdev = (vap->va_mask & XFS_AT_RDEV) ? vap->va_rdev : 0;
1953	error = xfs_dir_ialloc(&tp, dp, vap->va_mode, 1,
1954			rdev, credp, prid, resblks > 0,
1955			&ip, &committed);
1956	if (error) {
1957		if (error == ENOSPC)
1958			goto error_return;
1959		goto abort_return;
1960	}
1961	ITRACE(ip);
1962
1963	/*
1964	 * At this point, we've gotten a newly allocated inode.
1965	 * It is locked (and joined to the transaction).
1966	 */
1967
1968	ASSERT(ismrlocked (&ip->i_lock, MR_UPDATE));
1969
1970	/*
1971	 * Now we join the directory inode to the transaction.
1972	 * We do not do it earlier because xfs_dir_ialloc
1973	 * might commit the previous transaction (and release
1974	 * all the locks).
1975	 */
1976
1977	VN_HOLD(dir_vp);
1978	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
1979	dp_joined_to_trans = B_TRUE;
1980
1981	error = XFS_DIR_CREATENAME(mp, tp, dp, name, namelen, ip->i_ino,
1982		&first_block, &free_list,
1983		resblks ? resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
1984	if (error) {
1985		ASSERT(error != ENOSPC);
1986		goto abort_return;
1987	}
1988	xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1989	xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
1990
1991	/*
1992	 * If this is a synchronous mount, make sure that the
1993	 * create transaction goes to disk before returning to
1994	 * the user.
1995	 */
1996	if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
1997		xfs_trans_set_sync(tp);
1998	}
1999
2000	dp->i_gen++;
2001
2002	/*
2003	 * Attach the dquot(s) to the inodes and modify them incore.
2004	 * These ids of the inode couldn't have changed since the new
2005	 * inode has been locked ever since it was created.
2006	 */
2007	XFS_QM_DQVOPCREATE(mp, tp, ip, udqp, gdqp);
2008
2009	/*
2010	 * xfs_trans_commit normally decrements the vnode ref count
2011	 * when it unlocks the inode. Since we want to return the
2012	 * vnode to the caller, we bump the vnode ref count now.
2013	 */
2014	IHOLD(ip);
2015	vp = XFS_ITOV(ip);
2016
2017	error = xfs_bmap_finish(&tp, &free_list, first_block, &committed);
2018	if (error) {
2019		xfs_bmap_cancel(&free_list);
2020		goto abort_rele;
2021	}
2022
2023	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
2024	if (error) {
2025		IRELE(ip);
2026		tp = NULL;
2027		goto error_return;
2028	}
2029
2030	XFS_QM_DQRELE(mp, udqp);
2031	XFS_QM_DQRELE(mp, gdqp);
2032
2033	/*
2034	 * Propagate the fact that the vnode changed after the
2035	 * xfs_inode locks have been released.
2036	 */
2037	XVOP_VNODE_CHANGE(vp, VCHANGE_FLAGS_TRUNCATED, 3);
2038
2039	*vpp = vp;
2040
2041	/* Fallthrough to std_return with error = 0  */
2042
2043std_return:
2044	if ( (*vpp || (error != 0 && dm_event_sent != 0)) &&
2045			DM_EVENT_ENABLED(dir_vp->v_vfsp, XFS_BHVTOI(dir_bdp),
2046							DM_EVENT_POSTCREATE)) {
2047		(void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
2048			dir_vp, DM_RIGHT_NULL,
2049			*vpp ? vp:NULL,
2050			DM_RIGHT_NULL, name, NULL,
2051			dm_di_mode, error, 0);
2052	}
2053	return error;
2054
2055 abort_return:
2056	cancel_flags |= XFS_TRANS_ABORT;
2057	/* FALLTHROUGH */
2058
2059 error_return:
2060	if (tp != NULL)
2061		xfs_trans_cancel(tp, cancel_flags);
2062
2063	if (!dp_joined_to_trans && (dp != NULL))
2064		xfs_iunlock(dp, XFS_ILOCK_EXCL);
2065	XFS_QM_DQRELE(mp, udqp);
2066	XFS_QM_DQRELE(mp, gdqp);
2067
2068	goto std_return;
2069
2070 abort_rele:
2071	/*
2072	 * Wait until after the current transaction is aborted to
2073	 * release the inode.  This prevents recursive transactions
2074	 * and deadlocks from xfs_inactive.
2075	 */
2076	cancel_flags |= XFS_TRANS_ABORT;
2077	xfs_trans_cancel(tp, cancel_flags);
2078	IRELE(ip);
2079
2080	XFS_QM_DQRELE(mp, udqp);
2081	XFS_QM_DQRELE(mp, gdqp);
2082
2083	goto std_return;
2084}
2085
2086#ifdef DEBUG
2087/*
2088 * Some counters to see if (and how often) we are hitting some deadlock
2089 * prevention code paths.
2090 */
2091
2092int xfs_rm_locks;
2093int xfs_rm_lock_delays;
2094int xfs_rm_attempts;
2095#endif
2096
2097/*
2098 * The following routine will lock the inodes associated with the
2099 * directory and the named entry in the directory. The locks are
2100 * acquired in increasing inode number.
2101 *
2102 * If the entry is "..", then only the directory is locked. The
2103 * vnode ref count will still include that from the .. entry in
2104 * this case.
2105 *
2106 * There is a deadlock we need to worry about. If the locked directory is
2107 * in the AIL, it might be blocking up the log. The next inode we lock
2108 * could be already locked by another thread waiting for log space (e.g
2109 * a permanent log reservation with a long running transaction (see
2110 * xfs_itruncate_finish)). To solve this, we must check if the directory
2111 * is in the ail and use lock_nowait. If we can't lock, we need to
2112 * drop the inode lock on the directory and try again. xfs_iunlock will
2113 * potentially push the tail if we were holding up the log.
2114 */
2115STATIC int
2116xfs_lock_dir_and_entry(
2117	xfs_inode_t	*dp,
2118	vname_t		*dentry,
2119	xfs_inode_t	*ip)	/* inode of entry 'name' */
2120{
2121	int		attempts;
2122	xfs_ino_t	e_inum;
2123	xfs_inode_t	*ips[2];
2124	xfs_log_item_t	*lp;
2125
2126#ifdef DEBUG
2127	xfs_rm_locks++;
2128#endif
2129	attempts = 0;
2130
2131again:
2132	xfs_ilock(dp, XFS_ILOCK_EXCL);
2133
2134	e_inum = ip->i_ino;
2135
2136	ITRACE(ip);
2137
2138	/*
2139	 * We want to lock in increasing inum. Since we've already
2140	 * acquired the lock on the directory, we may need to release
2141	 * if if the inum of the entry turns out to be less.
2142	 */
2143	if (e_inum > dp->i_ino) {
2144		/*
2145		 * We are already in the right order, so just
2146		 * lock on the inode of the entry.
2147		 * We need to use nowait if dp is in the AIL.
2148		 */
2149
2150		lp = (xfs_log_item_t *)dp->i_itemp;
2151		if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
2152			if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
2153				attempts++;
2154#ifdef DEBUG
2155				xfs_rm_attempts++;
2156#endif
2157
2158				/*
2159				 * Unlock dp and try again.
2160				 * xfs_iunlock will try to push the tail
2161				 * if the inode is in the AIL.
2162				 */
2163
2164				xfs_iunlock(dp, XFS_ILOCK_EXCL);
2165
2166				if ((attempts % 5) == 0) {
2167					delay(1); /* Don't just spin the CPU */
2168#ifdef DEBUG
2169					xfs_rm_lock_delays++;
2170#endif
2171				}
2172				goto again;
2173			}
2174		} else {
2175			xfs_ilock(ip, XFS_ILOCK_EXCL);
2176		}
2177	} else if (e_inum < dp->i_ino) {
2178		xfs_iunlock(dp, XFS_ILOCK_EXCL);
2179
2180		ips[0] = ip;
2181		ips[1] = dp;
2182		xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL);
2183	}
2184	/* else	 e_inum == dp->i_ino */
2185	/*     This can happen if we're asked to lock /x/..
2186	 *     the entry is "..", which is also the parent directory.
2187	 */
2188
2189	return 0;
2190}
2191
2192#ifdef DEBUG
2193int xfs_locked_n;
2194int xfs_small_retries;
2195int xfs_middle_retries;
2196int xfs_lots_retries;
2197int xfs_lock_delays;
2198#endif
2199
2200/*
2201 * The following routine will lock n inodes in exclusive mode.
2202 * We assume the caller calls us with the inodes in i_ino order.
2203 *
2204 * We need to detect deadlock where an inode that we lock
2205 * is in the AIL and we start waiting for another inode that is locked
2206 * by a thread in a long running transaction (such as truncate). This can
2207 * result in deadlock since the long running trans might need to wait
2208 * for the inode we just locked in order to push the tail and free space
2209 * in the log.
2210 */
2211void
2212xfs_lock_inodes(
2213	xfs_inode_t	**ips,
2214	int		inodes,
2215	int		first_locked,
2216	uint		lock_mode)
2217{
2218	int		attempts = 0, i, j, try_lock;
2219	xfs_log_item_t	*lp;
2220
2221	ASSERT(ips && (inodes >= 2)); /* we need at least two */
2222
2223	if (first_locked) {
2224		try_lock = 1;
2225		i = 1;
2226	} else {
2227		try_lock = 0;
2228		i = 0;
2229	}
2230
2231again:
2232	for (; i < inodes; i++) {
2233		ASSERT(ips[i]);
2234
2235		if (i && (ips[i] == ips[i-1]))	/* Already locked */
2236			continue;
2237
2238		/*
2239		 * If try_lock is not set yet, make sure all locked inodes
2240		 * are not in the AIL.
2241		 * If any are, set try_lock to be used later.
2242		 */
2243
2244		if (!try_lock) {
2245			for (j = (i - 1); j >= 0 && !try_lock; j--) {
2246				lp = (xfs_log_item_t *)ips[j]->i_itemp;
2247				if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
2248					try_lock++;
2249				}
2250			}
2251		}
2252
2253		/*
2254		 * If any of the previous locks we have locked is in the AIL,
2255		 * we must TRY to get the second and subsequent locks. If
2256		 * we can't get any, we must release all we have
2257		 * and try again.
2258		 */
2259
2260		if (try_lock) {
2261			/* try_lock must be 0 if i is 0. */
2262			/*
2263			 * try_lock means we have an inode locked
2264			 * that is in the AIL.
2265			 */
2266			ASSERT(i != 0);
2267			if (!xfs_ilock_nowait(ips[i], lock_mode)) {
2268				attempts++;
2269
2270				/*
2271				 * Unlock all previous guys and try again.
2272				 * xfs_iunlock will try to push the tail
2273				 * if the inode is in the AIL.
2274				 */
2275
2276				for(j = i - 1; j >= 0; j--) {
2277
2278					/*
2279					 * Check to see if we've already
2280					 * unlocked this one.
2281					 * Not the first one going back,
2282					 * and the inode ptr is the same.
2283					 */
2284					if ((j != (i - 1)) && ips[j] ==
2285								ips[j+1])
2286						continue;
2287
2288					xfs_iunlock(ips[j], lock_mode);
2289				}
2290
2291				if ((attempts % 5) == 0) {
2292					delay(1); /* Don't just spin the CPU */
2293#ifdef DEBUG
2294					xfs_lock_delays++;
2295#endif
2296				}
2297				i = 0;
2298				try_lock = 0;
2299				goto again;
2300			}
2301		} else {
2302			xfs_ilock(ips[i], lock_mode);
2303		}
2304	}
2305
2306#ifdef DEBUG
2307	if (attempts) {
2308		if (attempts < 5) xfs_small_retries++;
2309		else if (attempts < 100) xfs_middle_retries++;
2310		else xfs_lots_retries++;
2311	} else {
2312		xfs_locked_n++;
2313	}
2314#endif
2315}
2316
2317#ifdef	DEBUG
2318#define	REMOVE_DEBUG_TRACE(x)	{remove_which_error_return = (x);}
2319int remove_which_error_return = 0;
2320#else /* ! DEBUG */
2321#define	REMOVE_DEBUG_TRACE(x)
2322#endif	/* ! DEBUG */
2323
2324extern int xfs_remove(bhv_desc_t *, bhv_desc_t *, vname_t *, cred_t *);
2325/*
2326 * xfs_remove
2327 *
2328 */
2329int
2330xfs_remove(
2331	bhv_desc_t		*dir_bdp,
2332	bhv_desc_t		*vp_bdp,
2333	vname_t			*dentry,
2334	cred_t			*credp)
2335{
2336	xfs_vnode_t		*dir_vp;
2337	xfs_vnode_t		*xvp;
2338	char			*name = VNAME(dentry);
2339	xfs_inode_t             *dp, *ip;
2340	xfs_trans_t             *tp = NULL;
2341	xfs_mount_t		*mp;
2342	int                     error = 0;
2343	xfs_bmap_free_t         free_list;
2344	xfs_fsblock_t           first_block;
2345	int			cancel_flags;
2346	int			committed;
2347	int			dm_di_mode = 0;
2348	int			link_zero;
2349	uint			resblks;
2350	int			namelen;
2351
2352	dir_vp = BHV_TO_VNODE(dir_bdp);
2353	xvp = BHV_TO_VNODE(vp_bdp);
2354
2355	printf("xfs_remove: dvp %p vp %p\n",dir_vp,xvp);
2356	vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
2357
2358	dp = XFS_BHVTOI(dir_bdp);
2359	mp = dp->i_mount;
2360
2361	if (XFS_FORCED_SHUTDOWN(mp))
2362		return XFS_ERROR(EIO);
2363
2364	namelen = VNAMELEN(dentry);
2365
2366	if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_REMOVE)) {
2367		error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, dir_vp,
2368					DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
2369					name, NULL, 0, 0, 0);
2370		if (error)
2371			return error;
2372	}
2373
2374	/* From this point on, return through std_return */
2375	ip = NULL;
2376
2377	/*
2378	 * We need to get a reference to ip before we get our log
2379	 * reservation. The reason for this is that we cannot call
2380	 * xfs_iget for an inode for which we do not have a reference
2381	 * once we've acquired a log reservation. This is because the
2382	 * inode we are trying to get might be in xfs_inactive going
2383	 * for a log reservation. Since we'll have to wait for the
2384	 * inactive code to complete before returning from xfs_iget,
2385	 * we need to make sure that we don't have log space reserved
2386	 * when we call xfs_iget.  Instead we get an unlocked reference
2387	 * to the inode before getting our log reservation.
2388	 */
2389#ifdef RMC
2390	error = xfs_get_dir_entry(dentry, &ip);
2391#endif
2392	/* FreeBSD has already done the lookup */
2393	ip = xvp->v_inode;
2394	VN_HOLD(xvp);
2395
2396	if (error) {
2397		REMOVE_DEBUG_TRACE(__LINE__);
2398		goto std_return;
2399	}
2400
2401	dm_di_mode = ip->i_d.di_mode;
2402
2403	vn_trace_entry(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address);
2404
2405	ITRACE(ip);
2406
2407	error = XFS_QM_DQATTACH(mp, dp, 0);
2408	if (!error && dp != ip)
2409		error = XFS_QM_DQATTACH(mp, ip, 0);
2410	if (error) {
2411		REMOVE_DEBUG_TRACE(__LINE__);
2412		IRELE(ip);
2413		goto std_return;
2414	}
2415
2416	tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
2417	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2418	/*
2419	 * We try to get the real space reservation first,
2420	 * allowing for directory btree deletion(s) implying
2421	 * possible bmap insert(s).  If we can't get the space
2422	 * reservation then we use 0 instead, and avoid the bmap
2423	 * btree insert(s) in the directory code by, if the bmap
2424	 * insert tries to happen, instead trimming the LAST
2425	 * block from the directory.
2426	 */
2427	resblks = XFS_REMOVE_SPACE_RES(mp);
2428	error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
2429			XFS_TRANS_PERM_LOG_RES, XFS_REMOVE_LOG_COUNT);
2430	if (error == ENOSPC) {
2431		resblks = 0;
2432		error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
2433				XFS_TRANS_PERM_LOG_RES, XFS_REMOVE_LOG_COUNT);
2434	}
2435	if (error) {
2436		ASSERT(error != ENOSPC);
2437		REMOVE_DEBUG_TRACE(__LINE__);
2438		xfs_trans_cancel(tp, 0);
2439		IRELE(ip);
2440		return error;
2441	}
2442
2443	error = xfs_lock_dir_and_entry(dp, dentry, ip);
2444	if (error) {
2445		REMOVE_DEBUG_TRACE(__LINE__);
2446		xfs_trans_cancel(tp, cancel_flags);
2447		IRELE(ip);
2448		goto std_return;
2449	}
2450
2451	/*
2452	 * At this point, we've gotten both the directory and the entry
2453	 * inodes locked.
2454	 */
2455	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2456	if (dp != ip) {
2457		/*
2458		 * Increment vnode ref count only in this case since
2459		 * there's an extra vnode reference in the case where
2460		 * dp == ip.
2461		 */
2462		IHOLD(dp);
2463		xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
2464	}
2465
2466	/*
2467	 * Entry must exist since we did a lookup in xfs_lock_dir_and_entry.
2468	 */
2469	XFS_BMAP_INIT(&free_list, &first_block);
2470	error = XFS_DIR_REMOVENAME(mp, tp, dp, name, namelen, ip->i_ino,
2471		&first_block, &free_list, 0);
2472	if (error) {
2473		ASSERT(error != ENOENT);
2474		REMOVE_DEBUG_TRACE(__LINE__);
2475		goto error1;
2476	}
2477	xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2478
2479	dp->i_gen++;
2480	xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
2481
2482	error = xfs_droplink(tp, ip);
2483	if (error) {
2484		REMOVE_DEBUG_TRACE(__LINE__);
2485		goto error1;
2486	}
2487
2488	/* Determine if this is the last link while
2489	 * we are in the transaction.
2490	 */
2491	link_zero = (ip)->i_d.di_nlink==0;
2492
2493	/*
2494	 * Take an extra ref on the inode so that it doesn't
2495	 * go to xfs_inactive() from within the commit.
2496	 */
2497	IHOLD(ip);
2498
2499	/*
2500	 * If this is a synchronous mount, make sure that the
2501	 * remove transaction goes to disk before returning to
2502	 * the user.
2503	 */
2504	if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2505		xfs_trans_set_sync(tp);
2506	}
2507
2508	error = xfs_bmap_finish(&tp, &free_list, first_block, &committed);
2509	if (error) {
2510		REMOVE_DEBUG_TRACE(__LINE__);
2511		goto error_rele;
2512	}
2513
2514	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
2515	if (error) {
2516		IRELE(ip);
2517		goto std_return;
2518	}
2519
2520	/*
2521	 * Before we drop our extra reference to the inode, purge it
2522	 * from the refcache if it is there.  By waiting until afterwards
2523	 * to do the IRELE, we ensure that we won't go inactive in the
2524	 * xfs_refcache_purge_ip routine (although that would be OK).
2525	 */
2526	xfs_refcache_purge_ip(ip);
2527
2528	vn_trace_exit(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address);
2529
2530	/*
2531	 * Let interposed file systems know about removed links.
2532	 */
2533	XVOP_LINK_REMOVED(XFS_ITOV(ip), dir_vp, link_zero);
2534
2535	IRELE(ip);
2536
2537/*	Fall through to std_return with error = 0 */
2538 std_return:
2539	if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp,
2540						DM_EVENT_POSTREMOVE)) {
2541		(void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
2542				dir_vp, DM_RIGHT_NULL,
2543				NULL, DM_RIGHT_NULL,
2544				name, NULL, dm_di_mode, error, 0);
2545	}
2546	return error;
2547
2548 error1:
2549	xfs_bmap_cancel(&free_list);
2550	cancel_flags |= XFS_TRANS_ABORT;
2551	xfs_trans_cancel(tp, cancel_flags);
2552	goto std_return;
2553
2554 error_rele:
2555	/*
2556	 * In this case make sure to not release the inode until after
2557	 * the current transaction is aborted.  Releasing it beforehand
2558	 * can cause us to go to xfs_inactive and start a recursive
2559	 * transaction which can easily deadlock with the current one.
2560	 */
2561	xfs_bmap_cancel(&free_list);
2562	cancel_flags |= XFS_TRANS_ABORT;
2563	xfs_trans_cancel(tp, cancel_flags);
2564
2565	/*
2566	 * Before we drop our extra reference to the inode, purge it
2567	 * from the refcache if it is there.  By waiting until afterwards
2568	 * to do the IRELE, we ensure that we won't go inactive in the
2569	 * xfs_refcache_purge_ip routine (although that would be OK).
2570	 */
2571	xfs_refcache_purge_ip(ip);
2572
2573	IRELE(ip);
2574
2575	goto std_return;
2576}
2577
2578
2579/*
2580 * xfs_link
2581 *
2582 */
2583STATIC int
2584xfs_link(
2585	bhv_desc_t		*target_dir_bdp,
2586	xfs_vnode_t		*src_vp,
2587	vname_t			*dentry,
2588	cred_t			*credp)
2589{
2590	xfs_inode_t		*tdp, *sip;
2591	xfs_trans_t		*tp;
2592	xfs_mount_t		*mp;
2593	xfs_inode_t		*ips[2];
2594	int			error;
2595	xfs_bmap_free_t         free_list;
2596	xfs_fsblock_t           first_block;
2597	int			cancel_flags;
2598	int			committed;
2599	xfs_vnode_t		*target_dir_vp;
2600	int			resblks;
2601	char			*target_name = VNAME(dentry);
2602	int			target_namelen;
2603
2604	target_dir_vp = BHV_TO_VNODE(target_dir_bdp);
2605	vn_trace_entry(target_dir_vp, __FUNCTION__, (inst_t *)__return_address);
2606	vn_trace_entry(src_vp, __FUNCTION__, (inst_t *)__return_address);
2607
2608	target_namelen = VNAMELEN(dentry);
2609	if (VN_ISDIR(src_vp))
2610		return XFS_ERROR(EPERM);
2611
2612	sip = xfs_vtoi(src_vp);
2613	tdp = XFS_BHVTOI(target_dir_bdp);
2614	mp = tdp->i_mount;
2615	if (XFS_FORCED_SHUTDOWN(mp))
2616		return XFS_ERROR(EIO);
2617
2618	if (DM_EVENT_ENABLED(src_vp->v_vfsp, tdp, DM_EVENT_LINK)) {
2619		error = XFS_SEND_NAMESP(mp, DM_EVENT_LINK,
2620					target_dir_vp, DM_RIGHT_NULL,
2621					src_vp, DM_RIGHT_NULL,
2622					target_name, NULL, 0, 0, 0);
2623		if (error)
2624			return error;
2625	}
2626
2627	/* Return through std_return after this point. */
2628
2629	error = XFS_QM_DQATTACH(mp, sip, 0);
2630	if (!error && sip != tdp)
2631		error = XFS_QM_DQATTACH(mp, tdp, 0);
2632	if (error)
2633		goto std_return;
2634
2635	tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
2636	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2637	resblks = XFS_LINK_SPACE_RES(mp, target_namelen);
2638	error = xfs_trans_reserve(tp, resblks, XFS_LINK_LOG_RES(mp), 0,
2639			XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
2640	if (error == ENOSPC) {
2641		resblks = 0;
2642		error = xfs_trans_reserve(tp, 0, XFS_LINK_LOG_RES(mp), 0,
2643				XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
2644	}
2645	if (error) {
2646		cancel_flags = 0;
2647		goto error_return;
2648	}
2649
2650	if (sip->i_ino < tdp->i_ino) {
2651		ips[0] = sip;
2652		ips[1] = tdp;
2653	} else {
2654		ips[0] = tdp;
2655		ips[1] = sip;
2656	}
2657
2658	xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL);
2659
2660	/*
2661	 * Increment vnode ref counts since xfs_trans_commit &
2662	 * xfs_trans_cancel will both unlock the inodes and
2663	 * decrement the associated ref counts.
2664	 */
2665	VN_HOLD(src_vp);
2666	VN_HOLD(target_dir_vp);
2667	xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
2668	xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
2669
2670	/*
2671	 * If the source has too many links, we can't make any more to it.
2672	 */
2673	if (sip->i_d.di_nlink >= XFS_MAXLINK) {
2674		error = XFS_ERROR(EMLINK);
2675		goto error_return;
2676	}
2677
2678	/*
2679	 * If we are using project inheritance, we only allow hard link
2680	 * creation in our tree when the project IDs are the same; else
2681	 * the tree quota mechanism could be circumvented.
2682	 */
2683	if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
2684		     (tdp->i_d.di_projid != sip->i_d.di_projid))) {
2685		error = XFS_ERROR(EPERM);
2686		goto error_return;
2687	}
2688
2689	if (resblks == 0 &&
2690	    (error = XFS_DIR_CANENTER(mp, tp, tdp, target_name,
2691			target_namelen)))
2692		goto error_return;
2693
2694	XFS_BMAP_INIT(&free_list, &first_block);
2695
2696	error = XFS_DIR_CREATENAME(mp, tp, tdp, target_name, target_namelen,
2697				   sip->i_ino, &first_block, &free_list,
2698				   resblks);
2699	if (error)
2700		goto abort_return;
2701	xfs_ichgtime(tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2702	tdp->i_gen++;
2703	xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
2704
2705	error = xfs_bumplink(tp, sip);
2706	if (error) {
2707		goto abort_return;
2708	}
2709
2710	/*
2711	 * If this is a synchronous mount, make sure that the
2712	 * link transaction goes to disk before returning to
2713	 * the user.
2714	 */
2715	if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2716		xfs_trans_set_sync(tp);
2717	}
2718
2719	error = xfs_bmap_finish (&tp, &free_list, first_block, &committed);
2720	if (error) {
2721		xfs_bmap_cancel(&free_list);
2722		goto abort_return;
2723	}
2724
2725	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
2726	if (error) {
2727		goto std_return;
2728	}
2729
2730	/* Fall through to std_return with error = 0. */
2731std_return:
2732	if (DM_EVENT_ENABLED(src_vp->v_vfsp, sip,
2733						DM_EVENT_POSTLINK)) {
2734		(void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTLINK,
2735				target_dir_vp, DM_RIGHT_NULL,
2736				src_vp, DM_RIGHT_NULL,
2737				target_name, NULL, 0, error, 0);
2738	}
2739	return error;
2740
2741 abort_return:
2742	cancel_flags |= XFS_TRANS_ABORT;
2743	/* FALLTHROUGH */
2744
2745 error_return:
2746	xfs_trans_cancel(tp, cancel_flags);
2747	goto std_return;
2748}
2749/*
2750 * xfs_mkdir
2751 *
2752 */
2753STATIC int
2754xfs_mkdir(
2755	bhv_desc_t		*dir_bdp,
2756	vname_t			*dentry,
2757	xfs_vattr_t		*vap,
2758	xfs_vnode_t		**vpp,
2759	cred_t			*credp)
2760{
2761	char			*dir_name = VNAME(dentry);
2762	xfs_inode_t             *dp;
2763	xfs_inode_t		*cdp;	/* inode of created dir */
2764	xfs_vnode_t		*cvp;	/* vnode of created dir */
2765	xfs_trans_t		*tp;
2766	xfs_mount_t		*mp;
2767	int			cancel_flags;
2768	int			error;
2769	int			committed;
2770	xfs_bmap_free_t         free_list;
2771	xfs_fsblock_t           first_block;
2772	xfs_vnode_t		*dir_vp;
2773	boolean_t		dp_joined_to_trans;
2774	boolean_t		created = B_FALSE;
2775	int			dm_event_sent = 0;
2776	xfs_prid_t		prid;
2777	struct xfs_dquot	*udqp, *gdqp;
2778	uint			resblks;
2779	int			dm_di_mode;
2780	int			dir_namelen;
2781
2782	dir_vp = BHV_TO_VNODE(dir_bdp);
2783	dp = XFS_BHVTOI(dir_bdp);
2784	mp = dp->i_mount;
2785
2786	if (XFS_FORCED_SHUTDOWN(mp))
2787		return XFS_ERROR(EIO);
2788
2789	dir_namelen = VNAMELEN(dentry);
2790
2791	tp = NULL;
2792	dp_joined_to_trans = B_FALSE;
2793	dm_di_mode = vap->va_mode;
2794
2795	if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_CREATE)) {
2796		error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
2797					dir_vp, DM_RIGHT_NULL, NULL,
2798					DM_RIGHT_NULL, dir_name, NULL,
2799					dm_di_mode, 0, 0);
2800		if (error)
2801			return error;
2802		dm_event_sent = 1;
2803	}
2804
2805	/* Return through std_return after this point. */
2806
2807	vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
2808
2809	mp = dp->i_mount;
2810	udqp = gdqp = NULL;
2811
2812	if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
2813		prid = dp->i_d.di_projid;
2814	else if (vap->va_mask & XFS_AT_PROJID)
2815		prid = (xfs_prid_t)vap->va_projid;
2816	else
2817		prid = (xfs_prid_t)dfltprid;
2818
2819	/*
2820	 * Make sure that we have allocated dquot(s) on disk.
2821	 */
2822	error = XFS_QM_DQVOPALLOC(mp, dp,
2823			current_fsuid(credp), current_fsgid(credp), prid,
2824			XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
2825	if (error)
2826		goto std_return;
2827
2828	tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
2829	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2830	resblks = XFS_MKDIR_SPACE_RES(mp, dir_namelen);
2831	error = xfs_trans_reserve(tp, resblks, XFS_MKDIR_LOG_RES(mp), 0,
2832				  XFS_TRANS_PERM_LOG_RES, XFS_MKDIR_LOG_COUNT);
2833	if (error == ENOSPC) {
2834		resblks = 0;
2835		error = xfs_trans_reserve(tp, 0, XFS_MKDIR_LOG_RES(mp), 0,
2836					  XFS_TRANS_PERM_LOG_RES,
2837					  XFS_MKDIR_LOG_COUNT);
2838	}
2839	if (error) {
2840		cancel_flags = 0;
2841		dp = NULL;
2842		goto error_return;
2843	}
2844
2845	xfs_ilock(dp, XFS_ILOCK_EXCL);
2846
2847	/*
2848	 * Check for directory link count overflow.
2849	 */
2850	if (dp->i_d.di_nlink >= XFS_MAXLINK) {
2851		error = XFS_ERROR(EMLINK);
2852		goto error_return;
2853	}
2854
2855	/*
2856	 * Reserve disk quota and the inode.
2857	 */
2858	error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
2859	if (error)
2860		goto error_return;
2861
2862	if (resblks == 0 &&
2863	    (error = XFS_DIR_CANENTER(mp, tp, dp, dir_name, dir_namelen)))
2864		goto error_return;
2865	/*
2866	 * create the directory inode.
2867	 */
2868	error = xfs_dir_ialloc(&tp, dp, vap->va_mode, 2,
2869			0, credp, prid, resblks > 0,
2870		&cdp, NULL);
2871	if (error) {
2872		if (error == ENOSPC)
2873			goto error_return;
2874		goto abort_return;
2875	}
2876	ITRACE(cdp);
2877
2878	/*
2879	 * Now we add the directory inode to the transaction.
2880	 * We waited until now since xfs_dir_ialloc might start
2881	 * a new transaction.  Had we joined the transaction
2882	 * earlier, the locks might have gotten released.
2883	 */
2884	VN_HOLD(dir_vp);
2885	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2886	dp_joined_to_trans = B_TRUE;
2887
2888	XFS_BMAP_INIT(&free_list, &first_block);
2889
2890	error = XFS_DIR_CREATENAME(mp, tp, dp, dir_name, dir_namelen,
2891			cdp->i_ino, &first_block, &free_list,
2892			resblks ? resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
2893	if (error) {
2894		ASSERT(error != ENOSPC);
2895		goto error1;
2896	}
2897	xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2898
2899	/*
2900	 * Bump the in memory version number of the parent directory
2901	 * so that other processes accessing it will recognize that
2902	 * the directory has changed.
2903	 */
2904	dp->i_gen++;
2905
2906	error = XFS_DIR_INIT(mp, tp, cdp, dp);
2907	if (error) {
2908		goto error2;
2909	}
2910
2911	cdp->i_gen = 1;
2912	error = xfs_bumplink(tp, dp);
2913	if (error) {
2914		goto error2;
2915	}
2916
2917	cvp = XFS_ITOV(cdp);
2918
2919	created = B_TRUE;
2920
2921	*vpp = cvp;
2922	IHOLD(cdp);
2923
2924	/*
2925	 * Attach the dquots to the new inode and modify the icount incore.
2926	 */
2927	XFS_QM_DQVOPCREATE(mp, tp, cdp, udqp, gdqp);
2928
2929	/*
2930	 * If this is a synchronous mount, make sure that the
2931	 * mkdir transaction goes to disk before returning to
2932	 * the user.
2933	 */
2934	if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2935		xfs_trans_set_sync(tp);
2936	}
2937
2938	error = xfs_bmap_finish(&tp, &free_list, first_block, &committed);
2939	if (error) {
2940		IRELE(cdp);
2941		goto error2;
2942	}
2943
2944	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
2945	XFS_QM_DQRELE(mp, udqp);
2946	XFS_QM_DQRELE(mp, gdqp);
2947	if (error) {
2948		IRELE(cdp);
2949	}
2950
2951	/* Fall through to std_return with error = 0 or errno from
2952	 * xfs_trans_commit. */
2953
2954std_return:
2955	if ( (created || (error != 0 && dm_event_sent != 0)) &&
2956			DM_EVENT_ENABLED(dir_vp->v_vfsp, XFS_BHVTOI(dir_bdp),
2957						DM_EVENT_POSTCREATE)) {
2958		(void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
2959					dir_vp, DM_RIGHT_NULL,
2960					created ? XFS_ITOV(cdp):NULL,
2961					DM_RIGHT_NULL,
2962					dir_name, NULL,
2963					dm_di_mode, error, 0);
2964	}
2965	return error;
2966
2967 error2:
2968 error1:
2969	xfs_bmap_cancel(&free_list);
2970 abort_return:
2971	cancel_flags |= XFS_TRANS_ABORT;
2972 error_return:
2973	xfs_trans_cancel(tp, cancel_flags);
2974	XFS_QM_DQRELE(mp, udqp);
2975	XFS_QM_DQRELE(mp, gdqp);
2976
2977	if (!dp_joined_to_trans && (dp != NULL)) {
2978		xfs_iunlock(dp, XFS_ILOCK_EXCL);
2979	}
2980
2981	goto std_return;
2982}
2983
2984
2985/*
2986 * xfs_rmdir
2987 *
2988 */
2989STATIC int
2990xfs_rmdir(
2991	bhv_desc_t		*dir_bdp,
2992	vname_t			*dentry,
2993	cred_t			*credp)
2994{
2995	char			*name = VNAME(dentry);
2996	xfs_inode_t             *dp;
2997	xfs_inode_t             *cdp;   /* child directory */
2998	xfs_trans_t             *tp;
2999	xfs_mount_t		*mp;
3000	int                     error;
3001	xfs_bmap_free_t         free_list;
3002	xfs_fsblock_t           first_block;
3003	int			cancel_flags;
3004	int			committed;
3005	xfs_vnode_t		*dir_vp;
3006	int			dm_di_mode = 0;
3007	int			last_cdp_link;
3008	int			namelen;
3009	uint			resblks;
3010
3011	dir_vp = BHV_TO_VNODE(dir_bdp);
3012	dp = XFS_BHVTOI(dir_bdp);
3013	mp = dp->i_mount;
3014
3015	vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
3016
3017	if (XFS_FORCED_SHUTDOWN(XFS_BHVTOI(dir_bdp)->i_mount))
3018		return XFS_ERROR(EIO);
3019	namelen = VNAMELEN(dentry);
3020
3021	if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_REMOVE)) {
3022		error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE,
3023					dir_vp, DM_RIGHT_NULL,
3024					NULL, DM_RIGHT_NULL,
3025					name, NULL, 0, 0, 0);
3026		if (error)
3027			return XFS_ERROR(error);
3028	}
3029
3030	/* Return through std_return after this point. */
3031
3032	cdp = NULL;
3033
3034	/*
3035	 * We need to get a reference to cdp before we get our log
3036	 * reservation.  The reason for this is that we cannot call
3037	 * xfs_iget for an inode for which we do not have a reference
3038	 * once we've acquired a log reservation.  This is because the
3039	 * inode we are trying to get might be in xfs_inactive going
3040	 * for a log reservation.  Since we'll have to wait for the
3041	 * inactive code to complete before returning from xfs_iget,
3042	 * we need to make sure that we don't have log space reserved
3043	 * when we call xfs_iget.  Instead we get an unlocked reference
3044	 * to the inode before getting our log reservation.
3045	 */
3046	error = xfs_get_dir_entry(dentry, &cdp);
3047	if (error) {
3048		REMOVE_DEBUG_TRACE(__LINE__);
3049		goto std_return;
3050	}
3051	mp = dp->i_mount;
3052	dm_di_mode = cdp->i_d.di_mode;
3053
3054	/*
3055	 * Get the dquots for the inodes.
3056	 */
3057	error = XFS_QM_DQATTACH(mp, dp, 0);
3058	if (!error && dp != cdp)
3059		error = XFS_QM_DQATTACH(mp, cdp, 0);
3060	if (error) {
3061		IRELE(cdp);
3062		REMOVE_DEBUG_TRACE(__LINE__);
3063		goto std_return;
3064	}
3065
3066	tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
3067	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
3068	/*
3069	 * We try to get the real space reservation first,
3070	 * allowing for directory btree deletion(s) implying
3071	 * possible bmap insert(s).  If we can't get the space
3072	 * reservation then we use 0 instead, and avoid the bmap
3073	 * btree insert(s) in the directory code by, if the bmap
3074	 * insert tries to happen, instead trimming the LAST
3075	 * block from the directory.
3076	 */
3077	resblks = XFS_REMOVE_SPACE_RES(mp);
3078	error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
3079			XFS_TRANS_PERM_LOG_RES, XFS_DEFAULT_LOG_COUNT);
3080	if (error == ENOSPC) {
3081		resblks = 0;
3082		error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
3083				XFS_TRANS_PERM_LOG_RES, XFS_DEFAULT_LOG_COUNT);
3084	}
3085	if (error) {
3086		ASSERT(error != ENOSPC);
3087		cancel_flags = 0;
3088		IRELE(cdp);
3089		goto error_return;
3090	}
3091	XFS_BMAP_INIT(&free_list, &first_block);
3092
3093	/*
3094	 * Now lock the child directory inode and the parent directory
3095	 * inode in the proper order.  This will take care of validating
3096	 * that the directory entry for the child directory inode has
3097	 * not changed while we were obtaining a log reservation.
3098	 */
3099	error = xfs_lock_dir_and_entry(dp, dentry, cdp);
3100	if (error) {
3101		xfs_trans_cancel(tp, cancel_flags);
3102		IRELE(cdp);
3103		goto std_return;
3104	}
3105
3106	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
3107	if (dp != cdp) {
3108		/*
3109		 * Only increment the parent directory vnode count if
3110		 * we didn't bump it in looking up cdp.  The only time
3111		 * we don't bump it is when we're looking up ".".
3112		 */
3113		VN_HOLD(dir_vp);
3114	}
3115
3116	ITRACE(cdp);
3117	xfs_trans_ijoin(tp, cdp, XFS_ILOCK_EXCL);
3118
3119	ASSERT(cdp->i_d.di_nlink >= 2);
3120	if (cdp->i_d.di_nlink != 2) {
3121		error = XFS_ERROR(ENOTEMPTY);
3122		goto error_return;
3123	}
3124	if (!XFS_DIR_ISEMPTY(mp, cdp)) {
3125		error = XFS_ERROR(ENOTEMPTY);
3126		goto error_return;
3127	}
3128
3129	error = XFS_DIR_REMOVENAME(mp, tp, dp, name, namelen, cdp->i_ino,
3130		&first_block, &free_list, resblks);
3131	if (error) {
3132		goto error1;
3133	}
3134
3135	xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3136
3137	/*
3138	 * Bump the in memory generation count on the parent
3139	 * directory so that other can know that it has changed.
3140	 */
3141	dp->i_gen++;
3142
3143	/*
3144	 * Drop the link from cdp's "..".
3145	 */
3146	error = xfs_droplink(tp, dp);
3147	if (error) {
3148		goto error1;
3149	}
3150
3151	/*
3152	 * Drop the link from dp to cdp.
3153	 */
3154	error = xfs_droplink(tp, cdp);
3155	if (error) {
3156		goto error1;
3157	}
3158
3159	/*
3160	 * Drop the "." link from cdp to self.
3161	 */
3162	error = xfs_droplink(tp, cdp);
3163	if (error) {
3164		goto error1;
3165	}
3166
3167	/* Determine these before committing transaction */
3168	last_cdp_link = (cdp)->i_d.di_nlink==0;
3169
3170	/*
3171	 * Take an extra ref on the child vnode so that it
3172	 * does not go to xfs_inactive() from within the commit.
3173	 */
3174	IHOLD(cdp);
3175
3176	/*
3177	 * If this is a synchronous mount, make sure that the
3178	 * rmdir transaction goes to disk before returning to
3179	 * the user.
3180	 */
3181	if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
3182		xfs_trans_set_sync(tp);
3183	}
3184
3185	error = xfs_bmap_finish (&tp, &free_list, first_block, &committed);
3186	if (error) {
3187		xfs_bmap_cancel(&free_list);
3188		xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES |
3189				 XFS_TRANS_ABORT));
3190		IRELE(cdp);
3191		goto std_return;
3192	}
3193
3194	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
3195	if (error) {
3196		IRELE(cdp);
3197		goto std_return;
3198	}
3199
3200
3201	/*
3202	 * Let interposed file systems know about removed links.
3203	 */
3204	XVOP_LINK_REMOVED(XFS_ITOV(cdp), dir_vp, last_cdp_link);
3205
3206	IRELE(cdp);
3207
3208	/* Fall through to std_return with error = 0 or the errno
3209	 * from xfs_trans_commit. */
3210 std_return:
3211	if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_POSTREMOVE)) {
3212		(void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
3213					dir_vp, DM_RIGHT_NULL,
3214					NULL, DM_RIGHT_NULL,
3215					name, NULL, dm_di_mode,
3216					error, 0);
3217	}
3218	return error;
3219
3220 error1:
3221	xfs_bmap_cancel(&free_list);
3222	cancel_flags |= XFS_TRANS_ABORT;
3223	/* FALLTHROUGH */
3224
3225 error_return:
3226	xfs_trans_cancel(tp, cancel_flags);
3227	goto std_return;
3228}
3229
3230
3231/*
3232 * xfs_readdir
3233 *
3234 * Read dp's entries starting at uiop->uio_offset and translate them into
3235 * bufsize bytes worth of struct dirents starting at bufbase.
3236 */
3237STATIC int
3238xfs_readdir(
3239	bhv_desc_t	*dir_bdp,
3240	uio_t		*uiop,
3241	cred_t		*credp,
3242	int		*eofp)
3243{
3244	xfs_inode_t	*dp;
3245	xfs_trans_t	*tp = NULL;
3246	int		error = 0;
3247	uint		lock_mode;
3248
3249	vn_trace_entry(BHV_TO_VNODE(dir_bdp), __FUNCTION__,
3250					       (inst_t *)__return_address);
3251	dp = XFS_BHVTOI(dir_bdp);
3252
3253	if (XFS_FORCED_SHUTDOWN(dp->i_mount)) {
3254		return XFS_ERROR(EIO);
3255	}
3256
3257	lock_mode = xfs_ilock_map_shared(dp);
3258	error = XFS_DIR_GETDENTS(dp->i_mount, tp, dp, uiop, eofp);
3259	xfs_iunlock_map_shared(dp, lock_mode);
3260	return error;
3261}
3262
3263
3264/*
3265 * xfs_symlink
3266 *
3267 */
3268STATIC int
3269xfs_symlink(
3270	bhv_desc_t		*dir_bdp,
3271	vname_t			*dentry,
3272	xfs_vattr_t		*vap,
3273	char			*target_path,
3274	xfs_vnode_t		**vpp,
3275	cred_t			*credp)
3276{
3277	xfs_trans_t		*tp;
3278	xfs_mount_t		*mp;
3279	xfs_inode_t		*dp;
3280	xfs_inode_t		*ip;
3281	int			error;
3282	int			pathlen;
3283	xfs_bmap_free_t		free_list;
3284	xfs_fsblock_t		first_block;
3285	boolean_t		dp_joined_to_trans;
3286	xfs_vnode_t		*dir_vp;
3287	uint			cancel_flags;
3288	int			committed;
3289	xfs_fileoff_t		first_fsb;
3290	xfs_filblks_t		fs_blocks;
3291	int			nmaps;
3292	xfs_bmbt_irec_t		mval[SYMLINK_MAPS];
3293	xfs_daddr_t		d;
3294	char			*cur_chunk;
3295	int			byte_cnt;
3296	int			n;
3297	xfs_buf_t		*bp;
3298	xfs_prid_t		prid;
3299	struct xfs_dquot	*udqp, *gdqp;
3300	uint			resblks;
3301	char			*link_name = VNAME(dentry);
3302	int			link_namelen;
3303	struct	thread 		*current = curthread;
3304
3305	*vpp = NULL;
3306	dir_vp = BHV_TO_VNODE(dir_bdp);
3307	dp = XFS_BHVTOI(dir_bdp);
3308	dp_joined_to_trans = B_FALSE;
3309	error = 0;
3310	ip = NULL;
3311	tp = NULL;
3312
3313	vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
3314
3315	mp = dp->i_mount;
3316
3317	if (XFS_FORCED_SHUTDOWN(mp))
3318		return XFS_ERROR(EIO);
3319
3320	link_namelen = VNAMELEN(dentry);
3321
3322	/*
3323	 * Check component lengths of the target path name.
3324	 */
3325	pathlen = strlen(target_path);
3326	if (pathlen >= MAXPATHLEN)      /* total string too long */
3327		return XFS_ERROR(ENAMETOOLONG);
3328	if (pathlen >= MAXNAMELEN) {    /* is any component too long? */
3329		int len, total;
3330		char *path;
3331
3332		for(total = 0, path = target_path; total < pathlen;) {
3333			/*
3334			 * Skip any slashes.
3335			 */
3336			while(*path == '/') {
3337				total++;
3338				path++;
3339			}
3340
3341			/*
3342			 * Count up to the next slash or end of path.
3343			 * Error out if the component is bigger than MAXNAMELEN.
3344			 */
3345			for(len = 0; *path != '/' && total < pathlen;total++, path++) {
3346				if (++len >= MAXNAMELEN) {
3347					error = ENAMETOOLONG;
3348					return error;
3349				}
3350			}
3351		}
3352	}
3353
3354	if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_SYMLINK)) {
3355		error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dir_vp,
3356					DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
3357					link_name, target_path, 0, 0, 0);
3358		if (error)
3359			return error;
3360	}
3361
3362	/* Return through std_return after this point. */
3363
3364	udqp = gdqp = NULL;
3365
3366#ifdef XXXKAN
3367	if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
3368		prid = dp->i_d.di_projid;
3369	else if (vap->va_mask & XFS_AT_PROJID)
3370		prid = (xfs_prid_t)vap->va_projid;
3371	else
3372#endif
3373		prid = (xfs_prid_t)dfltprid;
3374
3375	/*
3376	 * Make sure that we have allocated dquot(s) on disk.
3377	 */
3378	error = XFS_QM_DQVOPALLOC(mp, dp,
3379				  current->td_ucred->cr_uid,
3380				  current->td_ucred->cr_groups[0],
3381				  prid,
3382				  XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
3383	if (error)
3384		goto std_return;
3385
3386	tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK);
3387	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
3388	/*
3389	 * The symlink will fit into the inode data fork?
3390	 * There can't be any attributes so we get the whole variable part.
3391	 */
3392	if (pathlen <= XFS_LITINO(mp))
3393		fs_blocks = 0;
3394	else
3395		fs_blocks = XFS_B_TO_FSB(mp, pathlen);
3396	resblks = XFS_SYMLINK_SPACE_RES(mp, link_namelen, fs_blocks);
3397	error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0,
3398			XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
3399	if (error == ENOSPC && fs_blocks == 0) {
3400		resblks = 0;
3401		error = xfs_trans_reserve(tp, 0, XFS_SYMLINK_LOG_RES(mp), 0,
3402				XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
3403	}
3404	if (error) {
3405		cancel_flags = 0;
3406		dp = NULL;
3407		goto error_return;
3408	}
3409
3410	xfs_ilock(dp, XFS_ILOCK_EXCL);
3411
3412	/*
3413	 * Check whether the directory allows new symlinks or not.
3414	 */
3415	if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) {
3416		error = XFS_ERROR(EPERM);
3417		goto error_return;
3418	}
3419
3420	/*
3421	 * Reserve disk quota : blocks and inode.
3422	 */
3423	error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
3424	if (error)
3425		goto error_return;
3426
3427	/*
3428	 * Check for ability to enter directory entry, if no space reserved.
3429	 */
3430	if (resblks == 0 &&
3431	    (error = XFS_DIR_CANENTER(mp, tp, dp, link_name, link_namelen)))
3432		goto error_return;
3433	/*
3434	 * Initialize the bmap freelist prior to calling either
3435	 * bmapi or the directory create code.
3436	 */
3437	XFS_BMAP_INIT(&free_list, &first_block);
3438
3439	/*
3440	 * Allocate an inode for the symlink.
3441	 */
3442	error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (vap->va_mode&~S_IFMT),
3443			       1, 0, credp, prid, resblks > 0, &ip, NULL);
3444	if (error) {
3445		if (error == ENOSPC)
3446			goto error_return;
3447		goto error1;
3448	}
3449	ITRACE(ip);
3450
3451	VN_HOLD(dir_vp);
3452	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
3453	dp_joined_to_trans = B_TRUE;
3454
3455	/*
3456	 * Also attach the dquot(s) to it, if applicable.
3457	 */
3458	XFS_QM_DQVOPCREATE(mp, tp, ip, udqp, gdqp);
3459
3460	if (resblks)
3461		resblks -= XFS_IALLOC_SPACE_RES(mp);
3462	/*
3463	 * If the symlink will fit into the inode, write it inline.
3464	 */
3465	if (pathlen <= XFS_IFORK_DSIZE(ip)) {
3466		xfs_idata_realloc(ip, pathlen, XFS_DATA_FORK);
3467		memcpy(ip->i_df.if_u1.if_data, target_path, pathlen);
3468		ip->i_d.di_size = pathlen;
3469
3470		/*
3471		 * The inode was initially created in extent format.
3472		 */
3473		ip->i_df.if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT);
3474		ip->i_df.if_flags |= XFS_IFINLINE;
3475
3476		ip->i_d.di_format = XFS_DINODE_FMT_LOCAL;
3477		xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE);
3478
3479	} else {
3480		first_fsb = 0;
3481		nmaps = SYMLINK_MAPS;
3482
3483		error = xfs_bmapi(tp, ip, first_fsb, fs_blocks,
3484				  XFS_BMAPI_WRITE | XFS_BMAPI_METADATA,
3485				  &first_block, resblks, mval, &nmaps,
3486				  &free_list, NULL);
3487		if (error) {
3488			goto error1;
3489		}
3490
3491		if (resblks)
3492			resblks -= fs_blocks;
3493		ip->i_d.di_size = pathlen;
3494		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
3495
3496		cur_chunk = target_path;
3497		for (n = 0; n < nmaps; n++) {
3498			d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
3499			byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
3500			bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
3501					       BTOBB(byte_cnt), 0);
3502			ASSERT(bp && !XFS_BUF_GETERROR(bp));
3503			if (pathlen < byte_cnt) {
3504				byte_cnt = pathlen;
3505			}
3506			pathlen -= byte_cnt;
3507
3508			memcpy(XFS_BUF_PTR(bp), cur_chunk, byte_cnt);
3509			cur_chunk += byte_cnt;
3510
3511			xfs_trans_log_buf(tp, bp, 0, byte_cnt - 1);
3512		}
3513	}
3514
3515	/*
3516	 * Create the directory entry for the symlink.
3517	 */
3518	error = XFS_DIR_CREATENAME(mp, tp, dp, link_name, link_namelen,
3519			ip->i_ino, &first_block, &free_list, resblks);
3520	if (error) {
3521		goto error1;
3522	}
3523	xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3524	xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
3525
3526	/*
3527	 * Bump the in memory version number of the parent directory
3528	 * so that other processes accessing it will recognize that
3529	 * the directory has changed.
3530	 */
3531	dp->i_gen++;
3532
3533	/*
3534	 * If this is a synchronous mount, make sure that the
3535	 * symlink transaction goes to disk before returning to
3536	 * the user.
3537	 */
3538	if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
3539		xfs_trans_set_sync(tp);
3540	}
3541
3542	/*
3543	 * xfs_trans_commit normally decrements the vnode ref count
3544	 * when it unlocks the inode. Since we want to return the
3545	 * vnode to the caller, we bump the vnode ref count now.
3546	 */
3547	IHOLD(ip);
3548
3549	error = xfs_bmap_finish(&tp, &free_list, first_block, &committed);
3550	if (error) {
3551		goto error2;
3552	}
3553	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
3554	XFS_QM_DQRELE(mp, udqp);
3555	XFS_QM_DQRELE(mp, gdqp);
3556
3557	/* Fall through to std_return with error = 0 or errno from
3558	 * xfs_trans_commit	*/
3559std_return:
3560	if (DM_EVENT_ENABLED(dir_vp->v_vfsp, XFS_BHVTOI(dir_bdp),
3561			     DM_EVENT_POSTSYMLINK)) {
3562		(void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTSYMLINK,
3563					dir_vp, DM_RIGHT_NULL,
3564					error ? NULL : XFS_ITOV(ip),
3565					DM_RIGHT_NULL, link_name, target_path,
3566					0, error, 0);
3567	}
3568
3569	if (!error) {
3570		xfs_vnode_t *vp;
3571
3572		ASSERT(ip);
3573		vp = XFS_ITOV(ip);
3574		*vpp = vp;
3575	}
3576	return error;
3577
3578 error2:
3579	IRELE(ip);
3580 error1:
3581	xfs_bmap_cancel(&free_list);
3582	cancel_flags |= XFS_TRANS_ABORT;
3583 error_return:
3584	xfs_trans_cancel(tp, cancel_flags);
3585	XFS_QM_DQRELE(mp, udqp);
3586	XFS_QM_DQRELE(mp, gdqp);
3587
3588	if (!dp_joined_to_trans && (dp != NULL)) {
3589		xfs_iunlock(dp, XFS_ILOCK_EXCL);
3590	}
3591
3592	goto std_return;
3593}
3594
3595
3596/*
3597 * xfs_fid2
3598 *
3599 * A fid routine that takes a pointer to a previously allocated
3600 * fid structure (like xfs_fast_fid) but uses a 64 bit inode number.
3601 */
3602STATIC int
3603xfs_fid2(
3604	bhv_desc_t	*bdp,
3605	fid_t		*fidp)
3606{
3607	xfs_inode_t	*ip;
3608	xfs_fid2_t	*xfid;
3609
3610	vn_trace_entry(BHV_TO_VNODE(bdp), __FUNCTION__,
3611				       (inst_t *)__return_address);
3612	ASSERT(sizeof(xfs_fid_t) >= sizeof(xfs_fid2_t));
3613
3614	xfid = (xfs_fid2_t *)fidp;
3615	ip = XFS_BHVTOI(bdp);
3616	xfid->fid_len = sizeof(xfs_fid2_t) - sizeof(xfid->fid_len);
3617	xfid->fid_pad = 0;
3618	/*
3619	 * use memcpy because the inode is a long long and there's no
3620	 * assurance that xfid->fid_ino is properly aligned.
3621	 */
3622	memcpy(&xfid->fid_ino, &ip->i_ino, sizeof(xfid->fid_ino));
3623	xfid->fid_gen = ip->i_d.di_gen;
3624
3625	return 0;
3626}
3627
3628
3629/*
3630 * xfs_rwlock
3631 */
3632int
3633xfs_rwlock(
3634	bhv_desc_t	*bdp,
3635	vrwlock_t	locktype)
3636{
3637	xfs_inode_t	*ip;
3638	xfs_vnode_t	*vp;
3639
3640	vp = BHV_TO_VNODE(bdp);
3641	if (VN_ISDIR(vp))
3642		return 1;
3643	ip = XFS_BHVTOI(bdp);
3644	if (locktype == VRWLOCK_WRITE) {
3645		xfs_ilock(ip, XFS_IOLOCK_EXCL);
3646	} else if (locktype == VRWLOCK_TRY_READ) {
3647		return xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED);
3648	} else if (locktype == VRWLOCK_TRY_WRITE) {
3649		return xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL);
3650	} else {
3651		ASSERT((locktype == VRWLOCK_READ) ||
3652		       (locktype == VRWLOCK_WRITE_DIRECT));
3653		xfs_ilock(ip, XFS_IOLOCK_SHARED);
3654	}
3655
3656	return 1;
3657}
3658
3659
3660/*
3661 * xfs_rwunlock
3662 */
3663void
3664xfs_rwunlock(
3665	bhv_desc_t	*bdp,
3666	vrwlock_t	locktype)
3667{
3668	xfs_inode_t     *ip;
3669	xfs_vnode_t	*vp;
3670
3671	vp = BHV_TO_VNODE(bdp);
3672	if (VN_ISDIR(vp))
3673		return;
3674	ip = XFS_BHVTOI(bdp);
3675	if (locktype == VRWLOCK_WRITE) {
3676		/*
3677		 * In the write case, we may have added a new entry to
3678		 * the reference cache.  This might store a pointer to
3679		 * an inode to be released in this inode.  If it is there,
3680		 * clear the pointer and release the inode after unlocking
3681		 * this one.
3682		 */
3683		xfs_refcache_iunlock(ip, XFS_IOLOCK_EXCL);
3684	} else {
3685		ASSERT((locktype == VRWLOCK_READ) ||
3686		       (locktype == VRWLOCK_WRITE_DIRECT));
3687		xfs_iunlock(ip, XFS_IOLOCK_SHARED);
3688	}
3689	return;
3690}
3691
3692STATIC int
3693xfs_inode_flush(
3694	bhv_desc_t	*bdp,
3695	int		flags)
3696{
3697	xfs_inode_t	*ip;
3698	xfs_mount_t	*mp;
3699	xfs_inode_log_item_t *iip;
3700	int		error = 0;
3701
3702	ip = XFS_BHVTOI(bdp);
3703	mp = ip->i_mount;
3704	iip = ip->i_itemp;
3705
3706	if (XFS_FORCED_SHUTDOWN(mp))
3707		return XFS_ERROR(EIO);
3708
3709	/*
3710	 * Bypass inodes which have already been cleaned by
3711	 * the inode flush clustering code inside xfs_iflush
3712	 */
3713	if ((ip->i_update_core == 0) &&
3714	    ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL)))
3715		return 0;
3716
3717	if (flags & FLUSH_LOG) {
3718		if (iip && iip->ili_last_lsn) {
3719			xlog_t		*log = mp->m_log;
3720			xfs_lsn_t	sync_lsn;
3721			int		s, log_flags = XFS_LOG_FORCE;
3722
3723			s = GRANT_LOCK(log);
3724			sync_lsn = log->l_last_sync_lsn;
3725			GRANT_UNLOCK(log, s);
3726
3727			if ((XFS_LSN_CMP(iip->ili_last_lsn, sync_lsn) <= 0))
3728				return 0;
3729
3730			if (flags & FLUSH_SYNC)
3731				log_flags |= XFS_LOG_SYNC;
3732			return xfs_log_force(mp, iip->ili_last_lsn, log_flags);
3733		}
3734	}
3735
3736	/*
3737	 * We make this non-blocking if the inode is contended,
3738	 * return EAGAIN to indicate to the caller that they
3739	 * did not succeed. This prevents the flush path from
3740	 * blocking on inodes inside another operation right
3741	 * now, they get caught later by xfs_sync.
3742	 */
3743	if (flags & FLUSH_INODE) {
3744		int	flush_flags;
3745
3746		if (xfs_ipincount(ip))
3747			return EAGAIN;
3748
3749		if (flags & FLUSH_SYNC) {
3750			xfs_ilock(ip, XFS_ILOCK_SHARED);
3751			xfs_iflock(ip);
3752		} else if (xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
3753			if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) {
3754				xfs_iunlock(ip, XFS_ILOCK_SHARED);
3755				return EAGAIN;
3756			}
3757		} else {
3758			return EAGAIN;
3759		}
3760
3761		if (flags & FLUSH_SYNC)
3762			flush_flags = XFS_IFLUSH_SYNC;
3763		else
3764			flush_flags = XFS_IFLUSH_ASYNC;
3765
3766		error = xfs_iflush(ip, flush_flags);
3767		xfs_iunlock(ip, XFS_ILOCK_SHARED);
3768	}
3769
3770	return error;
3771}
3772
3773
3774int
3775xfs_set_dmattrs (
3776	bhv_desc_t	*bdp,
3777	u_int		evmask,
3778	u_int16_t	state,
3779	cred_t		*credp)
3780{
3781	xfs_inode_t     *ip;
3782	xfs_trans_t	*tp;
3783	xfs_mount_t	*mp;
3784	int		error;
3785
3786	if (!capable(CAP_SYS_ADMIN))
3787		return XFS_ERROR(EPERM);
3788
3789	ip = XFS_BHVTOI(bdp);
3790	mp = ip->i_mount;
3791
3792	if (XFS_FORCED_SHUTDOWN(mp))
3793		return XFS_ERROR(EIO);
3794
3795	tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
3796	error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES (mp), 0, 0, 0);
3797	if (error) {
3798		xfs_trans_cancel(tp, 0);
3799		return error;
3800	}
3801	xfs_ilock(ip, XFS_ILOCK_EXCL);
3802	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
3803
3804	ip->i_iocore.io_dmevmask = ip->i_d.di_dmevmask = evmask;
3805	ip->i_iocore.io_dmstate  = ip->i_d.di_dmstate  = state;
3806
3807	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
3808	IHOLD(ip);
3809	error = xfs_trans_commit(tp, 0, NULL);
3810
3811	return error;
3812}
3813
3814
3815/*
3816 * xfs_reclaim
3817 */
3818STATIC int
3819xfs_reclaim(
3820	bhv_desc_t	*bdp)
3821{
3822	xfs_inode_t	*ip;
3823	xfs_vnode_t	*vp;
3824
3825	vp = BHV_TO_VNODE(bdp);
3826	ip = XFS_BHVTOI(bdp);
3827
3828	vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
3829
3830	ASSERT(!VN_MAPPED(vp));
3831
3832	/* bad inode, get out here ASAP */
3833	if (VN_BAD(vp)) {
3834		xfs_ireclaim(ip);
3835		return 0;
3836	}
3837
3838	vn_iowait(vp);
3839
3840	ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
3841
3842	/*
3843	 * Make sure the atime in the XFS inode is correct before freeing the
3844	 * Linux inode.
3845	 */
3846	xfs_synchronize_atime(ip);
3847
3848	vnode_destroy_vobject(vp->v_vnode);
3849
3850	/* If we have nothing to flush with this inode then complete the
3851	 * teardown now, otherwise break the link between the xfs inode
3852	 * and the linux inode and clean up the xfs inode later. This
3853	 * avoids flushing the inode to disk during the delete operation
3854	 * itself.
3855	 */
3856	if (!ip->i_update_core && (ip->i_itemp == NULL)) {
3857		xfs_ilock(ip, XFS_ILOCK_EXCL);
3858		xfs_iflock(ip);
3859		return xfs_finish_reclaim(ip, 1, XFS_IFLUSH_DELWRI_ELSE_SYNC);
3860	} else {
3861		xfs_mount_t	*mp = ip->i_mount;
3862
3863		/* Protect sync from us */
3864		XFS_MOUNT_ILOCK(mp);
3865		vn_bhv_remove(VN_BHV_HEAD(vp), XFS_ITOBHV(ip));
3866		TAILQ_INSERT_TAIL(&mp->m_del_inodes, ip, i_reclaim);
3867		ip->i_flags |= XFS_IRECLAIMABLE;
3868		XFS_MOUNT_IUNLOCK(mp);
3869	}
3870	return 0;
3871}
3872
3873int
3874xfs_finish_reclaim(
3875	xfs_inode_t	*ip,
3876	int		locked,
3877	int		sync_mode)
3878{
3879	xfs_ihash_t	*ih = ip->i_hash;
3880	xfs_vnode_t	*vp = XFS_ITOV_NULL(ip);
3881	int		error;
3882
3883	if (vp && VN_BAD(vp))
3884		goto reclaim;
3885
3886	/* The hash lock here protects a thread in xfs_iget_core from
3887	 * racing with us on linking the inode back with a vnode.
3888	 * Once we have the XFS_IRECLAIM flag set it will not touch
3889	 * us.
3890	 */
3891	write_lock(&ih->ih_lock);
3892	if ((ip->i_flags & XFS_IRECLAIM) ||
3893	    (!(ip->i_flags & XFS_IRECLAIMABLE) && vp == NULL)) {
3894		write_unlock(&ih->ih_lock);
3895		if (locked) {
3896			xfs_ifunlock(ip);
3897			xfs_iunlock(ip, XFS_ILOCK_EXCL);
3898		}
3899		return 1;
3900	}
3901	ip->i_flags |= XFS_IRECLAIM;
3902	write_unlock(&ih->ih_lock);
3903
3904	/*
3905	 * If the inode is still dirty, then flush it out.  If the inode
3906	 * is not in the AIL, then it will be OK to flush it delwri as
3907	 * long as xfs_iflush() does not keep any references to the inode.
3908	 * We leave that decision up to xfs_iflush() since it has the
3909	 * knowledge of whether it's OK to simply do a delwri flush of
3910	 * the inode or whether we need to wait until the inode is
3911	 * pulled from the AIL.
3912	 * We get the flush lock regardless, though, just to make sure
3913	 * we don't free it while it is being flushed.
3914	 */
3915	if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
3916		if (!locked) {
3917			xfs_ilock(ip, XFS_ILOCK_EXCL);
3918			xfs_iflock(ip);
3919		}
3920
3921		if (ip->i_update_core ||
3922		    ((ip->i_itemp != NULL) &&
3923		     (ip->i_itemp->ili_format.ilf_fields != 0))) {
3924			error = xfs_iflush(ip, sync_mode);
3925			/*
3926			 * If we hit an error, typically because of filesystem
3927			 * shutdown, we don't need to let vn_reclaim to know
3928			 * because we're gonna reclaim the inode anyway.
3929			 */
3930			if (error) {
3931				xfs_iunlock(ip, XFS_ILOCK_EXCL);
3932				goto reclaim;
3933			}
3934			xfs_iflock(ip); /* synchronize with xfs_iflush_done */
3935		}
3936
3937		ASSERT(ip->i_update_core == 0);
3938		ASSERT(ip->i_itemp == NULL ||
3939		       ip->i_itemp->ili_format.ilf_fields == 0);
3940		xfs_iunlock(ip, XFS_ILOCK_EXCL);
3941	} else if (locked) {
3942		/*
3943		 * We are not interested in doing an iflush if we're
3944		 * in the process of shutting down the filesystem forcibly.
3945		 * So, just reclaim the inode.
3946		 */
3947		xfs_ifunlock(ip);
3948		xfs_iunlock(ip, XFS_ILOCK_EXCL);
3949	}
3950
3951 reclaim:
3952	xfs_ireclaim(ip);
3953	return 0;
3954}
3955
3956int
3957xfs_finish_reclaim_all(xfs_mount_t *mp, int noblock)
3958{
3959#ifdef RMC
3960	int		purged;
3961	xfs_inode_t	*ip, *n;
3962	int		done = 0;
3963
3964	while (!done) {
3965		purged = 0;
3966		XFS_MOUNT_ILOCK(mp);
3967		TAILQ_FOREACH_SAFE(curr, &mp->m_del_inodes, i_reclaim, next) {
3968			ip = curr;
3969			if (noblock) {
3970				if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0)
3971					continue;
3972				if (xfs_ipincount(ip) ||
3973				    !xfs_iflock_nowait(ip)) {
3974					xfs_iunlock(ip, XFS_ILOCK_EXCL);
3975					continue;
3976				}
3977			}
3978			XFS_MOUNT_IUNLOCK(mp);
3979			if (xfs_finish_reclaim(ip, noblock,
3980					XFS_IFLUSH_DELWRI_ELSE_ASYNC))
3981				delay(1);
3982			purged = 1;
3983			break;
3984		}
3985
3986		done = !purged;
3987	}
3988
3989	XFS_MOUNT_IUNLOCK(mp);
3990#endif
3991	return 0;
3992}
3993
3994/*
3995 * xfs_alloc_file_space()
3996 *      This routine allocates disk space for the given file.
3997 *
3998 *	If alloc_type == 0, this request is for an ALLOCSP type
3999 *	request which will change the file size.  In this case, no
4000 *	DMAPI event will be generated by the call.  A TRUNCATE event
4001 *	will be generated later by xfs_setattr.
4002 *
4003 *	If alloc_type != 0, this request is for a RESVSP type
4004 *	request, and a DMAPI DM_EVENT_WRITE will be generated if the
4005 *	lower block boundary byte address is less than the file's
4006 *	length.
4007 *
4008 * RETURNS:
4009 *       0 on success
4010 *      errno on error
4011 *
4012 */
4013STATIC int
4014xfs_alloc_file_space(
4015	xfs_inode_t		*ip,
4016	xfs_off_t		offset,
4017	xfs_off_t		len,
4018	int			alloc_type,
4019	int			attr_flags)
4020{
4021	xfs_mount_t		*mp = ip->i_mount;
4022	xfs_off_t		count;
4023	xfs_filblks_t		allocated_fsb;
4024	xfs_filblks_t		allocatesize_fsb;
4025	xfs_extlen_t		extsz, temp;
4026	xfs_fileoff_t		startoffset_fsb;
4027	xfs_fsblock_t		firstfsb;
4028	int			nimaps;
4029	int			bmapi_flag;
4030	int			quota_flag;
4031	int			rt;
4032	xfs_trans_t		*tp;
4033	xfs_bmbt_irec_t		imaps[1], *imapp;
4034	xfs_bmap_free_t		free_list;
4035	uint			qblocks, resblks, resrtextents;
4036	int			committed;
4037	int			error;
4038
4039	vn_trace_entry(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address);
4040
4041	if (XFS_FORCED_SHUTDOWN(mp))
4042		return XFS_ERROR(EIO);
4043
4044	rt = XFS_IS_REALTIME_INODE(ip);
4045	if (unlikely(rt)) {
4046		if (!(extsz = ip->i_d.di_extsize))
4047			extsz = mp->m_sb.sb_rextsize;
4048	} else {
4049		extsz = ip->i_d.di_extsize;
4050	}
4051
4052	if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
4053		return error;
4054
4055	if (len <= 0)
4056		return XFS_ERROR(EINVAL);
4057
4058	count = len;
4059	error = 0;
4060	imapp = &imaps[0];
4061	nimaps = 1;
4062	bmapi_flag = XFS_BMAPI_WRITE | (alloc_type ? XFS_BMAPI_PREALLOC : 0);
4063	startoffset_fsb	= XFS_B_TO_FSBT(mp, offset);
4064	allocatesize_fsb = XFS_B_TO_FSB(mp, count);
4065
4066	/*	Generate a DMAPI event if needed.	*/
4067	if (alloc_type != 0 && offset < ip->i_d.di_size &&
4068			(attr_flags&ATTR_DMI) == 0  &&
4069			DM_EVENT_ENABLED(XFS_MTOVFS(mp), ip, DM_EVENT_WRITE)) {
4070		xfs_off_t           end_dmi_offset;
4071
4072		end_dmi_offset = offset+len;
4073		if (end_dmi_offset > ip->i_d.di_size)
4074			end_dmi_offset = ip->i_d.di_size;
4075		error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, XFS_ITOV(ip),
4076			offset, end_dmi_offset - offset,
4077			0, NULL);
4078		if (error)
4079			return error;
4080	}
4081
4082	/*
4083	 * Allocate file space until done or until there is an error
4084	 */
4085retry:
4086	while (allocatesize_fsb && !error) {
4087		xfs_fileoff_t	s, e;
4088
4089		/*
4090		 * Determine space reservations for data/realtime.
4091		 */
4092		if (unlikely(extsz)) {
4093			s = startoffset_fsb;
4094			do_div(s, extsz);
4095			s *= extsz;
4096			e = startoffset_fsb + allocatesize_fsb;
4097			if ((temp = do_mod(startoffset_fsb, extsz)))
4098				e += temp;
4099			if ((temp = do_mod(e, extsz)))
4100				e += extsz - temp;
4101		} else {
4102			s = 0;
4103			e = allocatesize_fsb;
4104		}
4105
4106		if (unlikely(rt)) {
4107			resrtextents = qblocks = (uint)(e - s);
4108			resrtextents /= mp->m_sb.sb_rextsize;
4109			resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
4110			quota_flag = XFS_QMOPT_RES_RTBLKS;
4111		} else {
4112			resrtextents = 0;
4113			resblks = qblocks = \
4114				XFS_DIOSTRAT_SPACE_RES(mp, (uint)(e - s));
4115			quota_flag = XFS_QMOPT_RES_REGBLKS;
4116		}
4117
4118		/*
4119		 * Allocate and setup the transaction.
4120		 */
4121		tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
4122		error = xfs_trans_reserve(tp, resblks,
4123					  XFS_WRITE_LOG_RES(mp), resrtextents,
4124					  XFS_TRANS_PERM_LOG_RES,
4125					  XFS_WRITE_LOG_COUNT);
4126		/*
4127		 * Check for running out of space
4128		 */
4129		if (error) {
4130			/*
4131			 * Free the transaction structure.
4132			 */
4133			ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
4134			xfs_trans_cancel(tp, 0);
4135			break;
4136		}
4137		xfs_ilock(ip, XFS_ILOCK_EXCL);
4138		error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip,
4139						      qblocks, 0, quota_flag);
4140		if (error)
4141			goto error1;
4142
4143		xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
4144		xfs_trans_ihold(tp, ip);
4145
4146		/*
4147		 * Issue the xfs_bmapi() call to allocate the blocks
4148		 */
4149		XFS_BMAP_INIT(&free_list, &firstfsb);
4150		error = XFS_BMAPI(mp, tp, &ip->i_iocore, startoffset_fsb,
4151				  allocatesize_fsb, bmapi_flag,
4152				  &firstfsb, 0, imapp, &nimaps,
4153				  &free_list, NULL);
4154		if (error) {
4155			goto error0;
4156		}
4157
4158		/*
4159		 * Complete the transaction
4160		 */
4161		error = xfs_bmap_finish(&tp, &free_list, firstfsb, &committed);
4162		if (error) {
4163			goto error0;
4164		}
4165
4166		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
4167		xfs_iunlock(ip, XFS_ILOCK_EXCL);
4168		if (error) {
4169			break;
4170		}
4171
4172		allocated_fsb = imapp->br_blockcount;
4173
4174		if (nimaps == 0) {
4175			error = XFS_ERROR(ENOSPC);
4176			break;
4177		}
4178
4179		startoffset_fsb += allocated_fsb;
4180		allocatesize_fsb -= allocated_fsb;
4181	}
4182dmapi_enospc_check:
4183	if (error == ENOSPC && (attr_flags&ATTR_DMI) == 0 &&
4184	    DM_EVENT_ENABLED(XFS_MTOVFS(mp), ip, DM_EVENT_NOSPACE)) {
4185
4186		error = XFS_SEND_NAMESP(mp, DM_EVENT_NOSPACE,
4187				XFS_ITOV(ip), DM_RIGHT_NULL,
4188				XFS_ITOV(ip), DM_RIGHT_NULL,
4189				NULL, NULL, 0, 0, 0); /* Delay flag intentionally unused */
4190		if (error == 0)
4191			goto retry;	/* Maybe DMAPI app. has made space */
4192		/* else fall through with error from XFS_SEND_DATA */
4193	}
4194
4195	return error;
4196
4197error0:	/* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
4198	xfs_bmap_cancel(&free_list);
4199	XFS_TRANS_UNRESERVE_QUOTA_NBLKS(mp, tp, ip, qblocks, 0, quota_flag);
4200
4201error1:	/* Just cancel transaction */
4202	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
4203	xfs_iunlock(ip, XFS_ILOCK_EXCL);
4204	goto dmapi_enospc_check;
4205}
4206
4207/*
4208 * Zero file bytes between startoff and endoff inclusive.
4209 * The iolock is held exclusive and no blocks are buffered.
4210 */
4211STATIC int
4212xfs_zero_remaining_bytes(
4213	xfs_inode_t		*ip,
4214	xfs_off_t		startoff,
4215	xfs_off_t		endoff)
4216{
4217	xfs_bmbt_irec_t		imap;
4218	xfs_fileoff_t		offset_fsb;
4219	xfs_off_t		lastoffset;
4220	xfs_off_t		offset;
4221	xfs_buf_t		*bp;
4222	xfs_mount_t		*mp = ip->i_mount;
4223	int			nimap;
4224	int			error = 0;
4225
4226	bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize,
4227				ip->i_d.di_flags & XFS_DIFLAG_REALTIME ?
4228				mp->m_rtdev_targp : mp->m_ddev_targp);
4229
4230	for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
4231		offset_fsb = XFS_B_TO_FSBT(mp, offset);
4232		nimap = 1;
4233		error = XFS_BMAPI(mp, NULL, &ip->i_iocore, offset_fsb, 1, 0,
4234			NULL, 0, &imap, &nimap, NULL, NULL);
4235		if (error || nimap < 1)
4236			break;
4237		ASSERT(imap.br_blockcount >= 1);
4238		ASSERT(imap.br_startoff == offset_fsb);
4239		lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
4240		if (lastoffset > endoff)
4241			lastoffset = endoff;
4242		if (imap.br_startblock == HOLESTARTBLOCK)
4243			continue;
4244		ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
4245		if (imap.br_state == XFS_EXT_UNWRITTEN)
4246			continue;
4247		XFS_BUF_UNDONE(bp);
4248		XFS_BUF_UNWRITE(bp);
4249		XFS_BUF_READ(bp);
4250		XFS_BUF_SET_ADDR(bp, XFS_FSB_TO_DB(ip, imap.br_startblock));
4251		xfsbdstrat(mp, bp);
4252		if ((error = xfs_iowait(bp))) {
4253			xfs_ioerror_alert("xfs_zero_remaining_bytes(read)",
4254					  mp, bp, XFS_BUF_ADDR(bp));
4255			break;
4256		}
4257		memset(XFS_BUF_PTR(bp) +
4258			(offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
4259		      0, lastoffset - offset + 1);
4260		XFS_BUF_UNDONE(bp);
4261		XFS_BUF_UNREAD(bp);
4262		XFS_BUF_WRITE(bp);
4263		xfsbdstrat(mp, bp);
4264		if ((error = xfs_iowait(bp))) {
4265			xfs_ioerror_alert("xfs_zero_remaining_bytes(write)",
4266					  mp, bp, XFS_BUF_ADDR(bp));
4267			break;
4268		}
4269	}
4270	xfs_buf_free(bp);
4271	return error;
4272}
4273
4274/*
4275 * xfs_free_file_space()
4276 *      This routine frees disk space for the given file.
4277 *
4278 *	This routine is only called by xfs_change_file_space
4279 *	for an UNRESVSP type call.
4280 *
4281 * RETURNS:
4282 *       0 on success
4283 *      errno on error
4284 *
4285 */
4286STATIC int
4287xfs_free_file_space(
4288	xfs_inode_t		*ip,
4289	xfs_off_t		offset,
4290	xfs_off_t		len,
4291	int			attr_flags)
4292{
4293	xfs_vnode_t		*vp;
4294	int			committed;
4295	int			done;
4296	xfs_off_t		end_dmi_offset;
4297	xfs_fileoff_t		endoffset_fsb;
4298	int			error;
4299	xfs_fsblock_t		firstfsb;
4300	xfs_bmap_free_t		free_list;
4301	xfs_off_t		ilen;
4302	xfs_bmbt_irec_t		imap;
4303	xfs_off_t		ioffset;
4304	xfs_extlen_t		mod=0;
4305	xfs_mount_t		*mp;
4306	int			nimap;
4307	uint			resblks;
4308	int			rounding;
4309	int			rt;
4310	xfs_fileoff_t		startoffset_fsb;
4311	xfs_trans_t		*tp;
4312	int			need_iolock = 1;
4313
4314	vp = XFS_ITOV(ip);
4315	mp = ip->i_mount;
4316
4317	vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
4318
4319	if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
4320		return error;
4321
4322	error = 0;
4323	if (len <= 0)	/* if nothing being freed */
4324		return error;
4325	rt = (ip->i_d.di_flags & XFS_DIFLAG_REALTIME);
4326	startoffset_fsb	= XFS_B_TO_FSB(mp, offset);
4327	end_dmi_offset = offset + len;
4328	endoffset_fsb = XFS_B_TO_FSBT(mp, end_dmi_offset);
4329
4330	if (offset < ip->i_d.di_size &&
4331	    (attr_flags & ATTR_DMI) == 0 &&
4332	    DM_EVENT_ENABLED(XFS_MTOVFS(mp), ip, DM_EVENT_WRITE)) {
4333		if (end_dmi_offset > ip->i_d.di_size)
4334			end_dmi_offset = ip->i_d.di_size;
4335		error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, vp,
4336				offset, end_dmi_offset - offset,
4337				AT_DELAY_FLAG(attr_flags), NULL);
4338		if (error)
4339			return error;
4340	}
4341
4342	ASSERT(attr_flags & ATTR_NOLOCK ? attr_flags & ATTR_DMI : 1);
4343	if (attr_flags & ATTR_NOLOCK)
4344		need_iolock = 0;
4345	if (need_iolock) {
4346		xfs_ilock(ip, XFS_IOLOCK_EXCL);
4347		vn_iowait(vp);	/* wait for the completion of any pending DIOs */
4348	}
4349
4350	rounding = MAX((__uint8_t)(1 << mp->m_sb.sb_blocklog),
4351			(__uint8_t)NBPP);
4352	ilen = len + (offset & (rounding - 1));
4353	ioffset = offset & ~(rounding - 1);
4354	if (ilen & (rounding - 1))
4355		ilen = (ilen + rounding) & ~(rounding - 1);
4356
4357	if (VN_CACHED(vp) != 0) {
4358		xfs_inval_cached_trace(&ip->i_iocore, ioffset, -1,
4359				ctooff(offtoct(ioffset)), -1);
4360		XVOP_FLUSHINVAL_PAGES(vp, ctooff(offtoct(ioffset)),
4361				-1, FI_REMAPF_LOCKED);
4362	}
4363
4364	/*
4365	 * Need to zero the stuff we're not freeing, on disk.
4366	 * If its a realtime file & can't use unwritten extents then we
4367	 * actually need to zero the extent edges.  Otherwise xfs_bunmapi
4368	 * will take care of it for us.
4369	 */
4370	if (rt && !XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb)) {
4371		nimap = 1;
4372		error = XFS_BMAPI(mp, NULL, &ip->i_iocore, startoffset_fsb,
4373			1, 0, NULL, 0, &imap, &nimap, NULL, NULL);
4374		if (error)
4375			goto out_unlock_iolock;
4376		ASSERT(nimap == 0 || nimap == 1);
4377		if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
4378			xfs_daddr_t	block;
4379
4380			ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
4381			block = imap.br_startblock;
4382			mod = do_div(block, mp->m_sb.sb_rextsize);
4383			if (mod)
4384				startoffset_fsb += mp->m_sb.sb_rextsize - mod;
4385		}
4386		nimap = 1;
4387		error = XFS_BMAPI(mp, NULL, &ip->i_iocore, endoffset_fsb - 1,
4388			1, 0, NULL, 0, &imap, &nimap, NULL, NULL);
4389		if (error)
4390			goto out_unlock_iolock;
4391		ASSERT(nimap == 0 || nimap == 1);
4392		if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
4393			ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
4394			mod++;
4395			if (mod && (mod != mp->m_sb.sb_rextsize))
4396				endoffset_fsb -= mod;
4397		}
4398	}
4399	if ((done = (endoffset_fsb <= startoffset_fsb)))
4400		/*
4401		 * One contiguous piece to clear
4402		 */
4403		error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1);
4404	else {
4405		/*
4406		 * Some full blocks, possibly two pieces to clear
4407		 */
4408		if (offset < XFS_FSB_TO_B(mp, startoffset_fsb))
4409			error = xfs_zero_remaining_bytes(ip, offset,
4410				XFS_FSB_TO_B(mp, startoffset_fsb) - 1);
4411		if (!error &&
4412		    XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len)
4413			error = xfs_zero_remaining_bytes(ip,
4414				XFS_FSB_TO_B(mp, endoffset_fsb),
4415				offset + len - 1);
4416	}
4417
4418	/*
4419	 * free file space until done or until there is an error
4420	 */
4421	resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
4422	while (!error && !done) {
4423
4424		/*
4425		 * allocate and setup the transaction
4426		 */
4427		tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
4428		error = xfs_trans_reserve(tp,
4429					  resblks,
4430					  XFS_WRITE_LOG_RES(mp),
4431					  0,
4432					  XFS_TRANS_PERM_LOG_RES,
4433					  XFS_WRITE_LOG_COUNT);
4434
4435		/*
4436		 * check for running out of space
4437		 */
4438		if (error) {
4439			/*
4440			 * Free the transaction structure.
4441			 */
4442			ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
4443			xfs_trans_cancel(tp, 0);
4444			break;
4445		}
4446		xfs_ilock(ip, XFS_ILOCK_EXCL);
4447		error = XFS_TRANS_RESERVE_QUOTA(mp, tp,
4448				ip->i_udquot, ip->i_gdquot, resblks, 0,
4449				XFS_QMOPT_RES_REGBLKS);
4450		if (error)
4451			goto error1;
4452
4453		xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
4454		xfs_trans_ihold(tp, ip);
4455
4456		/*
4457		 * issue the bunmapi() call to free the blocks
4458		 */
4459		XFS_BMAP_INIT(&free_list, &firstfsb);
4460		error = XFS_BUNMAPI(mp, tp, &ip->i_iocore, startoffset_fsb,
4461				  endoffset_fsb - startoffset_fsb,
4462				  0, 2, &firstfsb, &free_list, NULL, &done);
4463		if (error) {
4464			goto error0;
4465		}
4466
4467		/*
4468		 * complete the transaction
4469		 */
4470		error = xfs_bmap_finish(&tp, &free_list, firstfsb, &committed);
4471		if (error) {
4472			goto error0;
4473		}
4474
4475		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
4476		xfs_iunlock(ip, XFS_ILOCK_EXCL);
4477	}
4478
4479 out_unlock_iolock:
4480	if (need_iolock)
4481		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
4482	return error;
4483
4484 error0:
4485	xfs_bmap_cancel(&free_list);
4486 error1:
4487	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
4488	xfs_iunlock(ip, need_iolock ? (XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL) :
4489		    XFS_ILOCK_EXCL);
4490	return error;
4491}
4492
4493/*
4494 * xfs_change_file_space()
4495 *      This routine allocates or frees disk space for the given file.
4496 *      The user specified parameters are checked for alignment and size
4497 *      limitations.
4498 *
4499 * RETURNS:
4500 *       0 on success
4501 *      errno on error
4502 *
4503 */
4504int
4505xfs_change_file_space(
4506	bhv_desc_t	*bdp,
4507	int		cmd,
4508	xfs_flock64_t	*bf,
4509	xfs_off_t	offset,
4510	cred_t		*credp,
4511	int		attr_flags)
4512{
4513	int		clrprealloc;
4514	int		error;
4515	xfs_fsize_t	fsize;
4516	xfs_inode_t	*ip;
4517	xfs_mount_t	*mp;
4518	int		setprealloc;
4519	xfs_off_t	startoffset;
4520	xfs_off_t	llen;
4521	xfs_trans_t	*tp;
4522	xfs_vattr_t	va;
4523	xfs_vnode_t	*vp;
4524
4525	vp = BHV_TO_VNODE(bdp);
4526	vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
4527
4528	ip = XFS_BHVTOI(bdp);
4529	mp = ip->i_mount;
4530
4531	/*
4532	 * must be a regular file and have write permission
4533	 */
4534	if (!VN_ISREG(vp))
4535		return XFS_ERROR(EINVAL);
4536
4537	xfs_ilock(ip, XFS_ILOCK_SHARED);
4538
4539	if ((error = xfs_iaccess(ip, S_IWUSR, credp))) {
4540		xfs_iunlock(ip, XFS_ILOCK_SHARED);
4541		return error;
4542	}
4543
4544	xfs_iunlock(ip, XFS_ILOCK_SHARED);
4545
4546	switch (bf->l_whence) {
4547	case 0: /*SEEK_SET*/
4548		break;
4549	case 1: /*SEEK_CUR*/
4550		bf->l_start += offset;
4551		break;
4552	case 2: /*SEEK_END*/
4553		bf->l_start += ip->i_d.di_size;
4554		break;
4555	default:
4556		return XFS_ERROR(EINVAL);
4557	}
4558
4559	llen = bf->l_len > 0 ? bf->l_len - 1 : bf->l_len;
4560
4561	if (   (bf->l_start < 0)
4562	    || (bf->l_start > XFS_MAXIOFFSET(mp))
4563	    || (bf->l_start + llen < 0)
4564	    || (bf->l_start + llen > XFS_MAXIOFFSET(mp)))
4565		return XFS_ERROR(EINVAL);
4566
4567	bf->l_whence = 0;
4568
4569	startoffset = bf->l_start;
4570	fsize = ip->i_d.di_size;
4571
4572	/*
4573	 * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve
4574	 * file space.
4575	 * These calls do NOT zero the data space allocated to the file,
4576	 * nor do they change the file size.
4577	 *
4578	 * XFS_IOC_ALLOCSP and XFS_IOC_FREESP will allocate and free file
4579	 * space.
4580	 * These calls cause the new file data to be zeroed and the file
4581	 * size to be changed.
4582	 */
4583	setprealloc = clrprealloc = 0;
4584
4585	switch (cmd) {
4586	case XFS_IOC_RESVSP:
4587	case XFS_IOC_RESVSP64:
4588		error = xfs_alloc_file_space(ip, startoffset, bf->l_len,
4589								1, attr_flags);
4590		if (error)
4591			return error;
4592		setprealloc = 1;
4593		break;
4594
4595	case XFS_IOC_UNRESVSP:
4596	case XFS_IOC_UNRESVSP64:
4597		if ((error = xfs_free_file_space(ip, startoffset, bf->l_len,
4598								attr_flags)))
4599			return error;
4600		break;
4601
4602	case XFS_IOC_ALLOCSP:
4603	case XFS_IOC_ALLOCSP64:
4604	case XFS_IOC_FREESP:
4605	case XFS_IOC_FREESP64:
4606		if (startoffset > fsize) {
4607			error = xfs_alloc_file_space(ip, fsize,
4608					startoffset - fsize, 0, attr_flags);
4609			if (error)
4610				break;
4611		}
4612
4613		va.va_mask = XFS_AT_SIZE;
4614		va.va_size = startoffset;
4615
4616		error = xfs_setattr(bdp, &va, attr_flags, credp);
4617
4618		if (error)
4619			return error;
4620
4621		clrprealloc = 1;
4622		break;
4623
4624	default:
4625		ASSERT(0);
4626		return XFS_ERROR(EINVAL);
4627	}
4628
4629	/*
4630	 * update the inode timestamp, mode, and prealloc flag bits
4631	 */
4632	tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
4633
4634	if ((error = xfs_trans_reserve(tp, 0, XFS_WRITEID_LOG_RES(mp),
4635				      0, 0, 0))) {
4636		/* ASSERT(0); */
4637		xfs_trans_cancel(tp, 0);
4638		return error;
4639	}
4640
4641	xfs_ilock(ip, XFS_ILOCK_EXCL);
4642
4643	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
4644	xfs_trans_ihold(tp, ip);
4645
4646	if ((attr_flags & ATTR_DMI) == 0) {
4647		ip->i_d.di_mode &= ~S_ISUID;
4648
4649		/*
4650		 * Note that we don't have to worry about mandatory
4651		 * file locking being disabled here because we only
4652		 * clear the S_ISGID bit if the Group execute bit is
4653		 * on, but if it was on then mandatory locking wouldn't
4654		 * have been enabled.
4655		 */
4656		if (ip->i_d.di_mode & S_IXGRP)
4657			ip->i_d.di_mode &= ~S_ISGID;
4658
4659		xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
4660	}
4661	if (setprealloc)
4662		ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
4663	else if (clrprealloc)
4664		ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
4665
4666	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
4667	xfs_trans_set_sync(tp);
4668
4669	error = xfs_trans_commit(tp, 0, NULL);
4670
4671	xfs_iunlock(ip, XFS_ILOCK_EXCL);
4672
4673	return error;
4674}
4675
4676
4677xfs_vnodeops_t xfs_vnodeops = {
4678	BHV_IDENTITY_INIT(VN_BHV_XFS,VNODE_POSITION_XFS),
4679	.vop_open		= xfs_open,
4680	.vop_read		= xfs_read,
4681#ifdef HAVE_SENDFILE
4682	.vop_sendfile		= xfs_sendfile,
4683#endif
4684	.vop_write		= xfs_write,
4685	.vop_ioctl		= xfs_ioctl,
4686	.vop_getattr		= xfs_getattr,
4687	.vop_setattr		= xfs_setattr,
4688	.vop_access		= xfs_access,
4689	.vop_lookup		= xfs_lookup,
4690	.vop_create		= xfs_create,
4691	.vop_remove		= xfs_remove,
4692	.vop_link		= xfs_link,
4693	.vop_rename		= xfs_rename,
4694	.vop_mkdir		= xfs_mkdir,
4695	.vop_rmdir		= xfs_rmdir,
4696	.vop_readdir		= xfs_readdir,
4697	.vop_symlink		= xfs_symlink,
4698	.vop_readlink		= xfs_readlink,
4699	.vop_fsync		= xfs_fsync,
4700	.vop_inactive		= xfs_inactive,
4701	.vop_fid2		= xfs_fid2,
4702	.vop_rwlock		= xfs_rwlock,
4703	.vop_rwunlock		= xfs_rwunlock,
4704	.vop_bmap		= xfs_bmap,
4705	.vop_reclaim		= xfs_reclaim,
4706	.vop_attr_get		= xfs_attr_get,
4707	.vop_attr_set		= xfs_attr_set,
4708	.vop_attr_remove	= xfs_attr_remove,
4709	.vop_attr_list		= xfs_attr_list,
4710	.vop_link_removed	= (xfs_vop_link_removed_t)fs_noval,
4711	.vop_vnode_change	= (xfs_vop_vnode_change_t)fs_noval,
4712	.vop_tosspages		= fs_tosspages,
4713	.vop_flushinval_pages	= fs_flushinval_pages,
4714	.vop_flush_pages	= fs_flush_pages,
4715	.vop_release		= xfs_release,
4716	.vop_iflush		= xfs_inode_flush,
4717};
4718