1/*
2 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17 */
18
19#include "xfs.h"
20#include "xfs_fs.h"
21#include "xfs_types.h"
22#include "xfs_bit.h"
23#include "xfs_log.h"
24#include "xfs_inum.h"
25#include "xfs_trans.h"
26#include "xfs_sb.h"
27#include "xfs_ag.h"
28#include "xfs_dir.h"
29#include "xfs_dir2.h"
30#include "xfs_dmapi.h"
31#include "xfs_mount.h"
32#include "xfs_da_btree.h"
33#include "xfs_bmap_btree.h"
34#include "xfs_alloc_btree.h"
35#include "xfs_ialloc_btree.h"
36#include "xfs_dir_sf.h"
37#include "xfs_dir2_sf.h"
38#include "xfs_attr_sf.h"
39#include "xfs_dinode.h"
40#include "xfs_inode.h"
41#include "xfs_inode_item.h"
42#include "xfs_dir_leaf.h"
43#include "xfs_itable.h"
44#include "xfs_btree.h"
45#include "xfs_ialloc.h"
46#include "xfs_alloc.h"
47#include "xfs_bmap.h"
48#include "xfs_attr.h"
49#include "xfs_rw.h"
50#include "xfs_error.h"
51#include "xfs_quota.h"
52#include "xfs_utils.h"
53#include "xfs_rtalloc.h"
54#include "xfs_refcache.h"
55#include "xfs_trans_space.h"
56#include "xfs_log_priv.h"
57#include "xfs_mac.h"
58
59#include "xfs_fs.h"
60
61/*
62 * The maximum pathlen is 1024 bytes. Since the minimum file system
63 * blocksize is 512 bytes, we can get a max of 2 extents back from
64 * bmapi.
65 */
66#define SYMLINK_MAPS 2
67
68/*
69 * For xfs, we check that the file isn't too big to be opened by this kernel.
70 * No other open action is required for regular files.  Devices are handled
71 * through the specfs file system, pipes through fifofs.  Device and
72 * fifo vnodes are "wrapped" by specfs and fifofs vnodes, respectively,
73 * when a new vnode is first looked up or created.
74 */
75STATIC int
76xfs_open(
77	bhv_desc_t	*bdp,
78	cred_t		*credp)
79{
80	int		mode;
81	xfs_vnode_t	*vp;
82	xfs_inode_t	*ip;
83
84	vp = BHV_TO_VNODE(bdp);
85	ip = XFS_BHVTOI(bdp);
86
87	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
88		return XFS_ERROR(EIO);
89
90	/*
91	 * If it's a directory with any blocks, read-ahead block 0
92	 * as we're almost certain to have the next operation be a read there.
93	 */
94	if (VN_ISDIR(vp) && ip->i_d.di_nextents > 0) {
95		mode = xfs_ilock_map_shared(ip);
96		if (ip->i_d.di_nextents > 0)
97			(void)xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK);
98		xfs_iunlock(ip, mode);
99	}
100	return 0;
101}
102
103
104/*
105 * xfs_getattr
106 */
107STATIC int
108xfs_getattr(
109	bhv_desc_t	*bdp,
110	xfs_vattr_t	*vap,
111	int		flags,
112	cred_t		*credp)
113{
114	xfs_inode_t	*ip;
115	xfs_mount_t	*mp;
116	xfs_vnode_t	*vp;
117
118	vp  = BHV_TO_VNODE(bdp);
119	vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
120
121	ip = XFS_BHVTOI(bdp);
122	mp = ip->i_mount;
123
124	if (XFS_FORCED_SHUTDOWN(mp))
125		return XFS_ERROR(EIO);
126
127	if (!(flags & ATTR_LAZY))
128		xfs_ilock(ip, XFS_ILOCK_SHARED);
129
130	vap->va_size = ip->i_d.di_size;
131	if (vap->va_mask == XFS_AT_SIZE)
132		goto all_done;
133
134	vap->va_nblocks =
135		XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks);
136	vap->va_nodeid = ip->i_ino;
137#if XFS_BIG_INUMS
138	vap->va_nodeid += mp->m_inoadd;
139#endif
140	vap->va_nlink = ip->i_d.di_nlink;
141
142	/*
143	 * Quick exit for non-stat callers
144	 */
145	if ((vap->va_mask &
146	    ~(XFS_AT_SIZE|XFS_AT_FSID|XFS_AT_NODEID|
147	      XFS_AT_NLINK|XFS_AT_BLKSIZE)) == 0)
148		goto all_done;
149
150	/*
151	 * Copy from in-core inode.
152	 */
153	vap->va_mode = ip->i_d.di_mode;
154	vap->va_uid = ip->i_d.di_uid;
155	vap->va_gid = ip->i_d.di_gid;
156	vap->va_projid = ip->i_d.di_projid;
157
158	/*
159	 * Check vnode type block/char vs. everything else.
160	 */
161	switch (ip->i_d.di_mode & S_IFMT) {
162	case S_IFBLK:
163	case S_IFCHR:
164		vap->va_rdev = ip->i_df.if_u2.if_rdev;
165		vap->va_blocksize = BLKDEV_IOSIZE;
166		break;
167	default:
168		vap->va_rdev = 0;
169
170		if (!(ip->i_d.di_flags & XFS_DIFLAG_REALTIME)) {
171			vap->va_blocksize = xfs_preferred_iosize(mp);
172		} else {
173
174			/*
175			 * If the file blocks are being allocated from a
176			 * realtime partition, then return the inode's
177			 * realtime extent size or the realtime volume's
178			 * extent size.
179			 */
180			vap->va_blocksize = ip->i_d.di_extsize ?
181				(ip->i_d.di_extsize << mp->m_sb.sb_blocklog) :
182				(mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog);
183		}
184		break;
185	}
186
187	vn_atime_to_timespec(vp, &vap->va_atime);
188	vap->va_mtime.tv_sec = ip->i_d.di_mtime.t_sec;
189	vap->va_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec;
190	vap->va_ctime.tv_sec = ip->i_d.di_ctime.t_sec;
191	vap->va_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec;
192
193	/*
194	 * Exit for stat callers.  See if any of the rest of the fields
195	 * to be filled in are needed.
196	 */
197	if ((vap->va_mask &
198	     (XFS_AT_XFLAGS|XFS_AT_EXTSIZE|XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|
199	      XFS_AT_GENCOUNT|XFS_AT_VCODE)) == 0)
200		goto all_done;
201
202	/*
203	 * Convert di_flags to xflags.
204	 */
205	vap->va_xflags = xfs_ip2xflags(ip);
206
207	/*
208	 * Exit for inode revalidate.  See if any of the rest of
209	 * the fields to be filled in are needed.
210	 */
211	if ((vap->va_mask &
212	     (XFS_AT_EXTSIZE|XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|
213	      XFS_AT_GENCOUNT|XFS_AT_VCODE)) == 0)
214		goto all_done;
215
216	vap->va_extsize = ip->i_d.di_extsize << mp->m_sb.sb_blocklog;
217	vap->va_nextents =
218		(ip->i_df.if_flags & XFS_IFEXTENTS) ?
219			ip->i_df.if_bytes / sizeof(xfs_bmbt_rec_t) :
220			ip->i_d.di_nextents;
221	if (ip->i_afp)
222		vap->va_anextents =
223			(ip->i_afp->if_flags & XFS_IFEXTENTS) ?
224				ip->i_afp->if_bytes / sizeof(xfs_bmbt_rec_t) :
225				 ip->i_d.di_anextents;
226	else
227		vap->va_anextents = 0;
228	vap->va_gen = ip->i_d.di_gen;
229
230 all_done:
231	if (!(flags & ATTR_LAZY))
232		xfs_iunlock(ip, XFS_ILOCK_SHARED);
233	return 0;
234}
235
236
237/*
238 * xfs_setattr
239 */
240int
241xfs_setattr(
242	bhv_desc_t		*bdp,
243	xfs_vattr_t		*vap,
244	int			flags,
245	cred_t			*credp)
246{
247	xfs_inode_t		*ip;
248	xfs_trans_t		*tp;
249	xfs_mount_t		*mp;
250	int			mask;
251	int			code;
252	uint			lock_flags;
253	uint			commit_flags=0;
254	uid_t			uid=0, iuid=0;
255	gid_t			gid=0, igid=0;
256	int			timeflags = 0;
257	xfs_vnode_t		*vp;
258	xfs_prid_t		projid=0, iprojid=0;
259	int			mandlock_before, mandlock_after;
260	struct xfs_dquot	*udqp, *gdqp, *olddquot1, *olddquot2;
261	int			file_owner;
262	int			need_iolock = 1;
263
264	vp = BHV_TO_VNODE(bdp);
265	vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
266
267	if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
268		return XFS_ERROR(EROFS);
269
270	/*
271	 * Cannot set certain attributes.
272	 */
273	mask = vap->va_mask;
274	if (mask & XFS_AT_NOSET) {
275		return XFS_ERROR(EINVAL);
276	}
277
278	ip = XFS_BHVTOI(bdp);
279	mp = ip->i_mount;
280
281	if (XFS_FORCED_SHUTDOWN(mp))
282		return XFS_ERROR(EIO);
283
284	/*
285	 * Timestamps do not need to be logged and hence do not
286	 * need to be done within a transaction.
287	 */
288	if (mask & XFS_AT_UPDTIMES) {
289		ASSERT((mask & ~XFS_AT_UPDTIMES) == 0);
290		timeflags = ((mask & XFS_AT_UPDATIME) ? XFS_ICHGTIME_ACC : 0) |
291			    ((mask & XFS_AT_UPDCTIME) ? XFS_ICHGTIME_CHG : 0) |
292			    ((mask & XFS_AT_UPDMTIME) ? XFS_ICHGTIME_MOD : 0);
293		xfs_ichgtime(ip, timeflags);
294		return 0;
295	}
296
297	olddquot1 = olddquot2 = NULL;
298	udqp = gdqp = NULL;
299
300	/*
301	 * If disk quotas is on, we make sure that the dquots do exist on disk,
302	 * before we start any other transactions. Trying to do this later
303	 * is messy. We don't care to take a readlock to look at the ids
304	 * in inode here, because we can't hold it across the trans_reserve.
305	 * If the IDs do change before we take the ilock, we're covered
306	 * because the i_*dquot fields will get updated anyway.
307	 */
308	if (XFS_IS_QUOTA_ON(mp) &&
309	    (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID))) {
310		uint	qflags = 0;
311
312		if ((mask & XFS_AT_UID) && XFS_IS_UQUOTA_ON(mp)) {
313			uid = vap->va_uid;
314			qflags |= XFS_QMOPT_UQUOTA;
315		} else {
316			uid = ip->i_d.di_uid;
317		}
318		if ((mask & XFS_AT_GID) && XFS_IS_GQUOTA_ON(mp)) {
319			gid = vap->va_gid;
320			qflags |= XFS_QMOPT_GQUOTA;
321		}  else {
322			gid = ip->i_d.di_gid;
323		}
324		if ((mask & XFS_AT_PROJID) && XFS_IS_PQUOTA_ON(mp)) {
325			projid = vap->va_projid;
326			qflags |= XFS_QMOPT_PQUOTA;
327		}  else {
328			projid = ip->i_d.di_projid;
329		}
330		/*
331		 * We take a reference when we initialize udqp and gdqp,
332		 * so it is important that we never blindly double trip on
333		 * the same variable. See xfs_create() for an example.
334		 */
335		ASSERT(udqp == NULL);
336		ASSERT(gdqp == NULL);
337		code = XFS_QM_DQVOPALLOC(mp, ip, uid, gid, projid, qflags,
338					 &udqp, &gdqp);
339		if (code)
340			return code;
341	}
342
343	/*
344	 * For the other attributes, we acquire the inode lock and
345	 * first do an error checking pass.
346	 */
347	tp = NULL;
348	lock_flags = XFS_ILOCK_EXCL;
349	ASSERT(flags & ATTR_NOLOCK ? flags & ATTR_DMI : 1);
350	if (flags & ATTR_NOLOCK)
351		need_iolock = 0;
352	if (!(mask & XFS_AT_SIZE)) {
353		if ((mask != (XFS_AT_CTIME|XFS_AT_ATIME|XFS_AT_MTIME)) ||
354		    (mp->m_flags & XFS_MOUNT_WSYNC)) {
355			tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
356			commit_flags = 0;
357			if ((code = xfs_trans_reserve(tp, 0,
358						     XFS_ICHANGE_LOG_RES(mp), 0,
359						     0, 0))) {
360				lock_flags = 0;
361				goto error_return;
362			}
363		}
364	} else {
365		if (DM_EVENT_ENABLED (vp->v_vfsp, ip, DM_EVENT_TRUNCATE) &&
366		    !(flags & ATTR_DMI)) {
367			int dmflags = AT_DELAY_FLAG(flags) | DM_SEM_FLAG_WR;
368			code = XFS_SEND_DATA(mp, DM_EVENT_TRUNCATE, vp,
369				vap->va_size, 0, dmflags, NULL);
370			if (code) {
371				lock_flags = 0;
372				goto error_return;
373			}
374		}
375		if (need_iolock)
376			lock_flags |= XFS_IOLOCK_EXCL;
377	}
378
379	xfs_ilock(ip, lock_flags);
380
381	/* boolean: are we the file owner? */
382#if 0
383	file_owner = (current_fsuid(credp) == ip->i_d.di_uid);
384#else
385	file_owner = (credp->cr_uid == ip->i_d.di_uid);
386#endif
387
388	/*
389	 * Change various properties of a file.
390	 * Only the owner or users with CAP_FOWNER
391	 * capability may do these things.
392	 */
393	if (mask &
394	    (XFS_AT_MODE|XFS_AT_XFLAGS|XFS_AT_EXTSIZE|XFS_AT_UID|
395	     XFS_AT_GID|XFS_AT_PROJID)) {
396		/*
397		 * CAP_FOWNER overrides the following restrictions:
398		 *
399		 * The user ID of the calling process must be equal
400		 * to the file owner ID, except in cases where the
401		 * CAP_FSETID capability is applicable.
402		 */
403		if (!file_owner && !capable(CAP_FOWNER)) {
404			code = XFS_ERROR(EPERM);
405			goto error_return;
406		}
407
408		/*
409		 * CAP_FSETID overrides the following restrictions:
410		 *
411		 * The effective user ID of the calling process shall match
412		 * the file owner when setting the set-user-ID and
413		 * set-group-ID bits on that file.
414		 *
415		 * The effective group ID or one of the supplementary group
416		 * IDs of the calling process shall match the group owner of
417		 * the file when setting the set-group-ID bit on that file
418		 */
419		if (mask & XFS_AT_MODE) {
420			mode_t m = 0;
421
422			if ((vap->va_mode & S_ISUID) && !file_owner)
423				m |= S_ISUID;
424			if ((vap->va_mode & S_ISGID) &&
425			    !groupmember((gid_t)ip->i_d.di_gid, credp))
426				m |= S_ISGID;
427#if 1
428			/* Linux allows this, Irix doesn't. */
429			if ((vap->va_mode & S_ISVTX) && !VN_ISDIR(vp))
430				m |= S_ISVTX;
431#endif
432			if (m && !capable(CAP_FSETID))
433				vap->va_mode &= ~m;
434		}
435	}
436
437	/*
438	 * Change file ownership.  Must be the owner or privileged.
439	 * If the system was configured with the "restricted_chown"
440	 * option, the owner is not permitted to give away the file,
441	 * and can change the group id only to a group of which he
442	 * or she is a member.
443	 */
444	if (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID)) {
445		/*
446		 * These IDs could have changed since we last looked at them.
447		 * But, we're assured that if the ownership did change
448		 * while we didn't have the inode locked, inode's dquot(s)
449		 * would have changed also.
450		 */
451		iuid = ip->i_d.di_uid;
452		iprojid = ip->i_d.di_projid;
453		igid = ip->i_d.di_gid;
454		gid = (mask & XFS_AT_GID) ? vap->va_gid : igid;
455		uid = (mask & XFS_AT_UID) ? vap->va_uid : iuid;
456
457		projid = (mask & XFS_AT_PROJID) ? (xfs_prid_t)vap->va_projid :
458			 iprojid;
459
460		/*
461		 * CAP_CHOWN overrides the following restrictions:
462		 *
463		 * If _POSIX_CHOWN_RESTRICTED is defined, this capability
464		 * shall override the restriction that a process cannot
465		 * change the user ID of a file it owns and the restriction
466		 * that the group ID supplied to the chown() function
467		 * shall be equal to either the group ID or one of the
468		 * supplementary group IDs of the calling process.
469		 */
470		if (restricted_chown &&
471		    (iuid != uid || (igid != gid &&
472				     !groupmember((gid_t)gid, credp))) &&
473		    !capable(CAP_CHOWN)) {
474			code = XFS_ERROR(EPERM);
475			goto error_return;
476		}
477		/*
478		 * Do a quota reservation only if uid/projid/gid is actually
479		 * going to change.
480		 */
481		if ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) ||
482		    (XFS_IS_PQUOTA_ON(mp) && iprojid != projid) ||
483		    (XFS_IS_GQUOTA_ON(mp) && igid != gid)) {
484			ASSERT(tp);
485			code = XFS_QM_DQVOPCHOWNRESV(mp, tp, ip, udqp, gdqp,
486						capable(CAP_FOWNER) ?
487						XFS_QMOPT_FORCE_RES : 0);
488			if (code)	/* out of quota */
489				goto error_return;
490		}
491	}
492
493	/*
494	 * Truncate file.  Must have write permission and not be a directory.
495	 */
496	if (mask & XFS_AT_SIZE) {
497		/* Short circuit the truncate case for zero length files */
498		if ((vap->va_size == 0) &&
499		   (ip->i_d.di_size == 0) && (ip->i_d.di_nextents == 0)) {
500			xfs_iunlock(ip, XFS_ILOCK_EXCL);
501			lock_flags &= ~XFS_ILOCK_EXCL;
502			if (mask & XFS_AT_CTIME)
503				xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
504			code = 0;
505			goto error_return;
506		}
507
508		if (VN_ISDIR(vp)) {
509			code = XFS_ERROR(EISDIR);
510			goto error_return;
511		} else if (!VN_ISREG(vp)) {
512			code = XFS_ERROR(EINVAL);
513			goto error_return;
514		}
515		/*
516		 * Make sure that the dquots are attached to the inode.
517		 */
518		if ((code = XFS_QM_DQATTACH(mp, ip, XFS_QMOPT_ILOCKED)))
519			goto error_return;
520	}
521
522	/*
523	 * Change file access or modified times.
524	 */
525	if (mask & (XFS_AT_ATIME|XFS_AT_MTIME)) {
526		if (!file_owner) {
527			if ((flags & ATTR_UTIME) &&
528			    !capable(CAP_FOWNER)) {
529				code = XFS_ERROR(EPERM);
530				goto error_return;
531			}
532		}
533	}
534
535	/*
536	 * Change extent size or realtime flag.
537	 */
538	if (mask & (XFS_AT_EXTSIZE|XFS_AT_XFLAGS)) {
539		/*
540		 * Can't change extent size if any extents are allocated.
541		 */
542		if (ip->i_d.di_nextents && (mask & XFS_AT_EXTSIZE) &&
543		    ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) !=
544		     vap->va_extsize) ) {
545			code = XFS_ERROR(EINVAL);	/* EFBIG? */
546			goto error_return;
547		}
548		/*
549		 * Can't change realtime flag if any extents are allocated.
550		 */
551		if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
552		    (mask & XFS_AT_XFLAGS) &&
553		    (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) !=
554		    (vap->va_xflags & XFS_XFLAG_REALTIME)) {
555			code = XFS_ERROR(EINVAL);	/* EFBIG? */
556			goto error_return;
557		}
558
559		/*
560		 * Extent size must be a multiple of the appropriate block
561		 * size, if set at all.
562		 */
563		if ((mask & XFS_AT_EXTSIZE) && vap->va_extsize != 0) {
564			xfs_extlen_t	size;
565
566			if ((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) ||
567			    ((mask & XFS_AT_XFLAGS) &&
568			    (vap->va_xflags & XFS_XFLAG_REALTIME))) {
569				size = mp->m_sb.sb_rextsize <<
570				       mp->m_sb.sb_blocklog;
571			} else {
572				size = mp->m_sb.sb_blocksize;
573			}
574			if (vap->va_extsize % size) {
575				code = XFS_ERROR(EINVAL);
576				goto error_return;
577			}
578		}
579		/*
580		 * If realtime flag is set then must have realtime data.
581		 */
582		if ((mask & XFS_AT_XFLAGS) &&
583		    (vap->va_xflags & XFS_XFLAG_REALTIME)) {
584			if ((mp->m_sb.sb_rblocks == 0) ||
585			    (mp->m_sb.sb_rextsize == 0) ||
586			    (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) {
587				code = XFS_ERROR(EINVAL);
588				goto error_return;
589			}
590		}
591
592		/*
593		 * Can't modify an immutable/append-only file unless
594		 * we have appropriate permission.
595		 */
596		if ((mask & XFS_AT_XFLAGS) &&
597		    (ip->i_d.di_flags &
598				(XFS_DIFLAG_IMMUTABLE|XFS_DIFLAG_APPEND) ||
599		     (vap->va_xflags &
600				(XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) &&
601		    !capable(CAP_LINUX_IMMUTABLE)) {
602			code = XFS_ERROR(EPERM);
603			goto error_return;
604		}
605	}
606
607	/*
608	 * Now we can make the changes.  Before we join the inode
609	 * to the transaction, if XFS_AT_SIZE is set then take care of
610	 * the part of the truncation that must be done without the
611	 * inode lock.  This needs to be done before joining the inode
612	 * to the transaction, because the inode cannot be unlocked
613	 * once it is a part of the transaction.
614	 */
615	if (mask & XFS_AT_SIZE) {
616		code = 0;
617		if ((vap->va_size > ip->i_d.di_size) &&
618		    (flags & ATTR_NOSIZETOK) == 0) {
619			code = xfs_igrow_start(ip, vap->va_size, credp);
620		}
621		xfs_iunlock(ip, XFS_ILOCK_EXCL);
622		vn_iowait(vp); /* wait for the completion of any pending DIOs */
623		if (!code)
624			code = xfs_itruncate_data(ip, vap->va_size);
625		if (code) {
626			ASSERT(tp == NULL);
627			lock_flags &= ~XFS_ILOCK_EXCL;
628			ASSERT(lock_flags == XFS_IOLOCK_EXCL);
629			goto error_return;
630		}
631		tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
632		if ((code = xfs_trans_reserve(tp, 0,
633					     XFS_ITRUNCATE_LOG_RES(mp), 0,
634					     XFS_TRANS_PERM_LOG_RES,
635					     XFS_ITRUNCATE_LOG_COUNT))) {
636			xfs_trans_cancel(tp, 0);
637			if (need_iolock)
638				xfs_iunlock(ip, XFS_IOLOCK_EXCL);
639			return code;
640		}
641		commit_flags = XFS_TRANS_RELEASE_LOG_RES;
642		xfs_ilock(ip, XFS_ILOCK_EXCL);
643	}
644
645	if (tp) {
646		xfs_trans_ijoin(tp, ip, lock_flags);
647		xfs_trans_ihold(tp, ip);
648	}
649
650	/* determine whether mandatory locking mode changes */
651	mandlock_before = MANDLOCK(vp, ip->i_d.di_mode);
652
653	/*
654	 * Truncate file.  Must have write permission and not be a directory.
655	 */
656	if (mask & XFS_AT_SIZE) {
657		if (vap->va_size > ip->i_d.di_size) {
658			xfs_igrow_finish(tp, ip, vap->va_size,
659			    !(flags & ATTR_DMI));
660		} else if ((vap->va_size <= ip->i_d.di_size) ||
661			   ((vap->va_size == 0) && ip->i_d.di_nextents)) {
662			/*
663			 * signal a sync transaction unless
664			 * we're truncating an already unlinked
665			 * file on a wsync filesystem
666			 */
667			code = xfs_itruncate_finish(&tp, ip,
668					    (xfs_fsize_t)vap->va_size,
669					    XFS_DATA_FORK,
670					    ((ip->i_d.di_nlink != 0 ||
671					      !(mp->m_flags & XFS_MOUNT_WSYNC))
672					     ? 1 : 0));
673			if (code) {
674				goto abort_return;
675			}
676		}
677		/*
678		 * Have to do this even if the file's size doesn't change.
679		 */
680		timeflags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
681	}
682
683	/*
684	 * Change file access modes.
685	 */
686	if (mask & XFS_AT_MODE) {
687		ip->i_d.di_mode &= S_IFMT;
688		ip->i_d.di_mode |= vap->va_mode & ~S_IFMT;
689
690		xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
691		timeflags |= XFS_ICHGTIME_CHG;
692	}
693
694	/*
695	 * Change file ownership.  Must be the owner or privileged.
696	 * If the system was configured with the "restricted_chown"
697	 * option, the owner is not permitted to give away the file,
698	 * and can change the group id only to a group of which he
699	 * or she is a member.
700	 */
701	if (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID)) {
702		/*
703		 * CAP_FSETID overrides the following restrictions:
704		 *
705		 * The set-user-ID and set-group-ID bits of a file will be
706		 * cleared upon successful return from chown()
707		 */
708		if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
709		    !capable(CAP_FSETID)) {
710			ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
711		}
712
713		/*
714		 * Change the ownerships and register quota modifications
715		 * in the transaction.
716		 */
717		if (iuid != uid) {
718			if (XFS_IS_UQUOTA_ON(mp)) {
719				ASSERT(mask & XFS_AT_UID);
720				ASSERT(udqp);
721				olddquot1 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
722							&ip->i_udquot, udqp);
723			}
724			ip->i_d.di_uid = uid;
725		}
726		if (igid != gid) {
727			if (XFS_IS_GQUOTA_ON(mp)) {
728				ASSERT(!XFS_IS_PQUOTA_ON(mp));
729				ASSERT(mask & XFS_AT_GID);
730				ASSERT(gdqp);
731				olddquot2 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
732							&ip->i_gdquot, gdqp);
733			}
734			ip->i_d.di_gid = gid;
735		}
736		if (iprojid != projid) {
737			if (XFS_IS_PQUOTA_ON(mp)) {
738				ASSERT(!XFS_IS_GQUOTA_ON(mp));
739				ASSERT(mask & XFS_AT_PROJID);
740				ASSERT(gdqp);
741				olddquot2 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
742							&ip->i_gdquot, gdqp);
743			}
744			ip->i_d.di_projid = projid;
745			/*
746			 * We may have to rev the inode as well as
747			 * the superblock version number since projids didn't
748			 * exist before DINODE_VERSION_2 and SB_VERSION_NLINK.
749			 */
750			if (ip->i_d.di_version == XFS_DINODE_VERSION_1)
751				xfs_bump_ino_vers2(tp, ip);
752		}
753
754		xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
755		timeflags |= XFS_ICHGTIME_CHG;
756	}
757
758
759	/*
760	 * Change file access or modified times.
761	 */
762	if (mask & (XFS_AT_ATIME|XFS_AT_MTIME)) {
763		if (mask & XFS_AT_ATIME) {
764			ip->i_d.di_atime.t_sec = vap->va_atime.tv_sec;
765			ip->i_d.di_atime.t_nsec = vap->va_atime.tv_nsec;
766			ip->i_update_core = 1;
767			//timeflags &= ~XFS_ICHGTIME_ACC;
768		}
769		if (mask & XFS_AT_MTIME) {
770			ip->i_d.di_mtime.t_sec = vap->va_mtime.tv_sec;
771			ip->i_d.di_mtime.t_nsec = vap->va_mtime.tv_nsec;
772			timeflags &= ~XFS_ICHGTIME_MOD;
773			timeflags |= XFS_ICHGTIME_CHG;
774		}
775		if (tp && (flags & ATTR_UTIME))
776			xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
777	}
778
779	/*
780	 * Change XFS-added attributes.
781	 */
782	if (mask & (XFS_AT_EXTSIZE|XFS_AT_XFLAGS)) {
783		if (mask & XFS_AT_EXTSIZE) {
784			/*
785			 * Converting bytes to fs blocks.
786			 */
787			ip->i_d.di_extsize = vap->va_extsize >>
788				mp->m_sb.sb_blocklog;
789		}
790		if (mask & XFS_AT_XFLAGS) {
791			uint	di_flags;
792
793			/* can't set PREALLOC this way, just preserve it */
794			di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC);
795			if (vap->va_xflags & XFS_XFLAG_IMMUTABLE)
796				di_flags |= XFS_DIFLAG_IMMUTABLE;
797			if (vap->va_xflags & XFS_XFLAG_APPEND)
798				di_flags |= XFS_DIFLAG_APPEND;
799			if (vap->va_xflags & XFS_XFLAG_SYNC)
800				di_flags |= XFS_DIFLAG_SYNC;
801			if (vap->va_xflags & XFS_XFLAG_NOATIME)
802				di_flags |= XFS_DIFLAG_NOATIME;
803			if (vap->va_xflags & XFS_XFLAG_NODUMP)
804				di_flags |= XFS_DIFLAG_NODUMP;
805			if (vap->va_xflags & XFS_XFLAG_PROJINHERIT)
806				di_flags |= XFS_DIFLAG_PROJINHERIT;
807			if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
808				if (vap->va_xflags & XFS_XFLAG_RTINHERIT)
809					di_flags |= XFS_DIFLAG_RTINHERIT;
810				if (vap->va_xflags & XFS_XFLAG_NOSYMLINKS)
811					di_flags |= XFS_DIFLAG_NOSYMLINKS;
812				if (vap->va_xflags & XFS_XFLAG_EXTSZINHERIT)
813					di_flags |= XFS_DIFLAG_EXTSZINHERIT;
814			} else if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) {
815				if (vap->va_xflags & XFS_XFLAG_REALTIME) {
816					di_flags |= XFS_DIFLAG_REALTIME;
817					ip->i_iocore.io_flags |= XFS_IOCORE_RT;
818				} else {
819					ip->i_iocore.io_flags &= ~XFS_IOCORE_RT;
820				}
821				if (vap->va_xflags & XFS_XFLAG_EXTSIZE)
822					di_flags |= XFS_DIFLAG_EXTSIZE;
823			}
824			ip->i_d.di_flags = di_flags;
825		}
826		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
827		timeflags |= XFS_ICHGTIME_CHG;
828	}
829
830	/*
831	 * Change file inode change time only if XFS_AT_CTIME set
832	 * AND we have been called by a DMI function.
833	 */
834
835	if ( (flags & ATTR_DMI) && (mask & XFS_AT_CTIME) ) {
836		ip->i_d.di_ctime.t_sec = vap->va_ctime.tv_sec;
837		ip->i_d.di_ctime.t_nsec = vap->va_ctime.tv_nsec;
838		ip->i_update_core = 1;
839		timeflags &= ~XFS_ICHGTIME_CHG;
840	}
841
842	/*
843	 * Send out timestamp changes that need to be set to the
844	 * current time.  Not done when called by a DMI function.
845	 */
846	if (timeflags && !(flags & ATTR_DMI))
847		xfs_ichgtime(ip, timeflags);
848
849	XFS_STATS_INC(xs_ig_attrchg);
850
851	/*
852	 * If this is a synchronous mount, make sure that the
853	 * transaction goes to disk before returning to the user.
854	 * This is slightly sub-optimal in that truncates require
855	 * two sync transactions instead of one for wsync filesystems.
856	 * One for the truncate and one for the timestamps since we
857	 * don't want to change the timestamps unless we're sure the
858	 * truncate worked.  Truncates are less than 1% of the laddis
859	 * mix so this probably isn't worth the trouble to optimize.
860	 */
861	code = 0;
862	if (tp) {
863		if (mp->m_flags & XFS_MOUNT_WSYNC)
864			xfs_trans_set_sync(tp);
865
866		code = xfs_trans_commit(tp, commit_flags, NULL);
867	}
868
869	/*
870	 * If the (regular) file's mandatory locking mode changed, then
871	 * notify the vnode.  We do this under the inode lock to prevent
872	 * racing calls to vop_vnode_change.
873	 */
874	mandlock_after = MANDLOCK(vp, ip->i_d.di_mode);
875	if (mandlock_before != mandlock_after) {
876		XVOP_VNODE_CHANGE(vp, VCHANGE_FLAGS_ENF_LOCKING,
877				 mandlock_after);
878	}
879
880	xfs_iunlock(ip, lock_flags);
881
882	/*
883	 * Release any dquot(s) the inode had kept before chown.
884	 */
885	XFS_QM_DQRELE(mp, olddquot1);
886	XFS_QM_DQRELE(mp, olddquot2);
887	XFS_QM_DQRELE(mp, udqp);
888	XFS_QM_DQRELE(mp, gdqp);
889
890	if (code) {
891		return code;
892	}
893
894	if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_ATTRIBUTE) &&
895	    !(flags & ATTR_DMI)) {
896		(void) XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, vp, DM_RIGHT_NULL,
897					NULL, DM_RIGHT_NULL, NULL, NULL,
898					0, 0, AT_DELAY_FLAG(flags));
899	}
900	return 0;
901
902 abort_return:
903	commit_flags |= XFS_TRANS_ABORT;
904	/* FALLTHROUGH */
905 error_return:
906	XFS_QM_DQRELE(mp, udqp);
907	XFS_QM_DQRELE(mp, gdqp);
908	if (tp) {
909		xfs_trans_cancel(tp, commit_flags);
910	}
911	if (lock_flags != 0) {
912		xfs_iunlock(ip, lock_flags);
913	}
914	return code;
915}
916
917
918/*
919 * xfs_access
920 * Null conversion from vnode mode bits to inode mode bits, as in efs.
921 */
922STATIC int
923xfs_access(
924	bhv_desc_t	*bdp,
925	accmode_t	accmode,
926	cred_t		*credp)
927{
928	xfs_inode_t	*ip;
929	int		error;
930
931	vn_trace_entry(BHV_TO_VNODE(bdp), __FUNCTION__,
932					       (inst_t *)__return_address);
933
934	ip = XFS_BHVTOI(bdp);
935	xfs_ilock(ip, XFS_ILOCK_SHARED);
936	error = xfs_iaccess(ip, accmode, credp);
937	xfs_iunlock(ip, XFS_ILOCK_SHARED);
938	return error;
939}
940
941
942/*
943 * xfs_readlink
944 *
945 */
946STATIC int
947xfs_readlink(
948	bhv_desc_t	*bdp,
949	uio_t		*uiop,
950	int		ioflags,
951	cred_t		*credp)
952{
953	xfs_inode_t     *ip;
954	int		count;
955	xfs_off_t	offset;
956	int		pathlen;
957	xfs_vnode_t	*vp;
958	int		error = 0;
959	xfs_mount_t	*mp;
960	int             nmaps;
961	xfs_bmbt_irec_t mval[SYMLINK_MAPS];
962	xfs_daddr_t	d;
963	int		byte_cnt;
964	int		n;
965	xfs_buf_t	*bp;
966
967	vp = BHV_TO_VNODE(bdp);
968	vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
969
970	ip = XFS_BHVTOI(bdp);
971	mp = ip->i_mount;
972
973	if (XFS_FORCED_SHUTDOWN(mp))
974		return XFS_ERROR(EIO);
975
976	xfs_ilock(ip, XFS_ILOCK_SHARED);
977
978	ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFLNK);
979
980	offset = uiop->uio_offset;
981	count = uiop->uio_resid;
982
983	if (offset < 0) {
984		error = XFS_ERROR(EINVAL);
985		goto error_return;
986	}
987	if (count <= 0) {
988		error = 0;
989		goto error_return;
990	}
991
992	/*
993	 * See if the symlink is stored inline.
994	 */
995	pathlen = (int)ip->i_d.di_size;
996
997	if (ip->i_df.if_flags & XFS_IFINLINE) {
998		error = uio_read(ip->i_df.if_u1.if_data, pathlen, uiop);
999	}
1000	else {
1001		/*
1002		 * Symlink not inline.  Call bmap to get it in.
1003		 */
1004		nmaps = SYMLINK_MAPS;
1005
1006		error = xfs_bmapi(NULL, ip, 0, XFS_B_TO_FSB(mp, pathlen),
1007				  0, NULL, 0, mval, &nmaps, NULL, NULL);
1008
1009		if (error) {
1010			goto error_return;
1011		}
1012
1013		for (n = 0; n < nmaps; n++) {
1014			d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
1015			byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
1016			bp = xfs_buf_read(mp->m_ddev_targp, d,
1017				      BTOBB(byte_cnt), 0);
1018			error = XFS_BUF_GETERROR(bp);
1019			if (error) {
1020				xfs_ioerror_alert("xfs_readlink",
1021					  ip->i_mount, bp, XFS_BUF_ADDR(bp));
1022				xfs_buf_relse(bp);
1023				goto error_return;
1024			}
1025			if (pathlen < byte_cnt)
1026				byte_cnt = pathlen;
1027			pathlen -= byte_cnt;
1028
1029			error = uio_read(XFS_BUF_PTR(bp), byte_cnt, uiop);
1030			xfs_buf_relse (bp);
1031		}
1032
1033	}
1034
1035error_return:
1036	xfs_iunlock(ip, XFS_ILOCK_SHARED);
1037	return error;
1038}
1039
1040
1041/*
1042 * xfs_fsync
1043 *
1044 * This is called to sync the inode and its data out to disk.
1045 * We need to hold the I/O lock while flushing the data, and
1046 * the inode lock while flushing the inode.  The inode lock CANNOT
1047 * be held while flushing the data, so acquire after we're done
1048 * with that.
1049 */
1050STATIC int
1051xfs_fsync(
1052	bhv_desc_t	*bdp,
1053	int		flag,
1054	cred_t		*credp,
1055	xfs_off_t	start,
1056	xfs_off_t	stop)
1057{
1058	xfs_inode_t	*ip;
1059	xfs_trans_t	*tp;
1060	int		error;
1061	int		log_flushed = 0, changed = 1;
1062
1063	vn_trace_entry(BHV_TO_VNODE(bdp),
1064			__FUNCTION__, (inst_t *)__return_address);
1065
1066	ip = XFS_BHVTOI(bdp);
1067
1068	ASSERT(start >= 0 && stop >= -1);
1069
1070	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
1071		return XFS_ERROR(EIO);
1072
1073	/*
1074	 * We always need to make sure that the required inode state
1075	 * is safe on disk.  The vnode might be clean but because
1076	 * of committed transactions that haven't hit the disk yet.
1077	 * Likewise, there could be unflushed non-transactional
1078	 * changes to the inode core that have to go to disk.
1079	 *
1080	 * The following code depends on one assumption:  that
1081	 * any transaction that changes an inode logs the core
1082	 * because it has to change some field in the inode core
1083	 * (typically nextents or nblocks).  That assumption
1084	 * implies that any transactions against an inode will
1085	 * catch any non-transactional updates.  If inode-altering
1086	 * transactions exist that violate this assumption, the
1087	 * code breaks.  Right now, it figures that if the involved
1088	 * update_* field is clear and the inode is unpinned, the
1089	 * inode is clean.  Either it's been flushed or it's been
1090	 * committed and the commit has hit the disk unpinning the inode.
1091	 * (Note that xfs_inode_item_format() called at commit clears
1092	 * the update_* fields.)
1093	 */
1094	xfs_ilock(ip, XFS_ILOCK_SHARED);
1095
1096	/* If we are flushing data then we care about update_size
1097	 * being set, otherwise we care about update_core
1098	 */
1099	if ((flag & FSYNC_DATA) ?
1100			(ip->i_update_size == 0) :
1101			(ip->i_update_core == 0)) {
1102		/*
1103		 * Timestamps/size haven't changed since last inode
1104		 * flush or inode transaction commit.  That means
1105		 * either nothing got written or a transaction
1106		 * committed which caught the updates.	If the
1107		 * latter happened and the transaction hasn't
1108		 * hit the disk yet, the inode will be still
1109		 * be pinned.  If it is, force the log.
1110		 */
1111
1112		xfs_iunlock(ip, XFS_ILOCK_SHARED);
1113
1114		if (xfs_ipincount(ip)) {
1115			_xfs_log_force(ip->i_mount, (xfs_lsn_t)0,
1116				      XFS_LOG_FORCE |
1117				      ((flag & FSYNC_WAIT)
1118				       ? XFS_LOG_SYNC : 0),
1119				      &log_flushed);
1120		} else {
1121			/*
1122			 * If the inode is not pinned and nothing
1123			 * has changed we don't need to flush the
1124			 * cache.
1125			 */
1126			changed = 0;
1127		}
1128		error = 0;
1129	} else	{
1130		/*
1131		 * Kick off a transaction to log the inode
1132		 * core to get the updates.  Make it
1133		 * sync if FSYNC_WAIT is passed in (which
1134		 * is done by everybody but specfs).  The
1135		 * sync transaction will also force the log.
1136		 */
1137		xfs_iunlock(ip, XFS_ILOCK_SHARED);
1138		tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS);
1139		if ((error = xfs_trans_reserve(tp, 0,
1140				XFS_FSYNC_TS_LOG_RES(ip->i_mount),
1141				0, 0, 0)))  {
1142			xfs_trans_cancel(tp, 0);
1143			return error;
1144		}
1145		xfs_ilock(ip, XFS_ILOCK_EXCL);
1146
1147		/*
1148		 * Note - it's possible that we might have pushed
1149		 * ourselves out of the way during trans_reserve
1150		 * which would flush the inode.	 But there's no
1151		 * guarantee that the inode buffer has actually
1152		 * gone out yet (it's delwri).	Plus the buffer
1153		 * could be pinned anyway if it's part of an
1154		 * inode in another recent transaction.	 So we
1155		 * play it safe and fire off the transaction anyway.
1156		 */
1157		xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1158		xfs_trans_ihold(tp, ip);
1159		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1160		if (flag & FSYNC_WAIT)
1161			xfs_trans_set_sync(tp);
1162		error = _xfs_trans_commit(tp, 0, NULL, &log_flushed);
1163
1164		xfs_iunlock(ip, XFS_ILOCK_EXCL);
1165	}
1166
1167	if ((ip->i_mount->m_flags & XFS_MOUNT_BARRIER) && changed) {
1168		/*
1169		 * If the log write didn't issue an ordered tag we need
1170		 * to flush the disk cache for the data device now.
1171		 */
1172		if (!log_flushed)
1173			xfs_blkdev_issue_flush(ip->i_mount->m_ddev_targp);
1174
1175		/*
1176		 * If this inode is on the RT dev we need to flush that
1177		 * cache as well.
1178		 */
1179		if (ip->i_d.di_flags & XFS_DIFLAG_REALTIME)
1180			xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp);
1181	}
1182
1183	return error;
1184}
1185
1186/*
1187 * This is called by xfs_inactive to free any blocks beyond eof,
1188 * when the link count isn't zero.
1189 */
1190STATIC int
1191xfs_inactive_free_eofblocks(
1192	xfs_mount_t	*mp,
1193	xfs_inode_t	*ip)
1194{
1195	xfs_trans_t	*tp;
1196	int		error;
1197	xfs_fileoff_t	end_fsb;
1198	xfs_fileoff_t	last_fsb;
1199	xfs_filblks_t	map_len;
1200	int		nimaps;
1201	xfs_bmbt_irec_t	imap;
1202
1203	/*
1204	 * Figure out if there are any blocks beyond the end
1205	 * of the file.  If not, then there is nothing to do.
1206	 */
1207	end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_d.di_size));
1208	last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
1209	map_len = last_fsb - end_fsb;
1210	if (map_len <= 0)
1211		return 0;
1212
1213	nimaps = 1;
1214	xfs_ilock(ip, XFS_ILOCK_SHARED);
1215	error = XFS_BMAPI(mp, NULL, &ip->i_iocore, end_fsb, map_len, 0,
1216			  NULL, 0, &imap, &nimaps, NULL, NULL);
1217	xfs_iunlock(ip, XFS_ILOCK_SHARED);
1218
1219	if (!error && (nimaps != 0) &&
1220	    (imap.br_startblock != HOLESTARTBLOCK ||
1221	     ip->i_delayed_blks)) {
1222		/*
1223		 * Attach the dquots to the inode up front.
1224		 */
1225		if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
1226			return error;
1227
1228		/*
1229		 * There are blocks after the end of file.
1230		 * Free them up now by truncating the file to
1231		 * its current size.
1232		 */
1233		tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1234
1235		/*
1236		 * Do the xfs_itruncate_start() call before
1237		 * reserving any log space because
1238		 * itruncate_start will call into the buffer
1239		 * cache and we can't
1240		 * do that within a transaction.
1241		 */
1242		xfs_ilock(ip, XFS_IOLOCK_EXCL);
1243		xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE,
1244				    ip->i_d.di_size);
1245
1246		error = xfs_trans_reserve(tp, 0,
1247					  XFS_ITRUNCATE_LOG_RES(mp),
1248					  0, XFS_TRANS_PERM_LOG_RES,
1249					  XFS_ITRUNCATE_LOG_COUNT);
1250		if (error) {
1251			ASSERT(XFS_FORCED_SHUTDOWN(mp));
1252			xfs_trans_cancel(tp, 0);
1253			xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1254			return error;
1255		}
1256
1257		xfs_ilock(ip, XFS_ILOCK_EXCL);
1258		xfs_trans_ijoin(tp, ip,
1259				XFS_IOLOCK_EXCL |
1260				XFS_ILOCK_EXCL);
1261		xfs_trans_ihold(tp, ip);
1262
1263		error = xfs_itruncate_finish(&tp, ip,
1264					     ip->i_d.di_size,
1265					     XFS_DATA_FORK,
1266					     0);
1267		/*
1268		 * If we get an error at this point we
1269		 * simply don't bother truncating the file.
1270		 */
1271		if (error) {
1272			xfs_trans_cancel(tp,
1273					 (XFS_TRANS_RELEASE_LOG_RES |
1274					  XFS_TRANS_ABORT));
1275		} else {
1276			error = xfs_trans_commit(tp,
1277						XFS_TRANS_RELEASE_LOG_RES,
1278						NULL);
1279		}
1280		xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1281	}
1282	return error;
1283}
1284
1285/*
1286 * Free a symlink that has blocks associated with it.
1287 */
1288STATIC int
1289xfs_inactive_symlink_rmt(
1290	xfs_inode_t	*ip,
1291	xfs_trans_t	**tpp)
1292{
1293	xfs_buf_t	*bp;
1294	int		committed;
1295	int		done;
1296	int		error;
1297	xfs_fsblock_t	first_block;
1298	xfs_bmap_free_t	free_list;
1299	int		i;
1300	xfs_mount_t	*mp;
1301	xfs_bmbt_irec_t	mval[SYMLINK_MAPS];
1302	int		nmaps;
1303	xfs_trans_t	*ntp;
1304	int		size;
1305	xfs_trans_t	*tp;
1306
1307	tp = *tpp;
1308	mp = ip->i_mount;
1309	ASSERT(ip->i_d.di_size > XFS_IFORK_DSIZE(ip));
1310	/*
1311	 * We're freeing a symlink that has some
1312	 * blocks allocated to it.  Free the
1313	 * blocks here.  We know that we've got
1314	 * either 1 or 2 extents and that we can
1315	 * free them all in one bunmapi call.
1316	 */
1317	ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2);
1318	if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
1319			XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
1320		ASSERT(XFS_FORCED_SHUTDOWN(mp));
1321		xfs_trans_cancel(tp, 0);
1322		*tpp = NULL;
1323		return error;
1324	}
1325	/*
1326	 * Lock the inode, fix the size, and join it to the transaction.
1327	 * Hold it so in the normal path, we still have it locked for
1328	 * the second transaction.  In the error paths we need it
1329	 * held so the cancel won't rele it, see below.
1330	 */
1331	xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1332	size = (int)ip->i_d.di_size;
1333	ip->i_d.di_size = 0;
1334	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1335	xfs_trans_ihold(tp, ip);
1336	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1337	/*
1338	 * Find the block(s) so we can inval and unmap them.
1339	 */
1340	done = 0;
1341	XFS_BMAP_INIT(&free_list, &first_block);
1342	nmaps = sizeof(mval) / sizeof(mval[0]);
1343	if ((error = xfs_bmapi(tp, ip, 0, XFS_B_TO_FSB(mp, size),
1344			XFS_BMAPI_METADATA, &first_block, 0, mval, &nmaps,
1345			&free_list, NULL)))
1346		goto error0;
1347	/*
1348	 * Invalidate the block(s).
1349	 */
1350	for (i = 0; i < nmaps; i++) {
1351		bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
1352			XFS_FSB_TO_DADDR(mp, mval[i].br_startblock),
1353			XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0);
1354		xfs_trans_binval(tp, bp);
1355	}
1356	/*
1357	 * Unmap the dead block(s) to the free_list.
1358	 */
1359	if ((error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps,
1360			&first_block, &free_list, NULL, &done)))
1361		goto error1;
1362	ASSERT(done);
1363	/*
1364	 * Commit the first transaction.  This logs the EFI and the inode.
1365	 */
1366	if ((error = xfs_bmap_finish(&tp, &free_list, first_block, &committed)))
1367		goto error1;
1368	/*
1369	 * The transaction must have been committed, since there were
1370	 * actually extents freed by xfs_bunmapi.  See xfs_bmap_finish.
1371	 * The new tp has the extent freeing and EFDs.
1372	 */
1373	ASSERT(committed);
1374	/*
1375	 * The first xact was committed, so add the inode to the new one.
1376	 * Mark it dirty so it will be logged and moved forward in the log as
1377	 * part of every commit.
1378	 */
1379	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1380	xfs_trans_ihold(tp, ip);
1381	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1382	/*
1383	 * Get a new, empty transaction to return to our caller.
1384	 */
1385	ntp = xfs_trans_dup(tp);
1386	/*
1387	 * Commit the transaction containing extent freeing and EFDs.
1388	 * If we get an error on the commit here or on the reserve below,
1389	 * we need to unlock the inode since the new transaction doesn't
1390	 * have the inode attached.
1391	 */
1392	error = xfs_trans_commit(tp, 0, NULL);
1393	tp = ntp;
1394	if (error) {
1395		ASSERT(XFS_FORCED_SHUTDOWN(mp));
1396		goto error0;
1397	}
1398	/*
1399	 * Remove the memory for extent descriptions (just bookkeeping).
1400	 */
1401	if (ip->i_df.if_bytes)
1402		xfs_idata_realloc(ip, -ip->i_df.if_bytes, XFS_DATA_FORK);
1403	ASSERT(ip->i_df.if_bytes == 0);
1404	/*
1405	 * Put an itruncate log reservation in the new transaction
1406	 * for our caller.
1407	 */
1408	if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
1409			XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
1410		ASSERT(XFS_FORCED_SHUTDOWN(mp));
1411		goto error0;
1412	}
1413	/*
1414	 * Return with the inode locked but not joined to the transaction.
1415	 */
1416	*tpp = tp;
1417	return 0;
1418
1419 error1:
1420	xfs_bmap_cancel(&free_list);
1421 error0:
1422	/*
1423	 * Have to come here with the inode locked and either
1424	 * (held and in the transaction) or (not in the transaction).
1425	 * If the inode isn't held then cancel would iput it, but
1426	 * that's wrong since this is inactive and the vnode ref
1427	 * count is 0 already.
1428	 * Cancel won't do anything to the inode if held, but it still
1429	 * needs to be locked until the cancel is done, if it was
1430	 * joined to the transaction.
1431	 */
1432	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1433	xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1434	*tpp = NULL;
1435	return error;
1436
1437}
1438
1439STATIC int
1440xfs_inactive_symlink_local(
1441	xfs_inode_t	*ip,
1442	xfs_trans_t	**tpp)
1443{
1444	int		error;
1445
1446	ASSERT(ip->i_d.di_size <= XFS_IFORK_DSIZE(ip));
1447	/*
1448	 * We're freeing a symlink which fit into
1449	 * the inode.  Just free the memory used
1450	 * to hold the old symlink.
1451	 */
1452	error = xfs_trans_reserve(*tpp, 0,
1453				  XFS_ITRUNCATE_LOG_RES(ip->i_mount),
1454				  0, XFS_TRANS_PERM_LOG_RES,
1455				  XFS_ITRUNCATE_LOG_COUNT);
1456
1457	if (error) {
1458		xfs_trans_cancel(*tpp, 0);
1459		*tpp = NULL;
1460		return error;
1461	}
1462	xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1463
1464	/*
1465	 * Zero length symlinks _can_ exist.
1466	 */
1467	if (ip->i_df.if_bytes > 0) {
1468		xfs_idata_realloc(ip,
1469				  -(ip->i_df.if_bytes),
1470				  XFS_DATA_FORK);
1471		ASSERT(ip->i_df.if_bytes == 0);
1472	}
1473	return 0;
1474}
1475
1476/*
1477 *
1478 */
1479STATIC int
1480xfs_inactive_attrs(
1481	xfs_inode_t	*ip,
1482	xfs_trans_t	**tpp)
1483{
1484	xfs_trans_t	*tp;
1485	int		error;
1486	xfs_mount_t	*mp;
1487
1488	ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE));
1489	tp = *tpp;
1490	mp = ip->i_mount;
1491	ASSERT(ip->i_d.di_forkoff != 0);
1492	xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
1493	xfs_iunlock(ip, XFS_ILOCK_EXCL);
1494
1495	error = xfs_attr_inactive(ip);
1496	if (error) {
1497		*tpp = NULL;
1498		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1499		return error; /* goto out */
1500	}
1501
1502	tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1503	error = xfs_trans_reserve(tp, 0,
1504				  XFS_IFREE_LOG_RES(mp),
1505				  0, XFS_TRANS_PERM_LOG_RES,
1506				  XFS_INACTIVE_LOG_COUNT);
1507	if (error) {
1508		ASSERT(XFS_FORCED_SHUTDOWN(mp));
1509		xfs_trans_cancel(tp, 0);
1510		*tpp = NULL;
1511		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1512		return error;
1513	}
1514
1515	xfs_ilock(ip, XFS_ILOCK_EXCL);
1516	xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1517	xfs_trans_ihold(tp, ip);
1518	xfs_idestroy_fork(ip, XFS_ATTR_FORK);
1519
1520	ASSERT(ip->i_d.di_anextents == 0);
1521
1522	*tpp = tp;
1523	return 0;
1524}
1525
1526STATIC int
1527xfs_release(
1528	bhv_desc_t	*bdp)
1529{
1530	xfs_inode_t	*ip;
1531	xfs_vnode_t	*vp;
1532	xfs_mount_t	*mp;
1533	int		error;
1534
1535	vp = BHV_TO_VNODE(bdp);
1536	ip = XFS_BHVTOI(bdp);
1537
1538	if (!VN_ISREG(vp) || (ip->i_d.di_mode == 0)) {
1539		return 0;
1540	}
1541
1542	/* If this is a read-only mount, don't do this (would generate I/O) */
1543	if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
1544		return 0;
1545
1546#ifdef HAVE_REFCACHE
1547	/* If we are in the NFS reference cache then don't do this now */
1548	if (ip->i_refcache)
1549		return 0;
1550#endif
1551
1552	mp = ip->i_mount;
1553
1554	if (ip->i_d.di_nlink != 0) {
1555		if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
1556		     ((ip->i_d.di_size > 0) || (VN_CACHED(vp) > 0 ||
1557		       ip->i_delayed_blks > 0)) &&
1558		     (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
1559		    (!(ip->i_d.di_flags &
1560				(XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
1561			if ((error = xfs_inactive_free_eofblocks(mp, ip)))
1562				return error;
1563
1564#ifdef RMC			/* Update linux inode block count after free above */
1565			vn_to_inode(vp)->i_blocks = XFS_FSB_TO_BB(mp,
1566				ip->i_d.di_nblocks + ip->i_delayed_blks);
1567#endif
1568		}
1569	}
1570
1571	return 0;
1572}
1573
1574/*
1575 * xfs_inactive
1576 *
1577 * This is called when the vnode reference count for the vnode
1578 * goes to zero.  If the file has been unlinked, then it must
1579 * now be truncated.  Also, we clear all of the read-ahead state
1580 * kept for the inode here since the file is now closed.
1581 */
1582STATIC int
1583xfs_inactive(
1584	bhv_desc_t	*bdp,
1585	cred_t		*credp)
1586{
1587	xfs_inode_t	*ip;
1588	xfs_vnode_t	*vp;
1589
1590	xfs_bmap_free_t	free_list;
1591	xfs_fsblock_t	first_block;
1592	int		committed;
1593	xfs_trans_t	*tp;
1594	xfs_mount_t	*mp;
1595	int		error;
1596	int		truncate;
1597
1598	vp = BHV_TO_VNODE(bdp);
1599	vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
1600
1601	ip = XFS_BHVTOI(bdp);
1602
1603	/*
1604	 * If the inode is already free, then there can be nothing
1605	 * to clean up here.
1606	 */
1607	if (ip->i_d.di_mode == 0 || VN_BAD(vp)) {
1608		ASSERT(ip->i_df.if_real_bytes == 0);
1609		ASSERT(ip->i_df.if_broot_bytes == 0);
1610		return VN_INACTIVE_CACHE;
1611	}
1612
1613	/*
1614	 * Only do a truncate if it's a regular file with
1615	 * some actual space in it.  It's OK to look at the
1616	 * inode's fields without the lock because we're the
1617	 * only one with a reference to the inode.
1618	 */
1619	truncate = ((ip->i_d.di_nlink == 0) &&
1620            ((ip->i_d.di_size != 0) || (ip->i_d.di_nextents > 0) ||
1621             (ip->i_delayed_blks > 0)) &&
1622	    ((ip->i_d.di_mode & S_IFMT) == S_IFREG));
1623
1624	mp = ip->i_mount;
1625
1626	if (ip->i_d.di_nlink == 0 &&
1627	    DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_DESTROY)) {
1628		(void) XFS_SEND_DESTROY(mp, vp, DM_RIGHT_NULL);
1629	}
1630
1631	error = 0;
1632
1633	/* If this is a read-only mount, don't do this (would generate I/O) */
1634	if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
1635		goto out;
1636
1637	if (ip->i_d.di_nlink != 0) {
1638		if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
1639                     ((ip->i_d.di_size > 0) || (VN_CACHED(vp) > 0 ||
1640                       ip->i_delayed_blks > 0)) &&
1641		      (ip->i_df.if_flags & XFS_IFEXTENTS) &&
1642		     (!(ip->i_d.di_flags &
1643				(XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) ||
1644		      (ip->i_delayed_blks != 0)))) {
1645			if ((error = xfs_inactive_free_eofblocks(mp, ip)))
1646				return VN_INACTIVE_CACHE;
1647#ifdef RMC
1648			/* Update linux inode block count after free above */
1649			vn_to_inode(vp)->i_blocks = XFS_FSB_TO_BB(mp,
1650				ip->i_d.di_nblocks + ip->i_delayed_blks);
1651#endif
1652		}
1653		goto out;
1654	}
1655
1656	ASSERT(ip->i_d.di_nlink == 0);
1657
1658	if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
1659		return VN_INACTIVE_CACHE;
1660
1661	tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1662	if (truncate) {
1663		/*
1664		 * Do the xfs_itruncate_start() call before
1665		 * reserving any log space because itruncate_start
1666		 * will call into the buffer cache and we can't
1667		 * do that within a transaction.
1668		 */
1669		xfs_ilock(ip, XFS_IOLOCK_EXCL);
1670
1671		xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, 0);
1672
1673		error = xfs_trans_reserve(tp, 0,
1674					  XFS_ITRUNCATE_LOG_RES(mp),
1675					  0, XFS_TRANS_PERM_LOG_RES,
1676					  XFS_ITRUNCATE_LOG_COUNT);
1677		if (error) {
1678			/* Don't call itruncate_cleanup */
1679			ASSERT(XFS_FORCED_SHUTDOWN(mp));
1680			xfs_trans_cancel(tp, 0);
1681			xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1682			return VN_INACTIVE_CACHE;
1683		}
1684
1685		xfs_ilock(ip, XFS_ILOCK_EXCL);
1686		xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1687		xfs_trans_ihold(tp, ip);
1688
1689		/*
1690		 * normally, we have to run xfs_itruncate_finish sync.
1691		 * But if filesystem is wsync and we're in the inactive
1692		 * path, then we know that nlink == 0, and that the
1693		 * xaction that made nlink == 0 is permanently committed
1694		 * since xfs_remove runs as a synchronous transaction.
1695		 */
1696		error = xfs_itruncate_finish(&tp, ip, 0, XFS_DATA_FORK,
1697				(!(mp->m_flags & XFS_MOUNT_WSYNC) ? 1 : 0));
1698
1699		if (error) {
1700			xfs_trans_cancel(tp,
1701				XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1702			xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1703			return VN_INACTIVE_CACHE;
1704		}
1705	} else if ((ip->i_d.di_mode & S_IFMT) == S_IFLNK) {
1706
1707		/*
1708		 * If we get an error while cleaning up a
1709		 * symlink we bail out.
1710		 */
1711		error = (ip->i_d.di_size > XFS_IFORK_DSIZE(ip)) ?
1712			xfs_inactive_symlink_rmt(ip, &tp) :
1713			xfs_inactive_symlink_local(ip, &tp);
1714
1715		if (error) {
1716			ASSERT(tp == NULL);
1717			return VN_INACTIVE_CACHE;
1718		}
1719
1720		xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1721		xfs_trans_ihold(tp, ip);
1722	} else {
1723		error = xfs_trans_reserve(tp, 0,
1724					  XFS_IFREE_LOG_RES(mp),
1725					  0, XFS_TRANS_PERM_LOG_RES,
1726					  XFS_INACTIVE_LOG_COUNT);
1727		if (error) {
1728			ASSERT(XFS_FORCED_SHUTDOWN(mp));
1729			xfs_trans_cancel(tp, 0);
1730			return VN_INACTIVE_CACHE;
1731		}
1732
1733		xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1734		xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1735		xfs_trans_ihold(tp, ip);
1736	}
1737
1738	/*
1739	 * If there are attributes associated with the file
1740	 * then blow them away now.  The code calls a routine
1741	 * that recursively deconstructs the attribute fork.
1742	 * We need to just commit the current transaction
1743	 * because we can't use it for xfs_attr_inactive().
1744	 */
1745	if (ip->i_d.di_anextents > 0) {
1746		error = xfs_inactive_attrs(ip, &tp);
1747		/*
1748		 * If we got an error, the transaction is already
1749		 * cancelled, and the inode is unlocked. Just get out.
1750		 */
1751		 if (error)
1752			 return VN_INACTIVE_CACHE;
1753	} else if (ip->i_afp) {
1754		xfs_idestroy_fork(ip, XFS_ATTR_FORK);
1755	}
1756
1757	/*
1758	 * Free the inode.
1759	 */
1760	XFS_BMAP_INIT(&free_list, &first_block);
1761	error = xfs_ifree(tp, ip, &free_list);
1762	if (error) {
1763		/*
1764		 * If we fail to free the inode, shut down.  The cancel
1765		 * might do that, we need to make sure.  Otherwise the
1766		 * inode might be lost for a long time or forever.
1767		 */
1768		if (!XFS_FORCED_SHUTDOWN(mp)) {
1769			cmn_err(CE_NOTE,
1770		"xfs_inactive:	xfs_ifree() returned an error = %d on %s",
1771				error, mp->m_fsname);
1772			xfs_force_shutdown(mp, XFS_METADATA_IO_ERROR);
1773		}
1774		xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
1775	} else {
1776		/*
1777		 * Credit the quota account(s). The inode is gone.
1778		 */
1779		XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
1780
1781		/*
1782		 * Just ignore errors at this point.  There is
1783		 * nothing we can do except to try to keep going.
1784		 */
1785		(void) xfs_bmap_finish(&tp,  &free_list, first_block,
1786				       &committed);
1787		(void) xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
1788	}
1789	/*
1790	 * Release the dquots held by inode, if any.
1791	 */
1792	XFS_QM_DQDETACH(mp, ip);
1793
1794	xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1795
1796 out:
1797	return VN_INACTIVE_CACHE;
1798}
1799
1800
1801/*
1802 * xfs_lookup
1803 */
1804STATIC int
1805xfs_lookup(
1806	bhv_desc_t		*dir_bdp,
1807	vname_t			*dentry,
1808	xfs_vnode_t		**vpp,
1809	int			flags,
1810	xfs_vnode_t		*rdir,
1811	cred_t			*credp)
1812{
1813	xfs_inode_t		*dp, *ip;
1814	xfs_ino_t		e_inum;
1815	int			error;
1816	uint			lock_mode;
1817	xfs_vnode_t		*dir_vp;
1818
1819	dir_vp = BHV_TO_VNODE(dir_bdp);
1820	vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
1821
1822	dp = XFS_BHVTOI(dir_bdp);
1823
1824	if (XFS_FORCED_SHUTDOWN(dp->i_mount))
1825		return XFS_ERROR(EIO);
1826
1827	lock_mode = xfs_ilock_map_shared(dp);
1828	error = xfs_dir_lookup_int(dir_bdp, lock_mode, dentry, &e_inum, &ip);
1829	if (!error) {
1830		*vpp = XFS_ITOV(ip);
1831		ITRACE(ip);
1832	}
1833	xfs_iunlock_map_shared(dp, lock_mode);
1834	return error;
1835}
1836
1837
1838/*
1839 * xfs_create (create a new file).
1840 */
1841STATIC int
1842xfs_create(
1843	bhv_desc_t		*dir_bdp,
1844	vname_t			*dentry,
1845	xfs_vattr_t		*vap,
1846	xfs_vnode_t		**vpp,
1847	cred_t			*credp)
1848{
1849	char			*name = VNAME(dentry);
1850	xfs_vnode_t		*dir_vp;
1851	xfs_inode_t		*dp, *ip;
1852	xfs_vnode_t	        *vp=NULL;
1853	xfs_trans_t		*tp;
1854	xfs_mount_t	        *mp;
1855	xfs_dev_t		rdev;
1856	int                     error;
1857	xfs_bmap_free_t		free_list;
1858	xfs_fsblock_t		first_block;
1859	boolean_t		dp_joined_to_trans;
1860	int			dm_event_sent = 0;
1861	uint			cancel_flags;
1862	int			committed;
1863	xfs_prid_t		prid;
1864	struct xfs_dquot	*udqp, *gdqp;
1865	uint			resblks;
1866	int			dm_di_mode;
1867	int			namelen;
1868
1869	ASSERT(!*vpp);
1870	dir_vp = BHV_TO_VNODE(dir_bdp);
1871	vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
1872
1873	dp = XFS_BHVTOI(dir_bdp);
1874	mp = dp->i_mount;
1875
1876	dm_di_mode = vap->va_mode;
1877	namelen = VNAMELEN(dentry);
1878
1879	if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_CREATE)) {
1880		error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
1881				dir_vp, DM_RIGHT_NULL, NULL,
1882				DM_RIGHT_NULL, name, NULL,
1883				dm_di_mode, 0, 0);
1884
1885		if (error)
1886			return error;
1887		dm_event_sent = 1;
1888	}
1889
1890	if (XFS_FORCED_SHUTDOWN(mp))
1891		return XFS_ERROR(EIO);
1892
1893	/* Return through std_return after this point. */
1894
1895	udqp = gdqp = NULL;
1896
1897	if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
1898		prid = dp->i_d.di_projid;
1899	else if (vap->va_mask & XFS_AT_PROJID)
1900		prid = (xfs_prid_t)vap->va_projid;
1901	else
1902		prid = (xfs_prid_t)dfltprid;
1903
1904	/*
1905	 * Make sure that we have allocated dquot(s) on disk.
1906	 */
1907	error = XFS_QM_DQVOPALLOC(mp, dp,
1908			current_fsuid(credp), current_fsgid(credp), prid,
1909			XFS_QMOPT_QUOTALL|XFS_QMOPT_INHERIT, &udqp, &gdqp);
1910	if (error)
1911		goto std_return;
1912
1913	ip = NULL;
1914	dp_joined_to_trans = B_FALSE;
1915
1916	tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
1917	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
1918	resblks = XFS_CREATE_SPACE_RES(mp, namelen);
1919	/*
1920	 * Initially assume that the file does not exist and
1921	 * reserve the resources for that case.  If that is not
1922	 * the case we'll drop the one we have and get a more
1923	 * appropriate transaction later.
1924	 */
1925	error = xfs_trans_reserve(tp, resblks, XFS_CREATE_LOG_RES(mp), 0,
1926			XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
1927	if (error == ENOSPC) {
1928		resblks = 0;
1929		error = xfs_trans_reserve(tp, 0, XFS_CREATE_LOG_RES(mp), 0,
1930				XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
1931	}
1932	if (error) {
1933		cancel_flags = 0;
1934		dp = NULL;
1935		goto error_return;
1936	}
1937
1938	xfs_ilock(dp, XFS_ILOCK_EXCL);
1939
1940	XFS_BMAP_INIT(&free_list, &first_block);
1941
1942	ASSERT(ip == NULL);
1943
1944	/*
1945	 * Reserve disk quota and the inode.
1946	 */
1947	error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
1948	if (error)
1949		goto error_return;
1950
1951	if (resblks == 0 &&
1952	    (error = XFS_DIR_CANENTER(mp, tp, dp, name, namelen)))
1953		goto error_return;
1954	rdev = (vap->va_mask & XFS_AT_RDEV) ? vap->va_rdev : 0;
1955	error = xfs_dir_ialloc(&tp, dp, vap->va_mode, 1,
1956			rdev, credp, prid, resblks > 0,
1957			&ip, &committed);
1958	if (error) {
1959		if (error == ENOSPC)
1960			goto error_return;
1961		goto abort_return;
1962	}
1963	ITRACE(ip);
1964
1965	/*
1966	 * At this point, we've gotten a newly allocated inode.
1967	 * It is locked (and joined to the transaction).
1968	 */
1969
1970	ASSERT(ismrlocked (&ip->i_lock, MR_UPDATE));
1971
1972	/*
1973	 * Now we join the directory inode to the transaction.
1974	 * We do not do it earlier because xfs_dir_ialloc
1975	 * might commit the previous transaction (and release
1976	 * all the locks).
1977	 */
1978
1979	VN_HOLD(dir_vp);
1980	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
1981	dp_joined_to_trans = B_TRUE;
1982
1983	error = XFS_DIR_CREATENAME(mp, tp, dp, name, namelen, ip->i_ino,
1984		&first_block, &free_list,
1985		resblks ? resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
1986	if (error) {
1987		ASSERT(error != ENOSPC);
1988		goto abort_return;
1989	}
1990	xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1991	xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
1992
1993	/*
1994	 * If this is a synchronous mount, make sure that the
1995	 * create transaction goes to disk before returning to
1996	 * the user.
1997	 */
1998	if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
1999		xfs_trans_set_sync(tp);
2000	}
2001
2002	dp->i_gen++;
2003
2004	/*
2005	 * Attach the dquot(s) to the inodes and modify them incore.
2006	 * These ids of the inode couldn't have changed since the new
2007	 * inode has been locked ever since it was created.
2008	 */
2009	XFS_QM_DQVOPCREATE(mp, tp, ip, udqp, gdqp);
2010
2011	/*
2012	 * xfs_trans_commit normally decrements the vnode ref count
2013	 * when it unlocks the inode. Since we want to return the
2014	 * vnode to the caller, we bump the vnode ref count now.
2015	 */
2016	IHOLD(ip);
2017	vp = XFS_ITOV(ip);
2018
2019	error = xfs_bmap_finish(&tp, &free_list, first_block, &committed);
2020	if (error) {
2021		xfs_bmap_cancel(&free_list);
2022		goto abort_rele;
2023	}
2024
2025	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
2026	if (error) {
2027		IRELE(ip);
2028		tp = NULL;
2029		goto error_return;
2030	}
2031
2032	XFS_QM_DQRELE(mp, udqp);
2033	XFS_QM_DQRELE(mp, gdqp);
2034
2035	/*
2036	 * Propagate the fact that the vnode changed after the
2037	 * xfs_inode locks have been released.
2038	 */
2039	XVOP_VNODE_CHANGE(vp, VCHANGE_FLAGS_TRUNCATED, 3);
2040
2041	*vpp = vp;
2042
2043	/* Fallthrough to std_return with error = 0  */
2044
2045std_return:
2046	if ( (*vpp || (error != 0 && dm_event_sent != 0)) &&
2047			DM_EVENT_ENABLED(dir_vp->v_vfsp, XFS_BHVTOI(dir_bdp),
2048							DM_EVENT_POSTCREATE)) {
2049		(void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
2050			dir_vp, DM_RIGHT_NULL,
2051			*vpp ? vp:NULL,
2052			DM_RIGHT_NULL, name, NULL,
2053			dm_di_mode, error, 0);
2054	}
2055	return error;
2056
2057 abort_return:
2058	cancel_flags |= XFS_TRANS_ABORT;
2059	/* FALLTHROUGH */
2060
2061 error_return:
2062	if (tp != NULL)
2063		xfs_trans_cancel(tp, cancel_flags);
2064
2065	if (!dp_joined_to_trans && (dp != NULL))
2066		xfs_iunlock(dp, XFS_ILOCK_EXCL);
2067	XFS_QM_DQRELE(mp, udqp);
2068	XFS_QM_DQRELE(mp, gdqp);
2069
2070	goto std_return;
2071
2072 abort_rele:
2073	/*
2074	 * Wait until after the current transaction is aborted to
2075	 * release the inode.  This prevents recursive transactions
2076	 * and deadlocks from xfs_inactive.
2077	 */
2078	cancel_flags |= XFS_TRANS_ABORT;
2079	xfs_trans_cancel(tp, cancel_flags);
2080	IRELE(ip);
2081
2082	XFS_QM_DQRELE(mp, udqp);
2083	XFS_QM_DQRELE(mp, gdqp);
2084
2085	goto std_return;
2086}
2087
2088#ifdef DEBUG
2089/*
2090 * Some counters to see if (and how often) we are hitting some deadlock
2091 * prevention code paths.
2092 */
2093
2094int xfs_rm_locks;
2095int xfs_rm_lock_delays;
2096int xfs_rm_attempts;
2097#endif
2098
2099/*
2100 * The following routine will lock the inodes associated with the
2101 * directory and the named entry in the directory. The locks are
2102 * acquired in increasing inode number.
2103 *
2104 * If the entry is "..", then only the directory is locked. The
2105 * vnode ref count will still include that from the .. entry in
2106 * this case.
2107 *
2108 * There is a deadlock we need to worry about. If the locked directory is
2109 * in the AIL, it might be blocking up the log. The next inode we lock
2110 * could be already locked by another thread waiting for log space (e.g
2111 * a permanent log reservation with a long running transaction (see
2112 * xfs_itruncate_finish)). To solve this, we must check if the directory
2113 * is in the ail and use lock_nowait. If we can't lock, we need to
2114 * drop the inode lock on the directory and try again. xfs_iunlock will
2115 * potentially push the tail if we were holding up the log.
2116 */
2117STATIC int
2118xfs_lock_dir_and_entry(
2119	xfs_inode_t	*dp,
2120	vname_t		*dentry,
2121	xfs_inode_t	*ip)	/* inode of entry 'name' */
2122{
2123	int		attempts;
2124	xfs_ino_t	e_inum;
2125	xfs_inode_t	*ips[2];
2126	xfs_log_item_t	*lp;
2127
2128#ifdef DEBUG
2129	xfs_rm_locks++;
2130#endif
2131	attempts = 0;
2132
2133again:
2134	xfs_ilock(dp, XFS_ILOCK_EXCL);
2135
2136	e_inum = ip->i_ino;
2137
2138	ITRACE(ip);
2139
2140	/*
2141	 * We want to lock in increasing inum. Since we've already
2142	 * acquired the lock on the directory, we may need to release
2143	 * if if the inum of the entry turns out to be less.
2144	 */
2145	if (e_inum > dp->i_ino) {
2146		/*
2147		 * We are already in the right order, so just
2148		 * lock on the inode of the entry.
2149		 * We need to use nowait if dp is in the AIL.
2150		 */
2151
2152		lp = (xfs_log_item_t *)dp->i_itemp;
2153		if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
2154			if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
2155				attempts++;
2156#ifdef DEBUG
2157				xfs_rm_attempts++;
2158#endif
2159
2160				/*
2161				 * Unlock dp and try again.
2162				 * xfs_iunlock will try to push the tail
2163				 * if the inode is in the AIL.
2164				 */
2165
2166				xfs_iunlock(dp, XFS_ILOCK_EXCL);
2167
2168				if ((attempts % 5) == 0) {
2169					delay(1); /* Don't just spin the CPU */
2170#ifdef DEBUG
2171					xfs_rm_lock_delays++;
2172#endif
2173				}
2174				goto again;
2175			}
2176		} else {
2177			xfs_ilock(ip, XFS_ILOCK_EXCL);
2178		}
2179	} else if (e_inum < dp->i_ino) {
2180		xfs_iunlock(dp, XFS_ILOCK_EXCL);
2181
2182		ips[0] = ip;
2183		ips[1] = dp;
2184		xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL);
2185	}
2186	/* else	 e_inum == dp->i_ino */
2187	/*     This can happen if we're asked to lock /x/..
2188	 *     the entry is "..", which is also the parent directory.
2189	 */
2190
2191	return 0;
2192}
2193
2194#ifdef DEBUG
2195int xfs_locked_n;
2196int xfs_small_retries;
2197int xfs_middle_retries;
2198int xfs_lots_retries;
2199int xfs_lock_delays;
2200#endif
2201
2202/*
2203 * The following routine will lock n inodes in exclusive mode.
2204 * We assume the caller calls us with the inodes in i_ino order.
2205 *
2206 * We need to detect deadlock where an inode that we lock
2207 * is in the AIL and we start waiting for another inode that is locked
2208 * by a thread in a long running transaction (such as truncate). This can
2209 * result in deadlock since the long running trans might need to wait
2210 * for the inode we just locked in order to push the tail and free space
2211 * in the log.
2212 */
2213void
2214xfs_lock_inodes(
2215	xfs_inode_t	**ips,
2216	int		inodes,
2217	int		first_locked,
2218	uint		lock_mode)
2219{
2220	int		attempts = 0, i, j, try_lock;
2221	xfs_log_item_t	*lp;
2222
2223	ASSERT(ips && (inodes >= 2)); /* we need at least two */
2224
2225	if (first_locked) {
2226		try_lock = 1;
2227		i = 1;
2228	} else {
2229		try_lock = 0;
2230		i = 0;
2231	}
2232
2233again:
2234	for (; i < inodes; i++) {
2235		ASSERT(ips[i]);
2236
2237		if (i && (ips[i] == ips[i-1]))	/* Already locked */
2238			continue;
2239
2240		/*
2241		 * If try_lock is not set yet, make sure all locked inodes
2242		 * are not in the AIL.
2243		 * If any are, set try_lock to be used later.
2244		 */
2245
2246		if (!try_lock) {
2247			for (j = (i - 1); j >= 0 && !try_lock; j--) {
2248				lp = (xfs_log_item_t *)ips[j]->i_itemp;
2249				if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
2250					try_lock++;
2251				}
2252			}
2253		}
2254
2255		/*
2256		 * If any of the previous locks we have locked is in the AIL,
2257		 * we must TRY to get the second and subsequent locks. If
2258		 * we can't get any, we must release all we have
2259		 * and try again.
2260		 */
2261
2262		if (try_lock) {
2263			/* try_lock must be 0 if i is 0. */
2264			/*
2265			 * try_lock means we have an inode locked
2266			 * that is in the AIL.
2267			 */
2268			ASSERT(i != 0);
2269			if (!xfs_ilock_nowait(ips[i], lock_mode)) {
2270				attempts++;
2271
2272				/*
2273				 * Unlock all previous guys and try again.
2274				 * xfs_iunlock will try to push the tail
2275				 * if the inode is in the AIL.
2276				 */
2277
2278				for(j = i - 1; j >= 0; j--) {
2279
2280					/*
2281					 * Check to see if we've already
2282					 * unlocked this one.
2283					 * Not the first one going back,
2284					 * and the inode ptr is the same.
2285					 */
2286					if ((j != (i - 1)) && ips[j] ==
2287								ips[j+1])
2288						continue;
2289
2290					xfs_iunlock(ips[j], lock_mode);
2291				}
2292
2293				if ((attempts % 5) == 0) {
2294					delay(1); /* Don't just spin the CPU */
2295#ifdef DEBUG
2296					xfs_lock_delays++;
2297#endif
2298				}
2299				i = 0;
2300				try_lock = 0;
2301				goto again;
2302			}
2303		} else {
2304			xfs_ilock(ips[i], lock_mode);
2305		}
2306	}
2307
2308#ifdef DEBUG
2309	if (attempts) {
2310		if (attempts < 5) xfs_small_retries++;
2311		else if (attempts < 100) xfs_middle_retries++;
2312		else xfs_lots_retries++;
2313	} else {
2314		xfs_locked_n++;
2315	}
2316#endif
2317}
2318
2319#ifdef	DEBUG
2320#define	REMOVE_DEBUG_TRACE(x)	{remove_which_error_return = (x);}
2321int remove_which_error_return = 0;
2322#else /* ! DEBUG */
2323#define	REMOVE_DEBUG_TRACE(x)
2324#endif	/* ! DEBUG */
2325
2326extern int xfs_remove(bhv_desc_t *, bhv_desc_t *, vname_t *, cred_t *);
2327/*
2328 * xfs_remove
2329 *
2330 */
2331int
2332xfs_remove(
2333	bhv_desc_t		*dir_bdp,
2334	bhv_desc_t		*vp_bdp,
2335	vname_t			*dentry,
2336	cred_t			*credp)
2337{
2338	xfs_vnode_t		*dir_vp;
2339	xfs_vnode_t		*xvp;
2340	char			*name = VNAME(dentry);
2341	xfs_inode_t             *dp, *ip;
2342	xfs_trans_t             *tp = NULL;
2343	xfs_mount_t		*mp;
2344	int                     error = 0;
2345	xfs_bmap_free_t         free_list;
2346	xfs_fsblock_t           first_block;
2347	int			cancel_flags;
2348	int			committed;
2349	int			dm_di_mode = 0;
2350	int			link_zero;
2351	uint			resblks;
2352	int			namelen;
2353
2354	dir_vp = BHV_TO_VNODE(dir_bdp);
2355	xvp = BHV_TO_VNODE(vp_bdp);
2356
2357	printf("xfs_remove: dvp %p vp %p\n",dir_vp,xvp);
2358	vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
2359
2360	dp = XFS_BHVTOI(dir_bdp);
2361	mp = dp->i_mount;
2362
2363	if (XFS_FORCED_SHUTDOWN(mp))
2364		return XFS_ERROR(EIO);
2365
2366	namelen = VNAMELEN(dentry);
2367
2368	if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_REMOVE)) {
2369		error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, dir_vp,
2370					DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
2371					name, NULL, 0, 0, 0);
2372		if (error)
2373			return error;
2374	}
2375
2376	/* From this point on, return through std_return */
2377	ip = NULL;
2378
2379	/*
2380	 * We need to get a reference to ip before we get our log
2381	 * reservation. The reason for this is that we cannot call
2382	 * xfs_iget for an inode for which we do not have a reference
2383	 * once we've acquired a log reservation. This is because the
2384	 * inode we are trying to get might be in xfs_inactive going
2385	 * for a log reservation. Since we'll have to wait for the
2386	 * inactive code to complete before returning from xfs_iget,
2387	 * we need to make sure that we don't have log space reserved
2388	 * when we call xfs_iget.  Instead we get an unlocked reference
2389	 * to the inode before getting our log reservation.
2390	 */
2391#ifdef RMC
2392	error = xfs_get_dir_entry(dentry, &ip);
2393#endif
2394	/* FreeBSD has already done the lookup */
2395	ip = xvp->v_inode;
2396	VN_HOLD(xvp);
2397
2398	if (error) {
2399		REMOVE_DEBUG_TRACE(__LINE__);
2400		goto std_return;
2401	}
2402
2403	dm_di_mode = ip->i_d.di_mode;
2404
2405	vn_trace_entry(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address);
2406
2407	ITRACE(ip);
2408
2409	error = XFS_QM_DQATTACH(mp, dp, 0);
2410	if (!error && dp != ip)
2411		error = XFS_QM_DQATTACH(mp, ip, 0);
2412	if (error) {
2413		REMOVE_DEBUG_TRACE(__LINE__);
2414		IRELE(ip);
2415		goto std_return;
2416	}
2417
2418	tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
2419	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2420	/*
2421	 * We try to get the real space reservation first,
2422	 * allowing for directory btree deletion(s) implying
2423	 * possible bmap insert(s).  If we can't get the space
2424	 * reservation then we use 0 instead, and avoid the bmap
2425	 * btree insert(s) in the directory code by, if the bmap
2426	 * insert tries to happen, instead trimming the LAST
2427	 * block from the directory.
2428	 */
2429	resblks = XFS_REMOVE_SPACE_RES(mp);
2430	error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
2431			XFS_TRANS_PERM_LOG_RES, XFS_REMOVE_LOG_COUNT);
2432	if (error == ENOSPC) {
2433		resblks = 0;
2434		error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
2435				XFS_TRANS_PERM_LOG_RES, XFS_REMOVE_LOG_COUNT);
2436	}
2437	if (error) {
2438		ASSERT(error != ENOSPC);
2439		REMOVE_DEBUG_TRACE(__LINE__);
2440		xfs_trans_cancel(tp, 0);
2441		IRELE(ip);
2442		return error;
2443	}
2444
2445	error = xfs_lock_dir_and_entry(dp, dentry, ip);
2446	if (error) {
2447		REMOVE_DEBUG_TRACE(__LINE__);
2448		xfs_trans_cancel(tp, cancel_flags);
2449		IRELE(ip);
2450		goto std_return;
2451	}
2452
2453	/*
2454	 * At this point, we've gotten both the directory and the entry
2455	 * inodes locked.
2456	 */
2457	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2458	if (dp != ip) {
2459		/*
2460		 * Increment vnode ref count only in this case since
2461		 * there's an extra vnode reference in the case where
2462		 * dp == ip.
2463		 */
2464		IHOLD(dp);
2465		xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
2466	}
2467
2468	/*
2469	 * Entry must exist since we did a lookup in xfs_lock_dir_and_entry.
2470	 */
2471	XFS_BMAP_INIT(&free_list, &first_block);
2472	error = XFS_DIR_REMOVENAME(mp, tp, dp, name, namelen, ip->i_ino,
2473		&first_block, &free_list, 0);
2474	if (error) {
2475		ASSERT(error != ENOENT);
2476		REMOVE_DEBUG_TRACE(__LINE__);
2477		goto error1;
2478	}
2479	xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2480
2481	dp->i_gen++;
2482	xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
2483
2484	error = xfs_droplink(tp, ip);
2485	if (error) {
2486		REMOVE_DEBUG_TRACE(__LINE__);
2487		goto error1;
2488	}
2489
2490	/* Determine if this is the last link while
2491	 * we are in the transaction.
2492	 */
2493	link_zero = (ip)->i_d.di_nlink==0;
2494
2495	/*
2496	 * Take an extra ref on the inode so that it doesn't
2497	 * go to xfs_inactive() from within the commit.
2498	 */
2499	IHOLD(ip);
2500
2501	/*
2502	 * If this is a synchronous mount, make sure that the
2503	 * remove transaction goes to disk before returning to
2504	 * the user.
2505	 */
2506	if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2507		xfs_trans_set_sync(tp);
2508	}
2509
2510	error = xfs_bmap_finish(&tp, &free_list, first_block, &committed);
2511	if (error) {
2512		REMOVE_DEBUG_TRACE(__LINE__);
2513		goto error_rele;
2514	}
2515
2516	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
2517	if (error) {
2518		IRELE(ip);
2519		goto std_return;
2520	}
2521
2522	/*
2523	 * Before we drop our extra reference to the inode, purge it
2524	 * from the refcache if it is there.  By waiting until afterwards
2525	 * to do the IRELE, we ensure that we won't go inactive in the
2526	 * xfs_refcache_purge_ip routine (although that would be OK).
2527	 */
2528	xfs_refcache_purge_ip(ip);
2529
2530	vn_trace_exit(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address);
2531
2532	/*
2533	 * Let interposed file systems know about removed links.
2534	 */
2535	XVOP_LINK_REMOVED(XFS_ITOV(ip), dir_vp, link_zero);
2536
2537	IRELE(ip);
2538
2539/*	Fall through to std_return with error = 0 */
2540 std_return:
2541	if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp,
2542						DM_EVENT_POSTREMOVE)) {
2543		(void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
2544				dir_vp, DM_RIGHT_NULL,
2545				NULL, DM_RIGHT_NULL,
2546				name, NULL, dm_di_mode, error, 0);
2547	}
2548	return error;
2549
2550 error1:
2551	xfs_bmap_cancel(&free_list);
2552	cancel_flags |= XFS_TRANS_ABORT;
2553	xfs_trans_cancel(tp, cancel_flags);
2554	goto std_return;
2555
2556 error_rele:
2557	/*
2558	 * In this case make sure to not release the inode until after
2559	 * the current transaction is aborted.  Releasing it beforehand
2560	 * can cause us to go to xfs_inactive and start a recursive
2561	 * transaction which can easily deadlock with the current one.
2562	 */
2563	xfs_bmap_cancel(&free_list);
2564	cancel_flags |= XFS_TRANS_ABORT;
2565	xfs_trans_cancel(tp, cancel_flags);
2566
2567	/*
2568	 * Before we drop our extra reference to the inode, purge it
2569	 * from the refcache if it is there.  By waiting until afterwards
2570	 * to do the IRELE, we ensure that we won't go inactive in the
2571	 * xfs_refcache_purge_ip routine (although that would be OK).
2572	 */
2573	xfs_refcache_purge_ip(ip);
2574
2575	IRELE(ip);
2576
2577	goto std_return;
2578}
2579
2580
2581/*
2582 * xfs_link
2583 *
2584 */
2585STATIC int
2586xfs_link(
2587	bhv_desc_t		*target_dir_bdp,
2588	xfs_vnode_t		*src_vp,
2589	vname_t			*dentry,
2590	cred_t			*credp)
2591{
2592	xfs_inode_t		*tdp, *sip;
2593	xfs_trans_t		*tp;
2594	xfs_mount_t		*mp;
2595	xfs_inode_t		*ips[2];
2596	int			error;
2597	xfs_bmap_free_t         free_list;
2598	xfs_fsblock_t           first_block;
2599	int			cancel_flags;
2600	int			committed;
2601	xfs_vnode_t		*target_dir_vp;
2602	int			resblks;
2603	char			*target_name = VNAME(dentry);
2604	int			target_namelen;
2605
2606	target_dir_vp = BHV_TO_VNODE(target_dir_bdp);
2607	vn_trace_entry(target_dir_vp, __FUNCTION__, (inst_t *)__return_address);
2608	vn_trace_entry(src_vp, __FUNCTION__, (inst_t *)__return_address);
2609
2610	target_namelen = VNAMELEN(dentry);
2611	if (VN_ISDIR(src_vp))
2612		return XFS_ERROR(EPERM);
2613
2614	sip = xfs_vtoi(src_vp);
2615	tdp = XFS_BHVTOI(target_dir_bdp);
2616	mp = tdp->i_mount;
2617	if (XFS_FORCED_SHUTDOWN(mp))
2618		return XFS_ERROR(EIO);
2619
2620	if (DM_EVENT_ENABLED(src_vp->v_vfsp, tdp, DM_EVENT_LINK)) {
2621		error = XFS_SEND_NAMESP(mp, DM_EVENT_LINK,
2622					target_dir_vp, DM_RIGHT_NULL,
2623					src_vp, DM_RIGHT_NULL,
2624					target_name, NULL, 0, 0, 0);
2625		if (error)
2626			return error;
2627	}
2628
2629	/* Return through std_return after this point. */
2630
2631	error = XFS_QM_DQATTACH(mp, sip, 0);
2632	if (!error && sip != tdp)
2633		error = XFS_QM_DQATTACH(mp, tdp, 0);
2634	if (error)
2635		goto std_return;
2636
2637	tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
2638	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2639	resblks = XFS_LINK_SPACE_RES(mp, target_namelen);
2640	error = xfs_trans_reserve(tp, resblks, XFS_LINK_LOG_RES(mp), 0,
2641			XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
2642	if (error == ENOSPC) {
2643		resblks = 0;
2644		error = xfs_trans_reserve(tp, 0, XFS_LINK_LOG_RES(mp), 0,
2645				XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
2646	}
2647	if (error) {
2648		cancel_flags = 0;
2649		goto error_return;
2650	}
2651
2652	if (sip->i_ino < tdp->i_ino) {
2653		ips[0] = sip;
2654		ips[1] = tdp;
2655	} else {
2656		ips[0] = tdp;
2657		ips[1] = sip;
2658	}
2659
2660	xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL);
2661
2662	/*
2663	 * Increment vnode ref counts since xfs_trans_commit &
2664	 * xfs_trans_cancel will both unlock the inodes and
2665	 * decrement the associated ref counts.
2666	 */
2667	VN_HOLD(src_vp);
2668	VN_HOLD(target_dir_vp);
2669	xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
2670	xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
2671
2672	/*
2673	 * If the source has too many links, we can't make any more to it.
2674	 */
2675	if (sip->i_d.di_nlink >= XFS_MAXLINK) {
2676		error = XFS_ERROR(EMLINK);
2677		goto error_return;
2678	}
2679
2680	/*
2681	 * If we are using project inheritance, we only allow hard link
2682	 * creation in our tree when the project IDs are the same; else
2683	 * the tree quota mechanism could be circumvented.
2684	 */
2685	if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
2686		     (tdp->i_d.di_projid != sip->i_d.di_projid))) {
2687		error = XFS_ERROR(EPERM);
2688		goto error_return;
2689	}
2690
2691	if (resblks == 0 &&
2692	    (error = XFS_DIR_CANENTER(mp, tp, tdp, target_name,
2693			target_namelen)))
2694		goto error_return;
2695
2696	XFS_BMAP_INIT(&free_list, &first_block);
2697
2698	error = XFS_DIR_CREATENAME(mp, tp, tdp, target_name, target_namelen,
2699				   sip->i_ino, &first_block, &free_list,
2700				   resblks);
2701	if (error)
2702		goto abort_return;
2703	xfs_ichgtime(tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2704	tdp->i_gen++;
2705	xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
2706
2707	error = xfs_bumplink(tp, sip);
2708	if (error) {
2709		goto abort_return;
2710	}
2711
2712	/*
2713	 * If this is a synchronous mount, make sure that the
2714	 * link transaction goes to disk before returning to
2715	 * the user.
2716	 */
2717	if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2718		xfs_trans_set_sync(tp);
2719	}
2720
2721	error = xfs_bmap_finish (&tp, &free_list, first_block, &committed);
2722	if (error) {
2723		xfs_bmap_cancel(&free_list);
2724		goto abort_return;
2725	}
2726
2727	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
2728	if (error) {
2729		goto std_return;
2730	}
2731
2732	/* Fall through to std_return with error = 0. */
2733std_return:
2734	if (DM_EVENT_ENABLED(src_vp->v_vfsp, sip,
2735						DM_EVENT_POSTLINK)) {
2736		(void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTLINK,
2737				target_dir_vp, DM_RIGHT_NULL,
2738				src_vp, DM_RIGHT_NULL,
2739				target_name, NULL, 0, error, 0);
2740	}
2741	return error;
2742
2743 abort_return:
2744	cancel_flags |= XFS_TRANS_ABORT;
2745	/* FALLTHROUGH */
2746
2747 error_return:
2748	xfs_trans_cancel(tp, cancel_flags);
2749	goto std_return;
2750}
2751/*
2752 * xfs_mkdir
2753 *
2754 */
2755STATIC int
2756xfs_mkdir(
2757	bhv_desc_t		*dir_bdp,
2758	vname_t			*dentry,
2759	xfs_vattr_t		*vap,
2760	xfs_vnode_t		**vpp,
2761	cred_t			*credp)
2762{
2763	char			*dir_name = VNAME(dentry);
2764	xfs_inode_t             *dp;
2765	xfs_inode_t		*cdp;	/* inode of created dir */
2766	xfs_vnode_t		*cvp;	/* vnode of created dir */
2767	xfs_trans_t		*tp;
2768	xfs_mount_t		*mp;
2769	int			cancel_flags;
2770	int			error;
2771	int			committed;
2772	xfs_bmap_free_t         free_list;
2773	xfs_fsblock_t           first_block;
2774	xfs_vnode_t		*dir_vp;
2775	boolean_t		dp_joined_to_trans;
2776	boolean_t		created = B_FALSE;
2777	int			dm_event_sent = 0;
2778	xfs_prid_t		prid;
2779	struct xfs_dquot	*udqp, *gdqp;
2780	uint			resblks;
2781	int			dm_di_mode;
2782	int			dir_namelen;
2783
2784	dir_vp = BHV_TO_VNODE(dir_bdp);
2785	dp = XFS_BHVTOI(dir_bdp);
2786	mp = dp->i_mount;
2787
2788	if (XFS_FORCED_SHUTDOWN(mp))
2789		return XFS_ERROR(EIO);
2790
2791	dir_namelen = VNAMELEN(dentry);
2792
2793	tp = NULL;
2794	dp_joined_to_trans = B_FALSE;
2795	dm_di_mode = vap->va_mode;
2796
2797	if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_CREATE)) {
2798		error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
2799					dir_vp, DM_RIGHT_NULL, NULL,
2800					DM_RIGHT_NULL, dir_name, NULL,
2801					dm_di_mode, 0, 0);
2802		if (error)
2803			return error;
2804		dm_event_sent = 1;
2805	}
2806
2807	/* Return through std_return after this point. */
2808
2809	vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
2810
2811	mp = dp->i_mount;
2812	udqp = gdqp = NULL;
2813
2814	if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
2815		prid = dp->i_d.di_projid;
2816	else if (vap->va_mask & XFS_AT_PROJID)
2817		prid = (xfs_prid_t)vap->va_projid;
2818	else
2819		prid = (xfs_prid_t)dfltprid;
2820
2821	/*
2822	 * Make sure that we have allocated dquot(s) on disk.
2823	 */
2824	error = XFS_QM_DQVOPALLOC(mp, dp,
2825			current_fsuid(credp), current_fsgid(credp), prid,
2826			XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
2827	if (error)
2828		goto std_return;
2829
2830	tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
2831	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2832	resblks = XFS_MKDIR_SPACE_RES(mp, dir_namelen);
2833	error = xfs_trans_reserve(tp, resblks, XFS_MKDIR_LOG_RES(mp), 0,
2834				  XFS_TRANS_PERM_LOG_RES, XFS_MKDIR_LOG_COUNT);
2835	if (error == ENOSPC) {
2836		resblks = 0;
2837		error = xfs_trans_reserve(tp, 0, XFS_MKDIR_LOG_RES(mp), 0,
2838					  XFS_TRANS_PERM_LOG_RES,
2839					  XFS_MKDIR_LOG_COUNT);
2840	}
2841	if (error) {
2842		cancel_flags = 0;
2843		dp = NULL;
2844		goto error_return;
2845	}
2846
2847	xfs_ilock(dp, XFS_ILOCK_EXCL);
2848
2849	/*
2850	 * Check for directory link count overflow.
2851	 */
2852	if (dp->i_d.di_nlink >= XFS_MAXLINK) {
2853		error = XFS_ERROR(EMLINK);
2854		goto error_return;
2855	}
2856
2857	/*
2858	 * Reserve disk quota and the inode.
2859	 */
2860	error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
2861	if (error)
2862		goto error_return;
2863
2864	if (resblks == 0 &&
2865	    (error = XFS_DIR_CANENTER(mp, tp, dp, dir_name, dir_namelen)))
2866		goto error_return;
2867	/*
2868	 * create the directory inode.
2869	 */
2870	error = xfs_dir_ialloc(&tp, dp, vap->va_mode, 2,
2871			0, credp, prid, resblks > 0,
2872		&cdp, NULL);
2873	if (error) {
2874		if (error == ENOSPC)
2875			goto error_return;
2876		goto abort_return;
2877	}
2878	ITRACE(cdp);
2879
2880	/*
2881	 * Now we add the directory inode to the transaction.
2882	 * We waited until now since xfs_dir_ialloc might start
2883	 * a new transaction.  Had we joined the transaction
2884	 * earlier, the locks might have gotten released.
2885	 */
2886	VN_HOLD(dir_vp);
2887	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2888	dp_joined_to_trans = B_TRUE;
2889
2890	XFS_BMAP_INIT(&free_list, &first_block);
2891
2892	error = XFS_DIR_CREATENAME(mp, tp, dp, dir_name, dir_namelen,
2893			cdp->i_ino, &first_block, &free_list,
2894			resblks ? resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
2895	if (error) {
2896		ASSERT(error != ENOSPC);
2897		goto error1;
2898	}
2899	xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2900
2901	/*
2902	 * Bump the in memory version number of the parent directory
2903	 * so that other processes accessing it will recognize that
2904	 * the directory has changed.
2905	 */
2906	dp->i_gen++;
2907
2908	error = XFS_DIR_INIT(mp, tp, cdp, dp);
2909	if (error) {
2910		goto error2;
2911	}
2912
2913	cdp->i_gen = 1;
2914	error = xfs_bumplink(tp, dp);
2915	if (error) {
2916		goto error2;
2917	}
2918
2919	cvp = XFS_ITOV(cdp);
2920
2921	created = B_TRUE;
2922
2923	*vpp = cvp;
2924	IHOLD(cdp);
2925
2926	/*
2927	 * Attach the dquots to the new inode and modify the icount incore.
2928	 */
2929	XFS_QM_DQVOPCREATE(mp, tp, cdp, udqp, gdqp);
2930
2931	/*
2932	 * If this is a synchronous mount, make sure that the
2933	 * mkdir transaction goes to disk before returning to
2934	 * the user.
2935	 */
2936	if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2937		xfs_trans_set_sync(tp);
2938	}
2939
2940	error = xfs_bmap_finish(&tp, &free_list, first_block, &committed);
2941	if (error) {
2942		IRELE(cdp);
2943		goto error2;
2944	}
2945
2946	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
2947	XFS_QM_DQRELE(mp, udqp);
2948	XFS_QM_DQRELE(mp, gdqp);
2949	if (error) {
2950		IRELE(cdp);
2951	}
2952
2953	/* Fall through to std_return with error = 0 or errno from
2954	 * xfs_trans_commit. */
2955
2956std_return:
2957	if ( (created || (error != 0 && dm_event_sent != 0)) &&
2958			DM_EVENT_ENABLED(dir_vp->v_vfsp, XFS_BHVTOI(dir_bdp),
2959						DM_EVENT_POSTCREATE)) {
2960		(void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
2961					dir_vp, DM_RIGHT_NULL,
2962					created ? XFS_ITOV(cdp):NULL,
2963					DM_RIGHT_NULL,
2964					dir_name, NULL,
2965					dm_di_mode, error, 0);
2966	}
2967	return error;
2968
2969 error2:
2970 error1:
2971	xfs_bmap_cancel(&free_list);
2972 abort_return:
2973	cancel_flags |= XFS_TRANS_ABORT;
2974 error_return:
2975	xfs_trans_cancel(tp, cancel_flags);
2976	XFS_QM_DQRELE(mp, udqp);
2977	XFS_QM_DQRELE(mp, gdqp);
2978
2979	if (!dp_joined_to_trans && (dp != NULL)) {
2980		xfs_iunlock(dp, XFS_ILOCK_EXCL);
2981	}
2982
2983	goto std_return;
2984}
2985
2986
2987/*
2988 * xfs_rmdir
2989 *
2990 */
2991STATIC int
2992xfs_rmdir(
2993	bhv_desc_t		*dir_bdp,
2994	vname_t			*dentry,
2995	cred_t			*credp)
2996{
2997	char			*name = VNAME(dentry);
2998	xfs_inode_t             *dp;
2999	xfs_inode_t             *cdp;   /* child directory */
3000	xfs_trans_t             *tp;
3001	xfs_mount_t		*mp;
3002	int                     error;
3003	xfs_bmap_free_t         free_list;
3004	xfs_fsblock_t           first_block;
3005	int			cancel_flags;
3006	int			committed;
3007	xfs_vnode_t		*dir_vp;
3008	int			dm_di_mode = 0;
3009	int			last_cdp_link;
3010	int			namelen;
3011	uint			resblks;
3012
3013	dir_vp = BHV_TO_VNODE(dir_bdp);
3014	dp = XFS_BHVTOI(dir_bdp);
3015	mp = dp->i_mount;
3016
3017	vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
3018
3019	if (XFS_FORCED_SHUTDOWN(XFS_BHVTOI(dir_bdp)->i_mount))
3020		return XFS_ERROR(EIO);
3021	namelen = VNAMELEN(dentry);
3022
3023	if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_REMOVE)) {
3024		error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE,
3025					dir_vp, DM_RIGHT_NULL,
3026					NULL, DM_RIGHT_NULL,
3027					name, NULL, 0, 0, 0);
3028		if (error)
3029			return XFS_ERROR(error);
3030	}
3031
3032	/* Return through std_return after this point. */
3033
3034	cdp = NULL;
3035
3036	/*
3037	 * We need to get a reference to cdp before we get our log
3038	 * reservation.  The reason for this is that we cannot call
3039	 * xfs_iget for an inode for which we do not have a reference
3040	 * once we've acquired a log reservation.  This is because the
3041	 * inode we are trying to get might be in xfs_inactive going
3042	 * for a log reservation.  Since we'll have to wait for the
3043	 * inactive code to complete before returning from xfs_iget,
3044	 * we need to make sure that we don't have log space reserved
3045	 * when we call xfs_iget.  Instead we get an unlocked reference
3046	 * to the inode before getting our log reservation.
3047	 */
3048	error = xfs_get_dir_entry(dentry, &cdp);
3049	if (error) {
3050		REMOVE_DEBUG_TRACE(__LINE__);
3051		goto std_return;
3052	}
3053	mp = dp->i_mount;
3054	dm_di_mode = cdp->i_d.di_mode;
3055
3056	/*
3057	 * Get the dquots for the inodes.
3058	 */
3059	error = XFS_QM_DQATTACH(mp, dp, 0);
3060	if (!error && dp != cdp)
3061		error = XFS_QM_DQATTACH(mp, cdp, 0);
3062	if (error) {
3063		IRELE(cdp);
3064		REMOVE_DEBUG_TRACE(__LINE__);
3065		goto std_return;
3066	}
3067
3068	tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
3069	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
3070	/*
3071	 * We try to get the real space reservation first,
3072	 * allowing for directory btree deletion(s) implying
3073	 * possible bmap insert(s).  If we can't get the space
3074	 * reservation then we use 0 instead, and avoid the bmap
3075	 * btree insert(s) in the directory code by, if the bmap
3076	 * insert tries to happen, instead trimming the LAST
3077	 * block from the directory.
3078	 */
3079	resblks = XFS_REMOVE_SPACE_RES(mp);
3080	error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
3081			XFS_TRANS_PERM_LOG_RES, XFS_DEFAULT_LOG_COUNT);
3082	if (error == ENOSPC) {
3083		resblks = 0;
3084		error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
3085				XFS_TRANS_PERM_LOG_RES, XFS_DEFAULT_LOG_COUNT);
3086	}
3087	if (error) {
3088		ASSERT(error != ENOSPC);
3089		cancel_flags = 0;
3090		IRELE(cdp);
3091		goto error_return;
3092	}
3093	XFS_BMAP_INIT(&free_list, &first_block);
3094
3095	/*
3096	 * Now lock the child directory inode and the parent directory
3097	 * inode in the proper order.  This will take care of validating
3098	 * that the directory entry for the child directory inode has
3099	 * not changed while we were obtaining a log reservation.
3100	 */
3101	error = xfs_lock_dir_and_entry(dp, dentry, cdp);
3102	if (error) {
3103		xfs_trans_cancel(tp, cancel_flags);
3104		IRELE(cdp);
3105		goto std_return;
3106	}
3107
3108	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
3109	if (dp != cdp) {
3110		/*
3111		 * Only increment the parent directory vnode count if
3112		 * we didn't bump it in looking up cdp.  The only time
3113		 * we don't bump it is when we're looking up ".".
3114		 */
3115		VN_HOLD(dir_vp);
3116	}
3117
3118	ITRACE(cdp);
3119	xfs_trans_ijoin(tp, cdp, XFS_ILOCK_EXCL);
3120
3121	ASSERT(cdp->i_d.di_nlink >= 2);
3122	if (cdp->i_d.di_nlink != 2) {
3123		error = XFS_ERROR(ENOTEMPTY);
3124		goto error_return;
3125	}
3126	if (!XFS_DIR_ISEMPTY(mp, cdp)) {
3127		error = XFS_ERROR(ENOTEMPTY);
3128		goto error_return;
3129	}
3130
3131	error = XFS_DIR_REMOVENAME(mp, tp, dp, name, namelen, cdp->i_ino,
3132		&first_block, &free_list, resblks);
3133	if (error) {
3134		goto error1;
3135	}
3136
3137	xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3138
3139	/*
3140	 * Bump the in memory generation count on the parent
3141	 * directory so that other can know that it has changed.
3142	 */
3143	dp->i_gen++;
3144
3145	/*
3146	 * Drop the link from cdp's "..".
3147	 */
3148	error = xfs_droplink(tp, dp);
3149	if (error) {
3150		goto error1;
3151	}
3152
3153	/*
3154	 * Drop the link from dp to cdp.
3155	 */
3156	error = xfs_droplink(tp, cdp);
3157	if (error) {
3158		goto error1;
3159	}
3160
3161	/*
3162	 * Drop the "." link from cdp to self.
3163	 */
3164	error = xfs_droplink(tp, cdp);
3165	if (error) {
3166		goto error1;
3167	}
3168
3169	/* Determine these before committing transaction */
3170	last_cdp_link = (cdp)->i_d.di_nlink==0;
3171
3172	/*
3173	 * Take an extra ref on the child vnode so that it
3174	 * does not go to xfs_inactive() from within the commit.
3175	 */
3176	IHOLD(cdp);
3177
3178	/*
3179	 * If this is a synchronous mount, make sure that the
3180	 * rmdir transaction goes to disk before returning to
3181	 * the user.
3182	 */
3183	if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
3184		xfs_trans_set_sync(tp);
3185	}
3186
3187	error = xfs_bmap_finish (&tp, &free_list, first_block, &committed);
3188	if (error) {
3189		xfs_bmap_cancel(&free_list);
3190		xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES |
3191				 XFS_TRANS_ABORT));
3192		IRELE(cdp);
3193		goto std_return;
3194	}
3195
3196	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
3197	if (error) {
3198		IRELE(cdp);
3199		goto std_return;
3200	}
3201
3202
3203	/*
3204	 * Let interposed file systems know about removed links.
3205	 */
3206	XVOP_LINK_REMOVED(XFS_ITOV(cdp), dir_vp, last_cdp_link);
3207
3208	IRELE(cdp);
3209
3210	/* Fall through to std_return with error = 0 or the errno
3211	 * from xfs_trans_commit. */
3212 std_return:
3213	if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_POSTREMOVE)) {
3214		(void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
3215					dir_vp, DM_RIGHT_NULL,
3216					NULL, DM_RIGHT_NULL,
3217					name, NULL, dm_di_mode,
3218					error, 0);
3219	}
3220	return error;
3221
3222 error1:
3223	xfs_bmap_cancel(&free_list);
3224	cancel_flags |= XFS_TRANS_ABORT;
3225	/* FALLTHROUGH */
3226
3227 error_return:
3228	xfs_trans_cancel(tp, cancel_flags);
3229	goto std_return;
3230}
3231
3232
3233/*
3234 * xfs_readdir
3235 *
3236 * Read dp's entries starting at uiop->uio_offset and translate them into
3237 * bufsize bytes worth of struct dirents starting at bufbase.
3238 */
3239STATIC int
3240xfs_readdir(
3241	bhv_desc_t	*dir_bdp,
3242	uio_t		*uiop,
3243	cred_t		*credp,
3244	int		*eofp)
3245{
3246	xfs_inode_t	*dp;
3247	xfs_trans_t	*tp = NULL;
3248	int		error = 0;
3249	uint		lock_mode;
3250
3251	vn_trace_entry(BHV_TO_VNODE(dir_bdp), __FUNCTION__,
3252					       (inst_t *)__return_address);
3253	dp = XFS_BHVTOI(dir_bdp);
3254
3255	if (XFS_FORCED_SHUTDOWN(dp->i_mount)) {
3256		return XFS_ERROR(EIO);
3257	}
3258
3259	lock_mode = xfs_ilock_map_shared(dp);
3260	error = XFS_DIR_GETDENTS(dp->i_mount, tp, dp, uiop, eofp);
3261	xfs_iunlock_map_shared(dp, lock_mode);
3262	return error;
3263}
3264
3265
3266/*
3267 * xfs_symlink
3268 *
3269 */
3270STATIC int
3271xfs_symlink(
3272	bhv_desc_t		*dir_bdp,
3273	vname_t			*dentry,
3274	xfs_vattr_t		*vap,
3275	char			*target_path,
3276	xfs_vnode_t		**vpp,
3277	cred_t			*credp)
3278{
3279	xfs_trans_t		*tp;
3280	xfs_mount_t		*mp;
3281	xfs_inode_t		*dp;
3282	xfs_inode_t		*ip;
3283	int			error;
3284	int			pathlen;
3285	xfs_bmap_free_t		free_list;
3286	xfs_fsblock_t		first_block;
3287	boolean_t		dp_joined_to_trans;
3288	xfs_vnode_t		*dir_vp;
3289	uint			cancel_flags;
3290	int			committed;
3291	xfs_fileoff_t		first_fsb;
3292	xfs_filblks_t		fs_blocks;
3293	int			nmaps;
3294	xfs_bmbt_irec_t		mval[SYMLINK_MAPS];
3295	xfs_daddr_t		d;
3296	char			*cur_chunk;
3297	int			byte_cnt;
3298	int			n;
3299	xfs_buf_t		*bp;
3300	xfs_prid_t		prid;
3301	struct xfs_dquot	*udqp, *gdqp;
3302	uint			resblks;
3303	char			*link_name = VNAME(dentry);
3304	int			link_namelen;
3305	struct	thread 		*current = curthread;
3306
3307	*vpp = NULL;
3308	dir_vp = BHV_TO_VNODE(dir_bdp);
3309	dp = XFS_BHVTOI(dir_bdp);
3310	dp_joined_to_trans = B_FALSE;
3311	error = 0;
3312	ip = NULL;
3313	tp = NULL;
3314
3315	vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
3316
3317	mp = dp->i_mount;
3318
3319	if (XFS_FORCED_SHUTDOWN(mp))
3320		return XFS_ERROR(EIO);
3321
3322	link_namelen = VNAMELEN(dentry);
3323
3324	/*
3325	 * Check component lengths of the target path name.
3326	 */
3327	pathlen = strlen(target_path);
3328	if (pathlen >= MAXPATHLEN)      /* total string too long */
3329		return XFS_ERROR(ENAMETOOLONG);
3330	if (pathlen >= MAXNAMELEN) {    /* is any component too long? */
3331		int len, total;
3332		char *path;
3333
3334		for(total = 0, path = target_path; total < pathlen;) {
3335			/*
3336			 * Skip any slashes.
3337			 */
3338			while(*path == '/') {
3339				total++;
3340				path++;
3341			}
3342
3343			/*
3344			 * Count up to the next slash or end of path.
3345			 * Error out if the component is bigger than MAXNAMELEN.
3346			 */
3347			for(len = 0; *path != '/' && total < pathlen;total++, path++) {
3348				if (++len >= MAXNAMELEN) {
3349					error = ENAMETOOLONG;
3350					return error;
3351				}
3352			}
3353		}
3354	}
3355
3356	if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_SYMLINK)) {
3357		error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dir_vp,
3358					DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
3359					link_name, target_path, 0, 0, 0);
3360		if (error)
3361			return error;
3362	}
3363
3364	/* Return through std_return after this point. */
3365
3366	udqp = gdqp = NULL;
3367
3368#ifdef XXXKAN
3369	if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
3370		prid = dp->i_d.di_projid;
3371	else if (vap->va_mask & XFS_AT_PROJID)
3372		prid = (xfs_prid_t)vap->va_projid;
3373	else
3374#endif
3375		prid = (xfs_prid_t)dfltprid;
3376
3377	/*
3378	 * Make sure that we have allocated dquot(s) on disk.
3379	 */
3380	error = XFS_QM_DQVOPALLOC(mp, dp,
3381				  current->td_ucred->cr_uid,
3382				  current->td_ucred->cr_groups[0],
3383				  prid,
3384				  XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
3385	if (error)
3386		goto std_return;
3387
3388	tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK);
3389	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
3390	/*
3391	 * The symlink will fit into the inode data fork?
3392	 * There can't be any attributes so we get the whole variable part.
3393	 */
3394	if (pathlen <= XFS_LITINO(mp))
3395		fs_blocks = 0;
3396	else
3397		fs_blocks = XFS_B_TO_FSB(mp, pathlen);
3398	resblks = XFS_SYMLINK_SPACE_RES(mp, link_namelen, fs_blocks);
3399	error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0,
3400			XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
3401	if (error == ENOSPC && fs_blocks == 0) {
3402		resblks = 0;
3403		error = xfs_trans_reserve(tp, 0, XFS_SYMLINK_LOG_RES(mp), 0,
3404				XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
3405	}
3406	if (error) {
3407		cancel_flags = 0;
3408		dp = NULL;
3409		goto error_return;
3410	}
3411
3412	xfs_ilock(dp, XFS_ILOCK_EXCL);
3413
3414	/*
3415	 * Check whether the directory allows new symlinks or not.
3416	 */
3417	if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) {
3418		error = XFS_ERROR(EPERM);
3419		goto error_return;
3420	}
3421
3422	/*
3423	 * Reserve disk quota : blocks and inode.
3424	 */
3425	error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
3426	if (error)
3427		goto error_return;
3428
3429	/*
3430	 * Check for ability to enter directory entry, if no space reserved.
3431	 */
3432	if (resblks == 0 &&
3433	    (error = XFS_DIR_CANENTER(mp, tp, dp, link_name, link_namelen)))
3434		goto error_return;
3435	/*
3436	 * Initialize the bmap freelist prior to calling either
3437	 * bmapi or the directory create code.
3438	 */
3439	XFS_BMAP_INIT(&free_list, &first_block);
3440
3441	/*
3442	 * Allocate an inode for the symlink.
3443	 */
3444	error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (vap->va_mode&~S_IFMT),
3445			       1, 0, credp, prid, resblks > 0, &ip, NULL);
3446	if (error) {
3447		if (error == ENOSPC)
3448			goto error_return;
3449		goto error1;
3450	}
3451	ITRACE(ip);
3452
3453	VN_HOLD(dir_vp);
3454	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
3455	dp_joined_to_trans = B_TRUE;
3456
3457	/*
3458	 * Also attach the dquot(s) to it, if applicable.
3459	 */
3460	XFS_QM_DQVOPCREATE(mp, tp, ip, udqp, gdqp);
3461
3462	if (resblks)
3463		resblks -= XFS_IALLOC_SPACE_RES(mp);
3464	/*
3465	 * If the symlink will fit into the inode, write it inline.
3466	 */
3467	if (pathlen <= XFS_IFORK_DSIZE(ip)) {
3468		xfs_idata_realloc(ip, pathlen, XFS_DATA_FORK);
3469		memcpy(ip->i_df.if_u1.if_data, target_path, pathlen);
3470		ip->i_d.di_size = pathlen;
3471
3472		/*
3473		 * The inode was initially created in extent format.
3474		 */
3475		ip->i_df.if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT);
3476		ip->i_df.if_flags |= XFS_IFINLINE;
3477
3478		ip->i_d.di_format = XFS_DINODE_FMT_LOCAL;
3479		xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE);
3480
3481	} else {
3482		first_fsb = 0;
3483		nmaps = SYMLINK_MAPS;
3484
3485		error = xfs_bmapi(tp, ip, first_fsb, fs_blocks,
3486				  XFS_BMAPI_WRITE | XFS_BMAPI_METADATA,
3487				  &first_block, resblks, mval, &nmaps,
3488				  &free_list, NULL);
3489		if (error) {
3490			goto error1;
3491		}
3492
3493		if (resblks)
3494			resblks -= fs_blocks;
3495		ip->i_d.di_size = pathlen;
3496		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
3497
3498		cur_chunk = target_path;
3499		for (n = 0; n < nmaps; n++) {
3500			d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
3501			byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
3502			bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
3503					       BTOBB(byte_cnt), 0);
3504			ASSERT(bp && !XFS_BUF_GETERROR(bp));
3505			if (pathlen < byte_cnt) {
3506				byte_cnt = pathlen;
3507			}
3508			pathlen -= byte_cnt;
3509
3510			memcpy(XFS_BUF_PTR(bp), cur_chunk, byte_cnt);
3511			cur_chunk += byte_cnt;
3512
3513			xfs_trans_log_buf(tp, bp, 0, byte_cnt - 1);
3514		}
3515	}
3516
3517	/*
3518	 * Create the directory entry for the symlink.
3519	 */
3520	error = XFS_DIR_CREATENAME(mp, tp, dp, link_name, link_namelen,
3521			ip->i_ino, &first_block, &free_list, resblks);
3522	if (error) {
3523		goto error1;
3524	}
3525	xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3526	xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
3527
3528	/*
3529	 * Bump the in memory version number of the parent directory
3530	 * so that other processes accessing it will recognize that
3531	 * the directory has changed.
3532	 */
3533	dp->i_gen++;
3534
3535	/*
3536	 * If this is a synchronous mount, make sure that the
3537	 * symlink transaction goes to disk before returning to
3538	 * the user.
3539	 */
3540	if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
3541		xfs_trans_set_sync(tp);
3542	}
3543
3544	/*
3545	 * xfs_trans_commit normally decrements the vnode ref count
3546	 * when it unlocks the inode. Since we want to return the
3547	 * vnode to the caller, we bump the vnode ref count now.
3548	 */
3549	IHOLD(ip);
3550
3551	error = xfs_bmap_finish(&tp, &free_list, first_block, &committed);
3552	if (error) {
3553		goto error2;
3554	}
3555	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
3556	XFS_QM_DQRELE(mp, udqp);
3557	XFS_QM_DQRELE(mp, gdqp);
3558
3559	/* Fall through to std_return with error = 0 or errno from
3560	 * xfs_trans_commit	*/
3561std_return:
3562	if (DM_EVENT_ENABLED(dir_vp->v_vfsp, XFS_BHVTOI(dir_bdp),
3563			     DM_EVENT_POSTSYMLINK)) {
3564		(void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTSYMLINK,
3565					dir_vp, DM_RIGHT_NULL,
3566					error ? NULL : XFS_ITOV(ip),
3567					DM_RIGHT_NULL, link_name, target_path,
3568					0, error, 0);
3569	}
3570
3571	if (!error) {
3572		xfs_vnode_t *vp;
3573
3574		ASSERT(ip);
3575		vp = XFS_ITOV(ip);
3576		*vpp = vp;
3577	}
3578	return error;
3579
3580 error2:
3581	IRELE(ip);
3582 error1:
3583	xfs_bmap_cancel(&free_list);
3584	cancel_flags |= XFS_TRANS_ABORT;
3585 error_return:
3586	xfs_trans_cancel(tp, cancel_flags);
3587	XFS_QM_DQRELE(mp, udqp);
3588	XFS_QM_DQRELE(mp, gdqp);
3589
3590	if (!dp_joined_to_trans && (dp != NULL)) {
3591		xfs_iunlock(dp, XFS_ILOCK_EXCL);
3592	}
3593
3594	goto std_return;
3595}
3596
3597
3598/*
3599 * xfs_fid2
3600 *
3601 * A fid routine that takes a pointer to a previously allocated
3602 * fid structure (like xfs_fast_fid) but uses a 64 bit inode number.
3603 */
3604STATIC int
3605xfs_fid2(
3606	bhv_desc_t	*bdp,
3607	fid_t		*fidp)
3608{
3609	xfs_inode_t	*ip;
3610	xfs_fid2_t	*xfid;
3611
3612	vn_trace_entry(BHV_TO_VNODE(bdp), __FUNCTION__,
3613				       (inst_t *)__return_address);
3614	ASSERT(sizeof(xfs_fid_t) >= sizeof(xfs_fid2_t));
3615
3616	xfid = (xfs_fid2_t *)fidp;
3617	ip = XFS_BHVTOI(bdp);
3618	xfid->fid_len = sizeof(xfs_fid2_t) - sizeof(xfid->fid_len);
3619	xfid->fid_pad = 0;
3620	/*
3621	 * use memcpy because the inode is a long long and there's no
3622	 * assurance that xfid->fid_ino is properly aligned.
3623	 */
3624	memcpy(&xfid->fid_ino, &ip->i_ino, sizeof(xfid->fid_ino));
3625	xfid->fid_gen = ip->i_d.di_gen;
3626
3627	return 0;
3628}
3629
3630
3631/*
3632 * xfs_rwlock
3633 */
3634int
3635xfs_rwlock(
3636	bhv_desc_t	*bdp,
3637	vrwlock_t	locktype)
3638{
3639	xfs_inode_t	*ip;
3640	xfs_vnode_t	*vp;
3641
3642	vp = BHV_TO_VNODE(bdp);
3643	if (VN_ISDIR(vp))
3644		return 1;
3645	ip = XFS_BHVTOI(bdp);
3646	if (locktype == VRWLOCK_WRITE) {
3647		xfs_ilock(ip, XFS_IOLOCK_EXCL);
3648	} else if (locktype == VRWLOCK_TRY_READ) {
3649		return xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED);
3650	} else if (locktype == VRWLOCK_TRY_WRITE) {
3651		return xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL);
3652	} else {
3653		ASSERT((locktype == VRWLOCK_READ) ||
3654		       (locktype == VRWLOCK_WRITE_DIRECT));
3655		xfs_ilock(ip, XFS_IOLOCK_SHARED);
3656	}
3657
3658	return 1;
3659}
3660
3661
3662/*
3663 * xfs_rwunlock
3664 */
3665void
3666xfs_rwunlock(
3667	bhv_desc_t	*bdp,
3668	vrwlock_t	locktype)
3669{
3670	xfs_inode_t     *ip;
3671	xfs_vnode_t	*vp;
3672
3673	vp = BHV_TO_VNODE(bdp);
3674	if (VN_ISDIR(vp))
3675		return;
3676	ip = XFS_BHVTOI(bdp);
3677	if (locktype == VRWLOCK_WRITE) {
3678		/*
3679		 * In the write case, we may have added a new entry to
3680		 * the reference cache.  This might store a pointer to
3681		 * an inode to be released in this inode.  If it is there,
3682		 * clear the pointer and release the inode after unlocking
3683		 * this one.
3684		 */
3685		xfs_refcache_iunlock(ip, XFS_IOLOCK_EXCL);
3686	} else {
3687		ASSERT((locktype == VRWLOCK_READ) ||
3688		       (locktype == VRWLOCK_WRITE_DIRECT));
3689		xfs_iunlock(ip, XFS_IOLOCK_SHARED);
3690	}
3691	return;
3692}
3693
3694STATIC int
3695xfs_inode_flush(
3696	bhv_desc_t	*bdp,
3697	int		flags)
3698{
3699	xfs_inode_t	*ip;
3700	xfs_mount_t	*mp;
3701	xfs_inode_log_item_t *iip;
3702	int		error = 0;
3703
3704	ip = XFS_BHVTOI(bdp);
3705	mp = ip->i_mount;
3706	iip = ip->i_itemp;
3707
3708	if (XFS_FORCED_SHUTDOWN(mp))
3709		return XFS_ERROR(EIO);
3710
3711	/*
3712	 * Bypass inodes which have already been cleaned by
3713	 * the inode flush clustering code inside xfs_iflush
3714	 */
3715	if ((ip->i_update_core == 0) &&
3716	    ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL)))
3717		return 0;
3718
3719	if (flags & FLUSH_LOG) {
3720		if (iip && iip->ili_last_lsn) {
3721			xlog_t		*log = mp->m_log;
3722			xfs_lsn_t	sync_lsn;
3723			int		s, log_flags = XFS_LOG_FORCE;
3724
3725			s = GRANT_LOCK(log);
3726			sync_lsn = log->l_last_sync_lsn;
3727			GRANT_UNLOCK(log, s);
3728
3729			if ((XFS_LSN_CMP(iip->ili_last_lsn, sync_lsn) <= 0))
3730				return 0;
3731
3732			if (flags & FLUSH_SYNC)
3733				log_flags |= XFS_LOG_SYNC;
3734			return xfs_log_force(mp, iip->ili_last_lsn, log_flags);
3735		}
3736	}
3737
3738	/*
3739	 * We make this non-blocking if the inode is contended,
3740	 * return EAGAIN to indicate to the caller that they
3741	 * did not succeed. This prevents the flush path from
3742	 * blocking on inodes inside another operation right
3743	 * now, they get caught later by xfs_sync.
3744	 */
3745	if (flags & FLUSH_INODE) {
3746		int	flush_flags;
3747
3748		if (xfs_ipincount(ip))
3749			return EAGAIN;
3750
3751		if (flags & FLUSH_SYNC) {
3752			xfs_ilock(ip, XFS_ILOCK_SHARED);
3753			xfs_iflock(ip);
3754		} else if (xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
3755			if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) {
3756				xfs_iunlock(ip, XFS_ILOCK_SHARED);
3757				return EAGAIN;
3758			}
3759		} else {
3760			return EAGAIN;
3761		}
3762
3763		if (flags & FLUSH_SYNC)
3764			flush_flags = XFS_IFLUSH_SYNC;
3765		else
3766			flush_flags = XFS_IFLUSH_ASYNC;
3767
3768		error = xfs_iflush(ip, flush_flags);
3769		xfs_iunlock(ip, XFS_ILOCK_SHARED);
3770	}
3771
3772	return error;
3773}
3774
3775
3776int
3777xfs_set_dmattrs (
3778	bhv_desc_t	*bdp,
3779	u_int		evmask,
3780	u_int16_t	state,
3781	cred_t		*credp)
3782{
3783	xfs_inode_t     *ip;
3784	xfs_trans_t	*tp;
3785	xfs_mount_t	*mp;
3786	int		error;
3787
3788	if (!capable(CAP_SYS_ADMIN))
3789		return XFS_ERROR(EPERM);
3790
3791	ip = XFS_BHVTOI(bdp);
3792	mp = ip->i_mount;
3793
3794	if (XFS_FORCED_SHUTDOWN(mp))
3795		return XFS_ERROR(EIO);
3796
3797	tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
3798	error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES (mp), 0, 0, 0);
3799	if (error) {
3800		xfs_trans_cancel(tp, 0);
3801		return error;
3802	}
3803	xfs_ilock(ip, XFS_ILOCK_EXCL);
3804	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
3805
3806	ip->i_iocore.io_dmevmask = ip->i_d.di_dmevmask = evmask;
3807	ip->i_iocore.io_dmstate  = ip->i_d.di_dmstate  = state;
3808
3809	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
3810	IHOLD(ip);
3811	error = xfs_trans_commit(tp, 0, NULL);
3812
3813	return error;
3814}
3815
3816
3817/*
3818 * xfs_reclaim
3819 */
3820STATIC int
3821xfs_reclaim(
3822	bhv_desc_t	*bdp)
3823{
3824	xfs_inode_t	*ip;
3825	xfs_vnode_t	*vp;
3826
3827	vp = BHV_TO_VNODE(bdp);
3828	ip = XFS_BHVTOI(bdp);
3829
3830	vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
3831
3832	ASSERT(!VN_MAPPED(vp));
3833
3834	/* bad inode, get out here ASAP */
3835	if (VN_BAD(vp)) {
3836		xfs_ireclaim(ip);
3837		return 0;
3838	}
3839
3840	vn_iowait(vp);
3841
3842	ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
3843
3844	/*
3845	 * Make sure the atime in the XFS inode is correct before freeing the
3846	 * Linux inode.
3847	 */
3848	xfs_synchronize_atime(ip);
3849
3850	vnode_destroy_vobject(vp->v_vnode);
3851
3852	/* If we have nothing to flush with this inode then complete the
3853	 * teardown now, otherwise break the link between the xfs inode
3854	 * and the linux inode and clean up the xfs inode later. This
3855	 * avoids flushing the inode to disk during the delete operation
3856	 * itself.
3857	 */
3858	if (!ip->i_update_core && (ip->i_itemp == NULL)) {
3859		xfs_ilock(ip, XFS_ILOCK_EXCL);
3860		xfs_iflock(ip);
3861		return xfs_finish_reclaim(ip, 1, XFS_IFLUSH_DELWRI_ELSE_SYNC);
3862	} else {
3863		xfs_mount_t	*mp = ip->i_mount;
3864
3865		/* Protect sync from us */
3866		XFS_MOUNT_ILOCK(mp);
3867		vn_bhv_remove(VN_BHV_HEAD(vp), XFS_ITOBHV(ip));
3868		TAILQ_INSERT_TAIL(&mp->m_del_inodes, ip, i_reclaim);
3869		ip->i_flags |= XFS_IRECLAIMABLE;
3870		XFS_MOUNT_IUNLOCK(mp);
3871	}
3872	return 0;
3873}
3874
3875int
3876xfs_finish_reclaim(
3877	xfs_inode_t	*ip,
3878	int		locked,
3879	int		sync_mode)
3880{
3881	xfs_ihash_t	*ih = ip->i_hash;
3882	xfs_vnode_t	*vp = XFS_ITOV_NULL(ip);
3883	int		error;
3884
3885	if (vp && VN_BAD(vp))
3886		goto reclaim;
3887
3888	/* The hash lock here protects a thread in xfs_iget_core from
3889	 * racing with us on linking the inode back with a vnode.
3890	 * Once we have the XFS_IRECLAIM flag set it will not touch
3891	 * us.
3892	 */
3893	write_lock(&ih->ih_lock);
3894	if ((ip->i_flags & XFS_IRECLAIM) ||
3895	    (!(ip->i_flags & XFS_IRECLAIMABLE) && vp == NULL)) {
3896		write_unlock(&ih->ih_lock);
3897		if (locked) {
3898			xfs_ifunlock(ip);
3899			xfs_iunlock(ip, XFS_ILOCK_EXCL);
3900		}
3901		return 1;
3902	}
3903	ip->i_flags |= XFS_IRECLAIM;
3904	write_unlock(&ih->ih_lock);
3905
3906	/*
3907	 * If the inode is still dirty, then flush it out.  If the inode
3908	 * is not in the AIL, then it will be OK to flush it delwri as
3909	 * long as xfs_iflush() does not keep any references to the inode.
3910	 * We leave that decision up to xfs_iflush() since it has the
3911	 * knowledge of whether it's OK to simply do a delwri flush of
3912	 * the inode or whether we need to wait until the inode is
3913	 * pulled from the AIL.
3914	 * We get the flush lock regardless, though, just to make sure
3915	 * we don't free it while it is being flushed.
3916	 */
3917	if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
3918		if (!locked) {
3919			xfs_ilock(ip, XFS_ILOCK_EXCL);
3920			xfs_iflock(ip);
3921		}
3922
3923		if (ip->i_update_core ||
3924		    ((ip->i_itemp != NULL) &&
3925		     (ip->i_itemp->ili_format.ilf_fields != 0))) {
3926			error = xfs_iflush(ip, sync_mode);
3927			/*
3928			 * If we hit an error, typically because of filesystem
3929			 * shutdown, we don't need to let vn_reclaim to know
3930			 * because we're gonna reclaim the inode anyway.
3931			 */
3932			if (error) {
3933				xfs_iunlock(ip, XFS_ILOCK_EXCL);
3934				goto reclaim;
3935			}
3936			xfs_iflock(ip); /* synchronize with xfs_iflush_done */
3937		}
3938
3939		ASSERT(ip->i_update_core == 0);
3940		ASSERT(ip->i_itemp == NULL ||
3941		       ip->i_itemp->ili_format.ilf_fields == 0);
3942		xfs_iunlock(ip, XFS_ILOCK_EXCL);
3943	} else if (locked) {
3944		/*
3945		 * We are not interested in doing an iflush if we're
3946		 * in the process of shutting down the filesystem forcibly.
3947		 * So, just reclaim the inode.
3948		 */
3949		xfs_ifunlock(ip);
3950		xfs_iunlock(ip, XFS_ILOCK_EXCL);
3951	}
3952
3953 reclaim:
3954	xfs_ireclaim(ip);
3955	return 0;
3956}
3957
3958int
3959xfs_finish_reclaim_all(xfs_mount_t *mp, int noblock)
3960{
3961#ifdef RMC
3962	int		purged;
3963	xfs_inode_t	*ip, *n;
3964	int		done = 0;
3965
3966	while (!done) {
3967		purged = 0;
3968		XFS_MOUNT_ILOCK(mp);
3969		TAILQ_FOREACH_SAFE(curr, &mp->m_del_inodes, i_reclaim, next) {
3970			ip = curr;
3971			if (noblock) {
3972				if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0)
3973					continue;
3974				if (xfs_ipincount(ip) ||
3975				    !xfs_iflock_nowait(ip)) {
3976					xfs_iunlock(ip, XFS_ILOCK_EXCL);
3977					continue;
3978				}
3979			}
3980			XFS_MOUNT_IUNLOCK(mp);
3981			if (xfs_finish_reclaim(ip, noblock,
3982					XFS_IFLUSH_DELWRI_ELSE_ASYNC))
3983				delay(1);
3984			purged = 1;
3985			break;
3986		}
3987
3988		done = !purged;
3989	}
3990
3991	XFS_MOUNT_IUNLOCK(mp);
3992#endif
3993	return 0;
3994}
3995
3996/*
3997 * xfs_alloc_file_space()
3998 *      This routine allocates disk space for the given file.
3999 *
4000 *	If alloc_type == 0, this request is for an ALLOCSP type
4001 *	request which will change the file size.  In this case, no
4002 *	DMAPI event will be generated by the call.  A TRUNCATE event
4003 *	will be generated later by xfs_setattr.
4004 *
4005 *	If alloc_type != 0, this request is for a RESVSP type
4006 *	request, and a DMAPI DM_EVENT_WRITE will be generated if the
4007 *	lower block boundary byte address is less than the file's
4008 *	length.
4009 *
4010 * RETURNS:
4011 *       0 on success
4012 *      errno on error
4013 *
4014 */
4015STATIC int
4016xfs_alloc_file_space(
4017	xfs_inode_t		*ip,
4018	xfs_off_t		offset,
4019	xfs_off_t		len,
4020	int			alloc_type,
4021	int			attr_flags)
4022{
4023	xfs_mount_t		*mp = ip->i_mount;
4024	xfs_off_t		count;
4025	xfs_filblks_t		allocated_fsb;
4026	xfs_filblks_t		allocatesize_fsb;
4027	xfs_extlen_t		extsz, temp;
4028	xfs_fileoff_t		startoffset_fsb;
4029	xfs_fsblock_t		firstfsb;
4030	int			nimaps;
4031	int			bmapi_flag;
4032	int			quota_flag;
4033	int			rt;
4034	xfs_trans_t		*tp;
4035	xfs_bmbt_irec_t		imaps[1], *imapp;
4036	xfs_bmap_free_t		free_list;
4037	uint			qblocks, resblks, resrtextents;
4038	int			committed;
4039	int			error;
4040
4041	vn_trace_entry(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address);
4042
4043	if (XFS_FORCED_SHUTDOWN(mp))
4044		return XFS_ERROR(EIO);
4045
4046	rt = XFS_IS_REALTIME_INODE(ip);
4047	if (unlikely(rt)) {
4048		if (!(extsz = ip->i_d.di_extsize))
4049			extsz = mp->m_sb.sb_rextsize;
4050	} else {
4051		extsz = ip->i_d.di_extsize;
4052	}
4053
4054	if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
4055		return error;
4056
4057	if (len <= 0)
4058		return XFS_ERROR(EINVAL);
4059
4060	count = len;
4061	error = 0;
4062	imapp = &imaps[0];
4063	nimaps = 1;
4064	bmapi_flag = XFS_BMAPI_WRITE | (alloc_type ? XFS_BMAPI_PREALLOC : 0);
4065	startoffset_fsb	= XFS_B_TO_FSBT(mp, offset);
4066	allocatesize_fsb = XFS_B_TO_FSB(mp, count);
4067
4068	/*	Generate a DMAPI event if needed.	*/
4069	if (alloc_type != 0 && offset < ip->i_d.di_size &&
4070			(attr_flags&ATTR_DMI) == 0  &&
4071			DM_EVENT_ENABLED(XFS_MTOVFS(mp), ip, DM_EVENT_WRITE)) {
4072		xfs_off_t           end_dmi_offset;
4073
4074		end_dmi_offset = offset+len;
4075		if (end_dmi_offset > ip->i_d.di_size)
4076			end_dmi_offset = ip->i_d.di_size;
4077		error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, XFS_ITOV(ip),
4078			offset, end_dmi_offset - offset,
4079			0, NULL);
4080		if (error)
4081			return error;
4082	}
4083
4084	/*
4085	 * Allocate file space until done or until there is an error
4086	 */
4087retry:
4088	while (allocatesize_fsb && !error) {
4089		xfs_fileoff_t	s, e;
4090
4091		/*
4092		 * Determine space reservations for data/realtime.
4093		 */
4094		if (unlikely(extsz)) {
4095			s = startoffset_fsb;
4096			do_div(s, extsz);
4097			s *= extsz;
4098			e = startoffset_fsb + allocatesize_fsb;
4099			if ((temp = do_mod(startoffset_fsb, extsz)))
4100				e += temp;
4101			if ((temp = do_mod(e, extsz)))
4102				e += extsz - temp;
4103		} else {
4104			s = 0;
4105			e = allocatesize_fsb;
4106		}
4107
4108		if (unlikely(rt)) {
4109			resrtextents = qblocks = (uint)(e - s);
4110			resrtextents /= mp->m_sb.sb_rextsize;
4111			resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
4112			quota_flag = XFS_QMOPT_RES_RTBLKS;
4113		} else {
4114			resrtextents = 0;
4115			resblks = qblocks = \
4116				XFS_DIOSTRAT_SPACE_RES(mp, (uint)(e - s));
4117			quota_flag = XFS_QMOPT_RES_REGBLKS;
4118		}
4119
4120		/*
4121		 * Allocate and setup the transaction.
4122		 */
4123		tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
4124		error = xfs_trans_reserve(tp, resblks,
4125					  XFS_WRITE_LOG_RES(mp), resrtextents,
4126					  XFS_TRANS_PERM_LOG_RES,
4127					  XFS_WRITE_LOG_COUNT);
4128		/*
4129		 * Check for running out of space
4130		 */
4131		if (error) {
4132			/*
4133			 * Free the transaction structure.
4134			 */
4135			ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
4136			xfs_trans_cancel(tp, 0);
4137			break;
4138		}
4139		xfs_ilock(ip, XFS_ILOCK_EXCL);
4140		error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip,
4141						      qblocks, 0, quota_flag);
4142		if (error)
4143			goto error1;
4144
4145		xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
4146		xfs_trans_ihold(tp, ip);
4147
4148		/*
4149		 * Issue the xfs_bmapi() call to allocate the blocks
4150		 */
4151		XFS_BMAP_INIT(&free_list, &firstfsb);
4152		error = XFS_BMAPI(mp, tp, &ip->i_iocore, startoffset_fsb,
4153				  allocatesize_fsb, bmapi_flag,
4154				  &firstfsb, 0, imapp, &nimaps,
4155				  &free_list, NULL);
4156		if (error) {
4157			goto error0;
4158		}
4159
4160		/*
4161		 * Complete the transaction
4162		 */
4163		error = xfs_bmap_finish(&tp, &free_list, firstfsb, &committed);
4164		if (error) {
4165			goto error0;
4166		}
4167
4168		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
4169		xfs_iunlock(ip, XFS_ILOCK_EXCL);
4170		if (error) {
4171			break;
4172		}
4173
4174		allocated_fsb = imapp->br_blockcount;
4175
4176		if (nimaps == 0) {
4177			error = XFS_ERROR(ENOSPC);
4178			break;
4179		}
4180
4181		startoffset_fsb += allocated_fsb;
4182		allocatesize_fsb -= allocated_fsb;
4183	}
4184dmapi_enospc_check:
4185	if (error == ENOSPC && (attr_flags&ATTR_DMI) == 0 &&
4186	    DM_EVENT_ENABLED(XFS_MTOVFS(mp), ip, DM_EVENT_NOSPACE)) {
4187
4188		error = XFS_SEND_NAMESP(mp, DM_EVENT_NOSPACE,
4189				XFS_ITOV(ip), DM_RIGHT_NULL,
4190				XFS_ITOV(ip), DM_RIGHT_NULL,
4191				NULL, NULL, 0, 0, 0); /* Delay flag intentionally unused */
4192		if (error == 0)
4193			goto retry;	/* Maybe DMAPI app. has made space */
4194		/* else fall through with error from XFS_SEND_DATA */
4195	}
4196
4197	return error;
4198
4199error0:	/* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
4200	xfs_bmap_cancel(&free_list);
4201	XFS_TRANS_UNRESERVE_QUOTA_NBLKS(mp, tp, ip, qblocks, 0, quota_flag);
4202
4203error1:	/* Just cancel transaction */
4204	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
4205	xfs_iunlock(ip, XFS_ILOCK_EXCL);
4206	goto dmapi_enospc_check;
4207}
4208
4209/*
4210 * Zero file bytes between startoff and endoff inclusive.
4211 * The iolock is held exclusive and no blocks are buffered.
4212 */
4213STATIC int
4214xfs_zero_remaining_bytes(
4215	xfs_inode_t		*ip,
4216	xfs_off_t		startoff,
4217	xfs_off_t		endoff)
4218{
4219	xfs_bmbt_irec_t		imap;
4220	xfs_fileoff_t		offset_fsb;
4221	xfs_off_t		lastoffset;
4222	xfs_off_t		offset;
4223	xfs_buf_t		*bp;
4224	xfs_mount_t		*mp = ip->i_mount;
4225	int			nimap;
4226	int			error = 0;
4227
4228	bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize,
4229				ip->i_d.di_flags & XFS_DIFLAG_REALTIME ?
4230				mp->m_rtdev_targp : mp->m_ddev_targp);
4231
4232	for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
4233		offset_fsb = XFS_B_TO_FSBT(mp, offset);
4234		nimap = 1;
4235		error = XFS_BMAPI(mp, NULL, &ip->i_iocore, offset_fsb, 1, 0,
4236			NULL, 0, &imap, &nimap, NULL, NULL);
4237		if (error || nimap < 1)
4238			break;
4239		ASSERT(imap.br_blockcount >= 1);
4240		ASSERT(imap.br_startoff == offset_fsb);
4241		lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
4242		if (lastoffset > endoff)
4243			lastoffset = endoff;
4244		if (imap.br_startblock == HOLESTARTBLOCK)
4245			continue;
4246		ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
4247		if (imap.br_state == XFS_EXT_UNWRITTEN)
4248			continue;
4249		XFS_BUF_UNDONE(bp);
4250		XFS_BUF_UNWRITE(bp);
4251		XFS_BUF_READ(bp);
4252		XFS_BUF_SET_ADDR(bp, XFS_FSB_TO_DB(ip, imap.br_startblock));
4253		xfsbdstrat(mp, bp);
4254		if ((error = xfs_iowait(bp))) {
4255			xfs_ioerror_alert("xfs_zero_remaining_bytes(read)",
4256					  mp, bp, XFS_BUF_ADDR(bp));
4257			break;
4258		}
4259		memset(XFS_BUF_PTR(bp) +
4260			(offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
4261		      0, lastoffset - offset + 1);
4262		XFS_BUF_UNDONE(bp);
4263		XFS_BUF_UNREAD(bp);
4264		XFS_BUF_WRITE(bp);
4265		xfsbdstrat(mp, bp);
4266		if ((error = xfs_iowait(bp))) {
4267			xfs_ioerror_alert("xfs_zero_remaining_bytes(write)",
4268					  mp, bp, XFS_BUF_ADDR(bp));
4269			break;
4270		}
4271	}
4272	xfs_buf_free(bp);
4273	return error;
4274}
4275
4276/*
4277 * xfs_free_file_space()
4278 *      This routine frees disk space for the given file.
4279 *
4280 *	This routine is only called by xfs_change_file_space
4281 *	for an UNRESVSP type call.
4282 *
4283 * RETURNS:
4284 *       0 on success
4285 *      errno on error
4286 *
4287 */
4288STATIC int
4289xfs_free_file_space(
4290	xfs_inode_t		*ip,
4291	xfs_off_t		offset,
4292	xfs_off_t		len,
4293	int			attr_flags)
4294{
4295	xfs_vnode_t		*vp;
4296	int			committed;
4297	int			done;
4298	xfs_off_t		end_dmi_offset;
4299	xfs_fileoff_t		endoffset_fsb;
4300	int			error;
4301	xfs_fsblock_t		firstfsb;
4302	xfs_bmap_free_t		free_list;
4303	xfs_off_t		ilen;
4304	xfs_bmbt_irec_t		imap;
4305	xfs_off_t		ioffset;
4306	xfs_extlen_t		mod=0;
4307	xfs_mount_t		*mp;
4308	int			nimap;
4309	uint			resblks;
4310	int			rounding;
4311	int			rt;
4312	xfs_fileoff_t		startoffset_fsb;
4313	xfs_trans_t		*tp;
4314	int			need_iolock = 1;
4315
4316	vp = XFS_ITOV(ip);
4317	mp = ip->i_mount;
4318
4319	vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
4320
4321	if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
4322		return error;
4323
4324	error = 0;
4325	if (len <= 0)	/* if nothing being freed */
4326		return error;
4327	rt = (ip->i_d.di_flags & XFS_DIFLAG_REALTIME);
4328	startoffset_fsb	= XFS_B_TO_FSB(mp, offset);
4329	end_dmi_offset = offset + len;
4330	endoffset_fsb = XFS_B_TO_FSBT(mp, end_dmi_offset);
4331
4332	if (offset < ip->i_d.di_size &&
4333	    (attr_flags & ATTR_DMI) == 0 &&
4334	    DM_EVENT_ENABLED(XFS_MTOVFS(mp), ip, DM_EVENT_WRITE)) {
4335		if (end_dmi_offset > ip->i_d.di_size)
4336			end_dmi_offset = ip->i_d.di_size;
4337		error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, vp,
4338				offset, end_dmi_offset - offset,
4339				AT_DELAY_FLAG(attr_flags), NULL);
4340		if (error)
4341			return error;
4342	}
4343
4344	ASSERT(attr_flags & ATTR_NOLOCK ? attr_flags & ATTR_DMI : 1);
4345	if (attr_flags & ATTR_NOLOCK)
4346		need_iolock = 0;
4347	if (need_iolock) {
4348		xfs_ilock(ip, XFS_IOLOCK_EXCL);
4349		vn_iowait(vp);	/* wait for the completion of any pending DIOs */
4350	}
4351
4352	rounding = MAX((__uint8_t)(1 << mp->m_sb.sb_blocklog),
4353			(__uint8_t)NBPP);
4354	ilen = len + (offset & (rounding - 1));
4355	ioffset = offset & ~(rounding - 1);
4356	if (ilen & (rounding - 1))
4357		ilen = (ilen + rounding) & ~(rounding - 1);
4358
4359	if (VN_CACHED(vp) != 0) {
4360		xfs_inval_cached_trace(&ip->i_iocore, ioffset, -1,
4361				ctooff(offtoct(ioffset)), -1);
4362		XVOP_FLUSHINVAL_PAGES(vp, ctooff(offtoct(ioffset)),
4363				-1, FI_REMAPF_LOCKED);
4364	}
4365
4366	/*
4367	 * Need to zero the stuff we're not freeing, on disk.
4368	 * If its a realtime file & can't use unwritten extents then we
4369	 * actually need to zero the extent edges.  Otherwise xfs_bunmapi
4370	 * will take care of it for us.
4371	 */
4372	if (rt && !XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb)) {
4373		nimap = 1;
4374		error = XFS_BMAPI(mp, NULL, &ip->i_iocore, startoffset_fsb,
4375			1, 0, NULL, 0, &imap, &nimap, NULL, NULL);
4376		if (error)
4377			goto out_unlock_iolock;
4378		ASSERT(nimap == 0 || nimap == 1);
4379		if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
4380			xfs_daddr_t	block;
4381
4382			ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
4383			block = imap.br_startblock;
4384			mod = do_div(block, mp->m_sb.sb_rextsize);
4385			if (mod)
4386				startoffset_fsb += mp->m_sb.sb_rextsize - mod;
4387		}
4388		nimap = 1;
4389		error = XFS_BMAPI(mp, NULL, &ip->i_iocore, endoffset_fsb - 1,
4390			1, 0, NULL, 0, &imap, &nimap, NULL, NULL);
4391		if (error)
4392			goto out_unlock_iolock;
4393		ASSERT(nimap == 0 || nimap == 1);
4394		if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
4395			ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
4396			mod++;
4397			if (mod && (mod != mp->m_sb.sb_rextsize))
4398				endoffset_fsb -= mod;
4399		}
4400	}
4401	if ((done = (endoffset_fsb <= startoffset_fsb)))
4402		/*
4403		 * One contiguous piece to clear
4404		 */
4405		error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1);
4406	else {
4407		/*
4408		 * Some full blocks, possibly two pieces to clear
4409		 */
4410		if (offset < XFS_FSB_TO_B(mp, startoffset_fsb))
4411			error = xfs_zero_remaining_bytes(ip, offset,
4412				XFS_FSB_TO_B(mp, startoffset_fsb) - 1);
4413		if (!error &&
4414		    XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len)
4415			error = xfs_zero_remaining_bytes(ip,
4416				XFS_FSB_TO_B(mp, endoffset_fsb),
4417				offset + len - 1);
4418	}
4419
4420	/*
4421	 * free file space until done or until there is an error
4422	 */
4423	resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
4424	while (!error && !done) {
4425
4426		/*
4427		 * allocate and setup the transaction
4428		 */
4429		tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
4430		error = xfs_trans_reserve(tp,
4431					  resblks,
4432					  XFS_WRITE_LOG_RES(mp),
4433					  0,
4434					  XFS_TRANS_PERM_LOG_RES,
4435					  XFS_WRITE_LOG_COUNT);
4436
4437		/*
4438		 * check for running out of space
4439		 */
4440		if (error) {
4441			/*
4442			 * Free the transaction structure.
4443			 */
4444			ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
4445			xfs_trans_cancel(tp, 0);
4446			break;
4447		}
4448		xfs_ilock(ip, XFS_ILOCK_EXCL);
4449		error = XFS_TRANS_RESERVE_QUOTA(mp, tp,
4450				ip->i_udquot, ip->i_gdquot, resblks, 0,
4451				XFS_QMOPT_RES_REGBLKS);
4452		if (error)
4453			goto error1;
4454
4455		xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
4456		xfs_trans_ihold(tp, ip);
4457
4458		/*
4459		 * issue the bunmapi() call to free the blocks
4460		 */
4461		XFS_BMAP_INIT(&free_list, &firstfsb);
4462		error = XFS_BUNMAPI(mp, tp, &ip->i_iocore, startoffset_fsb,
4463				  endoffset_fsb - startoffset_fsb,
4464				  0, 2, &firstfsb, &free_list, NULL, &done);
4465		if (error) {
4466			goto error0;
4467		}
4468
4469		/*
4470		 * complete the transaction
4471		 */
4472		error = xfs_bmap_finish(&tp, &free_list, firstfsb, &committed);
4473		if (error) {
4474			goto error0;
4475		}
4476
4477		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
4478		xfs_iunlock(ip, XFS_ILOCK_EXCL);
4479	}
4480
4481 out_unlock_iolock:
4482	if (need_iolock)
4483		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
4484	return error;
4485
4486 error0:
4487	xfs_bmap_cancel(&free_list);
4488 error1:
4489	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
4490	xfs_iunlock(ip, need_iolock ? (XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL) :
4491		    XFS_ILOCK_EXCL);
4492	return error;
4493}
4494
4495/*
4496 * xfs_change_file_space()
4497 *      This routine allocates or frees disk space for the given file.
4498 *      The user specified parameters are checked for alignment and size
4499 *      limitations.
4500 *
4501 * RETURNS:
4502 *       0 on success
4503 *      errno on error
4504 *
4505 */
4506int
4507xfs_change_file_space(
4508	bhv_desc_t	*bdp,
4509	u_long		cmd,
4510	xfs_flock64_t	*bf,
4511	xfs_off_t	offset,
4512	cred_t		*credp,
4513	int		attr_flags)
4514{
4515	int		clrprealloc;
4516	int		error;
4517	xfs_fsize_t	fsize;
4518	xfs_inode_t	*ip;
4519	xfs_mount_t	*mp;
4520	int		setprealloc;
4521	xfs_off_t	startoffset;
4522	xfs_off_t	llen;
4523	xfs_trans_t	*tp;
4524	xfs_vattr_t	va;
4525	xfs_vnode_t	*vp;
4526
4527	vp = BHV_TO_VNODE(bdp);
4528	vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
4529
4530	ip = XFS_BHVTOI(bdp);
4531	mp = ip->i_mount;
4532
4533	/*
4534	 * must be a regular file and have write permission
4535	 */
4536	if (!VN_ISREG(vp))
4537		return XFS_ERROR(EINVAL);
4538
4539	xfs_ilock(ip, XFS_ILOCK_SHARED);
4540
4541	if ((error = xfs_iaccess(ip, VWRITE, credp))) {
4542		xfs_iunlock(ip, XFS_ILOCK_SHARED);
4543		return error;
4544	}
4545
4546	xfs_iunlock(ip, XFS_ILOCK_SHARED);
4547
4548	switch (bf->l_whence) {
4549	case 0: /*SEEK_SET*/
4550		break;
4551	case 1: /*SEEK_CUR*/
4552		bf->l_start += offset;
4553		break;
4554	case 2: /*SEEK_END*/
4555		bf->l_start += ip->i_d.di_size;
4556		break;
4557	default:
4558		return XFS_ERROR(EINVAL);
4559	}
4560
4561	llen = bf->l_len > 0 ? bf->l_len - 1 : bf->l_len;
4562
4563	if (   (bf->l_start < 0)
4564	    || (bf->l_start > XFS_MAXIOFFSET(mp))
4565	    || (bf->l_start + llen < 0)
4566	    || (bf->l_start + llen > XFS_MAXIOFFSET(mp)))
4567		return XFS_ERROR(EINVAL);
4568
4569	bf->l_whence = 0;
4570
4571	startoffset = bf->l_start;
4572	fsize = ip->i_d.di_size;
4573
4574	/*
4575	 * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve
4576	 * file space.
4577	 * These calls do NOT zero the data space allocated to the file,
4578	 * nor do they change the file size.
4579	 *
4580	 * XFS_IOC_ALLOCSP and XFS_IOC_FREESP will allocate and free file
4581	 * space.
4582	 * These calls cause the new file data to be zeroed and the file
4583	 * size to be changed.
4584	 */
4585	setprealloc = clrprealloc = 0;
4586
4587	switch (cmd) {
4588	case XFS_IOC_RESVSP:
4589	case XFS_IOC_RESVSP64:
4590		error = xfs_alloc_file_space(ip, startoffset, bf->l_len,
4591								1, attr_flags);
4592		if (error)
4593			return error;
4594		setprealloc = 1;
4595		break;
4596
4597	case XFS_IOC_UNRESVSP:
4598	case XFS_IOC_UNRESVSP64:
4599		if ((error = xfs_free_file_space(ip, startoffset, bf->l_len,
4600								attr_flags)))
4601			return error;
4602		break;
4603
4604	case XFS_IOC_ALLOCSP:
4605	case XFS_IOC_ALLOCSP64:
4606	case XFS_IOC_FREESP:
4607	case XFS_IOC_FREESP64:
4608		if (startoffset > fsize) {
4609			error = xfs_alloc_file_space(ip, fsize,
4610					startoffset - fsize, 0, attr_flags);
4611			if (error)
4612				break;
4613		}
4614
4615		va.va_mask = XFS_AT_SIZE;
4616		va.va_size = startoffset;
4617
4618		error = xfs_setattr(bdp, &va, attr_flags, credp);
4619
4620		if (error)
4621			return error;
4622
4623		clrprealloc = 1;
4624		break;
4625
4626	default:
4627		ASSERT(0);
4628		return XFS_ERROR(EINVAL);
4629	}
4630
4631	/*
4632	 * update the inode timestamp, mode, and prealloc flag bits
4633	 */
4634	tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
4635
4636	if ((error = xfs_trans_reserve(tp, 0, XFS_WRITEID_LOG_RES(mp),
4637				      0, 0, 0))) {
4638		/* ASSERT(0); */
4639		xfs_trans_cancel(tp, 0);
4640		return error;
4641	}
4642
4643	xfs_ilock(ip, XFS_ILOCK_EXCL);
4644
4645	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
4646	xfs_trans_ihold(tp, ip);
4647
4648	if ((attr_flags & ATTR_DMI) == 0) {
4649		ip->i_d.di_mode &= ~S_ISUID;
4650
4651		/*
4652		 * Note that we don't have to worry about mandatory
4653		 * file locking being disabled here because we only
4654		 * clear the S_ISGID bit if the Group execute bit is
4655		 * on, but if it was on then mandatory locking wouldn't
4656		 * have been enabled.
4657		 */
4658		if (ip->i_d.di_mode & S_IXGRP)
4659			ip->i_d.di_mode &= ~S_ISGID;
4660
4661		xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
4662	}
4663	if (setprealloc)
4664		ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
4665	else if (clrprealloc)
4666		ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
4667
4668	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
4669	xfs_trans_set_sync(tp);
4670
4671	error = xfs_trans_commit(tp, 0, NULL);
4672
4673	xfs_iunlock(ip, XFS_ILOCK_EXCL);
4674
4675	return error;
4676}
4677
4678
4679xfs_vnodeops_t xfs_vnodeops = {
4680	BHV_IDENTITY_INIT(VN_BHV_XFS,VNODE_POSITION_XFS),
4681	.vop_open		= xfs_open,
4682	.vop_read		= xfs_read,
4683#ifdef HAVE_SENDFILE
4684	.vop_sendfile		= xfs_sendfile,
4685#endif
4686	.vop_write		= xfs_write,
4687	.vop_ioctl		= xfs_ioctl,
4688	.vop_getattr		= xfs_getattr,
4689	.vop_setattr		= xfs_setattr,
4690	.vop_access		= xfs_access,
4691	.vop_lookup		= xfs_lookup,
4692	.vop_create		= xfs_create,
4693	.vop_remove		= xfs_remove,
4694	.vop_link		= xfs_link,
4695	.vop_rename		= xfs_rename,
4696	.vop_mkdir		= xfs_mkdir,
4697	.vop_rmdir		= xfs_rmdir,
4698	.vop_readdir		= xfs_readdir,
4699	.vop_symlink		= xfs_symlink,
4700	.vop_readlink		= xfs_readlink,
4701	.vop_fsync		= xfs_fsync,
4702	.vop_inactive		= xfs_inactive,
4703	.vop_fid2		= xfs_fid2,
4704	.vop_rwlock		= xfs_rwlock,
4705	.vop_rwunlock		= xfs_rwunlock,
4706	.vop_bmap		= xfs_bmap,
4707	.vop_reclaim		= xfs_reclaim,
4708	.vop_attr_get		= xfs_attr_get,
4709	.vop_attr_set		= xfs_attr_set,
4710	.vop_attr_remove	= xfs_attr_remove,
4711	.vop_attr_list		= xfs_attr_list,
4712	.vop_link_removed	= (xfs_vop_link_removed_t)fs_noval,
4713	.vop_vnode_change	= (xfs_vop_vnode_change_t)fs_noval,
4714	.vop_tosspages		= fs_tosspages,
4715	.vop_flushinval_pages	= fs_flushinval_pages,
4716	.vop_flush_pages	= fs_flush_pages,
4717	.vop_release		= xfs_release,
4718	.vop_iflush		= xfs_inode_flush,
4719};
4720