1/*-
2 *  modified for EXT2FS support in Lites 1.1
3 *
4 *  Aug 1995, Godmar Back (gback@cs.utah.edu)
5 *  University of Utah, Department of Computer Science
6 */
7/*-
8 * SPDX-License-Identifier: BSD-3-Clause
9 *
10 * Copyright (c) 1982, 1986, 1989, 1993
11 *	The Regents of the University of California.  All rights reserved.
12 * (c) UNIX System Laboratories, Inc.
13 * All or some portions of this file are derived from material licensed
14 * to the University of California by American Telephone and Telegraph
15 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
16 * the permission of UNIX System Laboratories, Inc.
17 *
18 * Redistribution and use in source and binary forms, with or without
19 * modification, are permitted provided that the following conditions
20 * are met:
21 * 1. Redistributions of source code must retain the above copyright
22 *    notice, this list of conditions and the following disclaimer.
23 * 2. Redistributions in binary form must reproduce the above copyright
24 *    notice, this list of conditions and the following disclaimer in the
25 *    documentation and/or other materials provided with the distribution.
26 * 3. Neither the name of the University nor the names of its contributors
27 *    may be used to endorse or promote products derived from this software
28 *    without specific prior written permission.
29 *
30 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
31 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
32 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
33 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
34 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
35 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
36 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
37 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
38 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
39 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
40 * SUCH DAMAGE.
41 *
42 *	@(#)ufs_vnops.c	8.7 (Berkeley) 2/3/94
43 *	@(#)ufs_vnops.c 8.27 (Berkeley) 5/27/95
44 * $FreeBSD$
45 */
46
47#include "opt_suiddir.h"
48
49#include <sys/param.h>
50#include <sys/systm.h>
51#include <sys/kernel.h>
52#include <sys/fcntl.h>
53#include <sys/filio.h>
54#include <sys/limits.h>
55#include <sys/sdt.h>
56#include <sys/stat.h>
57#include <sys/bio.h>
58#include <sys/buf.h>
59#include <sys/endian.h>
60#include <sys/priv.h>
61#include <sys/rwlock.h>
62#include <sys/mount.h>
63#include <sys/unistd.h>
64#include <sys/time.h>
65#include <sys/vnode.h>
66#include <sys/namei.h>
67#include <sys/lockf.h>
68#include <sys/event.h>
69#include <sys/conf.h>
70#include <sys/file.h>
71#include <sys/extattr.h>
72#include <sys/vmmeter.h>
73
74#include <vm/vm.h>
75#include <vm/vm_param.h>
76#include <vm/vm_extern.h>
77#include <vm/vm_object.h>
78#include <vm/vm_page.h>
79#include <vm/vm_pager.h>
80#include <vm/vnode_pager.h>
81
82#include "opt_directio.h"
83
84#include <ufs/ufs/dir.h>
85
86#include <fs/ext2fs/fs.h>
87#include <fs/ext2fs/inode.h>
88#include <fs/ext2fs/ext2_acl.h>
89#include <fs/ext2fs/ext2fs.h>
90#include <fs/ext2fs/ext2_extern.h>
91#include <fs/ext2fs/ext2_dinode.h>
92#include <fs/ext2fs/ext2_dir.h>
93#include <fs/ext2fs/ext2_mount.h>
94#include <fs/ext2fs/ext2_extattr.h>
95#include <fs/ext2fs/ext2_extents.h>
96
97SDT_PROVIDER_DECLARE(ext2fs);
98/*
99 * ext2fs trace probe:
100 * arg0: verbosity. Higher numbers give more verbose messages
101 * arg1: Textual message
102 */
103SDT_PROBE_DEFINE2(ext2fs, , vnops, trace, "int", "char*");
104
105static int ext2_makeinode(int mode, struct vnode *, struct vnode **, struct componentname *);
106static void ext2_itimes_locked(struct vnode *);
107
108static vop_access_t	ext2_access;
109static int ext2_chmod(struct vnode *, int, struct ucred *, struct thread *);
110static int ext2_chown(struct vnode *, uid_t, gid_t, struct ucred *,
111    struct thread *);
112static vop_close_t	ext2_close;
113static vop_create_t	ext2_create;
114static vop_fsync_t	ext2_fsync;
115static vop_getattr_t	ext2_getattr;
116static vop_ioctl_t	ext2_ioctl;
117static vop_link_t	ext2_link;
118static vop_mkdir_t	ext2_mkdir;
119static vop_mknod_t	ext2_mknod;
120static vop_open_t	ext2_open;
121static vop_pathconf_t	ext2_pathconf;
122static vop_print_t	ext2_print;
123static vop_read_t	ext2_read;
124static vop_readlink_t	ext2_readlink;
125static vop_remove_t	ext2_remove;
126static vop_rename_t	ext2_rename;
127static vop_rmdir_t	ext2_rmdir;
128static vop_setattr_t	ext2_setattr;
129static vop_strategy_t	ext2_strategy;
130static vop_symlink_t	ext2_symlink;
131static vop_write_t	ext2_write;
132static vop_deleteextattr_t	ext2_deleteextattr;
133static vop_getextattr_t	ext2_getextattr;
134static vop_listextattr_t	ext2_listextattr;
135static vop_setextattr_t	ext2_setextattr;
136static vop_vptofh_t	ext2_vptofh;
137static vop_close_t	ext2fifo_close;
138static vop_kqfilter_t	ext2fifo_kqfilter;
139
140/* Global vfs data structures for ext2. */
141struct vop_vector ext2_vnodeops = {
142	.vop_default =		&default_vnodeops,
143	.vop_access =		ext2_access,
144	.vop_bmap =		ext2_bmap,
145	.vop_cachedlookup =	ext2_lookup,
146	.vop_close =		ext2_close,
147	.vop_create =		ext2_create,
148	.vop_fsync =		ext2_fsync,
149	.vop_getpages =		vnode_pager_local_getpages,
150	.vop_getpages_async =	vnode_pager_local_getpages_async,
151	.vop_getattr =		ext2_getattr,
152	.vop_inactive =		ext2_inactive,
153	.vop_ioctl =		ext2_ioctl,
154	.vop_link =		ext2_link,
155	.vop_lookup =		vfs_cache_lookup,
156	.vop_mkdir =		ext2_mkdir,
157	.vop_mknod =		ext2_mknod,
158	.vop_open =		ext2_open,
159	.vop_pathconf =		ext2_pathconf,
160	.vop_poll =		vop_stdpoll,
161	.vop_print =		ext2_print,
162	.vop_read =		ext2_read,
163	.vop_readdir =		ext2_readdir,
164	.vop_readlink =		ext2_readlink,
165	.vop_reallocblks =	ext2_reallocblks,
166	.vop_reclaim =		ext2_reclaim,
167	.vop_remove =		ext2_remove,
168	.vop_rename =		ext2_rename,
169	.vop_rmdir =		ext2_rmdir,
170	.vop_setattr =		ext2_setattr,
171	.vop_strategy =		ext2_strategy,
172	.vop_symlink =		ext2_symlink,
173	.vop_write =		ext2_write,
174	.vop_deleteextattr =	ext2_deleteextattr,
175	.vop_getextattr =	ext2_getextattr,
176	.vop_listextattr =	ext2_listextattr,
177	.vop_setextattr =	ext2_setextattr,
178#ifdef UFS_ACL
179	.vop_getacl =		ext2_getacl,
180	.vop_setacl =		ext2_setacl,
181	.vop_aclcheck =		ext2_aclcheck,
182#endif /* UFS_ACL */
183	.vop_vptofh =		ext2_vptofh,
184};
185
186struct vop_vector ext2_fifoops = {
187	.vop_default =		&fifo_specops,
188	.vop_access =		ext2_access,
189	.vop_close =		ext2fifo_close,
190	.vop_fsync =		ext2_fsync,
191	.vop_getattr =		ext2_getattr,
192	.vop_inactive =		ext2_inactive,
193	.vop_kqfilter =		ext2fifo_kqfilter,
194	.vop_pathconf =		ext2_pathconf,
195	.vop_print =		ext2_print,
196	.vop_read =		VOP_PANIC,
197	.vop_reclaim =		ext2_reclaim,
198	.vop_setattr =		ext2_setattr,
199	.vop_write =		VOP_PANIC,
200	.vop_vptofh =		ext2_vptofh,
201};
202
203/*
204 * A virgin directory (no blushing please).
205 * Note that the type and namlen fields are reversed relative to ext2.
206 * Also, we don't use `struct odirtemplate', since it would just cause
207 * endianness problems.
208 */
209static struct dirtemplate mastertemplate = {
210	0, 12, 1, EXT2_FT_DIR, ".",
211	0, DIRBLKSIZ - 12, 2, EXT2_FT_DIR, ".."
212};
213static struct dirtemplate omastertemplate = {
214	0, 12, 1, EXT2_FT_UNKNOWN, ".",
215	0, DIRBLKSIZ - 12, 2, EXT2_FT_UNKNOWN, ".."
216};
217
218static void
219ext2_itimes_locked(struct vnode *vp)
220{
221	struct inode *ip;
222	struct timespec ts;
223
224	ASSERT_VI_LOCKED(vp, __func__);
225
226	ip = VTOI(vp);
227	if ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE)) == 0)
228		return;
229	if ((vp->v_type == VBLK || vp->v_type == VCHR))
230		ip->i_flag |= IN_LAZYMOD;
231	else
232		ip->i_flag |= IN_MODIFIED;
233	if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) {
234		vfs_timestamp(&ts);
235		if (ip->i_flag & IN_ACCESS) {
236			ip->i_atime = ts.tv_sec;
237			ip->i_atimensec = ts.tv_nsec;
238		}
239		if (ip->i_flag & IN_UPDATE) {
240			ip->i_mtime = ts.tv_sec;
241			ip->i_mtimensec = ts.tv_nsec;
242			ip->i_modrev++;
243		}
244		if (ip->i_flag & IN_CHANGE) {
245			ip->i_ctime = ts.tv_sec;
246			ip->i_ctimensec = ts.tv_nsec;
247		}
248	}
249	ip->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_UPDATE);
250}
251
252void
253ext2_itimes(struct vnode *vp)
254{
255
256	VI_LOCK(vp);
257	ext2_itimes_locked(vp);
258	VI_UNLOCK(vp);
259}
260
261/*
262 * Create a regular file
263 */
264static int
265ext2_create(struct vop_create_args *ap)
266{
267	int error;
268
269	error =
270	    ext2_makeinode(MAKEIMODE(ap->a_vap->va_type, ap->a_vap->va_mode),
271	    ap->a_dvp, ap->a_vpp, ap->a_cnp);
272	if (error != 0)
273		return (error);
274	if ((ap->a_cnp->cn_flags & MAKEENTRY) != 0)
275		cache_enter(ap->a_dvp, *ap->a_vpp, ap->a_cnp);
276	return (0);
277}
278
279static int
280ext2_open(struct vop_open_args *ap)
281{
282
283	if (ap->a_vp->v_type == VBLK || ap->a_vp->v_type == VCHR)
284		return (EOPNOTSUPP);
285
286	/*
287	 * Files marked append-only must be opened for appending.
288	 */
289	if ((VTOI(ap->a_vp)->i_flags & APPEND) &&
290	    (ap->a_mode & (FWRITE | O_APPEND)) == FWRITE)
291		return (EPERM);
292
293	vnode_create_vobject(ap->a_vp, VTOI(ap->a_vp)->i_size, ap->a_td);
294
295	return (0);
296}
297
298/*
299 * Close called.
300 *
301 * Update the times on the inode.
302 */
303static int
304ext2_close(struct vop_close_args *ap)
305{
306	struct vnode *vp = ap->a_vp;
307
308	VI_LOCK(vp);
309	if (vp->v_usecount > 1)
310		ext2_itimes_locked(vp);
311	VI_UNLOCK(vp);
312	return (0);
313}
314
315static int
316ext2_access(struct vop_access_args *ap)
317{
318	struct vnode *vp = ap->a_vp;
319	struct inode *ip = VTOI(vp);
320	accmode_t accmode = ap->a_accmode;
321	int error;
322
323	if (vp->v_type == VBLK || vp->v_type == VCHR)
324		return (EOPNOTSUPP);
325
326	/*
327	 * Disallow write attempts on read-only file systems;
328	 * unless the file is a socket, fifo, or a block or
329	 * character device resident on the file system.
330	 */
331	if (accmode & VWRITE) {
332		switch (vp->v_type) {
333		case VDIR:
334		case VLNK:
335		case VREG:
336			if (vp->v_mount->mnt_flag & MNT_RDONLY)
337				return (EROFS);
338			break;
339		default:
340			break;
341		}
342	}
343
344	/* If immutable bit set, nobody gets to write it. */
345	if ((accmode & VWRITE) && (ip->i_flags & (SF_IMMUTABLE | SF_SNAPSHOT)))
346		return (EPERM);
347
348	error = vaccess(vp->v_type, ip->i_mode, ip->i_uid, ip->i_gid,
349	    ap->a_accmode, ap->a_cred, NULL);
350	return (error);
351}
352
353static int
354ext2_getattr(struct vop_getattr_args *ap)
355{
356	struct vnode *vp = ap->a_vp;
357	struct inode *ip = VTOI(vp);
358	struct vattr *vap = ap->a_vap;
359
360	ext2_itimes(vp);
361	/*
362	 * Copy from inode table
363	 */
364	vap->va_fsid = dev2udev(ip->i_devvp->v_rdev);
365	vap->va_fileid = ip->i_number;
366	vap->va_mode = ip->i_mode & ~IFMT;
367	vap->va_nlink = ip->i_nlink;
368	vap->va_uid = ip->i_uid;
369	vap->va_gid = ip->i_gid;
370	vap->va_rdev = ip->i_rdev;
371	vap->va_size = ip->i_size;
372	vap->va_atime.tv_sec = ip->i_atime;
373	vap->va_atime.tv_nsec = E2DI_HAS_XTIME(ip) ? ip->i_atimensec : 0;
374	vap->va_mtime.tv_sec = ip->i_mtime;
375	vap->va_mtime.tv_nsec = E2DI_HAS_XTIME(ip) ? ip->i_mtimensec : 0;
376	vap->va_ctime.tv_sec = ip->i_ctime;
377	vap->va_ctime.tv_nsec = E2DI_HAS_XTIME(ip) ? ip->i_ctimensec : 0;
378	if E2DI_HAS_XTIME(ip) {
379		vap->va_birthtime.tv_sec = ip->i_birthtime;
380		vap->va_birthtime.tv_nsec = ip->i_birthnsec;
381	}
382	vap->va_flags = ip->i_flags;
383	vap->va_gen = ip->i_gen;
384	vap->va_blocksize = vp->v_mount->mnt_stat.f_iosize;
385	vap->va_bytes = dbtob((u_quad_t)ip->i_blocks);
386	vap->va_type = IFTOVT(ip->i_mode);
387	vap->va_filerev = ip->i_modrev;
388	return (0);
389}
390
391/*
392 * Set attribute vnode op. called from several syscalls
393 */
394static int
395ext2_setattr(struct vop_setattr_args *ap)
396{
397	struct vattr *vap = ap->a_vap;
398	struct vnode *vp = ap->a_vp;
399	struct inode *ip = VTOI(vp);
400	struct ucred *cred = ap->a_cred;
401	struct thread *td = curthread;
402	int error;
403
404	/*
405	 * Check for unsettable attributes.
406	 */
407	if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) ||
408	    (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) ||
409	    (vap->va_blocksize != VNOVAL) || (vap->va_rdev != VNOVAL) ||
410	    ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) {
411		return (EINVAL);
412	}
413	if (vap->va_flags != VNOVAL) {
414		/* Disallow flags not supported by ext2fs. */
415		if (vap->va_flags & ~(SF_APPEND | SF_IMMUTABLE | UF_NODUMP))
416			return (EOPNOTSUPP);
417
418		if (vp->v_mount->mnt_flag & MNT_RDONLY)
419			return (EROFS);
420		/*
421		 * Callers may only modify the file flags on objects they
422		 * have VADMIN rights for.
423		 */
424		if ((error = VOP_ACCESS(vp, VADMIN, cred, td)))
425			return (error);
426		/*
427		 * Unprivileged processes and privileged processes in
428		 * jail() are not permitted to unset system flags, or
429		 * modify flags if any system flags are set.
430		 * Privileged non-jail processes may not modify system flags
431		 * if securelevel > 0 and any existing system flags are set.
432		 */
433		if (!priv_check_cred(cred, PRIV_VFS_SYSFLAGS, 0)) {
434			if (ip->i_flags & (SF_IMMUTABLE | SF_APPEND)) {
435				error = securelevel_gt(cred, 0);
436				if (error)
437					return (error);
438			}
439		} else {
440			if (ip->i_flags & (SF_IMMUTABLE | SF_APPEND) ||
441			    ((vap->va_flags ^ ip->i_flags) & SF_SETTABLE))
442				return (EPERM);
443		}
444		ip->i_flags = vap->va_flags;
445		ip->i_flag |= IN_CHANGE;
446		if (ip->i_flags & (IMMUTABLE | APPEND))
447			return (0);
448	}
449	if (ip->i_flags & (IMMUTABLE | APPEND))
450		return (EPERM);
451	/*
452	 * Go through the fields and update iff not VNOVAL.
453	 */
454	if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
455		if (vp->v_mount->mnt_flag & MNT_RDONLY)
456			return (EROFS);
457		if ((error = ext2_chown(vp, vap->va_uid, vap->va_gid, cred,
458		    td)) != 0)
459			return (error);
460	}
461	if (vap->va_size != VNOVAL) {
462		/*
463		 * Disallow write attempts on read-only file systems;
464		 * unless the file is a socket, fifo, or a block or
465		 * character device resident on the file system.
466		 */
467		switch (vp->v_type) {
468		case VDIR:
469			return (EISDIR);
470		case VLNK:
471		case VREG:
472			if (vp->v_mount->mnt_flag & MNT_RDONLY)
473				return (EROFS);
474			break;
475		default:
476			break;
477		}
478		if ((error = ext2_truncate(vp, vap->va_size, 0, cred, td)) != 0)
479			return (error);
480	}
481	if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) {
482		if (vp->v_mount->mnt_flag & MNT_RDONLY)
483			return (EROFS);
484		/*
485		 * From utimes(2):
486		 * If times is NULL, ... The caller must be the owner of
487		 * the file, have permission to write the file, or be the
488		 * super-user.
489		 * If times is non-NULL, ... The caller must be the owner of
490		 * the file or be the super-user.
491		 */
492		if ((error = VOP_ACCESS(vp, VADMIN, cred, td)) &&
493		    ((vap->va_vaflags & VA_UTIMES_NULL) == 0 ||
494		    (error = VOP_ACCESS(vp, VWRITE, cred, td))))
495			return (error);
496		ip->i_flag |= IN_CHANGE | IN_MODIFIED;
497		if (vap->va_atime.tv_sec != VNOVAL) {
498			ip->i_flag &= ~IN_ACCESS;
499			ip->i_atime = vap->va_atime.tv_sec;
500			ip->i_atimensec = vap->va_atime.tv_nsec;
501		}
502		if (vap->va_mtime.tv_sec != VNOVAL) {
503			ip->i_flag &= ~IN_UPDATE;
504			ip->i_mtime = vap->va_mtime.tv_sec;
505			ip->i_mtimensec = vap->va_mtime.tv_nsec;
506		}
507		ip->i_birthtime = vap->va_birthtime.tv_sec;
508		ip->i_birthnsec = vap->va_birthtime.tv_nsec;
509		error = ext2_update(vp, 0);
510		if (error)
511			return (error);
512	}
513	error = 0;
514	if (vap->va_mode != (mode_t)VNOVAL) {
515		if (vp->v_mount->mnt_flag & MNT_RDONLY)
516			return (EROFS);
517		error = ext2_chmod(vp, (int)vap->va_mode, cred, td);
518	}
519	return (error);
520}
521
522/*
523 * Change the mode on a file.
524 * Inode must be locked before calling.
525 */
526static int
527ext2_chmod(struct vnode *vp, int mode, struct ucred *cred, struct thread *td)
528{
529	struct inode *ip = VTOI(vp);
530	int error;
531
532	/*
533	 * To modify the permissions on a file, must possess VADMIN
534	 * for that file.
535	 */
536	if ((error = VOP_ACCESS(vp, VADMIN, cred, td)))
537		return (error);
538	/*
539	 * Privileged processes may set the sticky bit on non-directories,
540	 * as well as set the setgid bit on a file with a group that the
541	 * process is not a member of.
542	 */
543	if (vp->v_type != VDIR && (mode & S_ISTXT)) {
544		error = priv_check_cred(cred, PRIV_VFS_STICKYFILE, 0);
545		if (error)
546			return (EFTYPE);
547	}
548	if (!groupmember(ip->i_gid, cred) && (mode & ISGID)) {
549		error = priv_check_cred(cred, PRIV_VFS_SETGID, 0);
550		if (error)
551			return (error);
552	}
553	ip->i_mode &= ~ALLPERMS;
554	ip->i_mode |= (mode & ALLPERMS);
555	ip->i_flag |= IN_CHANGE;
556	return (0);
557}
558
559/*
560 * Perform chown operation on inode ip;
561 * inode must be locked prior to call.
562 */
563static int
564ext2_chown(struct vnode *vp, uid_t uid, gid_t gid, struct ucred *cred,
565    struct thread *td)
566{
567	struct inode *ip = VTOI(vp);
568	uid_t ouid;
569	gid_t ogid;
570	int error = 0;
571
572	if (uid == (uid_t)VNOVAL)
573		uid = ip->i_uid;
574	if (gid == (gid_t)VNOVAL)
575		gid = ip->i_gid;
576	/*
577	 * To modify the ownership of a file, must possess VADMIN
578	 * for that file.
579	 */
580	if ((error = VOP_ACCESS(vp, VADMIN, cred, td)))
581		return (error);
582	/*
583	 * To change the owner of a file, or change the group of a file
584	 * to a group of which we are not a member, the caller must
585	 * have privilege.
586	 */
587	if (uid != ip->i_uid || (gid != ip->i_gid &&
588	    !groupmember(gid, cred))) {
589		error = priv_check_cred(cred, PRIV_VFS_CHOWN, 0);
590		if (error)
591			return (error);
592	}
593	ogid = ip->i_gid;
594	ouid = ip->i_uid;
595	ip->i_gid = gid;
596	ip->i_uid = uid;
597	ip->i_flag |= IN_CHANGE;
598	if ((ip->i_mode & (ISUID | ISGID)) && (ouid != uid || ogid != gid)) {
599		if (priv_check_cred(cred, PRIV_VFS_RETAINSUGID, 0) != 0)
600			ip->i_mode &= ~(ISUID | ISGID);
601	}
602	return (0);
603}
604
605/*
606 * Synch an open file.
607 */
608/* ARGSUSED */
609static int
610ext2_fsync(struct vop_fsync_args *ap)
611{
612	/*
613	 * Flush all dirty buffers associated with a vnode.
614	 */
615
616	vop_stdfsync(ap);
617
618	return (ext2_update(ap->a_vp, ap->a_waitfor == MNT_WAIT));
619}
620
621/*
622 * Mknod vnode call
623 */
624/* ARGSUSED */
625static int
626ext2_mknod(struct vop_mknod_args *ap)
627{
628	struct vattr *vap = ap->a_vap;
629	struct vnode **vpp = ap->a_vpp;
630	struct inode *ip;
631	ino_t ino;
632	int error;
633
634	error = ext2_makeinode(MAKEIMODE(vap->va_type, vap->va_mode),
635	    ap->a_dvp, vpp, ap->a_cnp);
636	if (error)
637		return (error);
638	ip = VTOI(*vpp);
639	ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
640	if (vap->va_rdev != VNOVAL) {
641		/*
642		 * Want to be able to use this to make badblock
643		 * inodes, so don't truncate the dev number.
644		 */
645		if (!(ip->i_flag & IN_E4EXTENTS))
646			ip->i_rdev = vap->va_rdev;
647	}
648	/*
649	 * Remove inode, then reload it through VFS_VGET so it is
650	 * checked to see if it is an alias of an existing entry in
651	 * the inode cache.	 XXX I don't believe this is necessary now.
652	 */
653	(*vpp)->v_type = VNON;
654	ino = ip->i_number;	/* Save this before vgone() invalidates ip. */
655	vgone(*vpp);
656	vput(*vpp);
657	error = VFS_VGET(ap->a_dvp->v_mount, ino, LK_EXCLUSIVE, vpp);
658	if (error) {
659		*vpp = NULL;
660		return (error);
661	}
662	return (0);
663}
664
665static int
666ext2_remove(struct vop_remove_args *ap)
667{
668	struct inode *ip;
669	struct vnode *vp = ap->a_vp;
670	struct vnode *dvp = ap->a_dvp;
671	int error;
672
673	ip = VTOI(vp);
674	if ((ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) ||
675	    (VTOI(dvp)->i_flags & APPEND)) {
676		error = EPERM;
677		goto out;
678	}
679	error = ext2_dirremove(dvp, ap->a_cnp);
680	if (error == 0) {
681		ip->i_nlink--;
682		ip->i_flag |= IN_CHANGE;
683	}
684out:
685	return (error);
686}
687
688/*
689 * link vnode call
690 */
691static int
692ext2_link(struct vop_link_args *ap)
693{
694	struct vnode *vp = ap->a_vp;
695	struct vnode *tdvp = ap->a_tdvp;
696	struct componentname *cnp = ap->a_cnp;
697	struct inode *ip;
698	int error;
699
700#ifdef INVARIANTS
701	if ((cnp->cn_flags & HASBUF) == 0)
702		panic("ext2_link: no name");
703#endif
704	ip = VTOI(vp);
705	if ((nlink_t)ip->i_nlink >= EXT4_LINK_MAX) {
706		error = EMLINK;
707		goto out;
708	}
709	if (ip->i_flags & (IMMUTABLE | APPEND)) {
710		error = EPERM;
711		goto out;
712	}
713	ip->i_nlink++;
714	ip->i_flag |= IN_CHANGE;
715	error = ext2_update(vp, !DOINGASYNC(vp));
716	if (!error)
717		error = ext2_direnter(ip, tdvp, cnp);
718	if (error) {
719		ip->i_nlink--;
720		ip->i_flag |= IN_CHANGE;
721	}
722out:
723	return (error);
724}
725
726static int
727ext2_inc_nlink(struct inode *ip)
728{
729
730	ip->i_nlink++;
731
732	if (S_ISDIR(ip->i_mode) &&
733	    EXT2_HAS_RO_COMPAT_FEATURE(ip->i_e2fs, EXT2F_ROCOMPAT_DIR_NLINK) &&
734	    ip->i_nlink > 1) {
735		if (ip->i_nlink >= EXT4_LINK_MAX || ip->i_nlink == 2)
736			ip->i_nlink = 1;
737	} else if (ip->i_nlink > EXT4_LINK_MAX) {
738		ip->i_nlink--;
739		return (EMLINK);
740	}
741
742	return (0);
743}
744
745static void
746ext2_dec_nlink(struct inode *ip)
747{
748
749	if (!S_ISDIR(ip->i_mode) || ip->i_nlink > 2)
750		ip->i_nlink--;
751}
752
753/*
754 * Rename system call.
755 * 	rename("foo", "bar");
756 * is essentially
757 *	unlink("bar");
758 *	link("foo", "bar");
759 *	unlink("foo");
760 * but ``atomically''.  Can't do full commit without saving state in the
761 * inode on disk which isn't feasible at this time.  Best we can do is
762 * always guarantee the target exists.
763 *
764 * Basic algorithm is:
765 *
766 * 1) Bump link count on source while we're linking it to the
767 *    target.  This also ensure the inode won't be deleted out
768 *    from underneath us while we work (it may be truncated by
769 *    a concurrent `trunc' or `open' for creation).
770 * 2) Link source to destination.  If destination already exists,
771 *    delete it first.
772 * 3) Unlink source reference to inode if still around. If a
773 *    directory was moved and the parent of the destination
774 *    is different from the source, patch the ".." entry in the
775 *    directory.
776 */
777static int
778ext2_rename(struct vop_rename_args *ap)
779{
780	struct vnode *tvp = ap->a_tvp;
781	struct vnode *tdvp = ap->a_tdvp;
782	struct vnode *fvp = ap->a_fvp;
783	struct vnode *fdvp = ap->a_fdvp;
784	struct componentname *tcnp = ap->a_tcnp;
785	struct componentname *fcnp = ap->a_fcnp;
786	struct inode *ip, *xp, *dp;
787	struct dirtemplate *dirbuf;
788	int doingdirectory = 0, oldparent = 0, newparent = 0;
789	int error = 0;
790	u_char namlen;
791
792#ifdef INVARIANTS
793	if ((tcnp->cn_flags & HASBUF) == 0 ||
794	    (fcnp->cn_flags & HASBUF) == 0)
795		panic("ext2_rename: no name");
796#endif
797	/*
798	 * Check for cross-device rename.
799	 */
800	if ((fvp->v_mount != tdvp->v_mount) ||
801	    (tvp && (fvp->v_mount != tvp->v_mount))) {
802		error = EXDEV;
803abortit:
804		if (tdvp == tvp)
805			vrele(tdvp);
806		else
807			vput(tdvp);
808		if (tvp)
809			vput(tvp);
810		vrele(fdvp);
811		vrele(fvp);
812		return (error);
813	}
814
815	if (tvp && ((VTOI(tvp)->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) ||
816	    (VTOI(tdvp)->i_flags & APPEND))) {
817		error = EPERM;
818		goto abortit;
819	}
820
821	/*
822	 * Renaming a file to itself has no effect.  The upper layers should
823	 * not call us in that case.  Temporarily just warn if they do.
824	 */
825	if (fvp == tvp) {
826		SDT_PROBE2(ext2fs, , vnops, trace, 1,
827		    "rename: fvp == tvp (can't happen)");
828		error = 0;
829		goto abortit;
830	}
831
832	if ((error = vn_lock(fvp, LK_EXCLUSIVE)) != 0)
833		goto abortit;
834	dp = VTOI(fdvp);
835	ip = VTOI(fvp);
836	if (ip->i_nlink >= EXT4_LINK_MAX &&
837	    !EXT2_HAS_RO_COMPAT_FEATURE(ip->i_e2fs, EXT2F_ROCOMPAT_DIR_NLINK)) {
838		VOP_UNLOCK(fvp, 0);
839		error = EMLINK;
840		goto abortit;
841	}
842	if ((ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND))
843	    || (dp->i_flags & APPEND)) {
844		VOP_UNLOCK(fvp, 0);
845		error = EPERM;
846		goto abortit;
847	}
848	if ((ip->i_mode & IFMT) == IFDIR) {
849		/*
850		 * Avoid ".", "..", and aliases of "." for obvious reasons.
851		 */
852		if ((fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') ||
853		    dp == ip || (fcnp->cn_flags | tcnp->cn_flags) & ISDOTDOT ||
854		    (ip->i_flag & IN_RENAME)) {
855			VOP_UNLOCK(fvp, 0);
856			error = EINVAL;
857			goto abortit;
858		}
859		ip->i_flag |= IN_RENAME;
860		oldparent = dp->i_number;
861		doingdirectory++;
862	}
863	vrele(fdvp);
864
865	/*
866	 * When the target exists, both the directory
867	 * and target vnodes are returned locked.
868	 */
869	dp = VTOI(tdvp);
870	xp = NULL;
871	if (tvp)
872		xp = VTOI(tvp);
873
874	/*
875	 * 1) Bump link count while we're moving stuff
876	 *    around.  If we crash somewhere before
877	 *    completing our work, the link count
878	 *    may be wrong, but correctable.
879	 */
880	ext2_inc_nlink(ip);
881	ip->i_flag |= IN_CHANGE;
882	if ((error = ext2_update(fvp, !DOINGASYNC(fvp))) != 0) {
883		VOP_UNLOCK(fvp, 0);
884		goto bad;
885	}
886
887	/*
888	 * If ".." must be changed (ie the directory gets a new
889	 * parent) then the source directory must not be in the
890	 * directory hierarchy above the target, as this would
891	 * orphan everything below the source directory. Also
892	 * the user must have write permission in the source so
893	 * as to be able to change "..". We must repeat the call
894	 * to namei, as the parent directory is unlocked by the
895	 * call to checkpath().
896	 */
897	error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, tcnp->cn_thread);
898	VOP_UNLOCK(fvp, 0);
899	if (oldparent != dp->i_number)
900		newparent = dp->i_number;
901	if (doingdirectory && newparent) {
902		if (error)	/* write access check above */
903			goto bad;
904		if (xp != NULL)
905			vput(tvp);
906		error = ext2_checkpath(ip, dp, tcnp->cn_cred);
907		if (error)
908			goto out;
909		VREF(tdvp);
910		error = relookup(tdvp, &tvp, tcnp);
911		if (error)
912			goto out;
913		vrele(tdvp);
914		dp = VTOI(tdvp);
915		xp = NULL;
916		if (tvp)
917			xp = VTOI(tvp);
918	}
919	/*
920	 * 2) If target doesn't exist, link the target
921	 *    to the source and unlink the source.
922	 *    Otherwise, rewrite the target directory
923	 *    entry to reference the source inode and
924	 *    expunge the original entry's existence.
925	 */
926	if (xp == NULL) {
927		if (dp->i_devvp != ip->i_devvp)
928			panic("ext2_rename: EXDEV");
929		/*
930		 * Account for ".." in new directory.
931		 * When source and destination have the same
932		 * parent we don't fool with the link count.
933		 */
934		if (doingdirectory && newparent) {
935			error = ext2_inc_nlink(dp);
936			if (error)
937				goto bad;
938
939			dp->i_flag |= IN_CHANGE;
940			error = ext2_update(tdvp, !DOINGASYNC(tdvp));
941			if (error)
942				goto bad;
943		}
944		error = ext2_direnter(ip, tdvp, tcnp);
945		if (error) {
946			if (doingdirectory && newparent) {
947				ext2_dec_nlink(dp);
948				dp->i_flag |= IN_CHANGE;
949				(void)ext2_update(tdvp, 1);
950			}
951			goto bad;
952		}
953		vput(tdvp);
954	} else {
955		if (xp->i_devvp != dp->i_devvp || xp->i_devvp != ip->i_devvp)
956			panic("ext2_rename: EXDEV");
957		/*
958		 * Short circuit rename(foo, foo).
959		 */
960		if (xp->i_number == ip->i_number)
961			panic("ext2_rename: same file");
962		/*
963		 * If the parent directory is "sticky", then the user must
964		 * own the parent directory, or the destination of the rename,
965		 * otherwise the destination may not be changed (except by
966		 * root). This implements append-only directories.
967		 */
968		if ((dp->i_mode & S_ISTXT) && tcnp->cn_cred->cr_uid != 0 &&
969		    tcnp->cn_cred->cr_uid != dp->i_uid &&
970		    xp->i_uid != tcnp->cn_cred->cr_uid) {
971			error = EPERM;
972			goto bad;
973		}
974		/*
975		 * Target must be empty if a directory and have no links
976		 * to it. Also, ensure source and target are compatible
977		 * (both directories, or both not directories).
978		 */
979		if ((xp->i_mode & IFMT) == IFDIR) {
980			if (!ext2_dirempty(xp, dp->i_number, tcnp->cn_cred)) {
981				error = ENOTEMPTY;
982				goto bad;
983			}
984			if (!doingdirectory) {
985				error = ENOTDIR;
986				goto bad;
987			}
988			cache_purge(tdvp);
989		} else if (doingdirectory) {
990			error = EISDIR;
991			goto bad;
992		}
993		error = ext2_dirrewrite(dp, ip, tcnp);
994		if (error)
995			goto bad;
996		/*
997		 * If the target directory is in the same
998		 * directory as the source directory,
999		 * decrement the link count on the parent
1000		 * of the target directory.
1001		 */
1002		if (doingdirectory && !newparent) {
1003			ext2_dec_nlink(dp);
1004			dp->i_flag |= IN_CHANGE;
1005		}
1006		vput(tdvp);
1007		/*
1008		 * Adjust the link count of the target to
1009		 * reflect the dirrewrite above.  If this is
1010		 * a directory it is empty and there are
1011		 * no links to it, so we can squash the inode and
1012		 * any space associated with it.  We disallowed
1013		 * renaming over top of a directory with links to
1014		 * it above, as the remaining link would point to
1015		 * a directory without "." or ".." entries.
1016		 */
1017		ext2_dec_nlink(xp);
1018		if (doingdirectory) {
1019			if (xp->i_nlink > 2)
1020				panic("ext2_rename: linked directory");
1021			error = ext2_truncate(tvp, (off_t)0, IO_SYNC,
1022			    tcnp->cn_cred, tcnp->cn_thread);
1023			xp->i_nlink = 0;
1024		}
1025		xp->i_flag |= IN_CHANGE;
1026		vput(tvp);
1027		xp = NULL;
1028	}
1029
1030	/*
1031	 * 3) Unlink the source.
1032	 */
1033	fcnp->cn_flags &= ~MODMASK;
1034	fcnp->cn_flags |= LOCKPARENT | LOCKLEAF;
1035	VREF(fdvp);
1036	error = relookup(fdvp, &fvp, fcnp);
1037	if (error == 0)
1038		vrele(fdvp);
1039	if (fvp != NULL) {
1040		xp = VTOI(fvp);
1041		dp = VTOI(fdvp);
1042	} else {
1043		/*
1044		 * From name has disappeared.  IN_RENAME is not sufficient
1045		 * to protect against directory races due to timing windows,
1046		 * so we can't panic here.
1047		 */
1048		vrele(ap->a_fvp);
1049		return (0);
1050	}
1051	/*
1052	 * Ensure that the directory entry still exists and has not
1053	 * changed while the new name has been entered. If the source is
1054	 * a file then the entry may have been unlinked or renamed. In
1055	 * either case there is no further work to be done. If the source
1056	 * is a directory then it cannot have been rmdir'ed; its link
1057	 * count of three would cause a rmdir to fail with ENOTEMPTY.
1058	 * The IN_RENAME flag ensures that it cannot be moved by another
1059	 * rename.
1060	 */
1061	if (xp != ip) {
1062		/*
1063		 * From name resolves to a different inode.  IN_RENAME is
1064		 * not sufficient protection against timing window races
1065		 * so we can't panic here.
1066		 */
1067	} else {
1068		/*
1069		 * If the source is a directory with a
1070		 * new parent, the link count of the old
1071		 * parent directory must be decremented
1072		 * and ".." set to point to the new parent.
1073		 */
1074		if (doingdirectory && newparent) {
1075			ext2_dec_nlink(dp);
1076			dp->i_flag |= IN_CHANGE;
1077			dirbuf = malloc(dp->i_e2fs->e2fs_bsize, M_TEMP, M_WAITOK | M_ZERO);
1078			error = vn_rdwr(UIO_READ, fvp, (caddr_t)dirbuf,
1079			    ip->i_e2fs->e2fs_bsize, (off_t)0,
1080			    UIO_SYSSPACE, IO_NODELOCKED | IO_NOMACCHECK,
1081			    tcnp->cn_cred, NOCRED, NULL, NULL);
1082			if (error == 0) {
1083				/* Like ufs little-endian: */
1084				namlen = dirbuf->dotdot_type;
1085				if (namlen != 2 ||
1086				    dirbuf->dotdot_name[0] != '.' ||
1087				    dirbuf->dotdot_name[1] != '.') {
1088					ext2_dirbad(xp, (doff_t)12,
1089					    "rename: mangled dir");
1090				} else {
1091					dirbuf->dotdot_ino = newparent;
1092					/*
1093					 * dirblock 0 could be htree root,
1094					 * try both csum update functions.
1095					 */
1096					ext2_dirent_csum_set(ip,
1097					    (struct ext2fs_direct_2 *)dirbuf);
1098					ext2_dx_csum_set(ip,
1099					    (struct ext2fs_direct_2 *)dirbuf);
1100					(void)vn_rdwr(UIO_WRITE, fvp,
1101					    (caddr_t)dirbuf,
1102					    ip->i_e2fs->e2fs_bsize,
1103					    (off_t)0, UIO_SYSSPACE,
1104					    IO_NODELOCKED | IO_SYNC |
1105					    IO_NOMACCHECK, tcnp->cn_cred,
1106					    NOCRED, NULL, NULL);
1107					cache_purge(fdvp);
1108				}
1109			}
1110			free(dirbuf, M_TEMP);
1111		}
1112		error = ext2_dirremove(fdvp, fcnp);
1113		if (!error) {
1114			ext2_dec_nlink(xp);
1115			xp->i_flag |= IN_CHANGE;
1116		}
1117		xp->i_flag &= ~IN_RENAME;
1118	}
1119	if (dp)
1120		vput(fdvp);
1121	if (xp)
1122		vput(fvp);
1123	vrele(ap->a_fvp);
1124	return (error);
1125
1126bad:
1127	if (xp)
1128		vput(ITOV(xp));
1129	vput(ITOV(dp));
1130out:
1131	if (doingdirectory)
1132		ip->i_flag &= ~IN_RENAME;
1133	if (vn_lock(fvp, LK_EXCLUSIVE) == 0) {
1134		ext2_dec_nlink(ip);
1135		ip->i_flag |= IN_CHANGE;
1136		ip->i_flag &= ~IN_RENAME;
1137		vput(fvp);
1138	} else
1139		vrele(fvp);
1140	return (error);
1141}
1142
1143#ifdef UFS_ACL
1144static int
1145ext2_do_posix1e_acl_inheritance_dir(struct vnode *dvp, struct vnode *tvp,
1146    mode_t dmode, struct ucred *cred, struct thread *td)
1147{
1148	int error;
1149	struct inode *ip = VTOI(tvp);
1150	struct acl *dacl, *acl;
1151
1152	acl = acl_alloc(M_WAITOK);
1153	dacl = acl_alloc(M_WAITOK);
1154
1155	/*
1156	 * Retrieve default ACL from parent, if any.
1157	 */
1158	error = VOP_GETACL(dvp, ACL_TYPE_DEFAULT, acl, cred, td);
1159	switch (error) {
1160	case 0:
1161		/*
1162		 * Retrieved a default ACL, so merge mode and ACL if
1163		 * necessary.  If the ACL is empty, fall through to
1164		 * the "not defined or available" case.
1165		 */
1166		if (acl->acl_cnt != 0) {
1167			dmode = acl_posix1e_newfilemode(dmode, acl);
1168			ip->i_mode = dmode;
1169			*dacl = *acl;
1170			ext2_sync_acl_from_inode(ip, acl);
1171			break;
1172		}
1173		/* FALLTHROUGH */
1174
1175	case EOPNOTSUPP:
1176		/*
1177		 * Just use the mode as-is.
1178		 */
1179		ip->i_mode = dmode;
1180		error = 0;
1181		goto out;
1182
1183	default:
1184		goto out;
1185	}
1186
1187	error = VOP_SETACL(tvp, ACL_TYPE_ACCESS, acl, cred, td);
1188	if (error == 0)
1189		error = VOP_SETACL(tvp, ACL_TYPE_DEFAULT, dacl, cred, td);
1190	switch (error) {
1191	case 0:
1192		break;
1193
1194	case EOPNOTSUPP:
1195		/*
1196		 * XXX: This should not happen, as EOPNOTSUPP above
1197		 * was supposed to free acl.
1198		 */
1199#ifdef DEBUG
1200		printf("ext2_mkdir: VOP_GETACL() but no VOP_SETACL()\n");
1201#endif	/* DEBUG */
1202		break;
1203
1204	default:
1205		goto out;
1206	}
1207
1208out:
1209	acl_free(acl);
1210	acl_free(dacl);
1211
1212	return (error);
1213}
1214
1215static int
1216ext2_do_posix1e_acl_inheritance_file(struct vnode *dvp, struct vnode *tvp,
1217    mode_t mode, struct ucred *cred, struct thread *td)
1218{
1219	int error;
1220	struct inode *ip = VTOI(tvp);
1221	struct acl *acl;
1222
1223	acl = acl_alloc(M_WAITOK);
1224
1225	/*
1226	 * Retrieve default ACL for parent, if any.
1227	 */
1228	error = VOP_GETACL(dvp, ACL_TYPE_DEFAULT, acl, cred, td);
1229	switch (error) {
1230	case 0:
1231		/*
1232		 * Retrieved a default ACL, so merge mode and ACL if
1233		 * necessary.
1234		 */
1235		if (acl->acl_cnt != 0) {
1236			/*
1237			 * Two possible ways for default ACL to not
1238			 * be present.  First, the EA can be
1239			 * undefined, or second, the default ACL can
1240			 * be blank.  If it's blank, fall through to
1241			 * the it's not defined case.
1242			 */
1243			mode = acl_posix1e_newfilemode(mode, acl);
1244			ip->i_mode = mode;
1245			ext2_sync_acl_from_inode(ip, acl);
1246			break;
1247		}
1248		/* FALLTHROUGH */
1249
1250	case EOPNOTSUPP:
1251		/*
1252		 * Just use the mode as-is.
1253		 */
1254		ip->i_mode = mode;
1255		error = 0;
1256		goto out;
1257
1258	default:
1259		goto out;
1260	}
1261
1262	error = VOP_SETACL(tvp, ACL_TYPE_ACCESS, acl, cred, td);
1263	switch (error) {
1264	case 0:
1265		break;
1266
1267	case EOPNOTSUPP:
1268		/*
1269		 * XXX: This should not happen, as EOPNOTSUPP above was
1270		 * supposed to free acl.
1271		 */
1272		printf("ufs_do_posix1e_acl_inheritance_file: VOP_GETACL() "
1273		    "but no VOP_SETACL()\n");
1274		/* panic("ufs_do_posix1e_acl_inheritance_file: VOP_GETACL() "
1275		    "but no VOP_SETACL()"); */
1276		break;
1277
1278	default:
1279		goto out;
1280	}
1281
1282out:
1283	acl_free(acl);
1284
1285	return (error);
1286}
1287
1288#endif /* UFS_ACL */
1289
1290/*
1291 * Mkdir system call
1292 */
1293static int
1294ext2_mkdir(struct vop_mkdir_args *ap)
1295{
1296	struct m_ext2fs *fs;
1297	struct vnode *dvp = ap->a_dvp;
1298	struct vattr *vap = ap->a_vap;
1299	struct componentname *cnp = ap->a_cnp;
1300	struct inode *ip, *dp;
1301	struct vnode *tvp;
1302	struct dirtemplate dirtemplate, *dtp;
1303	char *buf = NULL;
1304	int error, dmode;
1305
1306#ifdef INVARIANTS
1307	if ((cnp->cn_flags & HASBUF) == 0)
1308		panic("ext2_mkdir: no name");
1309#endif
1310	dp = VTOI(dvp);
1311	if ((nlink_t)dp->i_nlink >= EXT4_LINK_MAX &&
1312	    !EXT2_HAS_RO_COMPAT_FEATURE(dp->i_e2fs, EXT2F_ROCOMPAT_DIR_NLINK)) {
1313		error = EMLINK;
1314		goto out;
1315	}
1316	dmode = vap->va_mode & 0777;
1317	dmode |= IFDIR;
1318	/*
1319	 * Must simulate part of ext2_makeinode here to acquire the inode,
1320	 * but not have it entered in the parent directory. The entry is
1321	 * made later after writing "." and ".." entries.
1322	 */
1323	error = ext2_valloc(dvp, dmode, cnp->cn_cred, &tvp);
1324	if (error)
1325		goto out;
1326	ip = VTOI(tvp);
1327	fs = ip->i_e2fs;
1328	ip->i_gid = dp->i_gid;
1329#ifdef SUIDDIR
1330	{
1331		/*
1332		 * if we are hacking owners here, (only do this where told to)
1333		 * and we are not giving it TOO root, (would subvert quotas)
1334		 * then go ahead and give it to the other user.
1335		 * The new directory also inherits the SUID bit.
1336		 * If user's UID and dir UID are the same,
1337		 * 'give it away' so that the SUID is still forced on.
1338		 */
1339		if ((dvp->v_mount->mnt_flag & MNT_SUIDDIR) &&
1340		    (dp->i_mode & ISUID) && dp->i_uid) {
1341			dmode |= ISUID;
1342			ip->i_uid = dp->i_uid;
1343		} else {
1344			ip->i_uid = cnp->cn_cred->cr_uid;
1345		}
1346	}
1347#else
1348	ip->i_uid = cnp->cn_cred->cr_uid;
1349#endif
1350	ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
1351	ip->i_mode = dmode;
1352	tvp->v_type = VDIR;	/* Rest init'd in getnewvnode(). */
1353	ip->i_nlink = 2;
1354	if (cnp->cn_flags & ISWHITEOUT)
1355		ip->i_flags |= UF_OPAQUE;
1356	error = ext2_update(tvp, 1);
1357
1358	/*
1359	 * Bump link count in parent directory
1360	 * to reflect work done below.  Should
1361	 * be done before reference is created
1362	 * so reparation is possible if we crash.
1363	 */
1364	ext2_inc_nlink(dp);
1365	dp->i_flag |= IN_CHANGE;
1366	error = ext2_update(dvp, !DOINGASYNC(dvp));
1367	if (error)
1368		goto bad;
1369
1370	/* Initialize directory with "." and ".." from static template. */
1371	if (EXT2_HAS_INCOMPAT_FEATURE(ip->i_e2fs,
1372	    EXT2F_INCOMPAT_FTYPE))
1373		dtp = &mastertemplate;
1374	else
1375		dtp = &omastertemplate;
1376	dirtemplate = *dtp;
1377	dirtemplate.dot_ino = ip->i_number;
1378	dirtemplate.dotdot_ino = dp->i_number;
1379	/*
1380	 * note that in ext2 DIRBLKSIZ == blocksize, not DEV_BSIZE so let's
1381	 * just redefine it - for this function only
1382	 */
1383#undef  DIRBLKSIZ
1384#define DIRBLKSIZ  VTOI(dvp)->i_e2fs->e2fs_bsize
1385	dirtemplate.dotdot_reclen = DIRBLKSIZ - 12;
1386	buf = malloc(DIRBLKSIZ, M_TEMP, M_WAITOK | M_ZERO);
1387	if (EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_METADATA_CKSUM)) {
1388		dirtemplate.dotdot_reclen -= sizeof(struct ext2fs_direct_tail);
1389		ext2_init_dirent_tail(EXT2_DIRENT_TAIL(buf, DIRBLKSIZ));
1390	}
1391	memcpy(buf, &dirtemplate, sizeof(dirtemplate));
1392	ext2_dirent_csum_set(ip, (struct ext2fs_direct_2 *)buf);
1393	error = vn_rdwr(UIO_WRITE, tvp, (caddr_t)buf,
1394	    DIRBLKSIZ, (off_t)0, UIO_SYSSPACE,
1395	    IO_NODELOCKED | IO_SYNC | IO_NOMACCHECK, cnp->cn_cred, NOCRED,
1396	    NULL, NULL);
1397	if (error) {
1398		ext2_dec_nlink(dp);
1399		dp->i_flag |= IN_CHANGE;
1400		goto bad;
1401	}
1402	if (DIRBLKSIZ > VFSTOEXT2(dvp->v_mount)->um_mountp->mnt_stat.f_bsize)
1403		/* XXX should grow with balloc() */
1404		panic("ext2_mkdir: blksize");
1405	else {
1406		ip->i_size = DIRBLKSIZ;
1407		ip->i_flag |= IN_CHANGE;
1408	}
1409
1410#ifdef UFS_ACL
1411	if (dvp->v_mount->mnt_flag & MNT_ACLS) {
1412		error = ext2_do_posix1e_acl_inheritance_dir(dvp, tvp, dmode,
1413		    cnp->cn_cred, cnp->cn_thread);
1414		if (error)
1415			goto bad;
1416	}
1417
1418#endif /* UFS_ACL */
1419
1420	/* Directory set up, now install its entry in the parent directory. */
1421	error = ext2_direnter(ip, dvp, cnp);
1422	if (error) {
1423		ext2_dec_nlink(dp);
1424		dp->i_flag |= IN_CHANGE;
1425	}
1426bad:
1427	/*
1428	 * No need to do an explicit VOP_TRUNCATE here, vrele will do this
1429	 * for us because we set the link count to 0.
1430	 */
1431	if (error) {
1432		ip->i_nlink = 0;
1433		ip->i_flag |= IN_CHANGE;
1434		vput(tvp);
1435	} else
1436		*ap->a_vpp = tvp;
1437out:
1438	free(buf, M_TEMP);
1439	return (error);
1440#undef  DIRBLKSIZ
1441#define DIRBLKSIZ  DEV_BSIZE
1442}
1443
1444/*
1445 * Rmdir system call.
1446 */
1447static int
1448ext2_rmdir(struct vop_rmdir_args *ap)
1449{
1450	struct vnode *vp = ap->a_vp;
1451	struct vnode *dvp = ap->a_dvp;
1452	struct componentname *cnp = ap->a_cnp;
1453	struct inode *ip, *dp;
1454	int error;
1455
1456	ip = VTOI(vp);
1457	dp = VTOI(dvp);
1458
1459	/*
1460	 * Verify the directory is empty (and valid).
1461	 * (Rmdir ".." won't be valid since
1462	 *  ".." will contain a reference to
1463	 *  the current directory and thus be
1464	 *  non-empty.)
1465	 */
1466	if (!ext2_dirempty(ip, dp->i_number, cnp->cn_cred)) {
1467		error = ENOTEMPTY;
1468		goto out;
1469	}
1470	if ((dp->i_flags & APPEND)
1471	    || (ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND))) {
1472		error = EPERM;
1473		goto out;
1474	}
1475	/*
1476	 * Delete reference to directory before purging
1477	 * inode.  If we crash in between, the directory
1478	 * will be reattached to lost+found,
1479	 */
1480	error = ext2_dirremove(dvp, cnp);
1481	if (error)
1482		goto out;
1483	ext2_dec_nlink(dp);
1484	dp->i_flag |= IN_CHANGE;
1485	cache_purge(dvp);
1486	VOP_UNLOCK(dvp, 0);
1487	/*
1488	 * Truncate inode.  The only stuff left
1489	 * in the directory is "." and "..".
1490	 */
1491	ip->i_nlink = 0;
1492	error = ext2_truncate(vp, (off_t)0, IO_SYNC, cnp->cn_cred,
1493	    cnp->cn_thread);
1494	cache_purge(ITOV(ip));
1495	if (vn_lock(dvp, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
1496		VOP_UNLOCK(vp, 0);
1497		vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
1498		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1499	}
1500out:
1501	return (error);
1502}
1503
1504/*
1505 * symlink -- make a symbolic link
1506 */
1507static int
1508ext2_symlink(struct vop_symlink_args *ap)
1509{
1510	struct vnode *vp, **vpp = ap->a_vpp;
1511	struct inode *ip;
1512	int len, error;
1513
1514	error = ext2_makeinode(IFLNK | ap->a_vap->va_mode, ap->a_dvp,
1515	    vpp, ap->a_cnp);
1516	if (error)
1517		return (error);
1518	vp = *vpp;
1519	len = strlen(ap->a_target);
1520	if (len < vp->v_mount->mnt_maxsymlinklen) {
1521		ip = VTOI(vp);
1522		bcopy(ap->a_target, (char *)ip->i_shortlink, len);
1523		ip->i_size = len;
1524		ip->i_flag |= IN_CHANGE | IN_UPDATE;
1525	} else
1526		error = vn_rdwr(UIO_WRITE, vp, ap->a_target, len, (off_t)0,
1527		    UIO_SYSSPACE, IO_NODELOCKED | IO_NOMACCHECK,
1528		    ap->a_cnp->cn_cred, NOCRED, NULL, NULL);
1529	if (error)
1530		vput(vp);
1531	return (error);
1532}
1533
1534/*
1535 * Return target name of a symbolic link
1536 */
1537static int
1538ext2_readlink(struct vop_readlink_args *ap)
1539{
1540	struct vnode *vp = ap->a_vp;
1541	struct inode *ip = VTOI(vp);
1542	int isize;
1543
1544	isize = ip->i_size;
1545	if (isize < vp->v_mount->mnt_maxsymlinklen) {
1546		uiomove((char *)ip->i_shortlink, isize, ap->a_uio);
1547		return (0);
1548	}
1549	return (VOP_READ(vp, ap->a_uio, 0, ap->a_cred));
1550}
1551
1552/*
1553 * Calculate the logical to physical mapping if not done already,
1554 * then call the device strategy routine.
1555 *
1556 * In order to be able to swap to a file, the ext2_bmaparray() operation may not
1557 * deadlock on memory.  See ext2_bmap() for details.
1558 */
1559static int
1560ext2_strategy(struct vop_strategy_args *ap)
1561{
1562	struct buf *bp = ap->a_bp;
1563	struct vnode *vp = ap->a_vp;
1564	struct bufobj *bo;
1565	daddr_t blkno;
1566	int error;
1567
1568	if (vp->v_type == VBLK || vp->v_type == VCHR)
1569		panic("ext2_strategy: spec");
1570	if (bp->b_blkno == bp->b_lblkno) {
1571
1572		if (VTOI(ap->a_vp)->i_flag & IN_E4EXTENTS)
1573			error = ext4_bmapext(vp, bp->b_lblkno, &blkno, NULL, NULL);
1574		else
1575			error = ext2_bmaparray(vp, bp->b_lblkno, &blkno, NULL, NULL);
1576
1577		bp->b_blkno = blkno;
1578		if (error) {
1579			bp->b_error = error;
1580			bp->b_ioflags |= BIO_ERROR;
1581			bufdone(bp);
1582			return (0);
1583		}
1584		if ((long)bp->b_blkno == -1)
1585			vfs_bio_clrbuf(bp);
1586	}
1587	if ((long)bp->b_blkno == -1) {
1588		bufdone(bp);
1589		return (0);
1590	}
1591	bp->b_iooffset = dbtob(bp->b_blkno);
1592	bo = VFSTOEXT2(vp->v_mount)->um_bo;
1593	BO_STRATEGY(bo, bp);
1594	return (0);
1595}
1596
1597/*
1598 * Print out the contents of an inode.
1599 */
1600static int
1601ext2_print(struct vop_print_args *ap)
1602{
1603	struct vnode *vp = ap->a_vp;
1604	struct inode *ip = VTOI(vp);
1605
1606	vn_printf(ip->i_devvp, "\tino %ju", (uintmax_t)ip->i_number);
1607	if (vp->v_type == VFIFO)
1608		fifo_printinfo(vp);
1609	printf("\n");
1610	return (0);
1611}
1612
1613/*
1614 * Close wrapper for fifos.
1615 *
1616 * Update the times on the inode then do device close.
1617 */
1618static int
1619ext2fifo_close(struct vop_close_args *ap)
1620{
1621	struct vnode *vp = ap->a_vp;
1622
1623	VI_LOCK(vp);
1624	if (vp->v_usecount > 1)
1625		ext2_itimes_locked(vp);
1626	VI_UNLOCK(vp);
1627	return (fifo_specops.vop_close(ap));
1628}
1629
1630/*
1631 * Kqfilter wrapper for fifos.
1632 *
1633 * Fall through to ext2 kqfilter routines if needed
1634 */
1635static int
1636ext2fifo_kqfilter(struct vop_kqfilter_args *ap)
1637{
1638	int error;
1639
1640	error = fifo_specops.vop_kqfilter(ap);
1641	if (error)
1642		error = vfs_kqfilter(ap);
1643	return (error);
1644}
1645
1646/*
1647 * Return POSIX pathconf information applicable to ext2 filesystems.
1648 */
1649static int
1650ext2_pathconf(struct vop_pathconf_args *ap)
1651{
1652	int error = 0;
1653
1654	switch (ap->a_name) {
1655	case _PC_LINK_MAX:
1656		if (EXT2_HAS_RO_COMPAT_FEATURE(VTOI(ap->a_vp)->i_e2fs,
1657		    EXT2F_ROCOMPAT_DIR_NLINK))
1658			*ap->a_retval = INT_MAX;
1659		else
1660			*ap->a_retval = EXT4_LINK_MAX;
1661		break;
1662	case _PC_NAME_MAX:
1663		*ap->a_retval = NAME_MAX;
1664		break;
1665	case _PC_PIPE_BUF:
1666		if (ap->a_vp->v_type == VDIR || ap->a_vp->v_type == VFIFO)
1667			*ap->a_retval = PIPE_BUF;
1668		else
1669			error = EINVAL;
1670		break;
1671	case _PC_CHOWN_RESTRICTED:
1672		*ap->a_retval = 1;
1673		break;
1674	case _PC_NO_TRUNC:
1675		*ap->a_retval = 1;
1676		break;
1677
1678#ifdef UFS_ACL
1679	case _PC_ACL_EXTENDED:
1680		if (ap->a_vp->v_mount->mnt_flag & MNT_ACLS)
1681			*ap->a_retval = 1;
1682		else
1683			*ap->a_retval = 0;
1684		break;
1685	case _PC_ACL_PATH_MAX:
1686		if (ap->a_vp->v_mount->mnt_flag & MNT_ACLS)
1687			*ap->a_retval = ACL_MAX_ENTRIES;
1688		else
1689			*ap->a_retval = 3;
1690		break;
1691#endif /* UFS_ACL */
1692
1693	case _PC_MIN_HOLE_SIZE:
1694		*ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_iosize;
1695		break;
1696	case _PC_PRIO_IO:
1697		*ap->a_retval = 0;
1698		break;
1699	case _PC_SYNC_IO:
1700		*ap->a_retval = 0;
1701		break;
1702	case _PC_ALLOC_SIZE_MIN:
1703		*ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_bsize;
1704		break;
1705	case _PC_FILESIZEBITS:
1706		*ap->a_retval = 64;
1707		break;
1708	case _PC_REC_INCR_XFER_SIZE:
1709		*ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_iosize;
1710		break;
1711	case _PC_REC_MAX_XFER_SIZE:
1712		*ap->a_retval = -1;	/* means ``unlimited'' */
1713		break;
1714	case _PC_REC_MIN_XFER_SIZE:
1715		*ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_iosize;
1716		break;
1717	case _PC_REC_XFER_ALIGN:
1718		*ap->a_retval = PAGE_SIZE;
1719		break;
1720	case _PC_SYMLINK_MAX:
1721		*ap->a_retval = MAXPATHLEN;
1722		break;
1723
1724	default:
1725		error = vop_stdpathconf(ap);
1726		break;
1727	}
1728	return (error);
1729}
1730
1731/*
1732 * Vnode operation to remove a named attribute.
1733 */
1734static int
1735ext2_deleteextattr(struct vop_deleteextattr_args *ap)
1736{
1737	struct inode *ip;
1738	struct m_ext2fs *fs;
1739	int error;
1740
1741	ip = VTOI(ap->a_vp);
1742	fs = ip->i_e2fs;
1743
1744	if (!EXT2_HAS_COMPAT_FEATURE(ip->i_e2fs, EXT2F_COMPAT_EXT_ATTR))
1745		return (EOPNOTSUPP);
1746
1747	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1748		return (EOPNOTSUPP);
1749
1750	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1751	    ap->a_cred, ap->a_td, VWRITE);
1752	if (error)
1753		return (error);
1754
1755	error = ENOATTR;
1756
1757	if (EXT2_INODE_SIZE(fs) != E2FS_REV0_INODE_SIZE) {
1758		error = ext2_extattr_inode_delete(ip, ap->a_attrnamespace, ap->a_name);
1759		if (error != ENOATTR)
1760			return (error);
1761	}
1762
1763	if (ip->i_facl)
1764		error = ext2_extattr_block_delete(ip, ap->a_attrnamespace, ap->a_name);
1765
1766	return (error);
1767}
1768
1769/*
1770 * Vnode operation to retrieve a named extended attribute.
1771 */
1772static int
1773ext2_getextattr(struct vop_getextattr_args *ap)
1774{
1775	struct inode *ip;
1776	struct m_ext2fs *fs;
1777	int error;
1778
1779	ip = VTOI(ap->a_vp);
1780	fs = ip->i_e2fs;
1781
1782	if (!EXT2_HAS_COMPAT_FEATURE(ip->i_e2fs, EXT2F_COMPAT_EXT_ATTR))
1783		return (EOPNOTSUPP);
1784
1785	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1786		return (EOPNOTSUPP);
1787
1788	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1789	    ap->a_cred, ap->a_td, VREAD);
1790	if (error)
1791		return (error);
1792
1793	if (ap->a_size != NULL)
1794		*ap->a_size = 0;
1795
1796	error = ENOATTR;
1797
1798	if (EXT2_INODE_SIZE(fs) != E2FS_REV0_INODE_SIZE) {
1799		error = ext2_extattr_inode_get(ip, ap->a_attrnamespace,
1800		    ap->a_name, ap->a_uio, ap->a_size);
1801		if (error != ENOATTR)
1802			return (error);
1803	}
1804
1805	if (ip->i_facl)
1806		error = ext2_extattr_block_get(ip, ap->a_attrnamespace,
1807		    ap->a_name, ap->a_uio, ap->a_size);
1808
1809	return (error);
1810}
1811
1812/*
1813 * Vnode operation to retrieve extended attributes on a vnode.
1814 */
1815static int
1816ext2_listextattr(struct vop_listextattr_args *ap)
1817{
1818	struct inode *ip;
1819	struct m_ext2fs *fs;
1820	int error;
1821
1822	ip = VTOI(ap->a_vp);
1823	fs = ip->i_e2fs;
1824
1825	if (!EXT2_HAS_COMPAT_FEATURE(ip->i_e2fs, EXT2F_COMPAT_EXT_ATTR))
1826		return (EOPNOTSUPP);
1827
1828	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1829		return (EOPNOTSUPP);
1830
1831	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1832	    ap->a_cred, ap->a_td, VREAD);
1833	if (error)
1834		return (error);
1835
1836	if (ap->a_size != NULL)
1837		*ap->a_size = 0;
1838
1839	if (EXT2_INODE_SIZE(fs) != E2FS_REV0_INODE_SIZE) {
1840		error = ext2_extattr_inode_list(ip, ap->a_attrnamespace,
1841		    ap->a_uio, ap->a_size);
1842		if (error)
1843			return (error);
1844	}
1845
1846	if (ip->i_facl)
1847		error = ext2_extattr_block_list(ip, ap->a_attrnamespace,
1848		    ap->a_uio, ap->a_size);
1849
1850	return (error);
1851}
1852
1853/*
1854 * Vnode operation to set a named attribute.
1855 */
1856static int
1857ext2_setextattr(struct vop_setextattr_args *ap)
1858{
1859	struct inode *ip;
1860	struct m_ext2fs *fs;
1861	int error;
1862
1863	ip = VTOI(ap->a_vp);
1864	fs = ip->i_e2fs;
1865
1866	if (!EXT2_HAS_COMPAT_FEATURE(ip->i_e2fs, EXT2F_COMPAT_EXT_ATTR))
1867		return (EOPNOTSUPP);
1868
1869	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1870		return (EOPNOTSUPP);
1871
1872	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1873	    ap->a_cred, ap->a_td, VWRITE);
1874	if (error)
1875		return (error);
1876
1877	error = ext2_extattr_valid_attrname(ap->a_attrnamespace, ap->a_name);
1878	if (error)
1879		return (error);
1880
1881	if (EXT2_INODE_SIZE(fs) != E2FS_REV0_INODE_SIZE) {
1882		error = ext2_extattr_inode_set(ip, ap->a_attrnamespace,
1883		    ap->a_name, ap->a_uio);
1884		if (error != ENOSPC)
1885			return (error);
1886	}
1887
1888	error = ext2_extattr_block_set(ip, ap->a_attrnamespace,
1889	    ap->a_name, ap->a_uio);
1890
1891	return (error);
1892}
1893
1894/*
1895 * Vnode pointer to File handle
1896 */
1897/* ARGSUSED */
1898static int
1899ext2_vptofh(struct vop_vptofh_args *ap)
1900{
1901	struct inode *ip;
1902	struct ufid *ufhp;
1903
1904	ip = VTOI(ap->a_vp);
1905	ufhp = (struct ufid *)ap->a_fhp;
1906	ufhp->ufid_len = sizeof(struct ufid);
1907	ufhp->ufid_ino = ip->i_number;
1908	ufhp->ufid_gen = ip->i_gen;
1909	return (0);
1910}
1911
1912/*
1913 * Initialize the vnode associated with a new inode, handle aliased
1914 * vnodes.
1915 */
1916int
1917ext2_vinit(struct mount *mntp, struct vop_vector *fifoops, struct vnode **vpp)
1918{
1919	struct inode *ip;
1920	struct vnode *vp;
1921
1922	vp = *vpp;
1923	ip = VTOI(vp);
1924	vp->v_type = IFTOVT(ip->i_mode);
1925	/*
1926	 * Only unallocated inodes should be of type VNON.
1927	 */
1928	if (ip->i_mode != 0 && vp->v_type == VNON)
1929		return (EINVAL);
1930	if (vp->v_type == VFIFO)
1931		vp->v_op = fifoops;
1932
1933	if (ip->i_number == EXT2_ROOTINO)
1934		vp->v_vflag |= VV_ROOT;
1935	ip->i_modrev = init_va_filerev();
1936	*vpp = vp;
1937	return (0);
1938}
1939
1940/*
1941 * Allocate a new inode.
1942 */
1943static int
1944ext2_makeinode(int mode, struct vnode *dvp, struct vnode **vpp,
1945    struct componentname *cnp)
1946{
1947	struct inode *ip, *pdir;
1948	struct vnode *tvp;
1949	int error;
1950
1951	pdir = VTOI(dvp);
1952#ifdef INVARIANTS
1953	if ((cnp->cn_flags & HASBUF) == 0)
1954		panic("ext2_makeinode: no name");
1955#endif
1956	*vpp = NULL;
1957	if ((mode & IFMT) == 0)
1958		mode |= IFREG;
1959
1960	error = ext2_valloc(dvp, mode, cnp->cn_cred, &tvp);
1961	if (error) {
1962		return (error);
1963	}
1964	ip = VTOI(tvp);
1965	ip->i_gid = pdir->i_gid;
1966#ifdef SUIDDIR
1967	{
1968		/*
1969		 * if we are
1970		 * not the owner of the directory,
1971		 * and we are hacking owners here, (only do this where told to)
1972		 * and we are not giving it TOO root, (would subvert quotas)
1973		 * then go ahead and give it to the other user.
1974		 * Note that this drops off the execute bits for security.
1975		 */
1976		if ((dvp->v_mount->mnt_flag & MNT_SUIDDIR) &&
1977		    (pdir->i_mode & ISUID) &&
1978		    (pdir->i_uid != cnp->cn_cred->cr_uid) && pdir->i_uid) {
1979			ip->i_uid = pdir->i_uid;
1980			mode &= ~07111;
1981		} else {
1982			ip->i_uid = cnp->cn_cred->cr_uid;
1983		}
1984	}
1985#else
1986	ip->i_uid = cnp->cn_cred->cr_uid;
1987#endif
1988	ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
1989	ip->i_mode = mode;
1990	tvp->v_type = IFTOVT(mode);	/* Rest init'd in getnewvnode(). */
1991	ip->i_nlink = 1;
1992	if ((ip->i_mode & ISGID) && !groupmember(ip->i_gid, cnp->cn_cred)) {
1993		if (priv_check_cred(cnp->cn_cred, PRIV_VFS_RETAINSUGID, 0))
1994			ip->i_mode &= ~ISGID;
1995	}
1996
1997	if (cnp->cn_flags & ISWHITEOUT)
1998		ip->i_flags |= UF_OPAQUE;
1999
2000	/*
2001	 * Make sure inode goes to disk before directory entry.
2002	 */
2003	error = ext2_update(tvp, !DOINGASYNC(tvp));
2004	if (error)
2005		goto bad;
2006
2007#ifdef UFS_ACL
2008	if (dvp->v_mount->mnt_flag & MNT_ACLS) {
2009		error = ext2_do_posix1e_acl_inheritance_file(dvp, tvp, mode,
2010		    cnp->cn_cred, cnp->cn_thread);
2011		if (error)
2012			goto bad;
2013	}
2014#endif /* UFS_ACL */
2015
2016	error = ext2_direnter(ip, dvp, cnp);
2017	if (error)
2018		goto bad;
2019
2020	*vpp = tvp;
2021	return (0);
2022
2023bad:
2024	/*
2025	 * Write error occurred trying to update the inode
2026	 * or the directory so must deallocate the inode.
2027	 */
2028	ip->i_nlink = 0;
2029	ip->i_flag |= IN_CHANGE;
2030	vput(tvp);
2031	return (error);
2032}
2033
2034/*
2035 * Vnode op for reading.
2036 */
2037static int
2038ext2_read(struct vop_read_args *ap)
2039{
2040	struct vnode *vp;
2041	struct inode *ip;
2042	struct uio *uio;
2043	struct m_ext2fs *fs;
2044	struct buf *bp;
2045	daddr_t lbn, nextlbn;
2046	off_t bytesinfile;
2047	long size, xfersize, blkoffset;
2048	int error, orig_resid, seqcount;
2049	int ioflag;
2050
2051	vp = ap->a_vp;
2052	uio = ap->a_uio;
2053	ioflag = ap->a_ioflag;
2054
2055	seqcount = ap->a_ioflag >> IO_SEQSHIFT;
2056	ip = VTOI(vp);
2057
2058#ifdef INVARIANTS
2059	if (uio->uio_rw != UIO_READ)
2060		panic("%s: mode", "ext2_read");
2061
2062	if (vp->v_type == VLNK) {
2063		if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen)
2064			panic("%s: short symlink", "ext2_read");
2065	} else if (vp->v_type != VREG && vp->v_type != VDIR)
2066		panic("%s: type %d", "ext2_read", vp->v_type);
2067#endif
2068	orig_resid = uio->uio_resid;
2069	KASSERT(orig_resid >= 0, ("ext2_read: uio->uio_resid < 0"));
2070	if (orig_resid == 0)
2071		return (0);
2072	KASSERT(uio->uio_offset >= 0, ("ext2_read: uio->uio_offset < 0"));
2073	fs = ip->i_e2fs;
2074	if (uio->uio_offset < ip->i_size &&
2075	    uio->uio_offset >= fs->e2fs_maxfilesize)
2076		return (EOVERFLOW);
2077
2078	for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
2079		if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0)
2080			break;
2081		lbn = lblkno(fs, uio->uio_offset);
2082		nextlbn = lbn + 1;
2083		size = blksize(fs, ip, lbn);
2084		blkoffset = blkoff(fs, uio->uio_offset);
2085
2086		xfersize = fs->e2fs_fsize - blkoffset;
2087		if (uio->uio_resid < xfersize)
2088			xfersize = uio->uio_resid;
2089		if (bytesinfile < xfersize)
2090			xfersize = bytesinfile;
2091
2092		if (lblktosize(fs, nextlbn) >= ip->i_size)
2093			error = bread(vp, lbn, size, NOCRED, &bp);
2094		else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
2095			error = cluster_read(vp, ip->i_size, lbn, size,
2096			    NOCRED, blkoffset + uio->uio_resid, seqcount,
2097			    0, &bp);
2098		} else if (seqcount > 1) {
2099			u_int nextsize = blksize(fs, ip, nextlbn);
2100
2101			error = breadn(vp, lbn,
2102			    size, &nextlbn, &nextsize, 1, NOCRED, &bp);
2103		} else
2104			error = bread(vp, lbn, size, NOCRED, &bp);
2105		if (error) {
2106			brelse(bp);
2107			bp = NULL;
2108			break;
2109		}
2110
2111		/*
2112		 * We should only get non-zero b_resid when an I/O error
2113		 * has occurred, which should cause us to break above.
2114		 * However, if the short read did not cause an error,
2115		 * then we want to ensure that we do not uiomove bad
2116		 * or uninitialized data.
2117		 */
2118		size -= bp->b_resid;
2119		if (size < xfersize) {
2120			if (size == 0)
2121				break;
2122			xfersize = size;
2123		}
2124		error = uiomove((char *)bp->b_data + blkoffset,
2125		    (int)xfersize, uio);
2126		if (error)
2127			break;
2128		vfs_bio_brelse(bp, ioflag);
2129	}
2130
2131	/*
2132	 * This can only happen in the case of an error because the loop
2133	 * above resets bp to NULL on each iteration and on normal
2134	 * completion has not set a new value into it. so it must have come
2135	 * from a 'break' statement
2136	 */
2137	if (bp != NULL)
2138		vfs_bio_brelse(bp, ioflag);
2139
2140	if ((error == 0 || uio->uio_resid != orig_resid) &&
2141	    (vp->v_mount->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0)
2142		ip->i_flag |= IN_ACCESS;
2143	return (error);
2144}
2145
2146static int
2147ext2_ioctl(struct vop_ioctl_args *ap)
2148{
2149	struct vnode *vp;
2150	int error;
2151
2152	vp = ap->a_vp;
2153	switch (ap->a_command) {
2154	case FIOSEEKDATA:
2155		if (!(VTOI(vp)->i_flag & IN_E4EXTENTS)) {
2156			error = vn_lock(vp, LK_SHARED);
2157			if (error == 0) {
2158				error = ext2_bmap_seekdata(vp,
2159				    (off_t *)ap->a_data);
2160				VOP_UNLOCK(vp, 0);
2161			} else
2162				error = EBADF;
2163			return (error);
2164		}
2165	case FIOSEEKHOLE:
2166		return (vn_bmap_seekhole(vp, ap->a_command,
2167		    (off_t *)ap->a_data, ap->a_cred));
2168	default:
2169		return (ENOTTY);
2170	}
2171}
2172
2173/*
2174 * Vnode op for writing.
2175 */
2176static int
2177ext2_write(struct vop_write_args *ap)
2178{
2179	struct vnode *vp;
2180	struct uio *uio;
2181	struct inode *ip;
2182	struct m_ext2fs *fs;
2183	struct buf *bp;
2184	daddr_t lbn;
2185	off_t osize;
2186	int blkoffset, error, flags, ioflag, resid, size, seqcount, xfersize;
2187
2188	ioflag = ap->a_ioflag;
2189	uio = ap->a_uio;
2190	vp = ap->a_vp;
2191
2192	seqcount = ioflag >> IO_SEQSHIFT;
2193	ip = VTOI(vp);
2194
2195#ifdef INVARIANTS
2196	if (uio->uio_rw != UIO_WRITE)
2197		panic("%s: mode", "ext2_write");
2198#endif
2199
2200	switch (vp->v_type) {
2201	case VREG:
2202		if (ioflag & IO_APPEND)
2203			uio->uio_offset = ip->i_size;
2204		if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size)
2205			return (EPERM);
2206		/* FALLTHROUGH */
2207	case VLNK:
2208		break;
2209	case VDIR:
2210		/* XXX differs from ffs -- this is called from ext2_mkdir(). */
2211		if ((ioflag & IO_SYNC) == 0)
2212			panic("ext2_write: nonsync dir write");
2213		break;
2214	default:
2215		panic("ext2_write: type %p %d (%jd,%jd)", (void *)vp,
2216		    vp->v_type, (intmax_t)uio->uio_offset,
2217		    (intmax_t)uio->uio_resid);
2218	}
2219
2220	KASSERT(uio->uio_resid >= 0, ("ext2_write: uio->uio_resid < 0"));
2221	KASSERT(uio->uio_offset >= 0, ("ext2_write: uio->uio_offset < 0"));
2222	fs = ip->i_e2fs;
2223	if ((uoff_t)uio->uio_offset + uio->uio_resid > fs->e2fs_maxfilesize)
2224		return (EFBIG);
2225	/*
2226	 * Maybe this should be above the vnode op call, but so long as
2227	 * file servers have no limits, I don't think it matters.
2228	 */
2229	if (vn_rlimit_fsize(vp, uio, uio->uio_td))
2230		return (EFBIG);
2231
2232	resid = uio->uio_resid;
2233	osize = ip->i_size;
2234	if (seqcount > BA_SEQMAX)
2235		flags = BA_SEQMAX << BA_SEQSHIFT;
2236	else
2237		flags = seqcount << BA_SEQSHIFT;
2238	if ((ioflag & IO_SYNC) && !DOINGASYNC(vp))
2239		flags |= IO_SYNC;
2240
2241	for (error = 0; uio->uio_resid > 0;) {
2242		lbn = lblkno(fs, uio->uio_offset);
2243		blkoffset = blkoff(fs, uio->uio_offset);
2244		xfersize = fs->e2fs_fsize - blkoffset;
2245		if (uio->uio_resid < xfersize)
2246			xfersize = uio->uio_resid;
2247		if (uio->uio_offset + xfersize > ip->i_size)
2248			vnode_pager_setsize(vp, uio->uio_offset + xfersize);
2249
2250		/*
2251		 * We must perform a read-before-write if the transfer size
2252		 * does not cover the entire buffer.
2253		 */
2254		if (fs->e2fs_bsize > xfersize)
2255			flags |= BA_CLRBUF;
2256		else
2257			flags &= ~BA_CLRBUF;
2258		error = ext2_balloc(ip, lbn, blkoffset + xfersize,
2259		    ap->a_cred, &bp, flags);
2260		if (error != 0)
2261			break;
2262
2263		if ((ioflag & (IO_SYNC | IO_INVAL)) == (IO_SYNC | IO_INVAL))
2264			bp->b_flags |= B_NOCACHE;
2265		if (uio->uio_offset + xfersize > ip->i_size)
2266			ip->i_size = uio->uio_offset + xfersize;
2267		size = blksize(fs, ip, lbn) - bp->b_resid;
2268		if (size < xfersize)
2269			xfersize = size;
2270
2271		error =
2272		    uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
2273		/*
2274		 * If the buffer is not already filled and we encounter an
2275		 * error while trying to fill it, we have to clear out any
2276		 * garbage data from the pages instantiated for the buffer.
2277		 * If we do not, a failed uiomove() during a write can leave
2278		 * the prior contents of the pages exposed to a userland mmap.
2279		 *
2280		 * Note that we need only clear buffers with a transfer size
2281		 * equal to the block size because buffers with a shorter
2282		 * transfer size were cleared above by the call to ext2_balloc()
2283		 * with the BA_CLRBUF flag set.
2284		 *
2285		 * If the source region for uiomove identically mmaps the
2286		 * buffer, uiomove() performed the NOP copy, and the buffer
2287		 * content remains valid because the page fault handler
2288		 * validated the pages.
2289		 */
2290		if (error != 0 && (bp->b_flags & B_CACHE) == 0 &&
2291		    fs->e2fs_bsize == xfersize)
2292			vfs_bio_clrbuf(bp);
2293
2294		vfs_bio_set_flags(bp, ioflag);
2295
2296		/*
2297		 * If IO_SYNC each buffer is written synchronously.  Otherwise
2298		 * if we have a severe page deficiency write the buffer
2299		 * asynchronously.  Otherwise try to cluster, and if that
2300		 * doesn't do it then either do an async write (if O_DIRECT),
2301		 * or a delayed write (if not).
2302		 */
2303		if (ioflag & IO_SYNC) {
2304			(void)bwrite(bp);
2305		} else if (vm_page_count_severe() ||
2306			    buf_dirty_count_severe() ||
2307		    (ioflag & IO_ASYNC)) {
2308			bp->b_flags |= B_CLUSTEROK;
2309			bawrite(bp);
2310		} else if (xfersize + blkoffset == fs->e2fs_fsize) {
2311			if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) {
2312				bp->b_flags |= B_CLUSTEROK;
2313				cluster_write(vp, bp, ip->i_size, seqcount, 0);
2314			} else {
2315				bawrite(bp);
2316			}
2317		} else if (ioflag & IO_DIRECT) {
2318			bp->b_flags |= B_CLUSTEROK;
2319			bawrite(bp);
2320		} else {
2321			bp->b_flags |= B_CLUSTEROK;
2322			bdwrite(bp);
2323		}
2324		if (error || xfersize == 0)
2325			break;
2326	}
2327	/*
2328	 * If we successfully wrote any data, and we are not the superuser
2329	 * we clear the setuid and setgid bits as a precaution against
2330	 * tampering.
2331	 */
2332	if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid &&
2333	    ap->a_cred) {
2334		if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID, 0))
2335			ip->i_mode &= ~(ISUID | ISGID);
2336	}
2337	if (error) {
2338		if (ioflag & IO_UNIT) {
2339			(void)ext2_truncate(vp, osize,
2340			    ioflag & IO_SYNC, ap->a_cred, uio->uio_td);
2341			uio->uio_offset -= resid - uio->uio_resid;
2342			uio->uio_resid = resid;
2343		}
2344	}
2345	if (uio->uio_resid != resid) {
2346		ip->i_flag |= IN_CHANGE | IN_UPDATE;
2347		if (ioflag & IO_SYNC)
2348			error = ext2_update(vp, 1);
2349	}
2350	return (error);
2351}
2352