vfs_vnops.c revision 108897
1/*
2 * Copyright (c) 1982, 1986, 1989, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 *    must display the following acknowledgement:
20 *	This product includes software developed by the University of
21 *	California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 *    may be used to endorse or promote products derived from this software
24 *    without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 *	@(#)vfs_vnops.c	8.2 (Berkeley) 1/21/94
39 * $FreeBSD: head/sys/kern/vfs_vnops.c 108897 2003-01-07 20:59:55Z green $
40 */
41
42#include "opt_mac.h"
43
44#include <sys/param.h>
45#include <sys/systm.h>
46#include <sys/fcntl.h>
47#include <sys/file.h>
48#include <sys/stat.h>
49#include <sys/proc.h>
50#include <sys/lock.h>
51#include <sys/mac.h>
52#include <sys/mount.h>
53#include <sys/mutex.h>
54#include <sys/namei.h>
55#include <sys/vnode.h>
56#include <sys/bio.h>
57#include <sys/buf.h>
58#include <sys/filio.h>
59#include <sys/sx.h>
60#include <sys/ttycom.h>
61#include <sys/conf.h>
62#include <sys/syslog.h>
63
64#include <machine/limits.h>
65
66static fo_rdwr_t	vn_read;
67static fo_rdwr_t	vn_write;
68static fo_ioctl_t	vn_ioctl;
69static fo_poll_t	vn_poll;
70static fo_kqfilter_t	vn_kqfilter;
71static fo_stat_t	vn_statfile;
72static fo_close_t	vn_closefile;
73
74struct 	fileops vnops = {
75	vn_read, vn_write, vn_ioctl, vn_poll, vn_kqfilter,
76	vn_statfile, vn_closefile
77};
78
79int
80vn_open(ndp, flagp, cmode)
81	register struct nameidata *ndp;
82	int *flagp, cmode;
83{
84	struct thread *td = ndp->ni_cnd.cn_thread;
85
86	return (vn_open_cred(ndp, flagp, cmode, td->td_ucred));
87}
88
89/*
90 * Common code for vnode open operations.
91 * Check permissions, and call the VOP_OPEN or VOP_CREATE routine.
92 *
93 * Note that this does NOT free nameidata for the successful case,
94 * due to the NDINIT being done elsewhere.
95 */
96int
97vn_open_cred(ndp, flagp, cmode, cred)
98	register struct nameidata *ndp;
99	int *flagp, cmode;
100	struct ucred *cred;
101{
102	struct vnode *vp;
103	struct mount *mp;
104	struct thread *td = ndp->ni_cnd.cn_thread;
105	struct vattr vat;
106	struct vattr *vap = &vat;
107	int mode, fmode, error;
108#ifdef LOOKUP_SHARED
109	int exclusive;	/* The current intended lock state */
110
111	exclusive = 0;
112#endif
113
114restart:
115	fmode = *flagp;
116	if (fmode & O_CREAT) {
117		ndp->ni_cnd.cn_nameiop = CREATE;
118		ndp->ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF;
119		if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0)
120			ndp->ni_cnd.cn_flags |= FOLLOW;
121		bwillwrite();
122		if ((error = namei(ndp)) != 0)
123			return (error);
124		if (ndp->ni_vp == NULL) {
125			VATTR_NULL(vap);
126			vap->va_type = VREG;
127			vap->va_mode = cmode;
128			if (fmode & O_EXCL)
129				vap->va_vaflags |= VA_EXCLUSIVE;
130			if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) {
131				NDFREE(ndp, NDF_ONLY_PNBUF);
132				vput(ndp->ni_dvp);
133				if ((error = vn_start_write(NULL, &mp,
134				    V_XSLEEP | PCATCH)) != 0)
135					return (error);
136				goto restart;
137			}
138#ifdef MAC
139			error = mac_check_vnode_create(cred, ndp->ni_dvp,
140			    &ndp->ni_cnd, vap);
141			if (error == 0) {
142#endif
143				VOP_LEASE(ndp->ni_dvp, td, cred, LEASE_WRITE);
144				error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp,
145						   &ndp->ni_cnd, vap);
146#ifdef MAC
147			}
148#endif
149			vput(ndp->ni_dvp);
150			vn_finished_write(mp);
151			if (error) {
152				NDFREE(ndp, NDF_ONLY_PNBUF);
153				return (error);
154			}
155			ASSERT_VOP_UNLOCKED(ndp->ni_dvp, "create");
156			ASSERT_VOP_LOCKED(ndp->ni_vp, "create");
157			fmode &= ~O_TRUNC;
158			vp = ndp->ni_vp;
159#ifdef LOOKUP_SHARED
160			exclusive = 1;
161#endif
162		} else {
163			if (ndp->ni_dvp == ndp->ni_vp)
164				vrele(ndp->ni_dvp);
165			else
166				vput(ndp->ni_dvp);
167			ndp->ni_dvp = NULL;
168			vp = ndp->ni_vp;
169			if (fmode & O_EXCL) {
170				error = EEXIST;
171				goto bad;
172			}
173			fmode &= ~O_CREAT;
174		}
175	} else {
176		ndp->ni_cnd.cn_nameiop = LOOKUP;
177#ifdef LOOKUP_SHARED
178		ndp->ni_cnd.cn_flags =
179		    ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) |
180		    LOCKSHARED | LOCKLEAF;
181#else
182		ndp->ni_cnd.cn_flags =
183		    ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) | LOCKLEAF;
184#endif
185		if ((error = namei(ndp)) != 0)
186			return (error);
187		vp = ndp->ni_vp;
188	}
189	if (vp->v_type == VLNK) {
190		error = EMLINK;
191		goto bad;
192	}
193	if (vp->v_type == VSOCK) {
194		error = EOPNOTSUPP;
195		goto bad;
196	}
197	mode = 0;
198	if (fmode & (FWRITE | O_TRUNC)) {
199		if (vp->v_type == VDIR) {
200			error = EISDIR;
201			goto bad;
202		}
203		mode |= VWRITE;
204	}
205	if (fmode & FREAD)
206		mode |= VREAD;
207	if (fmode & O_APPEND)
208		mode |= VAPPEND;
209#ifdef MAC
210	error = mac_check_vnode_open(cred, vp, mode);
211	if (error)
212		goto bad;
213#endif
214	if ((fmode & O_CREAT) == 0) {
215		if (mode & VWRITE) {
216			error = vn_writechk(vp);
217			if (error)
218				goto bad;
219		}
220		if (mode) {
221		        error = VOP_ACCESS(vp, mode, cred, td);
222			if (error)
223				goto bad;
224		}
225	}
226	if ((error = VOP_GETATTR(vp, vap, cred, td)) == 0) {
227		vp->v_cachedfs = vap->va_fsid;
228		vp->v_cachedid = vap->va_fileid;
229	}
230	if ((error = VOP_OPEN(vp, fmode, cred, td)) != 0)
231		goto bad;
232	/*
233	 * Make sure that a VM object is created for VMIO support.
234	 */
235	if (vn_canvmio(vp) == TRUE) {
236#ifdef LOOKUP_SHARED
237		int flock;
238
239		if (!exclusive && VOP_GETVOBJECT(vp, NULL) != 0)
240			VOP_LOCK(vp, LK_UPGRADE, td);
241		/*
242		 * In cases where the object is marked as dead object_create
243		 * will unlock and relock exclusive.  It is safe to call in
244		 * here with a shared lock because we only examine fields that
245		 * the shared lock guarantees will be stable.  In the UPGRADE
246		 * case it is not likely that anyone has used this vnode yet
247		 * so there will be no contention.  The logic after this call
248		 * restores the requested locking state.
249		 */
250#endif
251		if ((error = vfs_object_create(vp, td, cred)) != 0) {
252			VOP_UNLOCK(vp, 0, td);
253			VOP_CLOSE(vp, fmode, cred, td);
254			NDFREE(ndp, NDF_ONLY_PNBUF);
255			vrele(vp);
256			*flagp = fmode;
257			return (error);
258		}
259#ifdef LOOKUP_SHARED
260		flock = VOP_ISLOCKED(vp, td);
261		if (!exclusive && flock == LK_EXCLUSIVE)
262			VOP_LOCK(vp, LK_DOWNGRADE, td);
263#endif
264	}
265
266	if (fmode & FWRITE)
267		vp->v_writecount++;
268	*flagp = fmode;
269	return (0);
270bad:
271	NDFREE(ndp, NDF_ONLY_PNBUF);
272	vput(vp);
273	*flagp = fmode;
274	ndp->ni_vp = NULL;
275	return (error);
276}
277
278/*
279 * Check for write permissions on the specified vnode.
280 * Prototype text segments cannot be written.
281 */
282int
283vn_writechk(vp)
284	register struct vnode *vp;
285{
286
287	ASSERT_VOP_LOCKED(vp, "vn_writechk");
288	/*
289	 * If there's shared text associated with
290	 * the vnode, try to free it up once.  If
291	 * we fail, we can't allow writing.
292	 */
293	if (vp->v_vflag & VV_TEXT)
294		return (ETXTBSY);
295
296	return (0);
297}
298
299/*
300 * Vnode close call
301 */
302int
303vn_close(vp, flags, file_cred, td)
304	register struct vnode *vp;
305	int flags;
306	struct ucred *file_cred;
307	struct thread *td;
308{
309	int error;
310
311	if (flags & FWRITE)
312		vp->v_writecount--;
313	error = VOP_CLOSE(vp, flags, file_cred, td);
314	/*
315	 * XXX - In certain instances VOP_CLOSE has to do the vrele
316	 * itself. If the vrele has been done, it will return EAGAIN
317	 * to indicate that the vrele should not be done again. When
318	 * this happens, we just return success. The correct thing to
319	 * do would be to have all VOP_CLOSE instances do the vrele.
320	 */
321	if (error == EAGAIN)
322		return (0);
323	vrele(vp);
324	return (error);
325}
326
327/*
328 * Sequential heuristic - detect sequential operation
329 */
330static __inline
331int
332sequential_heuristic(struct uio *uio, struct file *fp)
333{
334
335	if ((uio->uio_offset == 0 && fp->f_seqcount > 0) ||
336	    uio->uio_offset == fp->f_nextoff) {
337		/*
338		 * XXX we assume that the filesystem block size is
339		 * the default.  Not true, but still gives us a pretty
340		 * good indicator of how sequential the read operations
341		 * are.
342		 */
343		fp->f_seqcount += (uio->uio_resid + BKVASIZE - 1) / BKVASIZE;
344		if (fp->f_seqcount > IO_SEQMAX)
345			fp->f_seqcount = IO_SEQMAX;
346		return(fp->f_seqcount << IO_SEQSHIFT);
347	}
348
349	/*
350	 * Not sequential, quick draw-down of seqcount
351	 */
352	if (fp->f_seqcount > 1)
353		fp->f_seqcount = 1;
354	else
355		fp->f_seqcount = 0;
356	return(0);
357}
358
359/*
360 * Package up an I/O request on a vnode into a uio and do it.
361 */
362int
363vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, active_cred, file_cred,
364    aresid, td)
365	enum uio_rw rw;
366	struct vnode *vp;
367	caddr_t base;
368	int len;
369	off_t offset;
370	enum uio_seg segflg;
371	int ioflg;
372	struct ucred *active_cred;
373	struct ucred *file_cred;
374	int *aresid;
375	struct thread *td;
376{
377	struct uio auio;
378	struct iovec aiov;
379	struct mount *mp;
380	struct ucred *cred;
381	int error;
382
383	if ((ioflg & IO_NODELOCKED) == 0) {
384		mp = NULL;
385		if (rw == UIO_WRITE) {
386			if (vp->v_type != VCHR &&
387			    (error = vn_start_write(vp, &mp, V_WAIT | PCATCH))
388			    != 0)
389				return (error);
390			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
391		} else {
392			/*
393			 * XXX This should be LK_SHARED but I don't trust VFS
394			 * enough to leave it like that until it has been
395			 * reviewed further.
396			 */
397			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
398		}
399
400	}
401	auio.uio_iov = &aiov;
402	auio.uio_iovcnt = 1;
403	aiov.iov_base = base;
404	aiov.iov_len = len;
405	auio.uio_resid = len;
406	auio.uio_offset = offset;
407	auio.uio_segflg = segflg;
408	auio.uio_rw = rw;
409	auio.uio_td = td;
410	error = 0;
411#ifdef MAC
412	if ((ioflg & IO_NOMACCHECK) == 0) {
413		if (rw == UIO_READ)
414			error = mac_check_vnode_read(active_cred, file_cred,
415			    vp);
416		else
417			error = mac_check_vnode_write(active_cred, file_cred,
418			    vp);
419	}
420#endif
421	if (error == 0) {
422		if (file_cred)
423			cred = file_cred;
424		else
425			cred = active_cred;
426		if (rw == UIO_READ)
427			error = VOP_READ(vp, &auio, ioflg, cred);
428		else
429			error = VOP_WRITE(vp, &auio, ioflg, cred);
430	}
431	if (aresid)
432		*aresid = auio.uio_resid;
433	else
434		if (auio.uio_resid && error == 0)
435			error = EIO;
436	if ((ioflg & IO_NODELOCKED) == 0) {
437		if (rw == UIO_WRITE)
438			vn_finished_write(mp);
439		VOP_UNLOCK(vp, 0, td);
440	}
441	return (error);
442}
443
444/*
445 * Package up an I/O request on a vnode into a uio and do it.  The I/O
446 * request is split up into smaller chunks and we try to avoid saturating
447 * the buffer cache while potentially holding a vnode locked, so we
448 * check bwillwrite() before calling vn_rdwr().  We also call uio_yield()
449 * to give other processes a chance to lock the vnode (either other processes
450 * core'ing the same binary, or unrelated processes scanning the directory).
451 */
452int
453vn_rdwr_inchunks(rw, vp, base, len, offset, segflg, ioflg, active_cred,
454    file_cred, aresid, td)
455	enum uio_rw rw;
456	struct vnode *vp;
457	caddr_t base;
458	int len;
459	off_t offset;
460	enum uio_seg segflg;
461	int ioflg;
462	struct ucred *active_cred;
463	struct ucred *file_cred;
464	int *aresid;
465	struct thread *td;
466{
467	int error = 0;
468
469	do {
470		int chunk = (len > MAXBSIZE) ? MAXBSIZE : len;
471
472		if (rw != UIO_READ && vp->v_type == VREG)
473			bwillwrite();
474		error = vn_rdwr(rw, vp, base, chunk, offset, segflg,
475		    ioflg, active_cred, file_cred, aresid, td);
476		len -= chunk;	/* aresid calc already includes length */
477		if (error)
478			break;
479		offset += chunk;
480		base += chunk;
481		uio_yield();
482	} while (len);
483	if (aresid)
484		*aresid += len;
485	return (error);
486}
487
488/*
489 * File table vnode read routine.
490 */
491static int
492vn_read(fp, uio, active_cred, flags, td)
493	struct file *fp;
494	struct uio *uio;
495	struct ucred *active_cred;
496	struct thread *td;
497	int flags;
498{
499	struct vnode *vp;
500	int error, ioflag;
501
502	mtx_lock(&Giant);
503	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
504	    uio->uio_td, td));
505	vp = (struct vnode *)fp->f_data;
506	ioflag = 0;
507	if (fp->f_flag & FNONBLOCK)
508		ioflag |= IO_NDELAY;
509	if (fp->f_flag & O_DIRECT)
510		ioflag |= IO_DIRECT;
511	VOP_LEASE(vp, td, fp->f_cred, LEASE_READ);
512	/*
513	 * According to McKusick the vn lock is protecting f_offset here.
514	 * Once this field has it's own lock we can acquire this shared.
515	 */
516	vn_lock(vp, LK_EXCLUSIVE | LK_NOPAUSE | LK_RETRY, td);
517	if ((flags & FOF_OFFSET) == 0)
518		uio->uio_offset = fp->f_offset;
519
520	ioflag |= sequential_heuristic(uio, fp);
521
522#ifdef MAC
523	error = mac_check_vnode_read(active_cred, fp->f_cred, vp);
524	if (error == 0)
525#endif
526		error = VOP_READ(vp, uio, ioflag, fp->f_cred);
527	if ((flags & FOF_OFFSET) == 0)
528		fp->f_offset = uio->uio_offset;
529	fp->f_nextoff = uio->uio_offset;
530	VOP_UNLOCK(vp, 0, td);
531	mtx_unlock(&Giant);
532	return (error);
533}
534
535/*
536 * File table vnode write routine.
537 */
538static int
539vn_write(fp, uio, active_cred, flags, td)
540	struct file *fp;
541	struct uio *uio;
542	struct ucred *active_cred;
543	struct thread *td;
544	int flags;
545{
546	struct vnode *vp;
547	struct mount *mp;
548	int error, ioflag;
549
550	mtx_lock(&Giant);
551	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
552	    uio->uio_td, td));
553	vp = (struct vnode *)fp->f_data;
554	if (vp->v_type == VREG)
555		bwillwrite();
556	ioflag = IO_UNIT;
557	if (vp->v_type == VREG && (fp->f_flag & O_APPEND))
558		ioflag |= IO_APPEND;
559	if (fp->f_flag & FNONBLOCK)
560		ioflag |= IO_NDELAY;
561	if (fp->f_flag & O_DIRECT)
562		ioflag |= IO_DIRECT;
563	if ((fp->f_flag & O_FSYNC) ||
564	    (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
565		ioflag |= IO_SYNC;
566	mp = NULL;
567	if (vp->v_type != VCHR &&
568	    (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
569		mtx_unlock(&Giant);
570		return (error);
571	}
572	VOP_LEASE(vp, td, fp->f_cred, LEASE_WRITE);
573	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
574	if ((flags & FOF_OFFSET) == 0)
575		uio->uio_offset = fp->f_offset;
576	ioflag |= sequential_heuristic(uio, fp);
577#ifdef MAC
578	error = mac_check_vnode_write(active_cred, fp->f_cred, vp);
579	if (error == 0)
580#endif
581		error = VOP_WRITE(vp, uio, ioflag, fp->f_cred);
582	if ((flags & FOF_OFFSET) == 0)
583		fp->f_offset = uio->uio_offset;
584	fp->f_nextoff = uio->uio_offset;
585	VOP_UNLOCK(vp, 0, td);
586	vn_finished_write(mp);
587	mtx_unlock(&Giant);
588	return (error);
589}
590
591/*
592 * File table vnode stat routine.
593 */
594static int
595vn_statfile(fp, sb, active_cred, td)
596	struct file *fp;
597	struct stat *sb;
598	struct ucred *active_cred;
599	struct thread *td;
600{
601	struct vnode *vp = (struct vnode *)fp->f_data;
602	int error;
603
604	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
605	error = vn_stat(vp, sb, active_cred, fp->f_cred, td);
606	VOP_UNLOCK(vp, 0, td);
607
608	return (error);
609}
610
611/*
612 * Stat a vnode; implementation for the stat syscall
613 */
614int
615vn_stat(vp, sb, active_cred, file_cred, td)
616	struct vnode *vp;
617	register struct stat *sb;
618	struct ucred *active_cred;
619	struct ucred *file_cred;
620	struct thread *td;
621{
622	struct vattr vattr;
623	register struct vattr *vap;
624	int error;
625	u_short mode;
626
627#ifdef MAC
628	error = mac_check_vnode_stat(active_cred, file_cred, vp);
629	if (error)
630		return (error);
631#endif
632
633	vap = &vattr;
634	error = VOP_GETATTR(vp, vap, active_cred, td);
635	if (error)
636		return (error);
637
638	vp->v_cachedfs = vap->va_fsid;
639	vp->v_cachedid = vap->va_fileid;
640
641	/*
642	 * Zero the spare stat fields
643	 */
644	bzero(sb, sizeof *sb);
645
646	/*
647	 * Copy from vattr table
648	 */
649	if (vap->va_fsid != VNOVAL)
650		sb->st_dev = vap->va_fsid;
651	else
652		sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0];
653	sb->st_ino = vap->va_fileid;
654	mode = vap->va_mode;
655	switch (vap->va_type) {
656	case VREG:
657		mode |= S_IFREG;
658		break;
659	case VDIR:
660		mode |= S_IFDIR;
661		break;
662	case VBLK:
663		mode |= S_IFBLK;
664		break;
665	case VCHR:
666		mode |= S_IFCHR;
667		break;
668	case VLNK:
669		mode |= S_IFLNK;
670		/* This is a cosmetic change, symlinks do not have a mode. */
671		if (vp->v_mount->mnt_flag & MNT_NOSYMFOLLOW)
672			sb->st_mode &= ~ACCESSPERMS;	/* 0000 */
673		else
674			sb->st_mode |= ACCESSPERMS;	/* 0777 */
675		break;
676	case VSOCK:
677		mode |= S_IFSOCK;
678		break;
679	case VFIFO:
680		mode |= S_IFIFO;
681		break;
682	default:
683		return (EBADF);
684	};
685	sb->st_mode = mode;
686	sb->st_nlink = vap->va_nlink;
687	sb->st_uid = vap->va_uid;
688	sb->st_gid = vap->va_gid;
689	sb->st_rdev = vap->va_rdev;
690	if (vap->va_size > OFF_MAX)
691		return (EOVERFLOW);
692	sb->st_size = vap->va_size;
693	sb->st_atimespec = vap->va_atime;
694	sb->st_mtimespec = vap->va_mtime;
695	sb->st_ctimespec = vap->va_ctime;
696	sb->st_birthtimespec = vap->va_birthtime;
697
698        /*
699	 * According to www.opengroup.org, the meaning of st_blksize is
700	 *   "a filesystem-specific preferred I/O block size for this
701	 *    object.  In some filesystem types, this may vary from file
702	 *    to file"
703	 * Default to PAGE_SIZE after much discussion.
704	 */
705
706	if (vap->va_type == VREG) {
707		sb->st_blksize = vap->va_blocksize;
708	} else if (vn_isdisk(vp, NULL)) {
709		sb->st_blksize = vp->v_rdev->si_bsize_best;
710		if (sb->st_blksize < vp->v_rdev->si_bsize_phys)
711			sb->st_blksize = vp->v_rdev->si_bsize_phys;
712		if (sb->st_blksize < BLKDEV_IOSIZE)
713			sb->st_blksize = BLKDEV_IOSIZE;
714	} else {
715		sb->st_blksize = PAGE_SIZE;
716	}
717
718	sb->st_flags = vap->va_flags;
719	if (suser(td))
720		sb->st_gen = 0;
721	else
722		sb->st_gen = vap->va_gen;
723
724#if (S_BLKSIZE == 512)
725	/* Optimize this case */
726	sb->st_blocks = vap->va_bytes >> 9;
727#else
728	sb->st_blocks = vap->va_bytes / S_BLKSIZE;
729#endif
730	return (0);
731}
732
733/*
734 * File table vnode ioctl routine.
735 */
736static int
737vn_ioctl(fp, com, data, active_cred, td)
738	struct file *fp;
739	u_long com;
740	void *data;
741	struct ucred *active_cred;
742	struct thread *td;
743{
744	register struct vnode *vp = ((struct vnode *)fp->f_data);
745	struct vnode *vpold;
746	struct vattr vattr;
747	int error;
748
749	switch (vp->v_type) {
750
751	case VREG:
752	case VDIR:
753		if (com == FIONREAD) {
754			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
755			error = VOP_GETATTR(vp, &vattr, active_cred, td);
756			VOP_UNLOCK(vp, 0, td);
757			if (error)
758				return (error);
759			*(int *)data = vattr.va_size - fp->f_offset;
760			return (0);
761		}
762		if (com == FIONBIO || com == FIOASYNC)	/* XXX */
763			return (0);			/* XXX */
764		/* FALLTHROUGH */
765
766	default:
767#if 0
768		return (ENOTTY);
769#endif
770	case VFIFO:
771	case VCHR:
772	case VBLK:
773		if (com == FIODTYPE) {
774			if (vp->v_type != VCHR && vp->v_type != VBLK)
775				return (ENOTTY);
776			*(int *)data = devsw(vp->v_rdev)->d_flags & D_TYPEMASK;
777			return (0);
778		}
779		error = VOP_IOCTL(vp, com, data, fp->f_flag, active_cred, td);
780		if (error == ENOIOCTL) {
781#ifdef DIAGNOSTIC
782			Debugger("ENOIOCTL leaked through");
783#endif
784			error = ENOTTY;
785		}
786		if (error == 0 && com == TIOCSCTTY) {
787
788			/* Do nothing if reassigning same control tty */
789			sx_slock(&proctree_lock);
790			if (td->td_proc->p_session->s_ttyvp == vp) {
791				sx_sunlock(&proctree_lock);
792				return (0);
793			}
794
795			vpold = td->td_proc->p_session->s_ttyvp;
796			VREF(vp);
797			SESS_LOCK(td->td_proc->p_session);
798			td->td_proc->p_session->s_ttyvp = vp;
799			SESS_UNLOCK(td->td_proc->p_session);
800
801			sx_sunlock(&proctree_lock);
802
803			/* Get rid of reference to old control tty */
804			if (vpold)
805				vrele(vpold);
806		}
807		return (error);
808	}
809}
810
811/*
812 * File table vnode poll routine.
813 */
814static int
815vn_poll(fp, events, active_cred, td)
816	struct file *fp;
817	int events;
818	struct ucred *active_cred;
819	struct thread *td;
820{
821	struct vnode *vp;
822#ifdef MAC
823	int error;
824#endif
825
826	vp = (struct vnode *)fp->f_data;
827#ifdef MAC
828	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
829	error = mac_check_vnode_poll(active_cred, fp->f_cred, vp);
830	VOP_UNLOCK(vp, 0, td);
831	if (error)
832		return (error);
833#endif
834
835	return (VOP_POLL(vp, events, fp->f_cred, td));
836}
837
838/*
839 * Check that the vnode is still valid, and if so
840 * acquire requested lock.
841 */
842int
843#ifndef	DEBUG_LOCKS
844vn_lock(vp, flags, td)
845#else
846debug_vn_lock(vp, flags, td, filename, line)
847#endif
848	struct vnode *vp;
849	int flags;
850	struct thread *td;
851#ifdef	DEBUG_LOCKS
852	const char *filename;
853	int line;
854#endif
855{
856	int error;
857
858	do {
859		if ((flags & LK_INTERLOCK) == 0)
860			VI_LOCK(vp);
861		if ((vp->v_iflag & VI_XLOCK) && vp->v_vxproc != curthread) {
862			vp->v_iflag |= VI_XWANT;
863			msleep(vp, VI_MTX(vp), PINOD, "vn_lock", 0);
864			error = ENOENT;
865			if ((flags & LK_RETRY) == 0) {
866				VI_UNLOCK(vp);
867				return (error);
868			}
869		}
870#ifdef	DEBUG_LOCKS
871		vp->filename = filename;
872		vp->line = line;
873#endif
874		/*
875		 * lockmgr drops interlock before it will return for
876		 * any reason.  So force the code above to relock it.
877		 */
878		error = VOP_LOCK(vp, flags | LK_NOPAUSE | LK_INTERLOCK, td);
879		flags &= ~LK_INTERLOCK;
880	} while (flags & LK_RETRY && error != 0);
881	return (error);
882}
883
884/*
885 * File table vnode close routine.
886 */
887static int
888vn_closefile(fp, td)
889	struct file *fp;
890	struct thread *td;
891{
892
893	fp->f_ops = &badfileops;
894	return (vn_close(((struct vnode *)fp->f_data), fp->f_flag,
895		fp->f_cred, td));
896}
897
898/*
899 * Preparing to start a filesystem write operation. If the operation is
900 * permitted, then we bump the count of operations in progress and
901 * proceed. If a suspend request is in progress, we wait until the
902 * suspension is over, and then proceed.
903 */
904int
905vn_start_write(vp, mpp, flags)
906	struct vnode *vp;
907	struct mount **mpp;
908	int flags;
909{
910	struct mount *mp;
911	int error;
912
913	/*
914	 * If a vnode is provided, get and return the mount point that
915	 * to which it will write.
916	 */
917	if (vp != NULL) {
918		if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
919			*mpp = NULL;
920			if (error != EOPNOTSUPP)
921				return (error);
922			return (0);
923		}
924	}
925	if ((mp = *mpp) == NULL)
926		return (0);
927	/*
928	 * Check on status of suspension.
929	 */
930	while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
931		if (flags & V_NOWAIT)
932			return (EWOULDBLOCK);
933		error = tsleep(&mp->mnt_flag, (PUSER - 1) | (flags & PCATCH),
934		    "suspfs", 0);
935		if (error)
936			return (error);
937	}
938	if (flags & V_XSLEEP)
939		return (0);
940	mp->mnt_writeopcount++;
941	return (0);
942}
943
944/*
945 * Secondary suspension. Used by operations such as vop_inactive
946 * routines that are needed by the higher level functions. These
947 * are allowed to proceed until all the higher level functions have
948 * completed (indicated by mnt_writeopcount dropping to zero). At that
949 * time, these operations are halted until the suspension is over.
950 */
951int
952vn_write_suspend_wait(vp, mp, flags)
953	struct vnode *vp;
954	struct mount *mp;
955	int flags;
956{
957	int error;
958
959	if (vp != NULL) {
960		if ((error = VOP_GETWRITEMOUNT(vp, &mp)) != 0) {
961			if (error != EOPNOTSUPP)
962				return (error);
963			return (0);
964		}
965	}
966	/*
967	 * If we are not suspended or have not yet reached suspended
968	 * mode, then let the operation proceed.
969	 */
970	if (mp == NULL || (mp->mnt_kern_flag & MNTK_SUSPENDED) == 0)
971		return (0);
972	if (flags & V_NOWAIT)
973		return (EWOULDBLOCK);
974	/*
975	 * Wait for the suspension to finish.
976	 */
977	return (tsleep(&mp->mnt_flag, (PUSER - 1) | (flags & PCATCH),
978	    "suspfs", 0));
979}
980
981/*
982 * Filesystem write operation has completed. If we are suspending and this
983 * operation is the last one, notify the suspender that the suspension is
984 * now in effect.
985 */
986void
987vn_finished_write(mp)
988	struct mount *mp;
989{
990
991	if (mp == NULL)
992		return;
993	mp->mnt_writeopcount--;
994	if (mp->mnt_writeopcount < 0)
995		panic("vn_finished_write: neg cnt");
996	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
997	    mp->mnt_writeopcount <= 0)
998		wakeup(&mp->mnt_writeopcount);
999}
1000
1001/*
1002 * Request a filesystem to suspend write operations.
1003 */
1004int
1005vfs_write_suspend(mp)
1006	struct mount *mp;
1007{
1008	struct thread *td = curthread;
1009	int error;
1010
1011	if (mp->mnt_kern_flag & MNTK_SUSPEND)
1012		return (0);
1013	mp->mnt_kern_flag |= MNTK_SUSPEND;
1014	if (mp->mnt_writeopcount > 0)
1015		(void) tsleep(&mp->mnt_writeopcount, PUSER - 1, "suspwt", 0);
1016	if ((error = VFS_SYNC(mp, MNT_WAIT, td->td_ucred, td)) != 0) {
1017		vfs_write_resume(mp);
1018		return (error);
1019	}
1020	mp->mnt_kern_flag |= MNTK_SUSPENDED;
1021	return (0);
1022}
1023
1024/*
1025 * Request a filesystem to resume write operations.
1026 */
1027void
1028vfs_write_resume(mp)
1029	struct mount *mp;
1030{
1031
1032	if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0)
1033		return;
1034	mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPENDED);
1035	wakeup(&mp->mnt_writeopcount);
1036	wakeup(&mp->mnt_flag);
1037}
1038
1039/*
1040 * Implement kqueues for files by translating it to vnode operation.
1041 */
1042static int
1043vn_kqfilter(struct file *fp, struct knote *kn)
1044{
1045
1046	return (VOP_KQFILTER(((struct vnode *)fp->f_data), kn));
1047}
1048
1049/*
1050 * Simplified in-kernel wrapper calls for extended attribute access.
1051 * Both calls pass in a NULL credential, authorizing as "kernel" access.
1052 * Set IO_NODELOCKED in ioflg if the vnode is already locked.
1053 */
1054int
1055vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
1056    const char *attrname, int *buflen, char *buf, struct thread *td)
1057{
1058	struct uio	auio;
1059	struct iovec	iov;
1060	int	error;
1061
1062	iov.iov_len = *buflen;
1063	iov.iov_base = buf;
1064
1065	auio.uio_iov = &iov;
1066	auio.uio_iovcnt = 1;
1067	auio.uio_rw = UIO_READ;
1068	auio.uio_segflg = UIO_SYSSPACE;
1069	auio.uio_td = td;
1070	auio.uio_offset = 0;
1071	auio.uio_resid = *buflen;
1072
1073	if ((ioflg & IO_NODELOCKED) == 0)
1074		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
1075
1076	/* authorize attribute retrieval as kernel */
1077	error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL,
1078	    td);
1079
1080	if ((ioflg & IO_NODELOCKED) == 0)
1081		VOP_UNLOCK(vp, 0, td);
1082
1083	if (error == 0) {
1084		*buflen = *buflen - auio.uio_resid;
1085	}
1086
1087	return (error);
1088}
1089
1090/*
1091 * XXX failure mode if partially written?
1092 */
1093int
1094vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace,
1095    const char *attrname, int buflen, char *buf, struct thread *td)
1096{
1097	struct uio	auio;
1098	struct iovec	iov;
1099	struct mount	*mp;
1100	int	error;
1101
1102	iov.iov_len = buflen;
1103	iov.iov_base = buf;
1104
1105	auio.uio_iov = &iov;
1106	auio.uio_iovcnt = 1;
1107	auio.uio_rw = UIO_WRITE;
1108	auio.uio_segflg = UIO_SYSSPACE;
1109	auio.uio_td = td;
1110	auio.uio_offset = 0;
1111	auio.uio_resid = buflen;
1112
1113	if ((ioflg & IO_NODELOCKED) == 0) {
1114		if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
1115			return (error);
1116		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
1117	}
1118
1119	/* authorize attribute setting as kernel */
1120	error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td);
1121
1122	if ((ioflg & IO_NODELOCKED) == 0) {
1123		vn_finished_write(mp);
1124		VOP_UNLOCK(vp, 0, td);
1125	}
1126
1127	return (error);
1128}
1129
1130int
1131vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
1132    const char *attrname, struct thread *td)
1133{
1134	struct mount	*mp;
1135	int	error;
1136
1137	if ((ioflg & IO_NODELOCKED) == 0) {
1138		if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
1139			return (error);
1140		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
1141	}
1142
1143	/* authorize attribute removal as kernel */
1144	error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL, NULL, td);
1145
1146	if ((ioflg & IO_NODELOCKED) == 0) {
1147		vn_finished_write(mp);
1148		VOP_UNLOCK(vp, 0, td);
1149	}
1150
1151	return (error);
1152}
1153