Deleted Added
full compact
vfs_vnops.c (71576) vfs_vnops.c (72200)
1/*
2 * Copyright (c) 1982, 1986, 1989, 1993
3 * The Regents of the University of California. All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the University of
21 * California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 * @(#)vfs_vnops.c 8.2 (Berkeley) 1/21/94
1/*
2 * Copyright (c) 1982, 1986, 1989, 1993
3 * The Regents of the University of California. All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the University of
21 * California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 * @(#)vfs_vnops.c 8.2 (Berkeley) 1/21/94
39 * $FreeBSD: head/sys/kern/vfs_vnops.c 71576 2001-01-24 12:35:55Z jasone $
39 * $FreeBSD: head/sys/kern/vfs_vnops.c 72200 2001-02-09 06:11:45Z bmilekic $
40 */
41
42#include <sys/param.h>
43#include <sys/systm.h>
44#include <sys/fcntl.h>
45#include <sys/file.h>
46#include <sys/stat.h>
47#include <sys/proc.h>
48#include <sys/mount.h>
49#include <sys/mutex.h>
50#include <sys/namei.h>
51#include <sys/vnode.h>
52#include <sys/bio.h>
53#include <sys/buf.h>
54#include <sys/filio.h>
55#include <sys/ttycom.h>
56#include <sys/conf.h>
57
58#include <ufs/ufs/quota.h>
59#include <ufs/ufs/inode.h>
60
61static int vn_closefile __P((struct file *fp, struct proc *p));
62static int vn_ioctl __P((struct file *fp, u_long com, caddr_t data,
63 struct proc *p));
64static int vn_read __P((struct file *fp, struct uio *uio,
65 struct ucred *cred, int flags, struct proc *p));
66static int vn_poll __P((struct file *fp, int events, struct ucred *cred,
67 struct proc *p));
68static int vn_statfile __P((struct file *fp, struct stat *sb, struct proc *p));
69static int vn_write __P((struct file *fp, struct uio *uio,
70 struct ucred *cred, int flags, struct proc *p));
71
72struct fileops vnops =
73 { vn_read, vn_write, vn_ioctl, vn_poll, vn_statfile, vn_closefile };
74
75static int filt_nullattach(struct knote *kn);
76static int filt_vnattach(struct knote *kn);
77static void filt_vndetach(struct knote *kn);
78static int filt_vnode(struct knote *kn, long hint);
79static int filt_vnread(struct knote *kn, long hint);
80
81struct filterops vn_filtops =
82 { 1, filt_vnattach, filt_vndetach, filt_vnode };
83
84/*
85 * XXX
86 * filt_vnread is ufs-specific, so the attach routine should really
87 * switch out to different filterops based on the vn filetype
88 */
89struct filterops vn_rwfiltops[] = {
90 { 1, filt_vnattach, filt_vndetach, filt_vnread },
91 { 1, filt_nullattach, NULL, NULL },
92};
93
94/*
95 * Common code for vnode open operations.
96 * Check permissions, and call the VOP_OPEN or VOP_CREATE routine.
97 *
98 * Note that this does NOT free nameidata for the successful case,
99 * due to the NDINIT being done elsewhere.
100 */
101int
102vn_open(ndp, flagp, cmode)
103 register struct nameidata *ndp;
104 int *flagp, cmode;
105{
106 struct vnode *vp;
107 struct mount *mp;
108 struct proc *p = ndp->ni_cnd.cn_proc;
109 struct ucred *cred = p->p_ucred;
110 struct vattr vat;
111 struct vattr *vap = &vat;
112 int mode, fmode, error;
113
114restart:
115 fmode = *flagp;
116 if (fmode & O_CREAT) {
117 ndp->ni_cnd.cn_nameiop = CREATE;
118 ndp->ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF;
119 if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0)
120 ndp->ni_cnd.cn_flags |= FOLLOW;
121 bwillwrite();
122 if ((error = namei(ndp)) != 0)
123 return (error);
124 if (ndp->ni_vp == NULL) {
125 VATTR_NULL(vap);
126 vap->va_type = VREG;
127 vap->va_mode = cmode;
128 if (fmode & O_EXCL)
129 vap->va_vaflags |= VA_EXCLUSIVE;
130 if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) {
131 NDFREE(ndp, NDF_ONLY_PNBUF);
132 vput(ndp->ni_dvp);
133 if ((error = vn_start_write(NULL, &mp,
134 V_XSLEEP | PCATCH)) != 0)
135 return (error);
136 goto restart;
137 }
138 VOP_LEASE(ndp->ni_dvp, p, cred, LEASE_WRITE);
139 error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp,
140 &ndp->ni_cnd, vap);
141 vput(ndp->ni_dvp);
142 vn_finished_write(mp);
143 if (error) {
144 NDFREE(ndp, NDF_ONLY_PNBUF);
145 return (error);
146 }
147 ASSERT_VOP_UNLOCKED(ndp->ni_dvp, "create");
148 ASSERT_VOP_LOCKED(ndp->ni_vp, "create");
149 fmode &= ~O_TRUNC;
150 vp = ndp->ni_vp;
151 } else {
152 if (ndp->ni_dvp == ndp->ni_vp)
153 vrele(ndp->ni_dvp);
154 else
155 vput(ndp->ni_dvp);
156 ndp->ni_dvp = NULL;
157 vp = ndp->ni_vp;
158 if (fmode & O_EXCL) {
159 error = EEXIST;
160 goto bad;
161 }
162 fmode &= ~O_CREAT;
163 }
164 } else {
165 ndp->ni_cnd.cn_nameiop = LOOKUP;
166 ndp->ni_cnd.cn_flags =
167 ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) | LOCKLEAF;
168 if ((error = namei(ndp)) != 0)
169 return (error);
170 vp = ndp->ni_vp;
171 }
172 if (vp->v_type == VLNK) {
173 error = EMLINK;
174 goto bad;
175 }
176 if (vp->v_type == VSOCK) {
177 error = EOPNOTSUPP;
178 goto bad;
179 }
180 if ((fmode & O_CREAT) == 0) {
181 mode = 0;
182 if (fmode & (FWRITE | O_TRUNC)) {
183 if (vp->v_type == VDIR) {
184 error = EISDIR;
185 goto bad;
186 }
187 error = vn_writechk(vp);
188 if (error)
189 goto bad;
190 mode |= VWRITE;
191 }
192 if (fmode & FREAD)
193 mode |= VREAD;
194 if (mode) {
195 error = VOP_ACCESS(vp, mode, cred, p);
196 if (error)
197 goto bad;
198 }
199 }
200 if ((error = VOP_OPEN(vp, fmode, cred, p)) != 0)
201 goto bad;
202 /*
203 * Make sure that a VM object is created for VMIO support.
204 */
205 if (vn_canvmio(vp) == TRUE) {
206 if ((error = vfs_object_create(vp, p, cred)) != 0)
207 goto bad;
208 }
209
210 if (fmode & FWRITE)
211 vp->v_writecount++;
212 *flagp = fmode;
213 return (0);
214bad:
215 NDFREE(ndp, NDF_ONLY_PNBUF);
216 vput(vp);
217 *flagp = fmode;
218 return (error);
219}
220
221/*
222 * Check for write permissions on the specified vnode.
223 * Prototype text segments cannot be written.
224 */
225int
226vn_writechk(vp)
227 register struct vnode *vp;
228{
229
230 /*
231 * If there's shared text associated with
232 * the vnode, try to free it up once. If
233 * we fail, we can't allow writing.
234 */
235 if (vp->v_flag & VTEXT)
236 return (ETXTBSY);
237 return (0);
238}
239
240/*
241 * Vnode close call
242 */
243int
244vn_close(vp, flags, cred, p)
245 register struct vnode *vp;
246 int flags;
247 struct ucred *cred;
248 struct proc *p;
249{
250 int error;
251
252 if (flags & FWRITE)
253 vp->v_writecount--;
254 error = VOP_CLOSE(vp, flags, cred, p);
255 vrele(vp);
256 return (error);
257}
258
259static __inline
260int
261sequential_heuristic(struct uio *uio, struct file *fp)
262{
263 /*
264 * Sequential heuristic - detect sequential operation
265 */
266 if ((uio->uio_offset == 0 && fp->f_seqcount > 0) ||
267 uio->uio_offset == fp->f_nextoff) {
268 /*
269 * XXX we assume that the filesystem block size is
270 * the default. Not true, but still gives us a pretty
271 * good indicator of how sequential the read operations
272 * are.
273 */
274 fp->f_seqcount += (uio->uio_resid + BKVASIZE - 1) / BKVASIZE;
275 if (fp->f_seqcount >= 127)
276 fp->f_seqcount = 127;
277 return(fp->f_seqcount << 16);
278 }
279
280 /*
281 * Not sequential, quick draw-down of seqcount
282 */
283 if (fp->f_seqcount > 1)
284 fp->f_seqcount = 1;
285 else
286 fp->f_seqcount = 0;
287 return(0);
288}
289
290/*
291 * Package up an I/O request on a vnode into a uio and do it.
292 */
293int
294vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, cred, aresid, p)
295 enum uio_rw rw;
296 struct vnode *vp;
297 caddr_t base;
298 int len;
299 off_t offset;
300 enum uio_seg segflg;
301 int ioflg;
302 struct ucred *cred;
303 int *aresid;
304 struct proc *p;
305{
306 struct uio auio;
307 struct iovec aiov;
308 struct mount *mp;
309 int error;
310
311 if ((ioflg & IO_NODELOCKED) == 0) {
312 mp = NULL;
313 if (rw == UIO_WRITE &&
314 vp->v_type != VCHR &&
315 (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
316 return (error);
317 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
318 }
319 auio.uio_iov = &aiov;
320 auio.uio_iovcnt = 1;
321 aiov.iov_base = base;
322 aiov.iov_len = len;
323 auio.uio_resid = len;
324 auio.uio_offset = offset;
325 auio.uio_segflg = segflg;
326 auio.uio_rw = rw;
327 auio.uio_procp = p;
328 if (rw == UIO_READ) {
329 error = VOP_READ(vp, &auio, ioflg, cred);
330 } else {
331 error = VOP_WRITE(vp, &auio, ioflg, cred);
332 }
333 if (aresid)
334 *aresid = auio.uio_resid;
335 else
336 if (auio.uio_resid && error == 0)
337 error = EIO;
338 if ((ioflg & IO_NODELOCKED) == 0) {
339 vn_finished_write(mp);
340 VOP_UNLOCK(vp, 0, p);
341 }
342 return (error);
343}
344
345/*
346 * File table vnode read routine.
347 */
348static int
349vn_read(fp, uio, cred, flags, p)
350 struct file *fp;
351 struct uio *uio;
352 struct ucred *cred;
353 struct proc *p;
354 int flags;
355{
356 struct vnode *vp;
357 int error, ioflag;
358
359 KASSERT(uio->uio_procp == p, ("uio_procp %p is not p %p",
360 uio->uio_procp, p));
361 vp = (struct vnode *)fp->f_data;
362 ioflag = 0;
363 if (fp->f_flag & FNONBLOCK)
364 ioflag |= IO_NDELAY;
365 VOP_LEASE(vp, p, cred, LEASE_READ);
366 vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, p);
367 if ((flags & FOF_OFFSET) == 0)
368 uio->uio_offset = fp->f_offset;
369
370 ioflag |= sequential_heuristic(uio, fp);
371
372 error = VOP_READ(vp, uio, ioflag, cred);
373 if ((flags & FOF_OFFSET) == 0)
374 fp->f_offset = uio->uio_offset;
375 fp->f_nextoff = uio->uio_offset;
376 VOP_UNLOCK(vp, 0, p);
377 return (error);
378}
379
380/*
381 * File table vnode write routine.
382 */
383static int
384vn_write(fp, uio, cred, flags, p)
385 struct file *fp;
386 struct uio *uio;
387 struct ucred *cred;
388 struct proc *p;
389 int flags;
390{
391 struct vnode *vp;
392 struct mount *mp;
393 int error, ioflag;
394
395 KASSERT(uio->uio_procp == p, ("uio_procp %p is not p %p",
396 uio->uio_procp, p));
397 vp = (struct vnode *)fp->f_data;
398 if (vp->v_type == VREG)
399 bwillwrite();
400 vp = (struct vnode *)fp->f_data; /* XXX needed? */
401 ioflag = IO_UNIT;
402 if (vp->v_type == VREG && (fp->f_flag & O_APPEND))
403 ioflag |= IO_APPEND;
404 if (fp->f_flag & FNONBLOCK)
405 ioflag |= IO_NDELAY;
406 if ((fp->f_flag & O_FSYNC) ||
407 (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
408 ioflag |= IO_SYNC;
409 mp = NULL;
410 if (vp->v_type != VCHR &&
411 (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
412 return (error);
413 VOP_LEASE(vp, p, cred, LEASE_WRITE);
414 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
415 if ((flags & FOF_OFFSET) == 0)
416 uio->uio_offset = fp->f_offset;
417 ioflag |= sequential_heuristic(uio, fp);
418 error = VOP_WRITE(vp, uio, ioflag, cred);
419 if ((flags & FOF_OFFSET) == 0)
420 fp->f_offset = uio->uio_offset;
421 fp->f_nextoff = uio->uio_offset;
422 VOP_UNLOCK(vp, 0, p);
423 vn_finished_write(mp);
424 return (error);
425}
426
427/*
428 * File table vnode stat routine.
429 */
430static int
431vn_statfile(fp, sb, p)
432 struct file *fp;
433 struct stat *sb;
434 struct proc *p;
435{
436 struct vnode *vp = (struct vnode *)fp->f_data;
437
438 return vn_stat(vp, sb, p);
439}
440
441int
442vn_stat(vp, sb, p)
443 struct vnode *vp;
444 register struct stat *sb;
445 struct proc *p;
446{
447 struct vattr vattr;
448 register struct vattr *vap;
449 int error;
450 u_short mode;
451
452 vap = &vattr;
453 error = VOP_GETATTR(vp, vap, p->p_ucred, p);
454 if (error)
455 return (error);
456
457 /*
458 * Zero the spare stat fields
459 */
460 sb->st_lspare = 0;
461 sb->st_qspare[0] = 0;
462 sb->st_qspare[1] = 0;
463
464 /*
465 * Copy from vattr table
466 */
467 if (vap->va_fsid != VNOVAL)
468 sb->st_dev = vap->va_fsid;
469 else
470 sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0];
471 sb->st_ino = vap->va_fileid;
472 mode = vap->va_mode;
473 switch (vap->va_type) {
474 case VREG:
475 mode |= S_IFREG;
476 break;
477 case VDIR:
478 mode |= S_IFDIR;
479 break;
480 case VBLK:
481 mode |= S_IFBLK;
482 break;
483 case VCHR:
484 mode |= S_IFCHR;
485 break;
486 case VLNK:
487 mode |= S_IFLNK;
488 /* This is a cosmetic change, symlinks do not have a mode. */
489 if (vp->v_mount->mnt_flag & MNT_NOSYMFOLLOW)
490 sb->st_mode &= ~ACCESSPERMS; /* 0000 */
491 else
492 sb->st_mode |= ACCESSPERMS; /* 0777 */
493 break;
494 case VSOCK:
495 mode |= S_IFSOCK;
496 break;
497 case VFIFO:
498 mode |= S_IFIFO;
499 break;
500 default:
501 return (EBADF);
502 };
503 sb->st_mode = mode;
504 sb->st_nlink = vap->va_nlink;
505 sb->st_uid = vap->va_uid;
506 sb->st_gid = vap->va_gid;
507 sb->st_rdev = vap->va_rdev;
508 sb->st_size = vap->va_size;
509 sb->st_atimespec = vap->va_atime;
510 sb->st_mtimespec = vap->va_mtime;
511 sb->st_ctimespec = vap->va_ctime;
512
513 /*
514 * According to www.opengroup.org, the meaning of st_blksize is
515 * "a filesystem-specific preferred I/O block size for this
516 * object. In some filesystem types, this may vary from file
517 * to file"
518 * Default to zero to catch bogus uses of this field.
519 */
520
521 if (vap->va_type == VREG) {
522 sb->st_blksize = vap->va_blocksize;
523 } else if (vn_isdisk(vp, NULL)) {
524 sb->st_blksize = vp->v_rdev->si_bsize_best;
525 if (sb->st_blksize < vp->v_rdev->si_bsize_phys)
526 sb->st_blksize = vp->v_rdev->si_bsize_phys;
527 if (sb->st_blksize < BLKDEV_IOSIZE)
528 sb->st_blksize = BLKDEV_IOSIZE;
529 } else {
530 sb->st_blksize = 0;
531 }
532
533 sb->st_flags = vap->va_flags;
534 if (suser_xxx(p->p_ucred, 0, 0))
535 sb->st_gen = 0;
536 else
537 sb->st_gen = vap->va_gen;
538
539#if (S_BLKSIZE == 512)
540 /* Optimize this case */
541 sb->st_blocks = vap->va_bytes >> 9;
542#else
543 sb->st_blocks = vap->va_bytes / S_BLKSIZE;
544#endif
545 return (0);
546}
547
548/*
549 * File table vnode ioctl routine.
550 */
551static int
552vn_ioctl(fp, com, data, p)
553 struct file *fp;
554 u_long com;
555 caddr_t data;
556 struct proc *p;
557{
558 register struct vnode *vp = ((struct vnode *)fp->f_data);
559 struct vattr vattr;
560 int error;
561
562 switch (vp->v_type) {
563
564 case VREG:
565 case VDIR:
566 if (com == FIONREAD) {
567 error = VOP_GETATTR(vp, &vattr, p->p_ucred, p);
568 if (error)
569 return (error);
570 *(int *)data = vattr.va_size - fp->f_offset;
571 return (0);
572 }
573 if (com == FIONBIO || com == FIOASYNC) /* XXX */
574 return (0); /* XXX */
575 /* fall into ... */
576
577 default:
578#if 0
579 return (ENOTTY);
580#endif
581 case VFIFO:
582 case VCHR:
583 case VBLK:
584 if (com == FIODTYPE) {
585 if (vp->v_type != VCHR && vp->v_type != VBLK)
586 return (ENOTTY);
587 *(int *)data = devsw(vp->v_rdev)->d_flags & D_TYPEMASK;
588 return (0);
589 }
590 error = VOP_IOCTL(vp, com, data, fp->f_flag, p->p_ucred, p);
591 if (error == 0 && com == TIOCSCTTY) {
592
593 /* Do nothing if reassigning same control tty */
594 if (p->p_session->s_ttyvp == vp)
595 return (0);
596
597 /* Get rid of reference to old control tty */
598 if (p->p_session->s_ttyvp)
599 vrele(p->p_session->s_ttyvp);
600
601 p->p_session->s_ttyvp = vp;
602 VREF(vp);
603 }
604 return (error);
605 }
606}
607
608/*
609 * File table vnode poll routine.
610 */
611static int
612vn_poll(fp, events, cred, p)
613 struct file *fp;
614 int events;
615 struct ucred *cred;
616 struct proc *p;
617{
618
619 return (VOP_POLL(((struct vnode *)fp->f_data), events, cred, p));
620}
621
622/*
623 * Check that the vnode is still valid, and if so
624 * acquire requested lock.
625 */
626int
627#ifndef DEBUG_LOCKS
628vn_lock(vp, flags, p)
629#else
630debug_vn_lock(vp, flags, p, filename, line)
631#endif
632 struct vnode *vp;
633 int flags;
634 struct proc *p;
635#ifdef DEBUG_LOCKS
636 const char *filename;
637 int line;
638#endif
639{
640 int error;
641
642 do {
643 if ((flags & LK_INTERLOCK) == 0)
40 */
41
42#include <sys/param.h>
43#include <sys/systm.h>
44#include <sys/fcntl.h>
45#include <sys/file.h>
46#include <sys/stat.h>
47#include <sys/proc.h>
48#include <sys/mount.h>
49#include <sys/mutex.h>
50#include <sys/namei.h>
51#include <sys/vnode.h>
52#include <sys/bio.h>
53#include <sys/buf.h>
54#include <sys/filio.h>
55#include <sys/ttycom.h>
56#include <sys/conf.h>
57
58#include <ufs/ufs/quota.h>
59#include <ufs/ufs/inode.h>
60
61static int vn_closefile __P((struct file *fp, struct proc *p));
62static int vn_ioctl __P((struct file *fp, u_long com, caddr_t data,
63 struct proc *p));
64static int vn_read __P((struct file *fp, struct uio *uio,
65 struct ucred *cred, int flags, struct proc *p));
66static int vn_poll __P((struct file *fp, int events, struct ucred *cred,
67 struct proc *p));
68static int vn_statfile __P((struct file *fp, struct stat *sb, struct proc *p));
69static int vn_write __P((struct file *fp, struct uio *uio,
70 struct ucred *cred, int flags, struct proc *p));
71
72struct fileops vnops =
73 { vn_read, vn_write, vn_ioctl, vn_poll, vn_statfile, vn_closefile };
74
75static int filt_nullattach(struct knote *kn);
76static int filt_vnattach(struct knote *kn);
77static void filt_vndetach(struct knote *kn);
78static int filt_vnode(struct knote *kn, long hint);
79static int filt_vnread(struct knote *kn, long hint);
80
81struct filterops vn_filtops =
82 { 1, filt_vnattach, filt_vndetach, filt_vnode };
83
84/*
85 * XXX
86 * filt_vnread is ufs-specific, so the attach routine should really
87 * switch out to different filterops based on the vn filetype
88 */
89struct filterops vn_rwfiltops[] = {
90 { 1, filt_vnattach, filt_vndetach, filt_vnread },
91 { 1, filt_nullattach, NULL, NULL },
92};
93
94/*
95 * Common code for vnode open operations.
96 * Check permissions, and call the VOP_OPEN or VOP_CREATE routine.
97 *
98 * Note that this does NOT free nameidata for the successful case,
99 * due to the NDINIT being done elsewhere.
100 */
101int
102vn_open(ndp, flagp, cmode)
103 register struct nameidata *ndp;
104 int *flagp, cmode;
105{
106 struct vnode *vp;
107 struct mount *mp;
108 struct proc *p = ndp->ni_cnd.cn_proc;
109 struct ucred *cred = p->p_ucred;
110 struct vattr vat;
111 struct vattr *vap = &vat;
112 int mode, fmode, error;
113
114restart:
115 fmode = *flagp;
116 if (fmode & O_CREAT) {
117 ndp->ni_cnd.cn_nameiop = CREATE;
118 ndp->ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF;
119 if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0)
120 ndp->ni_cnd.cn_flags |= FOLLOW;
121 bwillwrite();
122 if ((error = namei(ndp)) != 0)
123 return (error);
124 if (ndp->ni_vp == NULL) {
125 VATTR_NULL(vap);
126 vap->va_type = VREG;
127 vap->va_mode = cmode;
128 if (fmode & O_EXCL)
129 vap->va_vaflags |= VA_EXCLUSIVE;
130 if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) {
131 NDFREE(ndp, NDF_ONLY_PNBUF);
132 vput(ndp->ni_dvp);
133 if ((error = vn_start_write(NULL, &mp,
134 V_XSLEEP | PCATCH)) != 0)
135 return (error);
136 goto restart;
137 }
138 VOP_LEASE(ndp->ni_dvp, p, cred, LEASE_WRITE);
139 error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp,
140 &ndp->ni_cnd, vap);
141 vput(ndp->ni_dvp);
142 vn_finished_write(mp);
143 if (error) {
144 NDFREE(ndp, NDF_ONLY_PNBUF);
145 return (error);
146 }
147 ASSERT_VOP_UNLOCKED(ndp->ni_dvp, "create");
148 ASSERT_VOP_LOCKED(ndp->ni_vp, "create");
149 fmode &= ~O_TRUNC;
150 vp = ndp->ni_vp;
151 } else {
152 if (ndp->ni_dvp == ndp->ni_vp)
153 vrele(ndp->ni_dvp);
154 else
155 vput(ndp->ni_dvp);
156 ndp->ni_dvp = NULL;
157 vp = ndp->ni_vp;
158 if (fmode & O_EXCL) {
159 error = EEXIST;
160 goto bad;
161 }
162 fmode &= ~O_CREAT;
163 }
164 } else {
165 ndp->ni_cnd.cn_nameiop = LOOKUP;
166 ndp->ni_cnd.cn_flags =
167 ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) | LOCKLEAF;
168 if ((error = namei(ndp)) != 0)
169 return (error);
170 vp = ndp->ni_vp;
171 }
172 if (vp->v_type == VLNK) {
173 error = EMLINK;
174 goto bad;
175 }
176 if (vp->v_type == VSOCK) {
177 error = EOPNOTSUPP;
178 goto bad;
179 }
180 if ((fmode & O_CREAT) == 0) {
181 mode = 0;
182 if (fmode & (FWRITE | O_TRUNC)) {
183 if (vp->v_type == VDIR) {
184 error = EISDIR;
185 goto bad;
186 }
187 error = vn_writechk(vp);
188 if (error)
189 goto bad;
190 mode |= VWRITE;
191 }
192 if (fmode & FREAD)
193 mode |= VREAD;
194 if (mode) {
195 error = VOP_ACCESS(vp, mode, cred, p);
196 if (error)
197 goto bad;
198 }
199 }
200 if ((error = VOP_OPEN(vp, fmode, cred, p)) != 0)
201 goto bad;
202 /*
203 * Make sure that a VM object is created for VMIO support.
204 */
205 if (vn_canvmio(vp) == TRUE) {
206 if ((error = vfs_object_create(vp, p, cred)) != 0)
207 goto bad;
208 }
209
210 if (fmode & FWRITE)
211 vp->v_writecount++;
212 *flagp = fmode;
213 return (0);
214bad:
215 NDFREE(ndp, NDF_ONLY_PNBUF);
216 vput(vp);
217 *flagp = fmode;
218 return (error);
219}
220
221/*
222 * Check for write permissions on the specified vnode.
223 * Prototype text segments cannot be written.
224 */
225int
226vn_writechk(vp)
227 register struct vnode *vp;
228{
229
230 /*
231 * If there's shared text associated with
232 * the vnode, try to free it up once. If
233 * we fail, we can't allow writing.
234 */
235 if (vp->v_flag & VTEXT)
236 return (ETXTBSY);
237 return (0);
238}
239
240/*
241 * Vnode close call
242 */
243int
244vn_close(vp, flags, cred, p)
245 register struct vnode *vp;
246 int flags;
247 struct ucred *cred;
248 struct proc *p;
249{
250 int error;
251
252 if (flags & FWRITE)
253 vp->v_writecount--;
254 error = VOP_CLOSE(vp, flags, cred, p);
255 vrele(vp);
256 return (error);
257}
258
259static __inline
260int
261sequential_heuristic(struct uio *uio, struct file *fp)
262{
263 /*
264 * Sequential heuristic - detect sequential operation
265 */
266 if ((uio->uio_offset == 0 && fp->f_seqcount > 0) ||
267 uio->uio_offset == fp->f_nextoff) {
268 /*
269 * XXX we assume that the filesystem block size is
270 * the default. Not true, but still gives us a pretty
271 * good indicator of how sequential the read operations
272 * are.
273 */
274 fp->f_seqcount += (uio->uio_resid + BKVASIZE - 1) / BKVASIZE;
275 if (fp->f_seqcount >= 127)
276 fp->f_seqcount = 127;
277 return(fp->f_seqcount << 16);
278 }
279
280 /*
281 * Not sequential, quick draw-down of seqcount
282 */
283 if (fp->f_seqcount > 1)
284 fp->f_seqcount = 1;
285 else
286 fp->f_seqcount = 0;
287 return(0);
288}
289
290/*
291 * Package up an I/O request on a vnode into a uio and do it.
292 */
293int
294vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, cred, aresid, p)
295 enum uio_rw rw;
296 struct vnode *vp;
297 caddr_t base;
298 int len;
299 off_t offset;
300 enum uio_seg segflg;
301 int ioflg;
302 struct ucred *cred;
303 int *aresid;
304 struct proc *p;
305{
306 struct uio auio;
307 struct iovec aiov;
308 struct mount *mp;
309 int error;
310
311 if ((ioflg & IO_NODELOCKED) == 0) {
312 mp = NULL;
313 if (rw == UIO_WRITE &&
314 vp->v_type != VCHR &&
315 (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
316 return (error);
317 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
318 }
319 auio.uio_iov = &aiov;
320 auio.uio_iovcnt = 1;
321 aiov.iov_base = base;
322 aiov.iov_len = len;
323 auio.uio_resid = len;
324 auio.uio_offset = offset;
325 auio.uio_segflg = segflg;
326 auio.uio_rw = rw;
327 auio.uio_procp = p;
328 if (rw == UIO_READ) {
329 error = VOP_READ(vp, &auio, ioflg, cred);
330 } else {
331 error = VOP_WRITE(vp, &auio, ioflg, cred);
332 }
333 if (aresid)
334 *aresid = auio.uio_resid;
335 else
336 if (auio.uio_resid && error == 0)
337 error = EIO;
338 if ((ioflg & IO_NODELOCKED) == 0) {
339 vn_finished_write(mp);
340 VOP_UNLOCK(vp, 0, p);
341 }
342 return (error);
343}
344
345/*
346 * File table vnode read routine.
347 */
348static int
349vn_read(fp, uio, cred, flags, p)
350 struct file *fp;
351 struct uio *uio;
352 struct ucred *cred;
353 struct proc *p;
354 int flags;
355{
356 struct vnode *vp;
357 int error, ioflag;
358
359 KASSERT(uio->uio_procp == p, ("uio_procp %p is not p %p",
360 uio->uio_procp, p));
361 vp = (struct vnode *)fp->f_data;
362 ioflag = 0;
363 if (fp->f_flag & FNONBLOCK)
364 ioflag |= IO_NDELAY;
365 VOP_LEASE(vp, p, cred, LEASE_READ);
366 vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, p);
367 if ((flags & FOF_OFFSET) == 0)
368 uio->uio_offset = fp->f_offset;
369
370 ioflag |= sequential_heuristic(uio, fp);
371
372 error = VOP_READ(vp, uio, ioflag, cred);
373 if ((flags & FOF_OFFSET) == 0)
374 fp->f_offset = uio->uio_offset;
375 fp->f_nextoff = uio->uio_offset;
376 VOP_UNLOCK(vp, 0, p);
377 return (error);
378}
379
380/*
381 * File table vnode write routine.
382 */
383static int
384vn_write(fp, uio, cred, flags, p)
385 struct file *fp;
386 struct uio *uio;
387 struct ucred *cred;
388 struct proc *p;
389 int flags;
390{
391 struct vnode *vp;
392 struct mount *mp;
393 int error, ioflag;
394
395 KASSERT(uio->uio_procp == p, ("uio_procp %p is not p %p",
396 uio->uio_procp, p));
397 vp = (struct vnode *)fp->f_data;
398 if (vp->v_type == VREG)
399 bwillwrite();
400 vp = (struct vnode *)fp->f_data; /* XXX needed? */
401 ioflag = IO_UNIT;
402 if (vp->v_type == VREG && (fp->f_flag & O_APPEND))
403 ioflag |= IO_APPEND;
404 if (fp->f_flag & FNONBLOCK)
405 ioflag |= IO_NDELAY;
406 if ((fp->f_flag & O_FSYNC) ||
407 (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
408 ioflag |= IO_SYNC;
409 mp = NULL;
410 if (vp->v_type != VCHR &&
411 (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
412 return (error);
413 VOP_LEASE(vp, p, cred, LEASE_WRITE);
414 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
415 if ((flags & FOF_OFFSET) == 0)
416 uio->uio_offset = fp->f_offset;
417 ioflag |= sequential_heuristic(uio, fp);
418 error = VOP_WRITE(vp, uio, ioflag, cred);
419 if ((flags & FOF_OFFSET) == 0)
420 fp->f_offset = uio->uio_offset;
421 fp->f_nextoff = uio->uio_offset;
422 VOP_UNLOCK(vp, 0, p);
423 vn_finished_write(mp);
424 return (error);
425}
426
427/*
428 * File table vnode stat routine.
429 */
430static int
431vn_statfile(fp, sb, p)
432 struct file *fp;
433 struct stat *sb;
434 struct proc *p;
435{
436 struct vnode *vp = (struct vnode *)fp->f_data;
437
438 return vn_stat(vp, sb, p);
439}
440
441int
442vn_stat(vp, sb, p)
443 struct vnode *vp;
444 register struct stat *sb;
445 struct proc *p;
446{
447 struct vattr vattr;
448 register struct vattr *vap;
449 int error;
450 u_short mode;
451
452 vap = &vattr;
453 error = VOP_GETATTR(vp, vap, p->p_ucred, p);
454 if (error)
455 return (error);
456
457 /*
458 * Zero the spare stat fields
459 */
460 sb->st_lspare = 0;
461 sb->st_qspare[0] = 0;
462 sb->st_qspare[1] = 0;
463
464 /*
465 * Copy from vattr table
466 */
467 if (vap->va_fsid != VNOVAL)
468 sb->st_dev = vap->va_fsid;
469 else
470 sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0];
471 sb->st_ino = vap->va_fileid;
472 mode = vap->va_mode;
473 switch (vap->va_type) {
474 case VREG:
475 mode |= S_IFREG;
476 break;
477 case VDIR:
478 mode |= S_IFDIR;
479 break;
480 case VBLK:
481 mode |= S_IFBLK;
482 break;
483 case VCHR:
484 mode |= S_IFCHR;
485 break;
486 case VLNK:
487 mode |= S_IFLNK;
488 /* This is a cosmetic change, symlinks do not have a mode. */
489 if (vp->v_mount->mnt_flag & MNT_NOSYMFOLLOW)
490 sb->st_mode &= ~ACCESSPERMS; /* 0000 */
491 else
492 sb->st_mode |= ACCESSPERMS; /* 0777 */
493 break;
494 case VSOCK:
495 mode |= S_IFSOCK;
496 break;
497 case VFIFO:
498 mode |= S_IFIFO;
499 break;
500 default:
501 return (EBADF);
502 };
503 sb->st_mode = mode;
504 sb->st_nlink = vap->va_nlink;
505 sb->st_uid = vap->va_uid;
506 sb->st_gid = vap->va_gid;
507 sb->st_rdev = vap->va_rdev;
508 sb->st_size = vap->va_size;
509 sb->st_atimespec = vap->va_atime;
510 sb->st_mtimespec = vap->va_mtime;
511 sb->st_ctimespec = vap->va_ctime;
512
513 /*
514 * According to www.opengroup.org, the meaning of st_blksize is
515 * "a filesystem-specific preferred I/O block size for this
516 * object. In some filesystem types, this may vary from file
517 * to file"
518 * Default to zero to catch bogus uses of this field.
519 */
520
521 if (vap->va_type == VREG) {
522 sb->st_blksize = vap->va_blocksize;
523 } else if (vn_isdisk(vp, NULL)) {
524 sb->st_blksize = vp->v_rdev->si_bsize_best;
525 if (sb->st_blksize < vp->v_rdev->si_bsize_phys)
526 sb->st_blksize = vp->v_rdev->si_bsize_phys;
527 if (sb->st_blksize < BLKDEV_IOSIZE)
528 sb->st_blksize = BLKDEV_IOSIZE;
529 } else {
530 sb->st_blksize = 0;
531 }
532
533 sb->st_flags = vap->va_flags;
534 if (suser_xxx(p->p_ucred, 0, 0))
535 sb->st_gen = 0;
536 else
537 sb->st_gen = vap->va_gen;
538
539#if (S_BLKSIZE == 512)
540 /* Optimize this case */
541 sb->st_blocks = vap->va_bytes >> 9;
542#else
543 sb->st_blocks = vap->va_bytes / S_BLKSIZE;
544#endif
545 return (0);
546}
547
548/*
549 * File table vnode ioctl routine.
550 */
551static int
552vn_ioctl(fp, com, data, p)
553 struct file *fp;
554 u_long com;
555 caddr_t data;
556 struct proc *p;
557{
558 register struct vnode *vp = ((struct vnode *)fp->f_data);
559 struct vattr vattr;
560 int error;
561
562 switch (vp->v_type) {
563
564 case VREG:
565 case VDIR:
566 if (com == FIONREAD) {
567 error = VOP_GETATTR(vp, &vattr, p->p_ucred, p);
568 if (error)
569 return (error);
570 *(int *)data = vattr.va_size - fp->f_offset;
571 return (0);
572 }
573 if (com == FIONBIO || com == FIOASYNC) /* XXX */
574 return (0); /* XXX */
575 /* fall into ... */
576
577 default:
578#if 0
579 return (ENOTTY);
580#endif
581 case VFIFO:
582 case VCHR:
583 case VBLK:
584 if (com == FIODTYPE) {
585 if (vp->v_type != VCHR && vp->v_type != VBLK)
586 return (ENOTTY);
587 *(int *)data = devsw(vp->v_rdev)->d_flags & D_TYPEMASK;
588 return (0);
589 }
590 error = VOP_IOCTL(vp, com, data, fp->f_flag, p->p_ucred, p);
591 if (error == 0 && com == TIOCSCTTY) {
592
593 /* Do nothing if reassigning same control tty */
594 if (p->p_session->s_ttyvp == vp)
595 return (0);
596
597 /* Get rid of reference to old control tty */
598 if (p->p_session->s_ttyvp)
599 vrele(p->p_session->s_ttyvp);
600
601 p->p_session->s_ttyvp = vp;
602 VREF(vp);
603 }
604 return (error);
605 }
606}
607
608/*
609 * File table vnode poll routine.
610 */
611static int
612vn_poll(fp, events, cred, p)
613 struct file *fp;
614 int events;
615 struct ucred *cred;
616 struct proc *p;
617{
618
619 return (VOP_POLL(((struct vnode *)fp->f_data), events, cred, p));
620}
621
622/*
623 * Check that the vnode is still valid, and if so
624 * acquire requested lock.
625 */
626int
627#ifndef DEBUG_LOCKS
628vn_lock(vp, flags, p)
629#else
630debug_vn_lock(vp, flags, p, filename, line)
631#endif
632 struct vnode *vp;
633 int flags;
634 struct proc *p;
635#ifdef DEBUG_LOCKS
636 const char *filename;
637 int line;
638#endif
639{
640 int error;
641
642 do {
643 if ((flags & LK_INTERLOCK) == 0)
644 mtx_enter(&vp->v_interlock, MTX_DEF);
644 mtx_lock(&vp->v_interlock);
645 if ((vp->v_flag & VXLOCK) && vp->v_vxproc != curproc) {
646 vp->v_flag |= VXWANT;
645 if ((vp->v_flag & VXLOCK) && vp->v_vxproc != curproc) {
646 vp->v_flag |= VXWANT;
647 mtx_exit(&vp->v_interlock, MTX_DEF);
647 mtx_unlock(&vp->v_interlock);
648 tsleep((caddr_t)vp, PINOD, "vn_lock", 0);
649 error = ENOENT;
650 } else {
651 if (vp->v_vxproc != NULL)
652 printf("VXLOCK interlock avoided in vn_lock\n");
653#ifdef DEBUG_LOCKS
654 vp->filename = filename;
655 vp->line = line;
656#endif
657 error = VOP_LOCK(vp,
658 flags | LK_NOPAUSE | LK_INTERLOCK, p);
659 if (error == 0)
660 return (error);
661 }
662 flags &= ~LK_INTERLOCK;
663 } while (flags & LK_RETRY);
664 return (error);
665}
666
667/*
668 * File table vnode close routine.
669 */
670static int
671vn_closefile(fp, p)
672 struct file *fp;
673 struct proc *p;
674{
675
676 fp->f_ops = &badfileops;
677 return (vn_close(((struct vnode *)fp->f_data), fp->f_flag,
678 fp->f_cred, p));
679}
680
681/*
682 * Preparing to start a filesystem write operation. If the operation is
683 * permitted, then we bump the count of operations in progress and
684 * proceed. If a suspend request is in progress, we wait until the
685 * suspension is over, and then proceed.
686 */
687int
688vn_start_write(vp, mpp, flags)
689 struct vnode *vp;
690 struct mount **mpp;
691 int flags;
692{
693 struct mount *mp;
694 int error;
695
696 /*
697 * If a vnode is provided, get and return the mount point that
698 * to which it will write.
699 */
700 if (vp != NULL) {
701 if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
702 *mpp = NULL;
703 if (error != EOPNOTSUPP)
704 return (error);
705 return (0);
706 }
707 }
708 if ((mp = *mpp) == NULL)
709 return (0);
710 /*
711 * Check on status of suspension.
712 */
713 while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
714 if (flags & V_NOWAIT)
715 return (EWOULDBLOCK);
716 error = tsleep(&mp->mnt_flag, (PUSER - 1) | (flags & PCATCH),
717 "suspfs", 0);
718 if (error)
719 return (error);
720 }
721 if (flags & V_XSLEEP)
722 return (0);
723 mp->mnt_writeopcount++;
724 return (0);
725}
726
727/*
728 * Secondary suspension. Used by operations such as vop_inactive
729 * routines that are needed by the higher level functions. These
730 * are allowed to proceed until all the higher level functions have
731 * completed (indicated by mnt_writeopcount dropping to zero). At that
732 * time, these operations are halted until the suspension is over.
733 */
734int
735vn_write_suspend_wait(vp, mp, flags)
736 struct vnode *vp;
737 struct mount *mp;
738 int flags;
739{
740 int error;
741
742 if (vp != NULL) {
743 if ((error = VOP_GETWRITEMOUNT(vp, &mp)) != 0) {
744 if (error != EOPNOTSUPP)
745 return (error);
746 return (0);
747 }
748 }
749 /*
750 * If we are not suspended or have not yet reached suspended
751 * mode, then let the operation proceed.
752 */
753 if (mp == NULL || (mp->mnt_kern_flag & MNTK_SUSPENDED) == 0)
754 return (0);
755 if (flags & V_NOWAIT)
756 return (EWOULDBLOCK);
757 /*
758 * Wait for the suspension to finish.
759 */
760 return (tsleep(&mp->mnt_flag, (PUSER - 1) | (flags & PCATCH),
761 "suspfs", 0));
762}
763
764/*
765 * Filesystem write operation has completed. If we are suspending and this
766 * operation is the last one, notify the suspender that the suspension is
767 * now in effect.
768 */
769void
770vn_finished_write(mp)
771 struct mount *mp;
772{
773
774 if (mp == NULL)
775 return;
776 mp->mnt_writeopcount--;
777 if (mp->mnt_writeopcount < 0)
778 panic("vn_finished_write: neg cnt");
779 if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
780 mp->mnt_writeopcount <= 0)
781 wakeup(&mp->mnt_writeopcount);
782}
783
784/*
785 * Request a filesystem to suspend write operations.
786 */
787void
788vfs_write_suspend(mp)
789 struct mount *mp;
790{
791 struct proc *p = curproc;
792
793 if (mp->mnt_kern_flag & MNTK_SUSPEND)
794 return;
795 mp->mnt_kern_flag |= MNTK_SUSPEND;
796 if (mp->mnt_writeopcount > 0)
797 (void) tsleep(&mp->mnt_writeopcount, PUSER - 1, "suspwt", 0);
798 VFS_SYNC(mp, MNT_WAIT, p->p_ucred, p);
799 mp->mnt_kern_flag |= MNTK_SUSPENDED;
800}
801
802/*
803 * Request a filesystem to resume write operations.
804 */
805void
806vfs_write_resume(mp)
807 struct mount *mp;
808{
809
810 if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0)
811 return;
812 mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPENDED);
813 wakeup(&mp->mnt_writeopcount);
814 wakeup(&mp->mnt_flag);
815}
816
817static int
818filt_vnattach(struct knote *kn)
819{
820 struct vnode *vp;
821
822 if (kn->kn_fp->f_type != DTYPE_VNODE &&
823 kn->kn_fp->f_type != DTYPE_FIFO)
824 return (EBADF);
825
826 vp = (struct vnode *)kn->kn_fp->f_data;
827
828 /*
829 * XXX
830 * this is a hack simply to cause the filter attach to fail
831 * for non-ufs filesystems, until the support for them is done.
832 */
833 if ((vp)->v_tag != VT_UFS)
834 return (EOPNOTSUPP);
835
648 tsleep((caddr_t)vp, PINOD, "vn_lock", 0);
649 error = ENOENT;
650 } else {
651 if (vp->v_vxproc != NULL)
652 printf("VXLOCK interlock avoided in vn_lock\n");
653#ifdef DEBUG_LOCKS
654 vp->filename = filename;
655 vp->line = line;
656#endif
657 error = VOP_LOCK(vp,
658 flags | LK_NOPAUSE | LK_INTERLOCK, p);
659 if (error == 0)
660 return (error);
661 }
662 flags &= ~LK_INTERLOCK;
663 } while (flags & LK_RETRY);
664 return (error);
665}
666
667/*
668 * File table vnode close routine.
669 */
670static int
671vn_closefile(fp, p)
672 struct file *fp;
673 struct proc *p;
674{
675
676 fp->f_ops = &badfileops;
677 return (vn_close(((struct vnode *)fp->f_data), fp->f_flag,
678 fp->f_cred, p));
679}
680
681/*
682 * Preparing to start a filesystem write operation. If the operation is
683 * permitted, then we bump the count of operations in progress and
684 * proceed. If a suspend request is in progress, we wait until the
685 * suspension is over, and then proceed.
686 */
687int
688vn_start_write(vp, mpp, flags)
689 struct vnode *vp;
690 struct mount **mpp;
691 int flags;
692{
693 struct mount *mp;
694 int error;
695
696 /*
697 * If a vnode is provided, get and return the mount point that
698 * to which it will write.
699 */
700 if (vp != NULL) {
701 if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
702 *mpp = NULL;
703 if (error != EOPNOTSUPP)
704 return (error);
705 return (0);
706 }
707 }
708 if ((mp = *mpp) == NULL)
709 return (0);
710 /*
711 * Check on status of suspension.
712 */
713 while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
714 if (flags & V_NOWAIT)
715 return (EWOULDBLOCK);
716 error = tsleep(&mp->mnt_flag, (PUSER - 1) | (flags & PCATCH),
717 "suspfs", 0);
718 if (error)
719 return (error);
720 }
721 if (flags & V_XSLEEP)
722 return (0);
723 mp->mnt_writeopcount++;
724 return (0);
725}
726
727/*
728 * Secondary suspension. Used by operations such as vop_inactive
729 * routines that are needed by the higher level functions. These
730 * are allowed to proceed until all the higher level functions have
731 * completed (indicated by mnt_writeopcount dropping to zero). At that
732 * time, these operations are halted until the suspension is over.
733 */
734int
735vn_write_suspend_wait(vp, mp, flags)
736 struct vnode *vp;
737 struct mount *mp;
738 int flags;
739{
740 int error;
741
742 if (vp != NULL) {
743 if ((error = VOP_GETWRITEMOUNT(vp, &mp)) != 0) {
744 if (error != EOPNOTSUPP)
745 return (error);
746 return (0);
747 }
748 }
749 /*
750 * If we are not suspended or have not yet reached suspended
751 * mode, then let the operation proceed.
752 */
753 if (mp == NULL || (mp->mnt_kern_flag & MNTK_SUSPENDED) == 0)
754 return (0);
755 if (flags & V_NOWAIT)
756 return (EWOULDBLOCK);
757 /*
758 * Wait for the suspension to finish.
759 */
760 return (tsleep(&mp->mnt_flag, (PUSER - 1) | (flags & PCATCH),
761 "suspfs", 0));
762}
763
764/*
765 * Filesystem write operation has completed. If we are suspending and this
766 * operation is the last one, notify the suspender that the suspension is
767 * now in effect.
768 */
769void
770vn_finished_write(mp)
771 struct mount *mp;
772{
773
774 if (mp == NULL)
775 return;
776 mp->mnt_writeopcount--;
777 if (mp->mnt_writeopcount < 0)
778 panic("vn_finished_write: neg cnt");
779 if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
780 mp->mnt_writeopcount <= 0)
781 wakeup(&mp->mnt_writeopcount);
782}
783
784/*
785 * Request a filesystem to suspend write operations.
786 */
787void
788vfs_write_suspend(mp)
789 struct mount *mp;
790{
791 struct proc *p = curproc;
792
793 if (mp->mnt_kern_flag & MNTK_SUSPEND)
794 return;
795 mp->mnt_kern_flag |= MNTK_SUSPEND;
796 if (mp->mnt_writeopcount > 0)
797 (void) tsleep(&mp->mnt_writeopcount, PUSER - 1, "suspwt", 0);
798 VFS_SYNC(mp, MNT_WAIT, p->p_ucred, p);
799 mp->mnt_kern_flag |= MNTK_SUSPENDED;
800}
801
802/*
803 * Request a filesystem to resume write operations.
804 */
805void
806vfs_write_resume(mp)
807 struct mount *mp;
808{
809
810 if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0)
811 return;
812 mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPENDED);
813 wakeup(&mp->mnt_writeopcount);
814 wakeup(&mp->mnt_flag);
815}
816
817static int
818filt_vnattach(struct knote *kn)
819{
820 struct vnode *vp;
821
822 if (kn->kn_fp->f_type != DTYPE_VNODE &&
823 kn->kn_fp->f_type != DTYPE_FIFO)
824 return (EBADF);
825
826 vp = (struct vnode *)kn->kn_fp->f_data;
827
828 /*
829 * XXX
830 * this is a hack simply to cause the filter attach to fail
831 * for non-ufs filesystems, until the support for them is done.
832 */
833 if ((vp)->v_tag != VT_UFS)
834 return (EOPNOTSUPP);
835
836 mtx_enter(&vp->v_pollinfo.vpi_lock, MTX_DEF);
836 mtx_lock(&vp->v_pollinfo.vpi_lock);
837 SLIST_INSERT_HEAD(&vp->v_pollinfo.vpi_selinfo.si_note, kn, kn_selnext);
837 SLIST_INSERT_HEAD(&vp->v_pollinfo.vpi_selinfo.si_note, kn, kn_selnext);
838 mtx_exit(&vp->v_pollinfo.vpi_lock, MTX_DEF);
838 mtx_unlock(&vp->v_pollinfo.vpi_lock);
839
840 return (0);
841}
842
843static void
844filt_vndetach(struct knote *kn)
845{
846 struct vnode *vp = (struct vnode *)kn->kn_fp->f_data;
847
839
840 return (0);
841}
842
843static void
844filt_vndetach(struct knote *kn)
845{
846 struct vnode *vp = (struct vnode *)kn->kn_fp->f_data;
847
848 mtx_enter(&vp->v_pollinfo.vpi_lock, MTX_DEF);
848 mtx_lock(&vp->v_pollinfo.vpi_lock);
849 SLIST_REMOVE(&vp->v_pollinfo.vpi_selinfo.si_note,
850 kn, knote, kn_selnext);
849 SLIST_REMOVE(&vp->v_pollinfo.vpi_selinfo.si_note,
850 kn, knote, kn_selnext);
851 mtx_exit(&vp->v_pollinfo.vpi_lock, MTX_DEF);
851 mtx_unlock(&vp->v_pollinfo.vpi_lock);
852}
853
854static int
855filt_vnode(struct knote *kn, long hint)
856{
857
858 if (kn->kn_sfflags & hint)
859 kn->kn_fflags |= hint;
860 return (kn->kn_fflags != 0);
861}
862
863static int
864filt_nullattach(struct knote *kn)
865{
866 return (ENXIO);
867}
868
869/*ARGSUSED*/
870static int
871filt_vnread(struct knote *kn, long hint)
872{
873 struct vnode *vp = (struct vnode *)kn->kn_fp->f_data;
874 struct inode *ip = VTOI(vp);
875
876 kn->kn_data = ip->i_size - kn->kn_fp->f_offset;
877 return (kn->kn_data != 0);
878}
879
880/*
881 * Simplified in-kernel wrapper calls for extended attribute access.
882 * Both calls pass in a NULL credential, authorizing as "kernel" access.
883 * Set IO_NODELOCKED in ioflg if the vnode is already locked.
884 */
885int
886vn_extattr_get(struct vnode *vp, int ioflg, const char *attrname, int *buflen,
887 char *buf, struct proc *p)
888{
889 struct uio auio;
890 struct iovec iov;
891 int error;
892
893 iov.iov_len = *buflen;
894 iov.iov_base = buf;
895
896 auio.uio_iov = &iov;
897 auio.uio_iovcnt = 1;
898 auio.uio_rw = UIO_READ;
899 auio.uio_segflg = UIO_SYSSPACE;
900 auio.uio_procp = p;
901 auio.uio_offset = 0;
902 auio.uio_resid = *buflen;
903
904 if ((ioflg & IO_NODELOCKED) == 0)
905 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
906
907 /* authorize attribute retrieval as kernel */
908 error = VOP_GETEXTATTR(vp, attrname, &auio, NULL, p);
909
910 if ((ioflg & IO_NODELOCKED) == 0)
911 VOP_UNLOCK(vp, 0, p);
912
913 if (error == 0) {
914 *buflen = *buflen - auio.uio_resid;
915 }
916
917 return (error);
918}
919
920/*
921 * XXX failure mode if partially written?
922 */
923int
924vn_extattr_set(struct vnode *vp, int ioflg, const char *attrname, int buflen,
925 char *buf, struct proc *p)
926{
927 struct uio auio;
928 struct iovec iov;
929 struct mount *mp;
930 int error;
931
932 iov.iov_len = buflen;
933 iov.iov_base = buf;
934
935 auio.uio_iov = &iov;
936 auio.uio_iovcnt = 1;
937 auio.uio_rw = UIO_WRITE;
938 auio.uio_segflg = UIO_SYSSPACE;
939 auio.uio_procp = p;
940 auio.uio_offset = 0;
941 auio.uio_resid = buflen;
942
943 if ((ioflg & IO_NODELOCKED) == 0) {
944 if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
945 return (error);
946 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
947 }
948
949 /* authorize attribute setting as kernel */
950 error = VOP_SETEXTATTR(vp, attrname, &auio, NULL, p);
951
952 if ((ioflg & IO_NODELOCKED) == 0) {
953 vn_finished_write(mp);
954 VOP_UNLOCK(vp, 0, p);
955 }
956
957 return (error);
958}
959
960int
961vn_extattr_rm(struct vnode *vp, int ioflg, const char *attrname, struct proc *p)
962{
963 struct mount *mp;
964 int error;
965
966 if ((ioflg & IO_NODELOCKED) == 0) {
967 if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
968 return (error);
969 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
970 }
971
972 /* authorize attribute removal as kernel */
973 error = VOP_SETEXTATTR(vp, attrname, NULL, NULL, p);
974
975 if ((ioflg & IO_NODELOCKED) == 0) {
976 vn_finished_write(mp);
977 VOP_UNLOCK(vp, 0, p);
978 }
979
980 return (error);
981}
852}
853
854static int
855filt_vnode(struct knote *kn, long hint)
856{
857
858 if (kn->kn_sfflags & hint)
859 kn->kn_fflags |= hint;
860 return (kn->kn_fflags != 0);
861}
862
863static int
864filt_nullattach(struct knote *kn)
865{
866 return (ENXIO);
867}
868
869/*ARGSUSED*/
870static int
871filt_vnread(struct knote *kn, long hint)
872{
873 struct vnode *vp = (struct vnode *)kn->kn_fp->f_data;
874 struct inode *ip = VTOI(vp);
875
876 kn->kn_data = ip->i_size - kn->kn_fp->f_offset;
877 return (kn->kn_data != 0);
878}
879
880/*
881 * Simplified in-kernel wrapper calls for extended attribute access.
882 * Both calls pass in a NULL credential, authorizing as "kernel" access.
883 * Set IO_NODELOCKED in ioflg if the vnode is already locked.
884 */
885int
886vn_extattr_get(struct vnode *vp, int ioflg, const char *attrname, int *buflen,
887 char *buf, struct proc *p)
888{
889 struct uio auio;
890 struct iovec iov;
891 int error;
892
893 iov.iov_len = *buflen;
894 iov.iov_base = buf;
895
896 auio.uio_iov = &iov;
897 auio.uio_iovcnt = 1;
898 auio.uio_rw = UIO_READ;
899 auio.uio_segflg = UIO_SYSSPACE;
900 auio.uio_procp = p;
901 auio.uio_offset = 0;
902 auio.uio_resid = *buflen;
903
904 if ((ioflg & IO_NODELOCKED) == 0)
905 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
906
907 /* authorize attribute retrieval as kernel */
908 error = VOP_GETEXTATTR(vp, attrname, &auio, NULL, p);
909
910 if ((ioflg & IO_NODELOCKED) == 0)
911 VOP_UNLOCK(vp, 0, p);
912
913 if (error == 0) {
914 *buflen = *buflen - auio.uio_resid;
915 }
916
917 return (error);
918}
919
920/*
921 * XXX failure mode if partially written?
922 */
923int
924vn_extattr_set(struct vnode *vp, int ioflg, const char *attrname, int buflen,
925 char *buf, struct proc *p)
926{
927 struct uio auio;
928 struct iovec iov;
929 struct mount *mp;
930 int error;
931
932 iov.iov_len = buflen;
933 iov.iov_base = buf;
934
935 auio.uio_iov = &iov;
936 auio.uio_iovcnt = 1;
937 auio.uio_rw = UIO_WRITE;
938 auio.uio_segflg = UIO_SYSSPACE;
939 auio.uio_procp = p;
940 auio.uio_offset = 0;
941 auio.uio_resid = buflen;
942
943 if ((ioflg & IO_NODELOCKED) == 0) {
944 if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
945 return (error);
946 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
947 }
948
949 /* authorize attribute setting as kernel */
950 error = VOP_SETEXTATTR(vp, attrname, &auio, NULL, p);
951
952 if ((ioflg & IO_NODELOCKED) == 0) {
953 vn_finished_write(mp);
954 VOP_UNLOCK(vp, 0, p);
955 }
956
957 return (error);
958}
959
960int
961vn_extattr_rm(struct vnode *vp, int ioflg, const char *attrname, struct proc *p)
962{
963 struct mount *mp;
964 int error;
965
966 if ((ioflg & IO_NODELOCKED) == 0) {
967 if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
968 return (error);
969 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
970 }
971
972 /* authorize attribute removal as kernel */
973 error = VOP_SETEXTATTR(vp, attrname, NULL, NULL, p);
974
975 if ((ioflg & IO_NODELOCKED) == 0) {
976 vn_finished_write(mp);
977 VOP_UNLOCK(vp, 0, p);
978 }
979
980 return (error);
981}