Deleted Added
full compact
vfs_vnops.c (90946) vfs_vnops.c (91140)
1/*
2 * Copyright (c) 1982, 1986, 1989, 1993
3 * The Regents of the University of California. All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the University of
21 * California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 * @(#)vfs_vnops.c 8.2 (Berkeley) 1/21/94
1/*
2 * Copyright (c) 1982, 1986, 1989, 1993
3 * The Regents of the University of California. All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the University of
21 * California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 * @(#)vfs_vnops.c 8.2 (Berkeley) 1/21/94
39 * $FreeBSD: head/sys/kern/vfs_vnops.c 90946 2002-02-20 00:11:57Z rwatson $
39 * $FreeBSD: head/sys/kern/vfs_vnops.c 91140 2002-02-23 11:12:57Z tanimura $
40 */
41
42#include <sys/param.h>
43#include <sys/systm.h>
44#include <sys/fcntl.h>
45#include <sys/file.h>
46#include <sys/stat.h>
47#include <sys/proc.h>
48#include <sys/lock.h>
49#include <sys/mount.h>
50#include <sys/mutex.h>
51#include <sys/namei.h>
52#include <sys/vnode.h>
53#include <sys/bio.h>
54#include <sys/buf.h>
55#include <sys/filio.h>
56#include <sys/ttycom.h>
57#include <sys/conf.h>
58#include <sys/syslog.h>
59
60#include <machine/limits.h>
61
62static int vn_closefile __P((struct file *fp, struct thread *td));
63static int vn_ioctl __P((struct file *fp, u_long com, caddr_t data,
64 struct thread *td));
65static int vn_read __P((struct file *fp, struct uio *uio,
66 struct ucred *cred, int flags, struct thread *td));
67static int vn_poll __P((struct file *fp, int events, struct ucred *cred,
68 struct thread *td));
69static int vn_kqfilter __P((struct file *fp, struct knote *kn));
70static int vn_statfile __P((struct file *fp, struct stat *sb, struct thread *td));
71static int vn_write __P((struct file *fp, struct uio *uio,
72 struct ucred *cred, int flags, struct thread *td));
73
74struct fileops vnops = {
75 vn_read, vn_write, vn_ioctl, vn_poll, vn_kqfilter,
76 vn_statfile, vn_closefile
77};
78
79int
80vn_open(ndp, flagp, cmode)
81 register struct nameidata *ndp;
82 int *flagp, cmode;
83{
84 struct thread *td = ndp->ni_cnd.cn_thread;
85
86 return (vn_open_cred(ndp, flagp, cmode, td->td_proc->p_ucred));
87}
88
89/*
90 * Common code for vnode open operations.
91 * Check permissions, and call the VOP_OPEN or VOP_CREATE routine.
92 *
93 * Note that this does NOT free nameidata for the successful case,
94 * due to the NDINIT being done elsewhere.
95 */
96int
97vn_open_cred(ndp, flagp, cmode, cred)
98 register struct nameidata *ndp;
99 int *flagp, cmode;
100 struct ucred *cred;
101{
102 struct vnode *vp;
103 struct mount *mp;
104 struct thread *td = ndp->ni_cnd.cn_thread;
105 struct vattr vat;
106 struct vattr *vap = &vat;
107 int mode, fmode, error;
108
109restart:
110 fmode = *flagp;
111 if (fmode & O_CREAT) {
112 ndp->ni_cnd.cn_nameiop = CREATE;
113 ndp->ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF;
114 if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0)
115 ndp->ni_cnd.cn_flags |= FOLLOW;
116 bwillwrite();
117 if ((error = namei(ndp)) != 0)
118 return (error);
119 if (ndp->ni_vp == NULL) {
120 VATTR_NULL(vap);
121 vap->va_type = VREG;
122 vap->va_mode = cmode;
123 if (fmode & O_EXCL)
124 vap->va_vaflags |= VA_EXCLUSIVE;
125 if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) {
126 NDFREE(ndp, NDF_ONLY_PNBUF);
127 vput(ndp->ni_dvp);
128 if ((error = vn_start_write(NULL, &mp,
129 V_XSLEEP | PCATCH)) != 0)
130 return (error);
131 goto restart;
132 }
133 VOP_LEASE(ndp->ni_dvp, td, cred, LEASE_WRITE);
134 error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp,
135 &ndp->ni_cnd, vap);
136 vput(ndp->ni_dvp);
137 vn_finished_write(mp);
138 if (error) {
139 NDFREE(ndp, NDF_ONLY_PNBUF);
140 return (error);
141 }
142 ASSERT_VOP_UNLOCKED(ndp->ni_dvp, "create");
143 ASSERT_VOP_LOCKED(ndp->ni_vp, "create");
144 fmode &= ~O_TRUNC;
145 vp = ndp->ni_vp;
146 } else {
147 if (ndp->ni_dvp == ndp->ni_vp)
148 vrele(ndp->ni_dvp);
149 else
150 vput(ndp->ni_dvp);
151 ndp->ni_dvp = NULL;
152 vp = ndp->ni_vp;
153 if (fmode & O_EXCL) {
154 error = EEXIST;
155 goto bad;
156 }
157 fmode &= ~O_CREAT;
158 }
159 } else {
160 ndp->ni_cnd.cn_nameiop = LOOKUP;
161 ndp->ni_cnd.cn_flags =
162 ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) | LOCKLEAF;
163 if ((error = namei(ndp)) != 0)
164 return (error);
165 vp = ndp->ni_vp;
166 }
167 if (vp->v_type == VLNK) {
168 error = EMLINK;
169 goto bad;
170 }
171 if (vp->v_type == VSOCK) {
172 error = EOPNOTSUPP;
173 goto bad;
174 }
175 if ((fmode & O_CREAT) == 0) {
176 mode = 0;
177 if (fmode & (FWRITE | O_TRUNC)) {
178 if (vp->v_type == VDIR) {
179 error = EISDIR;
180 goto bad;
181 }
182 error = vn_writechk(vp);
183 if (error)
184 goto bad;
185 mode |= VWRITE;
186 }
187 if (fmode & FREAD)
188 mode |= VREAD;
189 if (mode) {
190 error = VOP_ACCESS(vp, mode, cred, td);
191 if (error)
192 goto bad;
193 }
194 }
195 if ((error = VOP_OPEN(vp, fmode, cred, td)) != 0)
196 goto bad;
197 /*
198 * Make sure that a VM object is created for VMIO support.
199 */
200 if (vn_canvmio(vp) == TRUE) {
201 if ((error = vfs_object_create(vp, td, cred)) != 0) {
202 VOP_UNLOCK(vp, 0, td);
203 VOP_CLOSE(vp, fmode, cred, td);
204 NDFREE(ndp, NDF_ONLY_PNBUF);
205 vrele(vp);
206 *flagp = fmode;
207 return (error);
208 }
209 }
210
211 if (fmode & FWRITE)
212 vp->v_writecount++;
213 *flagp = fmode;
214 return (0);
215bad:
216 NDFREE(ndp, NDF_ONLY_PNBUF);
217 vput(vp);
218 *flagp = fmode;
219 return (error);
220}
221
222/*
223 * Check for write permissions on the specified vnode.
224 * Prototype text segments cannot be written.
225 */
226int
227vn_writechk(vp)
228 register struct vnode *vp;
229{
230
231 /*
232 * If there's shared text associated with
233 * the vnode, try to free it up once. If
234 * we fail, we can't allow writing.
235 */
236 if (vp->v_flag & VTEXT)
237 return (ETXTBSY);
238 return (0);
239}
240
241/*
242 * Vnode close call
243 */
244int
245vn_close(vp, flags, cred, td)
246 register struct vnode *vp;
247 int flags;
248 struct ucred *cred;
249 struct thread *td;
250{
251 int error;
252
253 if (flags & FWRITE)
254 vp->v_writecount--;
255 error = VOP_CLOSE(vp, flags, cred, td);
256 /*
257 * XXX - In certain instances VOP_CLOSE has to do the vrele
258 * itself. If the vrele has been done, it will return EAGAIN
259 * to indicate that the vrele should not be done again. When
260 * this happens, we just return success. The correct thing to
261 * do would be to have all VOP_CLOSE instances do the vrele.
262 */
263 if (error == EAGAIN)
264 return (0);
265 vrele(vp);
266 return (error);
267}
268
269static __inline
270int
271sequential_heuristic(struct uio *uio, struct file *fp)
272{
273
274 /*
275 * Sequential heuristic - detect sequential operation
276 */
277 if ((uio->uio_offset == 0 && fp->f_seqcount > 0) ||
278 uio->uio_offset == fp->f_nextoff) {
279 /*
280 * XXX we assume that the filesystem block size is
281 * the default. Not true, but still gives us a pretty
282 * good indicator of how sequential the read operations
283 * are.
284 */
285 fp->f_seqcount += (uio->uio_resid + BKVASIZE - 1) / BKVASIZE;
286 if (fp->f_seqcount >= 127)
287 fp->f_seqcount = 127;
288 return(fp->f_seqcount << 16);
289 }
290
291 /*
292 * Not sequential, quick draw-down of seqcount
293 */
294 if (fp->f_seqcount > 1)
295 fp->f_seqcount = 1;
296 else
297 fp->f_seqcount = 0;
298 return(0);
299}
300
301/*
302 * Package up an I/O request on a vnode into a uio and do it.
303 */
304int
305vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, cred, aresid, td)
306 enum uio_rw rw;
307 struct vnode *vp;
308 caddr_t base;
309 int len;
310 off_t offset;
311 enum uio_seg segflg;
312 int ioflg;
313 struct ucred *cred;
314 int *aresid;
315 struct thread *td;
316{
317 struct uio auio;
318 struct iovec aiov;
319 struct mount *mp;
320 int error;
321
322 if ((ioflg & IO_NODELOCKED) == 0) {
323 mp = NULL;
324 if (rw == UIO_WRITE &&
325 vp->v_type != VCHR &&
326 (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
327 return (error);
328 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
329 }
330 auio.uio_iov = &aiov;
331 auio.uio_iovcnt = 1;
332 aiov.iov_base = base;
333 aiov.iov_len = len;
334 auio.uio_resid = len;
335 auio.uio_offset = offset;
336 auio.uio_segflg = segflg;
337 auio.uio_rw = rw;
338 auio.uio_td = td;
339 if (rw == UIO_READ) {
340 error = VOP_READ(vp, &auio, ioflg, cred);
341 } else {
342 error = VOP_WRITE(vp, &auio, ioflg, cred);
343 }
344 if (aresid)
345 *aresid = auio.uio_resid;
346 else
347 if (auio.uio_resid && error == 0)
348 error = EIO;
349 if ((ioflg & IO_NODELOCKED) == 0) {
350 vn_finished_write(mp);
351 VOP_UNLOCK(vp, 0, td);
352 }
353 return (error);
354}
355
356/*
357 * Package up an I/O request on a vnode into a uio and do it. The I/O
358 * request is split up into smaller chunks and we try to avoid saturating
359 * the buffer cache while potentially holding a vnode locked, so we
360 * check bwillwrite() before calling vn_rdwr(). We also call uio_yield()
361 * to give other processes a chance to lock the vnode (either other processes
362 * core'ing the same binary, or unrelated processes scanning the directory).
363 */
364int
365vn_rdwr_inchunks(rw, vp, base, len, offset, segflg, ioflg, cred, aresid, td)
366 enum uio_rw rw;
367 struct vnode *vp;
368 caddr_t base;
369 int len;
370 off_t offset;
371 enum uio_seg segflg;
372 int ioflg;
373 struct ucred *cred;
374 int *aresid;
375 struct thread *td;
376{
377 int error = 0;
378
379 do {
380 int chunk = (len > MAXBSIZE) ? MAXBSIZE : len;
381
382 if (rw != UIO_READ && vp->v_type == VREG)
383 bwillwrite();
384 error = vn_rdwr(rw, vp, base, chunk, offset, segflg,
385 ioflg, cred, aresid, td);
386 len -= chunk; /* aresid calc already includes length */
387 if (error)
388 break;
389 offset += chunk;
390 base += chunk;
391 uio_yield();
392 } while (len);
393 if (aresid)
394 *aresid += len;
395 return (error);
396}
397
398/*
399 * File table vnode read routine.
400 */
401static int
402vn_read(fp, uio, cred, flags, td)
403 struct file *fp;
404 struct uio *uio;
405 struct ucred *cred;
406 struct thread *td;
407 int flags;
408{
409 struct vnode *vp;
410 int error, ioflag;
411
412 KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
413 uio->uio_td, td));
414 vp = (struct vnode *)fp->f_data;
415 ioflag = 0;
416 if (fp->f_flag & FNONBLOCK)
417 ioflag |= IO_NDELAY;
418 if (fp->f_flag & O_DIRECT)
419 ioflag |= IO_DIRECT;
420 VOP_LEASE(vp, td, cred, LEASE_READ);
421 vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, td);
422 if ((flags & FOF_OFFSET) == 0)
423 uio->uio_offset = fp->f_offset;
424
425 ioflag |= sequential_heuristic(uio, fp);
426
427 error = VOP_READ(vp, uio, ioflag, cred);
428 if ((flags & FOF_OFFSET) == 0)
429 fp->f_offset = uio->uio_offset;
430 fp->f_nextoff = uio->uio_offset;
431 VOP_UNLOCK(vp, 0, td);
432 return (error);
433}
434
435/*
436 * File table vnode write routine.
437 */
438static int
439vn_write(fp, uio, cred, flags, td)
440 struct file *fp;
441 struct uio *uio;
442 struct ucred *cred;
443 struct thread *td;
444 int flags;
445{
446 struct vnode *vp;
447 struct mount *mp;
448 int error, ioflag;
449
450 KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
451 uio->uio_td, td));
452 vp = (struct vnode *)fp->f_data;
453 if (vp->v_type == VREG)
454 bwillwrite();
455 ioflag = IO_UNIT;
456 if (vp->v_type == VREG && (fp->f_flag & O_APPEND))
457 ioflag |= IO_APPEND;
458 if (fp->f_flag & FNONBLOCK)
459 ioflag |= IO_NDELAY;
460 if (fp->f_flag & O_DIRECT)
461 ioflag |= IO_DIRECT;
462 if ((fp->f_flag & O_FSYNC) ||
463 (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
464 ioflag |= IO_SYNC;
465 mp = NULL;
466 if (vp->v_type != VCHR &&
467 (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
468 return (error);
469 VOP_LEASE(vp, td, cred, LEASE_WRITE);
470 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
471 if ((flags & FOF_OFFSET) == 0)
472 uio->uio_offset = fp->f_offset;
473 ioflag |= sequential_heuristic(uio, fp);
474 error = VOP_WRITE(vp, uio, ioflag, cred);
475 if ((flags & FOF_OFFSET) == 0)
476 fp->f_offset = uio->uio_offset;
477 fp->f_nextoff = uio->uio_offset;
478 VOP_UNLOCK(vp, 0, td);
479 vn_finished_write(mp);
480 return (error);
481}
482
483/*
484 * File table vnode stat routine.
485 */
486static int
487vn_statfile(fp, sb, td)
488 struct file *fp;
489 struct stat *sb;
490 struct thread *td;
491{
492 struct vnode *vp = (struct vnode *)fp->f_data;
493 int error;
494
495 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
496 error = vn_stat(vp, sb, td);
497 VOP_UNLOCK(vp, 0, td);
498
499 return (error);
500}
501
502int
503vn_stat(vp, sb, td)
504 struct vnode *vp;
505 register struct stat *sb;
506 struct thread *td;
507{
508 struct vattr vattr;
509 register struct vattr *vap;
510 int error;
511 u_short mode;
512
513 vap = &vattr;
514 error = VOP_GETATTR(vp, vap, td->td_proc->p_ucred, td);
515 if (error)
516 return (error);
517
518 /*
519 * Zero the spare stat fields
520 */
521 sb->st_lspare = 0;
522 sb->st_qspare[0] = 0;
523 sb->st_qspare[1] = 0;
524
525 /*
526 * Copy from vattr table
527 */
528 if (vap->va_fsid != VNOVAL)
529 sb->st_dev = vap->va_fsid;
530 else
531 sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0];
532 sb->st_ino = vap->va_fileid;
533 mode = vap->va_mode;
534 switch (vap->va_type) {
535 case VREG:
536 mode |= S_IFREG;
537 break;
538 case VDIR:
539 mode |= S_IFDIR;
540 break;
541 case VBLK:
542 mode |= S_IFBLK;
543 break;
544 case VCHR:
545 mode |= S_IFCHR;
546 break;
547 case VLNK:
548 mode |= S_IFLNK;
549 /* This is a cosmetic change, symlinks do not have a mode. */
550 if (vp->v_mount->mnt_flag & MNT_NOSYMFOLLOW)
551 sb->st_mode &= ~ACCESSPERMS; /* 0000 */
552 else
553 sb->st_mode |= ACCESSPERMS; /* 0777 */
554 break;
555 case VSOCK:
556 mode |= S_IFSOCK;
557 break;
558 case VFIFO:
559 mode |= S_IFIFO;
560 break;
561 default:
562 return (EBADF);
563 };
564 sb->st_mode = mode;
565 sb->st_nlink = vap->va_nlink;
566 sb->st_uid = vap->va_uid;
567 sb->st_gid = vap->va_gid;
568 sb->st_rdev = vap->va_rdev;
569 if (vap->va_size > OFF_MAX)
570 return (EOVERFLOW);
571 sb->st_size = vap->va_size;
572 sb->st_atimespec = vap->va_atime;
573 sb->st_mtimespec = vap->va_mtime;
574 sb->st_ctimespec = vap->va_ctime;
575
576 /*
577 * According to www.opengroup.org, the meaning of st_blksize is
578 * "a filesystem-specific preferred I/O block size for this
579 * object. In some filesystem types, this may vary from file
580 * to file"
581 * Default to PAGE_SIZE after much discussion.
582 */
583
584 if (vap->va_type == VREG) {
585 sb->st_blksize = vap->va_blocksize;
586 } else if (vn_isdisk(vp, NULL)) {
587 sb->st_blksize = vp->v_rdev->si_bsize_best;
588 if (sb->st_blksize < vp->v_rdev->si_bsize_phys)
589 sb->st_blksize = vp->v_rdev->si_bsize_phys;
590 if (sb->st_blksize < BLKDEV_IOSIZE)
591 sb->st_blksize = BLKDEV_IOSIZE;
592 } else {
593 sb->st_blksize = PAGE_SIZE;
594 }
595
596 sb->st_flags = vap->va_flags;
597 if (suser_xxx(td->td_proc->p_ucred, 0, 0))
598 sb->st_gen = 0;
599 else
600 sb->st_gen = vap->va_gen;
601
602#if (S_BLKSIZE == 512)
603 /* Optimize this case */
604 sb->st_blocks = vap->va_bytes >> 9;
605#else
606 sb->st_blocks = vap->va_bytes / S_BLKSIZE;
607#endif
608 return (0);
609}
610
611/*
612 * File table vnode ioctl routine.
613 */
614static int
615vn_ioctl(fp, com, data, td)
616 struct file *fp;
617 u_long com;
618 caddr_t data;
619 struct thread *td;
620{
621 register struct vnode *vp = ((struct vnode *)fp->f_data);
40 */
41
42#include <sys/param.h>
43#include <sys/systm.h>
44#include <sys/fcntl.h>
45#include <sys/file.h>
46#include <sys/stat.h>
47#include <sys/proc.h>
48#include <sys/lock.h>
49#include <sys/mount.h>
50#include <sys/mutex.h>
51#include <sys/namei.h>
52#include <sys/vnode.h>
53#include <sys/bio.h>
54#include <sys/buf.h>
55#include <sys/filio.h>
56#include <sys/ttycom.h>
57#include <sys/conf.h>
58#include <sys/syslog.h>
59
60#include <machine/limits.h>
61
62static int vn_closefile __P((struct file *fp, struct thread *td));
63static int vn_ioctl __P((struct file *fp, u_long com, caddr_t data,
64 struct thread *td));
65static int vn_read __P((struct file *fp, struct uio *uio,
66 struct ucred *cred, int flags, struct thread *td));
67static int vn_poll __P((struct file *fp, int events, struct ucred *cred,
68 struct thread *td));
69static int vn_kqfilter __P((struct file *fp, struct knote *kn));
70static int vn_statfile __P((struct file *fp, struct stat *sb, struct thread *td));
71static int vn_write __P((struct file *fp, struct uio *uio,
72 struct ucred *cred, int flags, struct thread *td));
73
74struct fileops vnops = {
75 vn_read, vn_write, vn_ioctl, vn_poll, vn_kqfilter,
76 vn_statfile, vn_closefile
77};
78
79int
80vn_open(ndp, flagp, cmode)
81 register struct nameidata *ndp;
82 int *flagp, cmode;
83{
84 struct thread *td = ndp->ni_cnd.cn_thread;
85
86 return (vn_open_cred(ndp, flagp, cmode, td->td_proc->p_ucred));
87}
88
89/*
90 * Common code for vnode open operations.
91 * Check permissions, and call the VOP_OPEN or VOP_CREATE routine.
92 *
93 * Note that this does NOT free nameidata for the successful case,
94 * due to the NDINIT being done elsewhere.
95 */
96int
97vn_open_cred(ndp, flagp, cmode, cred)
98 register struct nameidata *ndp;
99 int *flagp, cmode;
100 struct ucred *cred;
101{
102 struct vnode *vp;
103 struct mount *mp;
104 struct thread *td = ndp->ni_cnd.cn_thread;
105 struct vattr vat;
106 struct vattr *vap = &vat;
107 int mode, fmode, error;
108
109restart:
110 fmode = *flagp;
111 if (fmode & O_CREAT) {
112 ndp->ni_cnd.cn_nameiop = CREATE;
113 ndp->ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF;
114 if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0)
115 ndp->ni_cnd.cn_flags |= FOLLOW;
116 bwillwrite();
117 if ((error = namei(ndp)) != 0)
118 return (error);
119 if (ndp->ni_vp == NULL) {
120 VATTR_NULL(vap);
121 vap->va_type = VREG;
122 vap->va_mode = cmode;
123 if (fmode & O_EXCL)
124 vap->va_vaflags |= VA_EXCLUSIVE;
125 if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) {
126 NDFREE(ndp, NDF_ONLY_PNBUF);
127 vput(ndp->ni_dvp);
128 if ((error = vn_start_write(NULL, &mp,
129 V_XSLEEP | PCATCH)) != 0)
130 return (error);
131 goto restart;
132 }
133 VOP_LEASE(ndp->ni_dvp, td, cred, LEASE_WRITE);
134 error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp,
135 &ndp->ni_cnd, vap);
136 vput(ndp->ni_dvp);
137 vn_finished_write(mp);
138 if (error) {
139 NDFREE(ndp, NDF_ONLY_PNBUF);
140 return (error);
141 }
142 ASSERT_VOP_UNLOCKED(ndp->ni_dvp, "create");
143 ASSERT_VOP_LOCKED(ndp->ni_vp, "create");
144 fmode &= ~O_TRUNC;
145 vp = ndp->ni_vp;
146 } else {
147 if (ndp->ni_dvp == ndp->ni_vp)
148 vrele(ndp->ni_dvp);
149 else
150 vput(ndp->ni_dvp);
151 ndp->ni_dvp = NULL;
152 vp = ndp->ni_vp;
153 if (fmode & O_EXCL) {
154 error = EEXIST;
155 goto bad;
156 }
157 fmode &= ~O_CREAT;
158 }
159 } else {
160 ndp->ni_cnd.cn_nameiop = LOOKUP;
161 ndp->ni_cnd.cn_flags =
162 ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) | LOCKLEAF;
163 if ((error = namei(ndp)) != 0)
164 return (error);
165 vp = ndp->ni_vp;
166 }
167 if (vp->v_type == VLNK) {
168 error = EMLINK;
169 goto bad;
170 }
171 if (vp->v_type == VSOCK) {
172 error = EOPNOTSUPP;
173 goto bad;
174 }
175 if ((fmode & O_CREAT) == 0) {
176 mode = 0;
177 if (fmode & (FWRITE | O_TRUNC)) {
178 if (vp->v_type == VDIR) {
179 error = EISDIR;
180 goto bad;
181 }
182 error = vn_writechk(vp);
183 if (error)
184 goto bad;
185 mode |= VWRITE;
186 }
187 if (fmode & FREAD)
188 mode |= VREAD;
189 if (mode) {
190 error = VOP_ACCESS(vp, mode, cred, td);
191 if (error)
192 goto bad;
193 }
194 }
195 if ((error = VOP_OPEN(vp, fmode, cred, td)) != 0)
196 goto bad;
197 /*
198 * Make sure that a VM object is created for VMIO support.
199 */
200 if (vn_canvmio(vp) == TRUE) {
201 if ((error = vfs_object_create(vp, td, cred)) != 0) {
202 VOP_UNLOCK(vp, 0, td);
203 VOP_CLOSE(vp, fmode, cred, td);
204 NDFREE(ndp, NDF_ONLY_PNBUF);
205 vrele(vp);
206 *flagp = fmode;
207 return (error);
208 }
209 }
210
211 if (fmode & FWRITE)
212 vp->v_writecount++;
213 *flagp = fmode;
214 return (0);
215bad:
216 NDFREE(ndp, NDF_ONLY_PNBUF);
217 vput(vp);
218 *flagp = fmode;
219 return (error);
220}
221
222/*
223 * Check for write permissions on the specified vnode.
224 * Prototype text segments cannot be written.
225 */
226int
227vn_writechk(vp)
228 register struct vnode *vp;
229{
230
231 /*
232 * If there's shared text associated with
233 * the vnode, try to free it up once. If
234 * we fail, we can't allow writing.
235 */
236 if (vp->v_flag & VTEXT)
237 return (ETXTBSY);
238 return (0);
239}
240
241/*
242 * Vnode close call
243 */
244int
245vn_close(vp, flags, cred, td)
246 register struct vnode *vp;
247 int flags;
248 struct ucred *cred;
249 struct thread *td;
250{
251 int error;
252
253 if (flags & FWRITE)
254 vp->v_writecount--;
255 error = VOP_CLOSE(vp, flags, cred, td);
256 /*
257 * XXX - In certain instances VOP_CLOSE has to do the vrele
258 * itself. If the vrele has been done, it will return EAGAIN
259 * to indicate that the vrele should not be done again. When
260 * this happens, we just return success. The correct thing to
261 * do would be to have all VOP_CLOSE instances do the vrele.
262 */
263 if (error == EAGAIN)
264 return (0);
265 vrele(vp);
266 return (error);
267}
268
269static __inline
270int
271sequential_heuristic(struct uio *uio, struct file *fp)
272{
273
274 /*
275 * Sequential heuristic - detect sequential operation
276 */
277 if ((uio->uio_offset == 0 && fp->f_seqcount > 0) ||
278 uio->uio_offset == fp->f_nextoff) {
279 /*
280 * XXX we assume that the filesystem block size is
281 * the default. Not true, but still gives us a pretty
282 * good indicator of how sequential the read operations
283 * are.
284 */
285 fp->f_seqcount += (uio->uio_resid + BKVASIZE - 1) / BKVASIZE;
286 if (fp->f_seqcount >= 127)
287 fp->f_seqcount = 127;
288 return(fp->f_seqcount << 16);
289 }
290
291 /*
292 * Not sequential, quick draw-down of seqcount
293 */
294 if (fp->f_seqcount > 1)
295 fp->f_seqcount = 1;
296 else
297 fp->f_seqcount = 0;
298 return(0);
299}
300
301/*
302 * Package up an I/O request on a vnode into a uio and do it.
303 */
304int
305vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, cred, aresid, td)
306 enum uio_rw rw;
307 struct vnode *vp;
308 caddr_t base;
309 int len;
310 off_t offset;
311 enum uio_seg segflg;
312 int ioflg;
313 struct ucred *cred;
314 int *aresid;
315 struct thread *td;
316{
317 struct uio auio;
318 struct iovec aiov;
319 struct mount *mp;
320 int error;
321
322 if ((ioflg & IO_NODELOCKED) == 0) {
323 mp = NULL;
324 if (rw == UIO_WRITE &&
325 vp->v_type != VCHR &&
326 (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
327 return (error);
328 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
329 }
330 auio.uio_iov = &aiov;
331 auio.uio_iovcnt = 1;
332 aiov.iov_base = base;
333 aiov.iov_len = len;
334 auio.uio_resid = len;
335 auio.uio_offset = offset;
336 auio.uio_segflg = segflg;
337 auio.uio_rw = rw;
338 auio.uio_td = td;
339 if (rw == UIO_READ) {
340 error = VOP_READ(vp, &auio, ioflg, cred);
341 } else {
342 error = VOP_WRITE(vp, &auio, ioflg, cred);
343 }
344 if (aresid)
345 *aresid = auio.uio_resid;
346 else
347 if (auio.uio_resid && error == 0)
348 error = EIO;
349 if ((ioflg & IO_NODELOCKED) == 0) {
350 vn_finished_write(mp);
351 VOP_UNLOCK(vp, 0, td);
352 }
353 return (error);
354}
355
356/*
357 * Package up an I/O request on a vnode into a uio and do it. The I/O
358 * request is split up into smaller chunks and we try to avoid saturating
359 * the buffer cache while potentially holding a vnode locked, so we
360 * check bwillwrite() before calling vn_rdwr(). We also call uio_yield()
361 * to give other processes a chance to lock the vnode (either other processes
362 * core'ing the same binary, or unrelated processes scanning the directory).
363 */
364int
365vn_rdwr_inchunks(rw, vp, base, len, offset, segflg, ioflg, cred, aresid, td)
366 enum uio_rw rw;
367 struct vnode *vp;
368 caddr_t base;
369 int len;
370 off_t offset;
371 enum uio_seg segflg;
372 int ioflg;
373 struct ucred *cred;
374 int *aresid;
375 struct thread *td;
376{
377 int error = 0;
378
379 do {
380 int chunk = (len > MAXBSIZE) ? MAXBSIZE : len;
381
382 if (rw != UIO_READ && vp->v_type == VREG)
383 bwillwrite();
384 error = vn_rdwr(rw, vp, base, chunk, offset, segflg,
385 ioflg, cred, aresid, td);
386 len -= chunk; /* aresid calc already includes length */
387 if (error)
388 break;
389 offset += chunk;
390 base += chunk;
391 uio_yield();
392 } while (len);
393 if (aresid)
394 *aresid += len;
395 return (error);
396}
397
398/*
399 * File table vnode read routine.
400 */
401static int
402vn_read(fp, uio, cred, flags, td)
403 struct file *fp;
404 struct uio *uio;
405 struct ucred *cred;
406 struct thread *td;
407 int flags;
408{
409 struct vnode *vp;
410 int error, ioflag;
411
412 KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
413 uio->uio_td, td));
414 vp = (struct vnode *)fp->f_data;
415 ioflag = 0;
416 if (fp->f_flag & FNONBLOCK)
417 ioflag |= IO_NDELAY;
418 if (fp->f_flag & O_DIRECT)
419 ioflag |= IO_DIRECT;
420 VOP_LEASE(vp, td, cred, LEASE_READ);
421 vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, td);
422 if ((flags & FOF_OFFSET) == 0)
423 uio->uio_offset = fp->f_offset;
424
425 ioflag |= sequential_heuristic(uio, fp);
426
427 error = VOP_READ(vp, uio, ioflag, cred);
428 if ((flags & FOF_OFFSET) == 0)
429 fp->f_offset = uio->uio_offset;
430 fp->f_nextoff = uio->uio_offset;
431 VOP_UNLOCK(vp, 0, td);
432 return (error);
433}
434
435/*
436 * File table vnode write routine.
437 */
438static int
439vn_write(fp, uio, cred, flags, td)
440 struct file *fp;
441 struct uio *uio;
442 struct ucred *cred;
443 struct thread *td;
444 int flags;
445{
446 struct vnode *vp;
447 struct mount *mp;
448 int error, ioflag;
449
450 KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
451 uio->uio_td, td));
452 vp = (struct vnode *)fp->f_data;
453 if (vp->v_type == VREG)
454 bwillwrite();
455 ioflag = IO_UNIT;
456 if (vp->v_type == VREG && (fp->f_flag & O_APPEND))
457 ioflag |= IO_APPEND;
458 if (fp->f_flag & FNONBLOCK)
459 ioflag |= IO_NDELAY;
460 if (fp->f_flag & O_DIRECT)
461 ioflag |= IO_DIRECT;
462 if ((fp->f_flag & O_FSYNC) ||
463 (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
464 ioflag |= IO_SYNC;
465 mp = NULL;
466 if (vp->v_type != VCHR &&
467 (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
468 return (error);
469 VOP_LEASE(vp, td, cred, LEASE_WRITE);
470 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
471 if ((flags & FOF_OFFSET) == 0)
472 uio->uio_offset = fp->f_offset;
473 ioflag |= sequential_heuristic(uio, fp);
474 error = VOP_WRITE(vp, uio, ioflag, cred);
475 if ((flags & FOF_OFFSET) == 0)
476 fp->f_offset = uio->uio_offset;
477 fp->f_nextoff = uio->uio_offset;
478 VOP_UNLOCK(vp, 0, td);
479 vn_finished_write(mp);
480 return (error);
481}
482
483/*
484 * File table vnode stat routine.
485 */
486static int
487vn_statfile(fp, sb, td)
488 struct file *fp;
489 struct stat *sb;
490 struct thread *td;
491{
492 struct vnode *vp = (struct vnode *)fp->f_data;
493 int error;
494
495 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
496 error = vn_stat(vp, sb, td);
497 VOP_UNLOCK(vp, 0, td);
498
499 return (error);
500}
501
502int
503vn_stat(vp, sb, td)
504 struct vnode *vp;
505 register struct stat *sb;
506 struct thread *td;
507{
508 struct vattr vattr;
509 register struct vattr *vap;
510 int error;
511 u_short mode;
512
513 vap = &vattr;
514 error = VOP_GETATTR(vp, vap, td->td_proc->p_ucred, td);
515 if (error)
516 return (error);
517
518 /*
519 * Zero the spare stat fields
520 */
521 sb->st_lspare = 0;
522 sb->st_qspare[0] = 0;
523 sb->st_qspare[1] = 0;
524
525 /*
526 * Copy from vattr table
527 */
528 if (vap->va_fsid != VNOVAL)
529 sb->st_dev = vap->va_fsid;
530 else
531 sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0];
532 sb->st_ino = vap->va_fileid;
533 mode = vap->va_mode;
534 switch (vap->va_type) {
535 case VREG:
536 mode |= S_IFREG;
537 break;
538 case VDIR:
539 mode |= S_IFDIR;
540 break;
541 case VBLK:
542 mode |= S_IFBLK;
543 break;
544 case VCHR:
545 mode |= S_IFCHR;
546 break;
547 case VLNK:
548 mode |= S_IFLNK;
549 /* This is a cosmetic change, symlinks do not have a mode. */
550 if (vp->v_mount->mnt_flag & MNT_NOSYMFOLLOW)
551 sb->st_mode &= ~ACCESSPERMS; /* 0000 */
552 else
553 sb->st_mode |= ACCESSPERMS; /* 0777 */
554 break;
555 case VSOCK:
556 mode |= S_IFSOCK;
557 break;
558 case VFIFO:
559 mode |= S_IFIFO;
560 break;
561 default:
562 return (EBADF);
563 };
564 sb->st_mode = mode;
565 sb->st_nlink = vap->va_nlink;
566 sb->st_uid = vap->va_uid;
567 sb->st_gid = vap->va_gid;
568 sb->st_rdev = vap->va_rdev;
569 if (vap->va_size > OFF_MAX)
570 return (EOVERFLOW);
571 sb->st_size = vap->va_size;
572 sb->st_atimespec = vap->va_atime;
573 sb->st_mtimespec = vap->va_mtime;
574 sb->st_ctimespec = vap->va_ctime;
575
576 /*
577 * According to www.opengroup.org, the meaning of st_blksize is
578 * "a filesystem-specific preferred I/O block size for this
579 * object. In some filesystem types, this may vary from file
580 * to file"
581 * Default to PAGE_SIZE after much discussion.
582 */
583
584 if (vap->va_type == VREG) {
585 sb->st_blksize = vap->va_blocksize;
586 } else if (vn_isdisk(vp, NULL)) {
587 sb->st_blksize = vp->v_rdev->si_bsize_best;
588 if (sb->st_blksize < vp->v_rdev->si_bsize_phys)
589 sb->st_blksize = vp->v_rdev->si_bsize_phys;
590 if (sb->st_blksize < BLKDEV_IOSIZE)
591 sb->st_blksize = BLKDEV_IOSIZE;
592 } else {
593 sb->st_blksize = PAGE_SIZE;
594 }
595
596 sb->st_flags = vap->va_flags;
597 if (suser_xxx(td->td_proc->p_ucred, 0, 0))
598 sb->st_gen = 0;
599 else
600 sb->st_gen = vap->va_gen;
601
602#if (S_BLKSIZE == 512)
603 /* Optimize this case */
604 sb->st_blocks = vap->va_bytes >> 9;
605#else
606 sb->st_blocks = vap->va_bytes / S_BLKSIZE;
607#endif
608 return (0);
609}
610
611/*
612 * File table vnode ioctl routine.
613 */
614static int
615vn_ioctl(fp, com, data, td)
616 struct file *fp;
617 u_long com;
618 caddr_t data;
619 struct thread *td;
620{
621 register struct vnode *vp = ((struct vnode *)fp->f_data);
622 struct vnode *vpold;
622 struct vattr vattr;
623 int error;
624
625 switch (vp->v_type) {
626
627 case VREG:
628 case VDIR:
629 if (com == FIONREAD) {
630 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
631 error = VOP_GETATTR(vp, &vattr, td->td_proc->p_ucred, td);
632 VOP_UNLOCK(vp, 0, td);
633 if (error)
634 return (error);
635 *(int *)data = vattr.va_size - fp->f_offset;
636 return (0);
637 }
638 if (com == FIONBIO || com == FIOASYNC) /* XXX */
639 return (0); /* XXX */
640 /* fall into ... */
641
642 default:
643#if 0
644 return (ENOTTY);
645#endif
646 case VFIFO:
647 case VCHR:
648 case VBLK:
649 if (com == FIODTYPE) {
650 if (vp->v_type != VCHR && vp->v_type != VBLK)
651 return (ENOTTY);
652 *(int *)data = devsw(vp->v_rdev)->d_flags & D_TYPEMASK;
653 return (0);
654 }
655 error = VOP_IOCTL(vp, com, data, fp->f_flag, td->td_proc->p_ucred, td);
656 if (error == 0 && com == TIOCSCTTY) {
657
658 /* Do nothing if reassigning same control tty */
623 struct vattr vattr;
624 int error;
625
626 switch (vp->v_type) {
627
628 case VREG:
629 case VDIR:
630 if (com == FIONREAD) {
631 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
632 error = VOP_GETATTR(vp, &vattr, td->td_proc->p_ucred, td);
633 VOP_UNLOCK(vp, 0, td);
634 if (error)
635 return (error);
636 *(int *)data = vattr.va_size - fp->f_offset;
637 return (0);
638 }
639 if (com == FIONBIO || com == FIOASYNC) /* XXX */
640 return (0); /* XXX */
641 /* fall into ... */
642
643 default:
644#if 0
645 return (ENOTTY);
646#endif
647 case VFIFO:
648 case VCHR:
649 case VBLK:
650 if (com == FIODTYPE) {
651 if (vp->v_type != VCHR && vp->v_type != VBLK)
652 return (ENOTTY);
653 *(int *)data = devsw(vp->v_rdev)->d_flags & D_TYPEMASK;
654 return (0);
655 }
656 error = VOP_IOCTL(vp, com, data, fp->f_flag, td->td_proc->p_ucred, td);
657 if (error == 0 && com == TIOCSCTTY) {
658
659 /* Do nothing if reassigning same control tty */
659 if (td->td_proc->p_session->s_ttyvp == vp)
660 PGRPSESS_XLOCK();
661 if (td->td_proc->p_session->s_ttyvp == vp) {
662 PGRPSESS_XUNLOCK();
660 return (0);
663 return (0);
664 }
661
665
662 /* Get rid of reference to old control tty */
663 if (td->td_proc->p_session->s_ttyvp)
664 vrele(td->td_proc->p_session->s_ttyvp);
665
666 td->td_proc->p_session->s_ttyvp = vp;
666 vpold = td->td_proc->p_session->s_ttyvp;
667 VREF(vp);
667 VREF(vp);
668 SESS_LOCK(td->td_proc->p_session);
669 td->td_proc->p_session->s_ttyvp = vp;
670 SESS_UNLOCK(td->td_proc->p_session);
671
672 PGRPSESS_XUNLOCK();
673
674 /* Get rid of reference to old control tty */
675 if (vpold)
676 vrele(vpold);
668 }
669 return (error);
670 }
671}
672
673/*
674 * File table vnode poll routine.
675 */
676static int
677vn_poll(fp, events, cred, td)
678 struct file *fp;
679 int events;
680 struct ucred *cred;
681 struct thread *td;
682{
683
684 return (VOP_POLL(((struct vnode *)fp->f_data), events, cred, td));
685}
686
687/*
688 * Check that the vnode is still valid, and if so
689 * acquire requested lock.
690 */
691int
692#ifndef DEBUG_LOCKS
693vn_lock(vp, flags, td)
694#else
695debug_vn_lock(vp, flags, td, filename, line)
696#endif
697 struct vnode *vp;
698 int flags;
699 struct thread *td;
700#ifdef DEBUG_LOCKS
701 const char *filename;
702 int line;
703#endif
704{
705 int error;
706
707 do {
708 if ((flags & LK_INTERLOCK) == 0)
709 mtx_lock(&vp->v_interlock);
710 if ((vp->v_flag & VXLOCK) && vp->v_vxproc != curthread) {
711 vp->v_flag |= VXWANT;
712 msleep(vp, &vp->v_interlock, PINOD | PDROP,
713 "vn_lock", 0);
714 error = ENOENT;
715 } else {
716#if 0
717 /* this can now occur in normal operation */
718 if (vp->v_vxproc != NULL)
719 log(LOG_INFO, "VXLOCK interlock avoided in vn_lock\n");
720#endif
721#ifdef DEBUG_LOCKS
722 vp->filename = filename;
723 vp->line = line;
724#endif
725 error = VOP_LOCK(vp,
726 flags | LK_NOPAUSE | LK_INTERLOCK, td);
727 if (error == 0)
728 return (error);
729 }
730 flags &= ~LK_INTERLOCK;
731 } while (flags & LK_RETRY);
732 return (error);
733}
734
735/*
736 * File table vnode close routine.
737 */
738static int
739vn_closefile(fp, td)
740 struct file *fp;
741 struct thread *td;
742{
743
744 fp->f_ops = &badfileops;
745 return (vn_close(((struct vnode *)fp->f_data), fp->f_flag,
746 fp->f_cred, td));
747}
748
749/*
750 * Preparing to start a filesystem write operation. If the operation is
751 * permitted, then we bump the count of operations in progress and
752 * proceed. If a suspend request is in progress, we wait until the
753 * suspension is over, and then proceed.
754 */
755int
756vn_start_write(vp, mpp, flags)
757 struct vnode *vp;
758 struct mount **mpp;
759 int flags;
760{
761 struct mount *mp;
762 int error;
763
764 /*
765 * If a vnode is provided, get and return the mount point that
766 * to which it will write.
767 */
768 if (vp != NULL) {
769 if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
770 *mpp = NULL;
771 if (error != EOPNOTSUPP)
772 return (error);
773 return (0);
774 }
775 }
776 if ((mp = *mpp) == NULL)
777 return (0);
778 /*
779 * Check on status of suspension.
780 */
781 while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
782 if (flags & V_NOWAIT)
783 return (EWOULDBLOCK);
784 error = tsleep(&mp->mnt_flag, (PUSER - 1) | (flags & PCATCH),
785 "suspfs", 0);
786 if (error)
787 return (error);
788 }
789 if (flags & V_XSLEEP)
790 return (0);
791 mp->mnt_writeopcount++;
792 return (0);
793}
794
795/*
796 * Secondary suspension. Used by operations such as vop_inactive
797 * routines that are needed by the higher level functions. These
798 * are allowed to proceed until all the higher level functions have
799 * completed (indicated by mnt_writeopcount dropping to zero). At that
800 * time, these operations are halted until the suspension is over.
801 */
802int
803vn_write_suspend_wait(vp, mp, flags)
804 struct vnode *vp;
805 struct mount *mp;
806 int flags;
807{
808 int error;
809
810 if (vp != NULL) {
811 if ((error = VOP_GETWRITEMOUNT(vp, &mp)) != 0) {
812 if (error != EOPNOTSUPP)
813 return (error);
814 return (0);
815 }
816 }
817 /*
818 * If we are not suspended or have not yet reached suspended
819 * mode, then let the operation proceed.
820 */
821 if (mp == NULL || (mp->mnt_kern_flag & MNTK_SUSPENDED) == 0)
822 return (0);
823 if (flags & V_NOWAIT)
824 return (EWOULDBLOCK);
825 /*
826 * Wait for the suspension to finish.
827 */
828 return (tsleep(&mp->mnt_flag, (PUSER - 1) | (flags & PCATCH),
829 "suspfs", 0));
830}
831
832/*
833 * Filesystem write operation has completed. If we are suspending and this
834 * operation is the last one, notify the suspender that the suspension is
835 * now in effect.
836 */
837void
838vn_finished_write(mp)
839 struct mount *mp;
840{
841
842 if (mp == NULL)
843 return;
844 mp->mnt_writeopcount--;
845 if (mp->mnt_writeopcount < 0)
846 panic("vn_finished_write: neg cnt");
847 if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
848 mp->mnt_writeopcount <= 0)
849 wakeup(&mp->mnt_writeopcount);
850}
851
852/*
853 * Request a filesystem to suspend write operations.
854 */
855void
856vfs_write_suspend(mp)
857 struct mount *mp;
858{
859 struct thread *td = curthread;
860
861 if (mp->mnt_kern_flag & MNTK_SUSPEND)
862 return;
863 mp->mnt_kern_flag |= MNTK_SUSPEND;
864 if (mp->mnt_writeopcount > 0)
865 (void) tsleep(&mp->mnt_writeopcount, PUSER - 1, "suspwt", 0);
866 VFS_SYNC(mp, MNT_WAIT, td->td_proc->p_ucred, td);
867 mp->mnt_kern_flag |= MNTK_SUSPENDED;
868}
869
870/*
871 * Request a filesystem to resume write operations.
872 */
873void
874vfs_write_resume(mp)
875 struct mount *mp;
876{
877
878 if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0)
879 return;
880 mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPENDED);
881 wakeup(&mp->mnt_writeopcount);
882 wakeup(&mp->mnt_flag);
883}
884
885static int
886vn_kqfilter(struct file *fp, struct knote *kn)
887{
888
889 return (VOP_KQFILTER(((struct vnode *)fp->f_data), kn));
890}
891
892/*
893 * Simplified in-kernel wrapper calls for extended attribute access.
894 * Both calls pass in a NULL credential, authorizing as "kernel" access.
895 * Set IO_NODELOCKED in ioflg if the vnode is already locked.
896 */
897int
898vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
899 const char *attrname, int *buflen, char *buf, struct thread *td)
900{
901 struct uio auio;
902 struct iovec iov;
903 int error;
904
905 iov.iov_len = *buflen;
906 iov.iov_base = buf;
907
908 auio.uio_iov = &iov;
909 auio.uio_iovcnt = 1;
910 auio.uio_rw = UIO_READ;
911 auio.uio_segflg = UIO_SYSSPACE;
912 auio.uio_td = td;
913 auio.uio_offset = 0;
914 auio.uio_resid = *buflen;
915
916 if ((ioflg & IO_NODELOCKED) == 0)
917 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
918
919 /* authorize attribute retrieval as kernel */
920 error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL,
921 td);
922
923 if ((ioflg & IO_NODELOCKED) == 0)
924 VOP_UNLOCK(vp, 0, td);
925
926 if (error == 0) {
927 *buflen = *buflen - auio.uio_resid;
928 }
929
930 return (error);
931}
932
933/*
934 * XXX failure mode if partially written?
935 */
936int
937vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace,
938 const char *attrname, int buflen, char *buf, struct thread *td)
939{
940 struct uio auio;
941 struct iovec iov;
942 struct mount *mp;
943 int error;
944
945 iov.iov_len = buflen;
946 iov.iov_base = buf;
947
948 auio.uio_iov = &iov;
949 auio.uio_iovcnt = 1;
950 auio.uio_rw = UIO_WRITE;
951 auio.uio_segflg = UIO_SYSSPACE;
952 auio.uio_td = td;
953 auio.uio_offset = 0;
954 auio.uio_resid = buflen;
955
956 if ((ioflg & IO_NODELOCKED) == 0) {
957 if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
958 return (error);
959 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
960 }
961
962 /* authorize attribute setting as kernel */
963 error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td);
964
965 if ((ioflg & IO_NODELOCKED) == 0) {
966 vn_finished_write(mp);
967 VOP_UNLOCK(vp, 0, td);
968 }
969
970 return (error);
971}
972
973int
974vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
975 const char *attrname, struct thread *td)
976{
977 struct mount *mp;
978 int error;
979
980 if ((ioflg & IO_NODELOCKED) == 0) {
981 if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
982 return (error);
983 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
984 }
985
986 /* authorize attribute removal as kernel */
987 error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL, NULL, td);
988
989 if ((ioflg & IO_NODELOCKED) == 0) {
990 vn_finished_write(mp);
991 VOP_UNLOCK(vp, 0, td);
992 }
993
994 return (error);
995}
677 }
678 return (error);
679 }
680}
681
682/*
683 * File table vnode poll routine.
684 */
685static int
686vn_poll(fp, events, cred, td)
687 struct file *fp;
688 int events;
689 struct ucred *cred;
690 struct thread *td;
691{
692
693 return (VOP_POLL(((struct vnode *)fp->f_data), events, cred, td));
694}
695
696/*
697 * Check that the vnode is still valid, and if so
698 * acquire requested lock.
699 */
700int
701#ifndef DEBUG_LOCKS
702vn_lock(vp, flags, td)
703#else
704debug_vn_lock(vp, flags, td, filename, line)
705#endif
706 struct vnode *vp;
707 int flags;
708 struct thread *td;
709#ifdef DEBUG_LOCKS
710 const char *filename;
711 int line;
712#endif
713{
714 int error;
715
716 do {
717 if ((flags & LK_INTERLOCK) == 0)
718 mtx_lock(&vp->v_interlock);
719 if ((vp->v_flag & VXLOCK) && vp->v_vxproc != curthread) {
720 vp->v_flag |= VXWANT;
721 msleep(vp, &vp->v_interlock, PINOD | PDROP,
722 "vn_lock", 0);
723 error = ENOENT;
724 } else {
725#if 0
726 /* this can now occur in normal operation */
727 if (vp->v_vxproc != NULL)
728 log(LOG_INFO, "VXLOCK interlock avoided in vn_lock\n");
729#endif
730#ifdef DEBUG_LOCKS
731 vp->filename = filename;
732 vp->line = line;
733#endif
734 error = VOP_LOCK(vp,
735 flags | LK_NOPAUSE | LK_INTERLOCK, td);
736 if (error == 0)
737 return (error);
738 }
739 flags &= ~LK_INTERLOCK;
740 } while (flags & LK_RETRY);
741 return (error);
742}
743
744/*
745 * File table vnode close routine.
746 */
747static int
748vn_closefile(fp, td)
749 struct file *fp;
750 struct thread *td;
751{
752
753 fp->f_ops = &badfileops;
754 return (vn_close(((struct vnode *)fp->f_data), fp->f_flag,
755 fp->f_cred, td));
756}
757
758/*
759 * Preparing to start a filesystem write operation. If the operation is
760 * permitted, then we bump the count of operations in progress and
761 * proceed. If a suspend request is in progress, we wait until the
762 * suspension is over, and then proceed.
763 */
764int
765vn_start_write(vp, mpp, flags)
766 struct vnode *vp;
767 struct mount **mpp;
768 int flags;
769{
770 struct mount *mp;
771 int error;
772
773 /*
774 * If a vnode is provided, get and return the mount point that
775 * to which it will write.
776 */
777 if (vp != NULL) {
778 if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
779 *mpp = NULL;
780 if (error != EOPNOTSUPP)
781 return (error);
782 return (0);
783 }
784 }
785 if ((mp = *mpp) == NULL)
786 return (0);
787 /*
788 * Check on status of suspension.
789 */
790 while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
791 if (flags & V_NOWAIT)
792 return (EWOULDBLOCK);
793 error = tsleep(&mp->mnt_flag, (PUSER - 1) | (flags & PCATCH),
794 "suspfs", 0);
795 if (error)
796 return (error);
797 }
798 if (flags & V_XSLEEP)
799 return (0);
800 mp->mnt_writeopcount++;
801 return (0);
802}
803
804/*
805 * Secondary suspension. Used by operations such as vop_inactive
806 * routines that are needed by the higher level functions. These
807 * are allowed to proceed until all the higher level functions have
808 * completed (indicated by mnt_writeopcount dropping to zero). At that
809 * time, these operations are halted until the suspension is over.
810 */
811int
812vn_write_suspend_wait(vp, mp, flags)
813 struct vnode *vp;
814 struct mount *mp;
815 int flags;
816{
817 int error;
818
819 if (vp != NULL) {
820 if ((error = VOP_GETWRITEMOUNT(vp, &mp)) != 0) {
821 if (error != EOPNOTSUPP)
822 return (error);
823 return (0);
824 }
825 }
826 /*
827 * If we are not suspended or have not yet reached suspended
828 * mode, then let the operation proceed.
829 */
830 if (mp == NULL || (mp->mnt_kern_flag & MNTK_SUSPENDED) == 0)
831 return (0);
832 if (flags & V_NOWAIT)
833 return (EWOULDBLOCK);
834 /*
835 * Wait for the suspension to finish.
836 */
837 return (tsleep(&mp->mnt_flag, (PUSER - 1) | (flags & PCATCH),
838 "suspfs", 0));
839}
840
841/*
842 * Filesystem write operation has completed. If we are suspending and this
843 * operation is the last one, notify the suspender that the suspension is
844 * now in effect.
845 */
846void
847vn_finished_write(mp)
848 struct mount *mp;
849{
850
851 if (mp == NULL)
852 return;
853 mp->mnt_writeopcount--;
854 if (mp->mnt_writeopcount < 0)
855 panic("vn_finished_write: neg cnt");
856 if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
857 mp->mnt_writeopcount <= 0)
858 wakeup(&mp->mnt_writeopcount);
859}
860
861/*
862 * Request a filesystem to suspend write operations.
863 */
864void
865vfs_write_suspend(mp)
866 struct mount *mp;
867{
868 struct thread *td = curthread;
869
870 if (mp->mnt_kern_flag & MNTK_SUSPEND)
871 return;
872 mp->mnt_kern_flag |= MNTK_SUSPEND;
873 if (mp->mnt_writeopcount > 0)
874 (void) tsleep(&mp->mnt_writeopcount, PUSER - 1, "suspwt", 0);
875 VFS_SYNC(mp, MNT_WAIT, td->td_proc->p_ucred, td);
876 mp->mnt_kern_flag |= MNTK_SUSPENDED;
877}
878
879/*
880 * Request a filesystem to resume write operations.
881 */
882void
883vfs_write_resume(mp)
884 struct mount *mp;
885{
886
887 if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0)
888 return;
889 mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPENDED);
890 wakeup(&mp->mnt_writeopcount);
891 wakeup(&mp->mnt_flag);
892}
893
894static int
895vn_kqfilter(struct file *fp, struct knote *kn)
896{
897
898 return (VOP_KQFILTER(((struct vnode *)fp->f_data), kn));
899}
900
901/*
902 * Simplified in-kernel wrapper calls for extended attribute access.
903 * Both calls pass in a NULL credential, authorizing as "kernel" access.
904 * Set IO_NODELOCKED in ioflg if the vnode is already locked.
905 */
906int
907vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
908 const char *attrname, int *buflen, char *buf, struct thread *td)
909{
910 struct uio auio;
911 struct iovec iov;
912 int error;
913
914 iov.iov_len = *buflen;
915 iov.iov_base = buf;
916
917 auio.uio_iov = &iov;
918 auio.uio_iovcnt = 1;
919 auio.uio_rw = UIO_READ;
920 auio.uio_segflg = UIO_SYSSPACE;
921 auio.uio_td = td;
922 auio.uio_offset = 0;
923 auio.uio_resid = *buflen;
924
925 if ((ioflg & IO_NODELOCKED) == 0)
926 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
927
928 /* authorize attribute retrieval as kernel */
929 error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL,
930 td);
931
932 if ((ioflg & IO_NODELOCKED) == 0)
933 VOP_UNLOCK(vp, 0, td);
934
935 if (error == 0) {
936 *buflen = *buflen - auio.uio_resid;
937 }
938
939 return (error);
940}
941
942/*
943 * XXX failure mode if partially written?
944 */
945int
946vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace,
947 const char *attrname, int buflen, char *buf, struct thread *td)
948{
949 struct uio auio;
950 struct iovec iov;
951 struct mount *mp;
952 int error;
953
954 iov.iov_len = buflen;
955 iov.iov_base = buf;
956
957 auio.uio_iov = &iov;
958 auio.uio_iovcnt = 1;
959 auio.uio_rw = UIO_WRITE;
960 auio.uio_segflg = UIO_SYSSPACE;
961 auio.uio_td = td;
962 auio.uio_offset = 0;
963 auio.uio_resid = buflen;
964
965 if ((ioflg & IO_NODELOCKED) == 0) {
966 if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
967 return (error);
968 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
969 }
970
971 /* authorize attribute setting as kernel */
972 error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td);
973
974 if ((ioflg & IO_NODELOCKED) == 0) {
975 vn_finished_write(mp);
976 VOP_UNLOCK(vp, 0, td);
977 }
978
979 return (error);
980}
981
982int
983vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
984 const char *attrname, struct thread *td)
985{
986 struct mount *mp;
987 int error;
988
989 if ((ioflg & IO_NODELOCKED) == 0) {
990 if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
991 return (error);
992 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
993 }
994
995 /* authorize attribute removal as kernel */
996 error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL, NULL, td);
997
998 if ((ioflg & IO_NODELOCKED) == 0) {
999 vn_finished_write(mp);
1000 VOP_UNLOCK(vp, 0, td);
1001 }
1002
1003 return (error);
1004}