1/* $NetBSD: vfs_vnops.c,v 1.242 2023/07/10 02:31:55 christos Exp $ */ 2 3/*- 4 * Copyright (c) 2009 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Andrew Doran. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32/* 33 * Copyright (c) 1982, 1986, 1989, 1993 34 * The Regents of the University of California. All rights reserved. 35 * (c) UNIX System Laboratories, Inc. 36 * All or some portions of this file are derived from material licensed 37 * to the University of California by American Telephone and Telegraph 38 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 39 * the permission of UNIX System Laboratories, Inc. 40 * 41 * Redistribution and use in source and binary forms, with or without 42 * modification, are permitted provided that the following conditions 43 * are met: 44 * 1. Redistributions of source code must retain the above copyright 45 * notice, this list of conditions and the following disclaimer. 46 * 2. Redistributions in binary form must reproduce the above copyright 47 * notice, this list of conditions and the following disclaimer in the 48 * documentation and/or other materials provided with the distribution. 49 * 3. Neither the name of the University nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 63 * SUCH DAMAGE. 64 * 65 * @(#)vfs_vnops.c 8.14 (Berkeley) 6/15/95 66 */ 67 68#include <sys/cdefs.h> 69__KERNEL_RCSID(0, "$NetBSD: vfs_vnops.c,v 1.242 2023/07/10 02:31:55 christos Exp $"); 70 71#include "veriexec.h" 72 73#include <sys/param.h> 74#include <sys/systm.h> 75#include <sys/kernel.h> 76#include <sys/file.h> 77#include <sys/stat.h> 78#include <sys/buf.h> 79#include <sys/proc.h> 80#include <sys/mount.h> 81#include <sys/namei.h> 82#include <sys/vnode_impl.h> 83#include <sys/ioctl.h> 84#include <sys/tty.h> 85#include <sys/poll.h> 86#include <sys/kauth.h> 87#include <sys/syslog.h> 88#include <sys/fstrans.h> 89#include <sys/atomic.h> 90#include <sys/filedesc.h> 91#include <sys/wapbl.h> 92#include <sys/mman.h> 93 94#include <miscfs/specfs/specdev.h> 95#include <miscfs/fifofs/fifo.h> 96 97#include <uvm/uvm_extern.h> 98#include <uvm/uvm_readahead.h> 99#include <uvm/uvm_device.h> 100 101#ifdef UNION 102#include <fs/union/union.h> 103#endif 104 105#ifndef COMPAT_ZERODEV 106#define COMPAT_ZERODEV(dev) (0) 107#endif 108 109int (*vn_union_readdir_hook)(struct vnode **, struct file *, struct lwp *); 110 111#include <sys/verified_exec.h> 112 113static int vn_read(file_t *fp, off_t *offset, struct uio *uio, 114 kauth_cred_t cred, int flags); 115static int vn_write(file_t *fp, off_t *offset, struct uio *uio, 116 kauth_cred_t cred, int flags); 117static int vn_closefile(file_t *fp); 118static int vn_poll(file_t *fp, int events); 119static int vn_fcntl(file_t *fp, u_int com, void *data); 120static int vn_statfile(file_t *fp, struct stat *sb); 121static int vn_ioctl(file_t *fp, u_long com, void *data); 122static int vn_mmap(struct file *, off_t *, size_t, int, int *, int *, 123 struct uvm_object **, int *); 124static int vn_seek(struct file *, off_t, int, off_t *, int); 125static int vn_advlock(struct file *, void *, int, struct flock *, int); 126static int vn_fpathconf(struct file *, int, register_t *); 127static int vn_posix_fadvise(struct file *, off_t, off_t, int); 128static int vn_truncate(file_t *, off_t); 129 130const struct fileops vnops = { 131 .fo_name = "vn", 132 .fo_read = vn_read, 133 .fo_write = vn_write, 134 .fo_ioctl = vn_ioctl, 135 .fo_fcntl = vn_fcntl, 136 .fo_poll = vn_poll, 137 .fo_stat = vn_statfile, 138 .fo_close = vn_closefile, 139 .fo_kqfilter = vn_kqfilter, 140 .fo_restart = fnullop_restart, 141 .fo_mmap = vn_mmap, 142 .fo_seek = vn_seek, 143 .fo_advlock = vn_advlock, 144 .fo_fpathconf = vn_fpathconf, 145 .fo_posix_fadvise = vn_posix_fadvise, 146 .fo_truncate = vn_truncate, 147}; 148 149/* 150 * Common code for vnode open operations. 151 * Check permissions, and call the VOP_OPEN or VOP_CREATE routine. 152 * 153 * at_dvp is the directory for openat(), if any. 154 * pb is the path. 155 * nmode is additional namei flags, restricted to TRYEMULROOT and NOCHROOT. 156 * fmode is the open flags, converted from O_* to F* 157 * cmode is the creation file permissions. 158 * 159 * XXX shouldn't cmode be mode_t? 160 * 161 * On success produces either a locked vnode in *ret_vp, or NULL in 162 * *ret_vp and a file descriptor number in *ret_fd. 163 * 164 * The caller may pass NULL for ret_fd (and ret_domove), in which case 165 * EOPNOTSUPP will be produced in the cases that would otherwise return 166 * a file descriptor. 167 * 168 * Note that callers that want no-follow behavior should pass 169 * O_NOFOLLOW in fmode. Neither FOLLOW nor NOFOLLOW in nmode is 170 * honored. 171 */ 172int 173vn_open(struct vnode *at_dvp, struct pathbuf *pb, 174 int nmode, int fmode, int cmode, 175 struct vnode **ret_vp, bool *ret_domove, int *ret_fd) 176{ 177 struct nameidata nd; 178 struct vnode *vp = NULL; 179 struct lwp *l = curlwp; 180 kauth_cred_t cred = l->l_cred; 181 struct vattr va; 182 int error; 183 const char *pathstring; 184 185 KASSERT((nmode & (TRYEMULROOT | NOCHROOT)) == nmode); 186 187 KASSERT(ret_vp != NULL); 188 KASSERT((ret_domove == NULL) == (ret_fd == NULL)); 189 190 if ((fmode & (O_CREAT | O_DIRECTORY)) == (O_CREAT | O_DIRECTORY)) 191 return EINVAL; 192 193 NDINIT(&nd, LOOKUP, nmode, pb); 194 if (at_dvp != NULL) 195 NDAT(&nd, at_dvp); 196 197 nd.ni_cnd.cn_flags &= TRYEMULROOT | NOCHROOT; 198 199 if (fmode & O_CREAT) { 200 nd.ni_cnd.cn_nameiop = CREATE; 201 nd.ni_cnd.cn_flags |= LOCKPARENT | LOCKLEAF; 202 if ((fmode & O_EXCL) == 0 && 203 ((fmode & O_NOFOLLOW) == 0)) 204 nd.ni_cnd.cn_flags |= FOLLOW; 205 if ((fmode & O_EXCL) == 0) 206 nd.ni_cnd.cn_flags |= NONEXCLHACK; 207 } else { 208 nd.ni_cnd.cn_nameiop = LOOKUP; 209 nd.ni_cnd.cn_flags |= LOCKLEAF; 210 if ((fmode & O_NOFOLLOW) == 0) 211 nd.ni_cnd.cn_flags |= FOLLOW; 212 } 213 214 pathstring = pathbuf_stringcopy_get(nd.ni_pathbuf); 215 if (pathstring == NULL) { 216 return ENOMEM; 217 } 218 219 /* 220 * When this "interface" was exposed to do_open() it used 221 * to initialize l_dupfd to -newfd-1 (thus passing in the 222 * new file handle number to use)... but nothing in the 223 * kernel uses that value. So just send 0. 224 */ 225 l->l_dupfd = 0; 226 227 error = namei(&nd); 228 if (error) 229 goto out; 230 231 vp = nd.ni_vp; 232 233#if NVERIEXEC > 0 234 error = veriexec_openchk(l, nd.ni_vp, pathstring, fmode); 235 if (error) { 236 /* We have to release the locks ourselves */ 237 /* 238 * 20210604 dholland passing NONEXCLHACK means we can 239 * get ni_dvp == NULL back if ni_vp exists, and we should 240 * treat that like the non-O_CREAT case. 241 */ 242 if ((fmode & O_CREAT) != 0 && nd.ni_dvp != NULL) { 243 if (vp == NULL) { 244 vput(nd.ni_dvp); 245 } else { 246 VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); 247 if (nd.ni_dvp == nd.ni_vp) 248 vrele(nd.ni_dvp); 249 else 250 vput(nd.ni_dvp); 251 nd.ni_dvp = NULL; 252 vput(vp); 253 vp = NULL; 254 } 255 } else { 256 vput(vp); 257 vp = NULL; 258 } 259 goto out; 260 } 261#endif /* NVERIEXEC > 0 */ 262 263 /* 264 * 20210604 dholland ditto 265 */ 266 if ((fmode & O_CREAT) != 0 && nd.ni_dvp != NULL) { 267 if (nd.ni_vp == NULL) { 268 vattr_null(&va); 269 va.va_type = VREG; 270 va.va_mode = cmode; 271 if (fmode & O_EXCL) 272 va.va_vaflags |= VA_EXCLUSIVE; 273 error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, 274 &nd.ni_cnd, &va); 275 if (error) { 276 vput(nd.ni_dvp); 277 goto out; 278 } 279 fmode &= ~O_TRUNC; 280 vp = nd.ni_vp; 281 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 282 vput(nd.ni_dvp); 283 } else { 284 VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); 285 if (nd.ni_dvp == nd.ni_vp) 286 vrele(nd.ni_dvp); 287 else 288 vput(nd.ni_dvp); 289 nd.ni_dvp = NULL; 290 vp = nd.ni_vp; 291 if (fmode & O_EXCL) { 292 error = EEXIST; 293 goto bad; 294 } 295 fmode &= ~O_CREAT; 296 } 297 } else if ((fmode & O_CREAT) != 0) { 298 /* 299 * 20210606 dholland passing NONEXCLHACK means this 300 * case exists; it is the same as the following one 301 * but also needs to do things in the second (exists) 302 * half of the following block. (Besides handle 303 * ni_dvp, anyway.) 304 */ 305 vp = nd.ni_vp; 306 KASSERT((fmode & O_EXCL) == 0); 307 fmode &= ~O_CREAT; 308 } else { 309 vp = nd.ni_vp; 310 } 311 if (vp->v_type == VSOCK) { 312 error = EOPNOTSUPP; 313 goto bad; 314 } 315 if (nd.ni_vp->v_type == VLNK) { 316 error = EFTYPE; 317 goto bad; 318 } 319 320 if ((fmode & O_CREAT) == 0) { 321 error = vn_openchk(vp, cred, fmode); 322 if (error != 0) 323 goto bad; 324 } 325 326 if (fmode & O_TRUNC) { 327 vattr_null(&va); 328 va.va_size = 0; 329 error = VOP_SETATTR(vp, &va, cred); 330 if (error != 0) 331 goto bad; 332 } 333 if ((error = VOP_OPEN(vp, fmode, cred)) != 0) 334 goto bad; 335 if (fmode & FWRITE) { 336 mutex_enter(vp->v_interlock); 337 vp->v_writecount++; 338 mutex_exit(vp->v_interlock); 339 } 340 341bad: 342 if (error) { 343 vput(vp); 344 vp = NULL; 345 } 346out: 347 pathbuf_stringcopy_put(nd.ni_pathbuf, pathstring); 348 349 switch (error) { 350 case EDUPFD: 351 case EMOVEFD: 352 /* if the caller isn't prepared to handle fds, fail for them */ 353 if (ret_fd == NULL) { 354 error = EOPNOTSUPP; 355 break; 356 } 357 *ret_vp = NULL; 358 *ret_domove = error == EMOVEFD; 359 *ret_fd = l->l_dupfd; 360 error = 0; 361 break; 362 case 0: 363 KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); 364 *ret_vp = vp; 365 break; 366 } 367 l->l_dupfd = 0; 368 return error; 369} 370 371/* 372 * Check for write permissions on the specified vnode. 373 * Prototype text segments cannot be written. 374 */ 375int 376vn_writechk(struct vnode *vp) 377{ 378 379 /* 380 * If the vnode is in use as a process's text, 381 * we can't allow writing. 382 */ 383 if (vp->v_iflag & VI_TEXT) 384 return ETXTBSY; 385 return 0; 386} 387 388int 389vn_openchk(struct vnode *vp, kauth_cred_t cred, int fflags) 390{ 391 int permbits = 0; 392 int error; 393 394 if (vp->v_type == VNON || vp->v_type == VBAD) 395 return ENXIO; 396 397 if ((fflags & O_DIRECTORY) != 0 && vp->v_type != VDIR) 398 return ENOTDIR; 399 400 if ((fflags & O_REGULAR) != 0 && vp->v_type != VREG) 401 return EFTYPE; 402 403 if ((fflags & FREAD) != 0) { 404 permbits = VREAD; 405 } 406 if ((fflags & FEXEC) != 0) { 407 permbits |= VEXEC; 408 } 409 if ((fflags & (FWRITE | O_TRUNC)) != 0) { 410 permbits |= VWRITE; 411 if (vp->v_type == VDIR) { 412 error = EISDIR; 413 goto bad; 414 } 415 error = vn_writechk(vp); 416 if (error != 0) 417 goto bad; 418 } 419 error = VOP_ACCESS(vp, permbits, cred); 420bad: 421 return error; 422} 423 424/* 425 * Mark a vnode as having executable mappings. 426 */ 427void 428vn_markexec(struct vnode *vp) 429{ 430 431 if ((vp->v_iflag & VI_EXECMAP) != 0) { 432 /* Safe unlocked, as long as caller holds a reference. */ 433 return; 434 } 435 436 rw_enter(vp->v_uobj.vmobjlock, RW_WRITER); 437 mutex_enter(vp->v_interlock); 438 if ((vp->v_iflag & VI_EXECMAP) == 0) { 439 cpu_count(CPU_COUNT_EXECPAGES, vp->v_uobj.uo_npages); 440 vp->v_iflag |= VI_EXECMAP; 441 } 442 mutex_exit(vp->v_interlock); 443 rw_exit(vp->v_uobj.vmobjlock); 444} 445 446/* 447 * Mark a vnode as being the text of a process. 448 * Fail if the vnode is currently writable. 449 */ 450int 451vn_marktext(struct vnode *vp) 452{ 453 454 if ((vp->v_iflag & (VI_TEXT|VI_EXECMAP)) == (VI_TEXT|VI_EXECMAP)) { 455 /* Safe unlocked, as long as caller holds a reference. */ 456 return 0; 457 } 458 459 rw_enter(vp->v_uobj.vmobjlock, RW_WRITER); 460 mutex_enter(vp->v_interlock); 461 if (vp->v_writecount != 0) { 462 KASSERT((vp->v_iflag & VI_TEXT) == 0); 463 mutex_exit(vp->v_interlock); 464 rw_exit(vp->v_uobj.vmobjlock); 465 return ETXTBSY; 466 } 467 if ((vp->v_iflag & VI_EXECMAP) == 0) { 468 cpu_count(CPU_COUNT_EXECPAGES, vp->v_uobj.uo_npages); 469 } 470 vp->v_iflag |= (VI_TEXT | VI_EXECMAP); 471 mutex_exit(vp->v_interlock); 472 rw_exit(vp->v_uobj.vmobjlock); 473 return 0; 474} 475 476/* 477 * Vnode close call 478 * 479 * Note: takes an unlocked vnode, while VOP_CLOSE takes a locked node. 480 */ 481int 482vn_close(struct vnode *vp, int flags, kauth_cred_t cred) 483{ 484 int error; 485 486 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 487 if (flags & FWRITE) { 488 mutex_enter(vp->v_interlock); 489 KASSERT(vp->v_writecount > 0); 490 vp->v_writecount--; 491 mutex_exit(vp->v_interlock); 492 } 493 error = VOP_CLOSE(vp, flags, cred); 494 vput(vp); 495 return error; 496} 497 498static int 499enforce_rlimit_fsize(struct vnode *vp, struct uio *uio, int ioflag) 500{ 501 struct lwp *l = curlwp; 502 off_t testoff; 503 504 if (uio->uio_rw != UIO_WRITE || vp->v_type != VREG) 505 return 0; 506 507 KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); 508 if (ioflag & IO_APPEND) 509 testoff = vp->v_size; 510 else 511 testoff = uio->uio_offset; 512 513 if (testoff + uio->uio_resid > 514 l->l_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) { 515 mutex_enter(&proc_lock); 516 psignal(l->l_proc, SIGXFSZ); 517 mutex_exit(&proc_lock); 518 return EFBIG; 519 } 520 521 return 0; 522} 523 524/* 525 * Package up an I/O request on a vnode into a uio and do it. 526 */ 527int 528vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset, 529 enum uio_seg segflg, int ioflg, kauth_cred_t cred, size_t *aresid, 530 struct lwp *l) 531{ 532 struct uio auio; 533 struct iovec aiov; 534 int error; 535 536 if ((ioflg & IO_NODELOCKED) == 0) { 537 if (rw == UIO_READ) { 538 vn_lock(vp, LK_SHARED | LK_RETRY); 539 } else /* UIO_WRITE */ { 540 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 541 } 542 } 543 auio.uio_iov = &aiov; 544 auio.uio_iovcnt = 1; 545 aiov.iov_base = base; 546 aiov.iov_len = len; 547 auio.uio_resid = len; 548 auio.uio_offset = offset; 549 auio.uio_rw = rw; 550 if (segflg == UIO_SYSSPACE) { 551 UIO_SETUP_SYSSPACE(&auio); 552 } else { 553 auio.uio_vmspace = l->l_proc->p_vmspace; 554 } 555 556 if ((error = enforce_rlimit_fsize(vp, &auio, ioflg)) != 0) 557 goto out; 558 559 if (rw == UIO_READ) { 560 error = VOP_READ(vp, &auio, ioflg, cred); 561 } else { 562 error = VOP_WRITE(vp, &auio, ioflg, cred); 563 } 564 565 if (aresid) 566 *aresid = auio.uio_resid; 567 else 568 if (auio.uio_resid && error == 0) 569 error = EIO; 570 571 out: 572 if ((ioflg & IO_NODELOCKED) == 0) { 573 VOP_UNLOCK(vp); 574 } 575 return error; 576} 577 578int 579vn_readdir(file_t *fp, char *bf, int segflg, u_int count, int *done, 580 struct lwp *l, off_t **cookies, int *ncookies) 581{ 582 struct vnode *vp = fp->f_vnode; 583 struct iovec aiov; 584 struct uio auio; 585 int error, eofflag; 586 587 /* Limit the size on any kernel buffers used by VOP_READDIR */ 588 count = uimin(MAXBSIZE, count); 589 590unionread: 591 if (vp->v_type != VDIR) 592 return EINVAL; 593 aiov.iov_base = bf; 594 aiov.iov_len = count; 595 auio.uio_iov = &aiov; 596 auio.uio_iovcnt = 1; 597 auio.uio_rw = UIO_READ; 598 if (segflg == UIO_SYSSPACE) { 599 UIO_SETUP_SYSSPACE(&auio); 600 } else { 601 KASSERT(l == curlwp); 602 auio.uio_vmspace = l->l_proc->p_vmspace; 603 } 604 auio.uio_resid = count; 605 vn_lock(vp, LK_SHARED | LK_RETRY); 606 mutex_enter(&fp->f_lock); 607 auio.uio_offset = fp->f_offset; 608 mutex_exit(&fp->f_lock); 609 error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, cookies, 610 ncookies); 611 mutex_enter(&fp->f_lock); 612 fp->f_offset = auio.uio_offset; 613 mutex_exit(&fp->f_lock); 614 VOP_UNLOCK(vp); 615 if (error) 616 return error; 617 618 if (count == auio.uio_resid && vn_union_readdir_hook) { 619 struct vnode *ovp = vp; 620 621 error = (*vn_union_readdir_hook)(&vp, fp, l); 622 if (error) 623 return error; 624 if (vp != ovp) 625 goto unionread; 626 } 627 628 if (count == auio.uio_resid && (vp->v_vflag & VV_ROOT) && 629 (vp->v_mount->mnt_flag & MNT_UNION)) { 630 struct vnode *tvp = vp; 631 vp = vp->v_mount->mnt_vnodecovered; 632 vref(vp); 633 mutex_enter(&fp->f_lock); 634 fp->f_vnode = vp; 635 fp->f_offset = 0; 636 mutex_exit(&fp->f_lock); 637 vrele(tvp); 638 goto unionread; 639 } 640 *done = count - auio.uio_resid; 641 return error; 642} 643 644/* 645 * File table vnode read routine. 646 */ 647static int 648vn_read(file_t *fp, off_t *offset, struct uio *uio, kauth_cred_t cred, 649 int flags) 650{ 651 struct vnode *vp = fp->f_vnode; 652 int error, ioflag, fflag; 653 size_t count; 654 655 ioflag = IO_ADV_ENCODE(fp->f_advice); 656 fflag = fp->f_flag; 657 if (fflag & FNONBLOCK) 658 ioflag |= IO_NDELAY; 659 if ((fflag & (FFSYNC | FRSYNC)) == (FFSYNC | FRSYNC)) 660 ioflag |= IO_SYNC; 661 if (fflag & FALTIO) 662 ioflag |= IO_ALTSEMANTICS; 663 if (fflag & FDIRECT) 664 ioflag |= IO_DIRECT; 665 if (offset == &fp->f_offset && (flags & FOF_UPDATE_OFFSET) != 0) 666 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 667 else 668 vn_lock(vp, LK_SHARED | LK_RETRY); 669 if (__predict_false(vp->v_type == VDIR) && 670 offset == &fp->f_offset && (flags & FOF_UPDATE_OFFSET) == 0) 671 mutex_enter(&fp->f_lock); 672 uio->uio_offset = *offset; 673 if (__predict_false(vp->v_type == VDIR) && 674 offset == &fp->f_offset && (flags & FOF_UPDATE_OFFSET) == 0) 675 mutex_enter(&fp->f_lock); 676 count = uio->uio_resid; 677 error = VOP_READ(vp, uio, ioflag, cred); 678 if (flags & FOF_UPDATE_OFFSET) 679 *offset += count - uio->uio_resid; 680 VOP_UNLOCK(vp); 681 return error; 682} 683 684/* 685 * File table vnode write routine. 686 */ 687static int 688vn_write(file_t *fp, off_t *offset, struct uio *uio, kauth_cred_t cred, 689 int flags) 690{ 691 struct vnode *vp = fp->f_vnode; 692 int error, ioflag, fflag; 693 size_t count; 694 695 ioflag = IO_ADV_ENCODE(fp->f_advice) | IO_UNIT; 696 fflag = fp->f_flag; 697 if (vp->v_type == VREG && (fflag & O_APPEND)) 698 ioflag |= IO_APPEND; 699 if (fflag & FNONBLOCK) 700 ioflag |= IO_NDELAY; 701 if (fflag & FFSYNC || 702 (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS))) 703 ioflag |= IO_SYNC; 704 else if (fflag & FDSYNC) 705 ioflag |= IO_DSYNC; 706 if (fflag & FALTIO) 707 ioflag |= IO_ALTSEMANTICS; 708 if (fflag & FDIRECT) 709 ioflag |= IO_DIRECT; 710 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 711 uio->uio_offset = *offset; 712 count = uio->uio_resid; 713 714 if ((error = enforce_rlimit_fsize(vp, uio, ioflag)) != 0) 715 goto out; 716 717 error = VOP_WRITE(vp, uio, ioflag, cred); 718 719 if (flags & FOF_UPDATE_OFFSET) { 720 if (ioflag & IO_APPEND) { 721 /* 722 * SUSv3 describes behaviour for count = 0 as following: 723 * "Before any action ... is taken, and if nbyte is zero 724 * and the file is a regular file, the write() function 725 * ... in the absence of errors ... shall return zero 726 * and have no other results." 727 */ 728 if (count) 729 *offset = uio->uio_offset; 730 } else 731 *offset += count - uio->uio_resid; 732 } 733 734 out: 735 VOP_UNLOCK(vp); 736 return error; 737} 738 739/* 740 * File table vnode stat routine. 741 */ 742static int 743vn_statfile(file_t *fp, struct stat *sb) 744{ 745 struct vnode *vp = fp->f_vnode; 746 int error; 747 748 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 749 error = vn_stat(vp, sb); 750 VOP_UNLOCK(vp); 751 return error; 752} 753 754int 755vn_stat(struct vnode *vp, struct stat *sb) 756{ 757 struct vattr va; 758 int error; 759 mode_t mode; 760 761 memset(&va, 0, sizeof(va)); 762 error = VOP_GETATTR(vp, &va, kauth_cred_get()); 763 if (error) 764 return error; 765 /* 766 * Copy from vattr table 767 */ 768 memset(sb, 0, sizeof(*sb)); 769 sb->st_dev = va.va_fsid; 770 sb->st_ino = va.va_fileid; 771 mode = va.va_mode; 772 switch (vp->v_type) { 773 case VREG: 774 mode |= S_IFREG; 775 break; 776 case VDIR: 777 mode |= S_IFDIR; 778 break; 779 case VBLK: 780 mode |= S_IFBLK; 781 break; 782 case VCHR: 783 mode |= S_IFCHR; 784 break; 785 case VLNK: 786 mode |= S_IFLNK; 787 break; 788 case VSOCK: 789 mode |= S_IFSOCK; 790 break; 791 case VFIFO: 792 mode |= S_IFIFO; 793 break; 794 default: 795 return EBADF; 796 } 797 sb->st_mode = mode; 798 sb->st_nlink = va.va_nlink; 799 sb->st_uid = va.va_uid; 800 sb->st_gid = va.va_gid; 801 sb->st_rdev = va.va_rdev; 802 sb->st_size = va.va_size; 803 sb->st_atimespec = va.va_atime; 804 sb->st_mtimespec = va.va_mtime; 805 sb->st_ctimespec = va.va_ctime; 806 sb->st_birthtimespec = va.va_birthtime; 807 sb->st_blksize = va.va_blocksize; 808 sb->st_flags = va.va_flags; 809 sb->st_gen = 0; 810 sb->st_blocks = va.va_bytes / S_BLKSIZE; 811 return 0; 812} 813 814/* 815 * File table vnode fcntl routine. 816 */ 817static int 818vn_fcntl(file_t *fp, u_int com, void *data) 819{ 820 struct vnode *vp = fp->f_vnode; 821 int error; 822 823 error = VOP_FCNTL(vp, com, data, fp->f_flag, kauth_cred_get()); 824 return error; 825} 826 827/* 828 * File table vnode ioctl routine. 829 */ 830static int 831vn_ioctl(file_t *fp, u_long com, void *data) 832{ 833 struct vnode *vp = fp->f_vnode, *ovp; 834 struct vattr vattr; 835 int error; 836 837 switch (vp->v_type) { 838 839 case VREG: 840 case VDIR: 841 if (com == FIONREAD) { 842 vn_lock(vp, LK_SHARED | LK_RETRY); 843 error = VOP_GETATTR(vp, &vattr, kauth_cred_get()); 844 if (error == 0) { 845 if (vp->v_type == VDIR) 846 mutex_enter(&fp->f_lock); 847 *(int *)data = vattr.va_size - fp->f_offset; 848 if (vp->v_type == VDIR) 849 mutex_exit(&fp->f_lock); 850 } 851 VOP_UNLOCK(vp); 852 if (error) 853 return error; 854 return 0; 855 } 856 if ((com == FIONWRITE) || (com == FIONSPACE)) { 857 /* 858 * Files don't have send queues, so there never 859 * are any bytes in them, nor is there any 860 * open space in them. 861 */ 862 *(int *)data = 0; 863 return 0; 864 } 865 if (com == FIOGETBMAP) { 866 daddr_t *block; 867 868 if (*(daddr_t *)data < 0) 869 return EINVAL; 870 block = (daddr_t *)data; 871 vn_lock(vp, LK_SHARED | LK_RETRY); 872 error = VOP_BMAP(vp, *block, NULL, block, NULL); 873 VOP_UNLOCK(vp); 874 return error; 875 } 876 if (com == OFIOGETBMAP) { 877 daddr_t ibn, obn; 878 879 if (*(int32_t *)data < 0) 880 return EINVAL; 881 ibn = (daddr_t)*(int32_t *)data; 882 vn_lock(vp, LK_SHARED | LK_RETRY); 883 error = VOP_BMAP(vp, ibn, NULL, &obn, NULL); 884 VOP_UNLOCK(vp); 885 *(int32_t *)data = (int32_t)obn; 886 return error; 887 } 888 if (com == FIONBIO || com == FIOASYNC) /* XXX */ 889 return 0; /* XXX */ 890 /* FALLTHROUGH */ 891 case VFIFO: 892 case VCHR: 893 case VBLK: 894 error = VOP_IOCTL(vp, com, data, fp->f_flag, 895 kauth_cred_get()); 896 if (error == 0 && com == TIOCSCTTY) { 897 vref(vp); 898 mutex_enter(&proc_lock); 899 ovp = curproc->p_session->s_ttyvp; 900 curproc->p_session->s_ttyvp = vp; 901 mutex_exit(&proc_lock); 902 if (ovp != NULL) 903 vrele(ovp); 904 } 905 return error; 906 907 default: 908 return EPASSTHROUGH; 909 } 910} 911 912/* 913 * File table vnode poll routine. 914 */ 915static int 916vn_poll(file_t *fp, int events) 917{ 918 919 return VOP_POLL(fp->f_vnode, events); 920} 921 922/* 923 * File table vnode kqfilter routine. 924 */ 925int 926vn_kqfilter(file_t *fp, struct knote *kn) 927{ 928 929 return VOP_KQFILTER(fp->f_vnode, kn); 930} 931 932static int 933vn_mmap(struct file *fp, off_t *offp, size_t size, int prot, int *flagsp, 934 int *advicep, struct uvm_object **uobjp, int *maxprotp) 935{ 936 struct uvm_object *uobj; 937 struct vnode *vp; 938 struct vattr va; 939 struct lwp *l; 940 vm_prot_t maxprot; 941 off_t off; 942 int error, flags; 943 bool needwritemap; 944 945 l = curlwp; 946 947 off = *offp; 948 flags = *flagsp; 949 maxprot = VM_PROT_EXECUTE; 950 951 KASSERT(size > 0); 952 953 vp = fp->f_vnode; 954 if (vp->v_type != VREG && vp->v_type != VCHR && 955 vp->v_type != VBLK) { 956 /* only REG/CHR/BLK support mmap */ 957 return ENODEV; 958 } 959 if (vp->v_type != VCHR && off < 0) { 960 return EINVAL; 961 } 962#if SIZE_MAX > UINT32_MAX /* XXX -Wtype-limits */ 963 if (vp->v_type != VCHR && size > __type_max(off_t)) { 964 return EOVERFLOW; 965 } 966#endif 967 if (vp->v_type != VCHR && off > __type_max(off_t) - size) { 968 /* no offset wrapping */ 969 return EOVERFLOW; 970 } 971 972 /* special case: catch SunOS style /dev/zero */ 973 if (vp->v_type == VCHR && 974 (vp->v_rdev == zerodev || COMPAT_ZERODEV(vp->v_rdev))) { 975 *uobjp = NULL; 976 *maxprotp = VM_PROT_ALL; 977 return 0; 978 } 979 980 /* 981 * Old programs may not select a specific sharing type, so 982 * default to an appropriate one. 983 * 984 * XXX: how does MAP_ANON fit in the picture? 985 */ 986 if ((flags & (MAP_SHARED|MAP_PRIVATE)) == 0) { 987#if defined(DEBUG) 988 struct proc *p = l->l_proc; 989 printf("WARNING: defaulted mmap() share type to " 990 "%s (pid %d command %s)\n", vp->v_type == VCHR ? 991 "MAP_SHARED" : "MAP_PRIVATE", p->p_pid, 992 p->p_comm); 993#endif 994 if (vp->v_type == VCHR) 995 flags |= MAP_SHARED; /* for a device */ 996 else 997 flags |= MAP_PRIVATE; /* for a file */ 998 } 999 1000 /* 1001 * MAP_PRIVATE device mappings don't make sense (and aren't 1002 * supported anyway). However, some programs rely on this, 1003 * so just change it to MAP_SHARED. 1004 */ 1005 if (vp->v_type == VCHR && (flags & MAP_PRIVATE) != 0) { 1006 flags = (flags & ~MAP_PRIVATE) | MAP_SHARED; 1007 } 1008 1009 /* 1010 * now check protection 1011 */ 1012 1013 /* check read access */ 1014 if (fp->f_flag & FREAD) 1015 maxprot |= VM_PROT_READ; 1016 else if (prot & PROT_READ) { 1017 return EACCES; 1018 } 1019 1020 /* check write access, shared case first */ 1021 if (flags & MAP_SHARED) { 1022 /* 1023 * if the file is writable, only add PROT_WRITE to 1024 * maxprot if the file is not immutable, append-only. 1025 * otherwise, if we have asked for PROT_WRITE, return 1026 * EPERM. 1027 */ 1028 if (fp->f_flag & FWRITE) { 1029 vn_lock(vp, LK_SHARED | LK_RETRY); 1030 error = VOP_GETATTR(vp, &va, l->l_cred); 1031 VOP_UNLOCK(vp); 1032 if (error) { 1033 return error; 1034 } 1035 if ((va.va_flags & 1036 (SF_SNAPSHOT|IMMUTABLE|APPEND)) == 0) 1037 maxprot |= VM_PROT_WRITE; 1038 else if (prot & PROT_WRITE) { 1039 return EPERM; 1040 } 1041 } else if (prot & PROT_WRITE) { 1042 return EACCES; 1043 } 1044 } else { 1045 /* MAP_PRIVATE mappings can always write to */ 1046 maxprot |= VM_PROT_WRITE; 1047 } 1048 1049 /* 1050 * Don't allow mmap for EXEC if the file system 1051 * is mounted NOEXEC. 1052 */ 1053 if ((prot & PROT_EXEC) != 0 && 1054 (vp->v_mount->mnt_flag & MNT_NOEXEC) != 0) { 1055 return EACCES; 1056 } 1057 1058 if (vp->v_type != VCHR) { 1059 error = VOP_MMAP(vp, prot, curlwp->l_cred); 1060 if (error) { 1061 return error; 1062 } 1063 vref(vp); 1064 uobj = &vp->v_uobj; 1065 1066 /* 1067 * If the vnode is being mapped with PROT_EXEC, 1068 * then mark it as text. 1069 */ 1070 if (prot & PROT_EXEC) { 1071 vn_markexec(vp); 1072 } 1073 } else { 1074 int i = maxprot; 1075 1076 /* 1077 * XXX Some devices don't like to be mapped with 1078 * XXX PROT_EXEC or PROT_WRITE, but we don't really 1079 * XXX have a better way of handling this, right now 1080 */ 1081 do { 1082 uobj = udv_attach(vp->v_rdev, 1083 (flags & MAP_SHARED) ? i : 1084 (i & ~VM_PROT_WRITE), off, size); 1085 i--; 1086 } while ((uobj == NULL) && (i > 0)); 1087 if (uobj == NULL) { 1088 return EINVAL; 1089 } 1090 *advicep = UVM_ADV_RANDOM; 1091 } 1092 1093 /* 1094 * Set vnode flags to indicate the new kinds of mapping. 1095 * We take the vnode lock in exclusive mode here to serialize 1096 * with direct I/O. 1097 * 1098 * Safe to check for these flag values without a lock, as 1099 * long as a reference to the vnode is held. 1100 */ 1101 needwritemap = (vp->v_iflag & VI_WRMAP) == 0 && 1102 (flags & MAP_SHARED) != 0 && 1103 (maxprot & VM_PROT_WRITE) != 0; 1104 if ((vp->v_vflag & VV_MAPPED) == 0 || needwritemap) { 1105 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1106 vp->v_vflag |= VV_MAPPED; 1107 if (needwritemap) { 1108 rw_enter(vp->v_uobj.vmobjlock, RW_WRITER); 1109 mutex_enter(vp->v_interlock); 1110 vp->v_iflag |= VI_WRMAP; 1111 mutex_exit(vp->v_interlock); 1112 rw_exit(vp->v_uobj.vmobjlock); 1113 } 1114 VOP_UNLOCK(vp); 1115 } 1116 1117#if NVERIEXEC > 0 1118 1119 /* 1120 * Check if the file can be executed indirectly. 1121 * 1122 * XXX: This gives false warnings about "Incorrect access type" 1123 * XXX: if the mapping is not executable. Harmless, but will be 1124 * XXX: fixed as part of other changes. 1125 */ 1126 if (veriexec_verify(l, vp, "(mmap)", VERIEXEC_INDIRECT, 1127 NULL)) { 1128 1129 /* 1130 * Don't allow executable mappings if we can't 1131 * indirectly execute the file. 1132 */ 1133 if (prot & VM_PROT_EXECUTE) { 1134 return EPERM; 1135 } 1136 1137 /* 1138 * Strip the executable bit from 'maxprot' to make sure 1139 * it can't be made executable later. 1140 */ 1141 maxprot &= ~VM_PROT_EXECUTE; 1142 } 1143#endif /* NVERIEXEC > 0 */ 1144 1145 *uobjp = uobj; 1146 *maxprotp = maxprot; 1147 *flagsp = flags; 1148 1149 return 0; 1150} 1151 1152static int 1153vn_seek(struct file *fp, off_t delta, int whence, off_t *newoffp, 1154 int flags) 1155{ 1156 const off_t OFF_MIN = __type_min(off_t); 1157 const off_t OFF_MAX = __type_max(off_t); 1158 kauth_cred_t cred = fp->f_cred; 1159 off_t oldoff, newoff; 1160 struct vnode *vp = fp->f_vnode; 1161 struct vattr vattr; 1162 int error; 1163 1164 if (vp->v_type == VFIFO) 1165 return ESPIPE; 1166 1167 if (flags & FOF_UPDATE_OFFSET) 1168 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1169 else 1170 vn_lock(vp, LK_SHARED | LK_RETRY); 1171 1172 /* Compute the old and new offsets. */ 1173 if (vp->v_type == VDIR && (flags & FOF_UPDATE_OFFSET) == 0) 1174 mutex_enter(&fp->f_lock); 1175 oldoff = fp->f_offset; 1176 if (vp->v_type == VDIR && (flags & FOF_UPDATE_OFFSET) == 0) 1177 mutex_exit(&fp->f_lock); 1178 switch (whence) { 1179 case SEEK_CUR: 1180 if (delta > 0) { 1181 if (oldoff > 0 && delta > OFF_MAX - oldoff) { 1182 newoff = OFF_MAX; 1183 break; 1184 } 1185 } else { 1186 if (oldoff < 0 && delta < OFF_MIN - oldoff) { 1187 newoff = OFF_MIN; 1188 break; 1189 } 1190 } 1191 newoff = oldoff + delta; 1192 break; 1193 case SEEK_END: 1194 error = VOP_GETATTR(vp, &vattr, cred); 1195 if (error) 1196 goto out; 1197 if (vattr.va_size > OFF_MAX || 1198 delta > OFF_MAX - (off_t)vattr.va_size) { 1199 newoff = OFF_MAX; 1200 break; 1201 } 1202 newoff = delta + vattr.va_size; 1203 break; 1204 case SEEK_SET: 1205 newoff = delta; 1206 break; 1207 default: 1208 error = EINVAL; 1209 goto out; 1210 } 1211 1212 /* Pass the proposed change to the file system to audit. */ 1213 error = VOP_SEEK(vp, oldoff, newoff, cred); 1214 if (error) 1215 goto out; 1216 1217 /* Success! */ 1218 if (newoffp) 1219 *newoffp = newoff; 1220 if (flags & FOF_UPDATE_OFFSET) 1221 fp->f_offset = newoff; 1222 error = 0; 1223 1224out: VOP_UNLOCK(vp); 1225 return error; 1226} 1227 1228static int 1229vn_advlock(struct file *fp, void *id, int op, struct flock *fl, 1230 int flags) 1231{ 1232 struct vnode *const vp = fp->f_vnode; 1233 1234 if (fl->l_whence == SEEK_CUR) { 1235 vn_lock(vp, LK_SHARED | LK_RETRY); 1236 fl->l_start += fp->f_offset; 1237 VOP_UNLOCK(vp); 1238 } 1239 1240 return VOP_ADVLOCK(vp, id, op, fl, flags); 1241} 1242 1243static int 1244vn_fpathconf(struct file *fp, int name, register_t *retval) 1245{ 1246 struct vnode *const vp = fp->f_vnode; 1247 int error; 1248 1249 vn_lock(vp, LK_SHARED | LK_RETRY); 1250 error = VOP_PATHCONF(vp, name, retval); 1251 VOP_UNLOCK(vp); 1252 1253 return error; 1254} 1255 1256static int 1257vn_posix_fadvise(struct file *fp, off_t offset, off_t len, int advice) 1258{ 1259 const off_t OFF_MAX = __type_max(off_t); 1260 struct vnode *vp = fp->f_vnode; 1261 off_t endoffset; 1262 int error; 1263 1264 if (offset < 0) { 1265 return EINVAL; 1266 } 1267 if (len == 0) { 1268 endoffset = OFF_MAX; 1269 } else if (len > 0 && (OFF_MAX - offset) >= len) { 1270 endoffset = offset + len; 1271 } else { 1272 return EINVAL; 1273 } 1274 1275 CTASSERT(POSIX_FADV_NORMAL == UVM_ADV_NORMAL); 1276 CTASSERT(POSIX_FADV_RANDOM == UVM_ADV_RANDOM); 1277 CTASSERT(POSIX_FADV_SEQUENTIAL == UVM_ADV_SEQUENTIAL); 1278 1279 switch (advice) { 1280 case POSIX_FADV_WILLNEED: 1281 case POSIX_FADV_DONTNEED: 1282 if (vp->v_type != VREG && vp->v_type != VBLK) 1283 return 0; 1284 break; 1285 } 1286 1287 switch (advice) { 1288 case POSIX_FADV_NORMAL: 1289 case POSIX_FADV_RANDOM: 1290 case POSIX_FADV_SEQUENTIAL: 1291 /* 1292 * We ignore offset and size. Must lock the file to 1293 * do this, as f_advice is sub-word sized. 1294 */ 1295 mutex_enter(&fp->f_lock); 1296 fp->f_advice = (u_char)advice; 1297 mutex_exit(&fp->f_lock); 1298 error = 0; 1299 break; 1300 1301 case POSIX_FADV_WILLNEED: 1302 error = uvm_readahead(&vp->v_uobj, offset, endoffset - offset); 1303 break; 1304 1305 case POSIX_FADV_DONTNEED: 1306 /* 1307 * Align the region to page boundaries as VOP_PUTPAGES expects 1308 * by shrinking it. We shrink instead of expand because we 1309 * do not want to deactivate cache outside of the requested 1310 * region. It means that if the specified region is smaller 1311 * than PAGE_SIZE, we do nothing. 1312 */ 1313 if (offset <= trunc_page(OFF_MAX) && 1314 round_page(offset) < trunc_page(endoffset)) { 1315 rw_enter(vp->v_uobj.vmobjlock, RW_WRITER); 1316 error = VOP_PUTPAGES(vp, 1317 round_page(offset), trunc_page(endoffset), 1318 PGO_DEACTIVATE | PGO_CLEANIT); 1319 } else { 1320 error = 0; 1321 } 1322 break; 1323 1324 case POSIX_FADV_NOREUSE: 1325 /* Not implemented yet. */ 1326 error = 0; 1327 break; 1328 default: 1329 error = EINVAL; 1330 break; 1331 } 1332 1333 return error; 1334} 1335 1336static int 1337vn_truncate(file_t *fp, off_t length) 1338{ 1339 struct vattr vattr; 1340 struct vnode *vp; 1341 int error = 0; 1342 1343 if (length < 0) 1344 return EINVAL; 1345 1346 if ((fp->f_flag & FWRITE) == 0) 1347 return EINVAL; 1348 vp = fp->f_vnode; 1349 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1350 if (vp->v_type == VDIR) 1351 error = EISDIR; 1352 else if ((error = vn_writechk(vp)) == 0) { 1353 vattr_null(&vattr); 1354 vattr.va_size = length; 1355 error = VOP_SETATTR(vp, &vattr, fp->f_cred); 1356 } 1357 VOP_UNLOCK(vp); 1358 1359 return error; 1360} 1361 1362 1363/* 1364 * Check that the vnode is still valid, and if so 1365 * acquire requested lock. 1366 */ 1367int 1368vn_lock(struct vnode *vp, int flags) 1369{ 1370 struct lwp *l; 1371 int error; 1372 1373 KASSERT(vrefcnt(vp) > 0); 1374 KASSERT((flags & ~(LK_SHARED|LK_EXCLUSIVE|LK_NOWAIT|LK_RETRY| 1375 LK_UPGRADE|LK_DOWNGRADE)) == 0); 1376 KASSERT((flags & LK_NOWAIT) != 0 || !mutex_owned(vp->v_interlock)); 1377 1378#ifdef DIAGNOSTIC 1379 if (wapbl_vphaswapbl(vp)) 1380 WAPBL_JUNLOCK_ASSERT(wapbl_vptomp(vp)); 1381#endif 1382 1383 /* Get a more useful report for lockstat. */ 1384 l = curlwp; 1385 KASSERT(l->l_rwcallsite == 0); 1386 l->l_rwcallsite = (uintptr_t)__builtin_return_address(0); 1387 1388 error = VOP_LOCK(vp, flags); 1389 1390 l->l_rwcallsite = 0; 1391 1392 switch (flags & (LK_RETRY | LK_NOWAIT)) { 1393 case 0: 1394 KASSERT(error == 0 || error == ENOENT); 1395 break; 1396 case LK_RETRY: 1397 KASSERT(error == 0); 1398 break; 1399 case LK_NOWAIT: 1400 KASSERT(error == 0 || error == EBUSY || error == ENOENT); 1401 break; 1402 case LK_RETRY | LK_NOWAIT: 1403 KASSERT(error == 0 || error == EBUSY); 1404 break; 1405 } 1406 1407 return error; 1408} 1409 1410/* 1411 * File table vnode close routine. 1412 */ 1413static int 1414vn_closefile(file_t *fp) 1415{ 1416 1417 return vn_close(fp->f_vnode, fp->f_flag, fp->f_cred); 1418} 1419 1420/* 1421 * Simplified in-kernel wrapper calls for extended attribute access. 1422 * Both calls pass in a NULL credential, authorizing a "kernel" access. 1423 * Set IO_NODELOCKED in ioflg if the vnode is already locked. 1424 */ 1425int 1426vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace, 1427 const char *attrname, size_t *buflen, void *bf, struct lwp *l) 1428{ 1429 struct uio auio; 1430 struct iovec aiov; 1431 int error; 1432 1433 aiov.iov_len = *buflen; 1434 aiov.iov_base = bf; 1435 1436 auio.uio_iov = &aiov; 1437 auio.uio_iovcnt = 1; 1438 auio.uio_rw = UIO_READ; 1439 auio.uio_offset = 0; 1440 auio.uio_resid = *buflen; 1441 UIO_SETUP_SYSSPACE(&auio); 1442 1443 if ((ioflg & IO_NODELOCKED) == 0) 1444 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1445 1446 error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, 1447 NOCRED); 1448 1449 if ((ioflg & IO_NODELOCKED) == 0) 1450 VOP_UNLOCK(vp); 1451 1452 if (error == 0) 1453 *buflen = *buflen - auio.uio_resid; 1454 1455 return error; 1456} 1457 1458/* 1459 * XXX Failure mode if partially written? 1460 */ 1461int 1462vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace, 1463 const char *attrname, size_t buflen, const void *bf, struct lwp *l) 1464{ 1465 struct uio auio; 1466 struct iovec aiov; 1467 int error; 1468 1469 aiov.iov_len = buflen; 1470 aiov.iov_base = __UNCONST(bf); /* XXXUNCONST kills const */ 1471 1472 auio.uio_iov = &aiov; 1473 auio.uio_iovcnt = 1; 1474 auio.uio_rw = UIO_WRITE; 1475 auio.uio_offset = 0; 1476 auio.uio_resid = buflen; 1477 UIO_SETUP_SYSSPACE(&auio); 1478 1479 if ((ioflg & IO_NODELOCKED) == 0) { 1480 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1481 } 1482 1483 error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NOCRED); 1484 1485 if ((ioflg & IO_NODELOCKED) == 0) { 1486 VOP_UNLOCK(vp); 1487 } 1488 1489 return error; 1490} 1491 1492int 1493vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace, 1494 const char *attrname, struct lwp *l) 1495{ 1496 int error; 1497 1498 if ((ioflg & IO_NODELOCKED) == 0) { 1499 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1500 } 1501 1502 error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NOCRED); 1503 if (error == EOPNOTSUPP) 1504 error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL, 1505 NOCRED); 1506 1507 if ((ioflg & IO_NODELOCKED) == 0) { 1508 VOP_UNLOCK(vp); 1509 } 1510 1511 return error; 1512} 1513 1514int 1515vn_fifo_bypass(void *v) 1516{ 1517 struct vop_generic_args *ap = v; 1518 1519 return VOCALL(fifo_vnodeop_p, ap->a_desc->vdesc_offset, v); 1520} 1521 1522/* 1523 * Open block device by device number 1524 */ 1525int 1526vn_bdev_open(dev_t dev, struct vnode **vpp, struct lwp *l) 1527{ 1528 int error; 1529 1530 if ((error = bdevvp(dev, vpp)) != 0) 1531 return error; 1532 1533 vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); 1534 if ((error = VOP_OPEN(*vpp, FREAD | FWRITE, l->l_cred)) != 0) { 1535 vput(*vpp); 1536 return error; 1537 } 1538 mutex_enter((*vpp)->v_interlock); 1539 (*vpp)->v_writecount++; 1540 mutex_exit((*vpp)->v_interlock); 1541 VOP_UNLOCK(*vpp); 1542 1543 return 0; 1544} 1545 1546/* 1547 * Lookup the provided name in the filesystem. If the file exists, 1548 * is a valid block device, and isn't being used by anyone else, 1549 * set *vpp to the file's vnode. 1550 */ 1551int 1552vn_bdev_openpath(struct pathbuf *pb, struct vnode **vpp, struct lwp *l) 1553{ 1554 struct vnode *vp; 1555 dev_t dev; 1556 enum vtype vt; 1557 int error; 1558 1559 error = vn_open(NULL, pb, 0, FREAD | FWRITE, 0, &vp, NULL, NULL); 1560 if (error != 0) 1561 return error; 1562 1563 dev = vp->v_rdev; 1564 vt = vp->v_type; 1565 1566 VOP_UNLOCK(vp); 1567 (void) vn_close(vp, FREAD | FWRITE, l->l_cred); 1568 1569 if (vt != VBLK) 1570 return ENOTBLK; 1571 1572 return vn_bdev_open(dev, vpp, l); 1573} 1574 1575static long 1576vn_knote_to_interest(const struct knote *kn) 1577{ 1578 switch (kn->kn_filter) { 1579 case EVFILT_READ: 1580 /* 1581 * Writing to the file or changing its attributes can 1582 * set the file size, which impacts the readability 1583 * filter. 1584 * 1585 * (No need to set NOTE_EXTEND here; it's only ever 1586 * send with other hints; see vnode_if.c.) 1587 */ 1588 return NOTE_WRITE | NOTE_ATTRIB; 1589 1590 case EVFILT_VNODE: 1591 return kn->kn_sfflags; 1592 1593 case EVFILT_WRITE: 1594 default: 1595 return 0; 1596 } 1597} 1598 1599void 1600vn_knote_attach(struct vnode *vp, struct knote *kn) 1601{ 1602 struct vnode_klist *vk = vp->v_klist; 1603 long interest = 0; 1604 1605 /* 1606 * In the case of layered / stacked file systems, knotes 1607 * should only ever be associated with the base vnode. 1608 */ 1609 KASSERT(kn->kn_hook == vp); 1610 KASSERT(vp->v_klist == &VNODE_TO_VIMPL(vp)->vi_klist); 1611 1612 /* 1613 * We maintain a bitmask of the kevents that there is interest in, 1614 * to minimize the impact of having watchers. It's silly to have 1615 * to traverse vn_klist every time a read or write happens simply 1616 * because there is someone interested in knowing when the file 1617 * is deleted, for example. 1618 */ 1619 1620 mutex_enter(vp->v_interlock); 1621 SLIST_INSERT_HEAD(&vk->vk_klist, kn, kn_selnext); 1622 SLIST_FOREACH(kn, &vk->vk_klist, kn_selnext) { 1623 interest |= vn_knote_to_interest(kn); 1624 } 1625 vk->vk_interest = interest; 1626 mutex_exit(vp->v_interlock); 1627} 1628 1629void 1630vn_knote_detach(struct vnode *vp, struct knote *kn) 1631{ 1632 struct vnode_klist *vk = vp->v_klist; 1633 long interest = 0; 1634 1635 /* See above. */ 1636 KASSERT(kn->kn_hook == vp); 1637 KASSERT(vp->v_klist == &VNODE_TO_VIMPL(vp)->vi_klist); 1638 1639 /* 1640 * We special case removing the head of the list, because: 1641 * 1642 * 1. It's extremely likely that we're detaching the only 1643 * knote. 1644 * 1645 * 2. We're already traversing the whole list, so we don't 1646 * want to use the generic SLIST_REMOVE() which would 1647 * traverse it *again*. 1648 */ 1649 1650 mutex_enter(vp->v_interlock); 1651 if (__predict_true(kn == SLIST_FIRST(&vk->vk_klist))) { 1652 SLIST_REMOVE_HEAD(&vk->vk_klist, kn_selnext); 1653 SLIST_FOREACH(kn, &vk->vk_klist, kn_selnext) { 1654 interest |= vn_knote_to_interest(kn); 1655 } 1656 vk->vk_interest = interest; 1657 } else { 1658 struct knote *thiskn, *nextkn, *prevkn = NULL; 1659 1660 SLIST_FOREACH_SAFE(thiskn, &vk->vk_klist, kn_selnext, nextkn) { 1661 if (thiskn == kn) { 1662 KASSERT(kn != NULL); 1663 KASSERT(prevkn != NULL); 1664 SLIST_REMOVE_AFTER(prevkn, kn_selnext); 1665 kn = NULL; 1666 } else { 1667 interest |= vn_knote_to_interest(thiskn); 1668 prevkn = thiskn; 1669 } 1670 } 1671 vk->vk_interest = interest; 1672 } 1673 mutex_exit(vp->v_interlock); 1674} 1675