vfs_vnops.c revision 276649
139291Sfenner/*- 239291Sfenner * Copyright (c) 1982, 1986, 1989, 1993 339291Sfenner * The Regents of the University of California. All rights reserved. 439291Sfenner * (c) UNIX System Laboratories, Inc. 539291Sfenner * All or some portions of this file are derived from material licensed 639291Sfenner * to the University of California by American Telephone and Telegraph 739291Sfenner * Co. or Unix System Laboratories, Inc. and are reproduced herein with 839291Sfenner * the permission of UNIX System Laboratories, Inc. 939291Sfenner * 1039291Sfenner * Copyright (c) 2012 Konstantin Belousov <kib@FreeBSD.org> 1139291Sfenner * Copyright (c) 2013, 2014 The FreeBSD Foundation 1239291Sfenner * 1339291Sfenner * Portions of this software were developed by Konstantin Belousov 1439291Sfenner * under sponsorship from the FreeBSD Foundation. 1539291Sfenner * 1639291Sfenner * Redistribution and use in source and binary forms, with or without 1739291Sfenner * modification, are permitted provided that the following conditions 1839291Sfenner * are met: 1939291Sfenner * 1. Redistributions of source code must retain the above copyright 2039291Sfenner * notice, this list of conditions and the following disclaimer. 2139291Sfenner * 2. Redistributions in binary form must reproduce the above copyright 2239291Sfenner * notice, this list of conditions and the following disclaimer in the 2339291Sfenner * documentation and/or other materials provided with the distribution. 2439291Sfenner * 4. Neither the name of the University nor the names of its contributors 2539291Sfenner * may be used to endorse or promote products derived from this software 2639291Sfenner * without specific prior written permission. 2739291Sfenner * 2839291Sfenner * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 2939291Sfenner * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 3039291Sfenner * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 3139291Sfenner * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 3239291Sfenner * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 3339291Sfenner * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 3439291Sfenner * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 3539291Sfenner * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 3639291Sfenner * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 3739291Sfenner * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 3839291Sfenner * SUCH DAMAGE. 3939291Sfenner * 4039291Sfenner * @(#)vfs_vnops.c 8.2 (Berkeley) 1/21/94 4139291Sfenner */ 4239291Sfenner 4339291Sfenner#include <sys/cdefs.h> 4439291Sfenner__FBSDID("$FreeBSD: stable/10/sys/kern/vfs_vnops.c 276649 2015-01-04 00:49:45Z kib $"); 4539291Sfenner 4639291Sfenner#include <sys/param.h> 4739291Sfenner#include <sys/systm.h> 4839291Sfenner#include <sys/disk.h> 4939291Sfenner#include <sys/fcntl.h> 5039291Sfenner#include <sys/file.h> 51#include <sys/kdb.h> 52#include <sys/stat.h> 53#include <sys/priv.h> 54#include <sys/proc.h> 55#include <sys/limits.h> 56#include <sys/lock.h> 57#include <sys/mount.h> 58#include <sys/mutex.h> 59#include <sys/namei.h> 60#include <sys/vnode.h> 61#include <sys/bio.h> 62#include <sys/buf.h> 63#include <sys/filio.h> 64#include <sys/resourcevar.h> 65#include <sys/rwlock.h> 66#include <sys/sx.h> 67#include <sys/sysctl.h> 68#include <sys/ttycom.h> 69#include <sys/conf.h> 70#include <sys/syslog.h> 71#include <sys/unistd.h> 72 73#include <security/audit/audit.h> 74#include <security/mac/mac_framework.h> 75 76#include <vm/vm.h> 77#include <vm/vm_extern.h> 78#include <vm/pmap.h> 79#include <vm/vm_map.h> 80#include <vm/vm_object.h> 81#include <vm/vm_page.h> 82 83static fo_rdwr_t vn_read; 84static fo_rdwr_t vn_write; 85static fo_rdwr_t vn_io_fault; 86static fo_truncate_t vn_truncate; 87static fo_ioctl_t vn_ioctl; 88static fo_poll_t vn_poll; 89static fo_kqfilter_t vn_kqfilter; 90static fo_stat_t vn_statfile; 91static fo_close_t vn_closefile; 92 93struct fileops vnops = { 94 .fo_read = vn_io_fault, 95 .fo_write = vn_io_fault, 96 .fo_truncate = vn_truncate, 97 .fo_ioctl = vn_ioctl, 98 .fo_poll = vn_poll, 99 .fo_kqfilter = vn_kqfilter, 100 .fo_stat = vn_statfile, 101 .fo_close = vn_closefile, 102 .fo_chmod = vn_chmod, 103 .fo_chown = vn_chown, 104 .fo_sendfile = vn_sendfile, 105 .fo_seek = vn_seek, 106 .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE 107}; 108 109static const int io_hold_cnt = 16; 110static int vn_io_fault_enable = 1; 111SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_enable, CTLFLAG_RW, 112 &vn_io_fault_enable, 0, "Enable vn_io_fault lock avoidance"); 113static u_long vn_io_faults_cnt; 114SYSCTL_ULONG(_debug, OID_AUTO, vn_io_faults, CTLFLAG_RD, 115 &vn_io_faults_cnt, 0, "Count of vn_io_fault lock avoidance triggers"); 116 117/* 118 * Returns true if vn_io_fault mode of handling the i/o request should 119 * be used. 120 */ 121static bool 122do_vn_io_fault(struct vnode *vp, struct uio *uio) 123{ 124 struct mount *mp; 125 126 return (uio->uio_segflg == UIO_USERSPACE && vp->v_type == VREG && 127 (mp = vp->v_mount) != NULL && 128 (mp->mnt_kern_flag & MNTK_NO_IOPF) != 0 && vn_io_fault_enable); 129} 130 131/* 132 * Structure used to pass arguments to vn_io_fault1(), to do either 133 * file- or vnode-based I/O calls. 134 */ 135struct vn_io_fault_args { 136 enum { 137 VN_IO_FAULT_FOP, 138 VN_IO_FAULT_VOP 139 } kind; 140 struct ucred *cred; 141 int flags; 142 union { 143 struct fop_args_tag { 144 struct file *fp; 145 fo_rdwr_t *doio; 146 } fop_args; 147 struct vop_args_tag { 148 struct vnode *vp; 149 } vop_args; 150 } args; 151}; 152 153static int vn_io_fault1(struct vnode *vp, struct uio *uio, 154 struct vn_io_fault_args *args, struct thread *td); 155 156int 157vn_open(ndp, flagp, cmode, fp) 158 struct nameidata *ndp; 159 int *flagp, cmode; 160 struct file *fp; 161{ 162 struct thread *td = ndp->ni_cnd.cn_thread; 163 164 return (vn_open_cred(ndp, flagp, cmode, 0, td->td_ucred, fp)); 165} 166 167/* 168 * Common code for vnode open operations via a name lookup. 169 * Lookup the vnode and invoke VOP_CREATE if needed. 170 * Check permissions, and call the VOP_OPEN or VOP_CREATE routine. 171 * 172 * Note that this does NOT free nameidata for the successful case, 173 * due to the NDINIT being done elsewhere. 174 */ 175int 176vn_open_cred(struct nameidata *ndp, int *flagp, int cmode, u_int vn_open_flags, 177 struct ucred *cred, struct file *fp) 178{ 179 struct vnode *vp; 180 struct mount *mp; 181 struct thread *td = ndp->ni_cnd.cn_thread; 182 struct vattr vat; 183 struct vattr *vap = &vat; 184 int fmode, error; 185 186restart: 187 fmode = *flagp; 188 if (fmode & O_CREAT) { 189 ndp->ni_cnd.cn_nameiop = CREATE; 190 /* 191 * Set NOCACHE to avoid flushing the cache when 192 * rolling in many files at once. 193 */ 194 ndp->ni_cnd.cn_flags = ISOPEN | LOCKPARENT | LOCKLEAF | NOCACHE; 195 if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0) 196 ndp->ni_cnd.cn_flags |= FOLLOW; 197 if (!(vn_open_flags & VN_OPEN_NOAUDIT)) 198 ndp->ni_cnd.cn_flags |= AUDITVNODE1; 199 if (vn_open_flags & VN_OPEN_NOCAPCHECK) 200 ndp->ni_cnd.cn_flags |= NOCAPCHECK; 201 bwillwrite(); 202 if ((error = namei(ndp)) != 0) 203 return (error); 204 if (ndp->ni_vp == NULL) { 205 VATTR_NULL(vap); 206 vap->va_type = VREG; 207 vap->va_mode = cmode; 208 if (fmode & O_EXCL) 209 vap->va_vaflags |= VA_EXCLUSIVE; 210 if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) { 211 NDFREE(ndp, NDF_ONLY_PNBUF); 212 vput(ndp->ni_dvp); 213 if ((error = vn_start_write(NULL, &mp, 214 V_XSLEEP | PCATCH)) != 0) 215 return (error); 216 goto restart; 217 } 218 if ((vn_open_flags & VN_OPEN_NAMECACHE) != 0) 219 ndp->ni_cnd.cn_flags |= MAKEENTRY; 220#ifdef MAC 221 error = mac_vnode_check_create(cred, ndp->ni_dvp, 222 &ndp->ni_cnd, vap); 223 if (error == 0) 224#endif 225 error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp, 226 &ndp->ni_cnd, vap); 227 vput(ndp->ni_dvp); 228 vn_finished_write(mp); 229 if (error) { 230 NDFREE(ndp, NDF_ONLY_PNBUF); 231 return (error); 232 } 233 fmode &= ~O_TRUNC; 234 vp = ndp->ni_vp; 235 } else { 236 if (ndp->ni_dvp == ndp->ni_vp) 237 vrele(ndp->ni_dvp); 238 else 239 vput(ndp->ni_dvp); 240 ndp->ni_dvp = NULL; 241 vp = ndp->ni_vp; 242 if (fmode & O_EXCL) { 243 error = EEXIST; 244 goto bad; 245 } 246 fmode &= ~O_CREAT; 247 } 248 } else { 249 ndp->ni_cnd.cn_nameiop = LOOKUP; 250 ndp->ni_cnd.cn_flags = ISOPEN | 251 ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) | LOCKLEAF; 252 if (!(fmode & FWRITE)) 253 ndp->ni_cnd.cn_flags |= LOCKSHARED; 254 if (!(vn_open_flags & VN_OPEN_NOAUDIT)) 255 ndp->ni_cnd.cn_flags |= AUDITVNODE1; 256 if (vn_open_flags & VN_OPEN_NOCAPCHECK) 257 ndp->ni_cnd.cn_flags |= NOCAPCHECK; 258 if ((error = namei(ndp)) != 0) 259 return (error); 260 vp = ndp->ni_vp; 261 } 262 error = vn_open_vnode(vp, fmode, cred, td, fp); 263 if (error) 264 goto bad; 265 *flagp = fmode; 266 return (0); 267bad: 268 NDFREE(ndp, NDF_ONLY_PNBUF); 269 vput(vp); 270 *flagp = fmode; 271 ndp->ni_vp = NULL; 272 return (error); 273} 274 275/* 276 * Common code for vnode open operations once a vnode is located. 277 * Check permissions, and call the VOP_OPEN routine. 278 */ 279int 280vn_open_vnode(struct vnode *vp, int fmode, struct ucred *cred, 281 struct thread *td, struct file *fp) 282{ 283 struct mount *mp; 284 accmode_t accmode; 285 struct flock lf; 286 int error, have_flock, lock_flags, type; 287 288 if (vp->v_type == VLNK) 289 return (EMLINK); 290 if (vp->v_type == VSOCK) 291 return (EOPNOTSUPP); 292 if (vp->v_type != VDIR && fmode & O_DIRECTORY) 293 return (ENOTDIR); 294 accmode = 0; 295 if (fmode & (FWRITE | O_TRUNC)) { 296 if (vp->v_type == VDIR) 297 return (EISDIR); 298 accmode |= VWRITE; 299 } 300 if (fmode & FREAD) 301 accmode |= VREAD; 302 if (fmode & FEXEC) 303 accmode |= VEXEC; 304 if ((fmode & O_APPEND) && (fmode & FWRITE)) 305 accmode |= VAPPEND; 306#ifdef MAC 307 error = mac_vnode_check_open(cred, vp, accmode); 308 if (error) 309 return (error); 310#endif 311 if ((fmode & O_CREAT) == 0) { 312 if (accmode & VWRITE) { 313 error = vn_writechk(vp); 314 if (error) 315 return (error); 316 } 317 if (accmode) { 318 error = VOP_ACCESS(vp, accmode, cred, td); 319 if (error) 320 return (error); 321 } 322 } 323 if (vp->v_type == VFIFO && VOP_ISLOCKED(vp) != LK_EXCLUSIVE) 324 vn_lock(vp, LK_UPGRADE | LK_RETRY); 325 if ((error = VOP_OPEN(vp, fmode, cred, td, fp)) != 0) 326 return (error); 327 328 if (fmode & (O_EXLOCK | O_SHLOCK)) { 329 KASSERT(fp != NULL, ("open with flock requires fp")); 330 lock_flags = VOP_ISLOCKED(vp); 331 VOP_UNLOCK(vp, 0); 332 lf.l_whence = SEEK_SET; 333 lf.l_start = 0; 334 lf.l_len = 0; 335 if (fmode & O_EXLOCK) 336 lf.l_type = F_WRLCK; 337 else 338 lf.l_type = F_RDLCK; 339 type = F_FLOCK; 340 if ((fmode & FNONBLOCK) == 0) 341 type |= F_WAIT; 342 error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type); 343 have_flock = (error == 0); 344 vn_lock(vp, lock_flags | LK_RETRY); 345 if (error == 0 && vp->v_iflag & VI_DOOMED) 346 error = ENOENT; 347 /* 348 * Another thread might have used this vnode as an 349 * executable while the vnode lock was dropped. 350 * Ensure the vnode is still able to be opened for 351 * writing after the lock has been obtained. 352 */ 353 if (error == 0 && accmode & VWRITE) 354 error = vn_writechk(vp); 355 if (error) { 356 VOP_UNLOCK(vp, 0); 357 if (have_flock) { 358 lf.l_whence = SEEK_SET; 359 lf.l_start = 0; 360 lf.l_len = 0; 361 lf.l_type = F_UNLCK; 362 (void) VOP_ADVLOCK(vp, fp, F_UNLCK, &lf, 363 F_FLOCK); 364 } 365 vn_start_write(vp, &mp, V_WAIT); 366 vn_lock(vp, lock_flags | LK_RETRY); 367 (void)VOP_CLOSE(vp, fmode, cred, td); 368 vn_finished_write(mp); 369 /* Prevent second close from fdrop()->vn_close(). */ 370 if (fp != NULL) 371 fp->f_ops= &badfileops; 372 return (error); 373 } 374 fp->f_flag |= FHASLOCK; 375 } 376 if (fmode & FWRITE) { 377 VOP_ADD_WRITECOUNT(vp, 1); 378 CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d", 379 __func__, vp, vp->v_writecount); 380 } 381 ASSERT_VOP_LOCKED(vp, "vn_open_vnode"); 382 return (0); 383} 384 385/* 386 * Check for write permissions on the specified vnode. 387 * Prototype text segments cannot be written. 388 */ 389int 390vn_writechk(vp) 391 register struct vnode *vp; 392{ 393 394 ASSERT_VOP_LOCKED(vp, "vn_writechk"); 395 /* 396 * If there's shared text associated with 397 * the vnode, try to free it up once. If 398 * we fail, we can't allow writing. 399 */ 400 if (VOP_IS_TEXT(vp)) 401 return (ETXTBSY); 402 403 return (0); 404} 405 406/* 407 * Vnode close call 408 */ 409int 410vn_close(vp, flags, file_cred, td) 411 register struct vnode *vp; 412 int flags; 413 struct ucred *file_cred; 414 struct thread *td; 415{ 416 struct mount *mp; 417 int error, lock_flags; 418 419 if (vp->v_type != VFIFO && (flags & FWRITE) == 0 && 420 MNT_EXTENDED_SHARED(vp->v_mount)) 421 lock_flags = LK_SHARED; 422 else 423 lock_flags = LK_EXCLUSIVE; 424 425 vn_start_write(vp, &mp, V_WAIT); 426 vn_lock(vp, lock_flags | LK_RETRY); 427 if (flags & FWRITE) { 428 VNASSERT(vp->v_writecount > 0, vp, 429 ("vn_close: negative writecount")); 430 VOP_ADD_WRITECOUNT(vp, -1); 431 CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d", 432 __func__, vp, vp->v_writecount); 433 } 434 error = VOP_CLOSE(vp, flags, file_cred, td); 435 vput(vp); 436 vn_finished_write(mp); 437 return (error); 438} 439 440/* 441 * Heuristic to detect sequential operation. 442 */ 443static int 444sequential_heuristic(struct uio *uio, struct file *fp) 445{ 446 447 ASSERT_VOP_LOCKED(fp->f_vnode, __func__); 448 if (fp->f_flag & FRDAHEAD) 449 return (fp->f_seqcount << IO_SEQSHIFT); 450 451 /* 452 * Offset 0 is handled specially. open() sets f_seqcount to 1 so 453 * that the first I/O is normally considered to be slightly 454 * sequential. Seeking to offset 0 doesn't change sequentiality 455 * unless previous seeks have reduced f_seqcount to 0, in which 456 * case offset 0 is not special. 457 */ 458 if ((uio->uio_offset == 0 && fp->f_seqcount > 0) || 459 uio->uio_offset == fp->f_nextoff) { 460 /* 461 * f_seqcount is in units of fixed-size blocks so that it 462 * depends mainly on the amount of sequential I/O and not 463 * much on the number of sequential I/O's. The fixed size 464 * of 16384 is hard-coded here since it is (not quite) just 465 * a magic size that works well here. This size is more 466 * closely related to the best I/O size for real disks than 467 * to any block size used by software. 468 */ 469 fp->f_seqcount += howmany(uio->uio_resid, 16384); 470 if (fp->f_seqcount > IO_SEQMAX) 471 fp->f_seqcount = IO_SEQMAX; 472 return (fp->f_seqcount << IO_SEQSHIFT); 473 } 474 475 /* Not sequential. Quickly draw-down sequentiality. */ 476 if (fp->f_seqcount > 1) 477 fp->f_seqcount = 1; 478 else 479 fp->f_seqcount = 0; 480 return (0); 481} 482 483/* 484 * Package up an I/O request on a vnode into a uio and do it. 485 */ 486int 487vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset, 488 enum uio_seg segflg, int ioflg, struct ucred *active_cred, 489 struct ucred *file_cred, ssize_t *aresid, struct thread *td) 490{ 491 struct uio auio; 492 struct iovec aiov; 493 struct mount *mp; 494 struct ucred *cred; 495 void *rl_cookie; 496 struct vn_io_fault_args args; 497 int error, lock_flags; 498 499 auio.uio_iov = &aiov; 500 auio.uio_iovcnt = 1; 501 aiov.iov_base = base; 502 aiov.iov_len = len; 503 auio.uio_resid = len; 504 auio.uio_offset = offset; 505 auio.uio_segflg = segflg; 506 auio.uio_rw = rw; 507 auio.uio_td = td; 508 error = 0; 509 510 if ((ioflg & IO_NODELOCKED) == 0) { 511 if ((ioflg & IO_RANGELOCKED) == 0) { 512 if (rw == UIO_READ) { 513 rl_cookie = vn_rangelock_rlock(vp, offset, 514 offset + len); 515 } else { 516 rl_cookie = vn_rangelock_wlock(vp, offset, 517 offset + len); 518 } 519 } else 520 rl_cookie = NULL; 521 mp = NULL; 522 if (rw == UIO_WRITE) { 523 if (vp->v_type != VCHR && 524 (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) 525 != 0) 526 goto out; 527 if (MNT_SHARED_WRITES(mp) || 528 ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) 529 lock_flags = LK_SHARED; 530 else 531 lock_flags = LK_EXCLUSIVE; 532 } else 533 lock_flags = LK_SHARED; 534 vn_lock(vp, lock_flags | LK_RETRY); 535 } else 536 rl_cookie = NULL; 537 538 ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); 539#ifdef MAC 540 if ((ioflg & IO_NOMACCHECK) == 0) { 541 if (rw == UIO_READ) 542 error = mac_vnode_check_read(active_cred, file_cred, 543 vp); 544 else 545 error = mac_vnode_check_write(active_cred, file_cred, 546 vp); 547 } 548#endif 549 if (error == 0) { 550 if (file_cred != NULL) 551 cred = file_cred; 552 else 553 cred = active_cred; 554 if (do_vn_io_fault(vp, &auio)) { 555 args.kind = VN_IO_FAULT_VOP; 556 args.cred = cred; 557 args.flags = ioflg; 558 args.args.vop_args.vp = vp; 559 error = vn_io_fault1(vp, &auio, &args, td); 560 } else if (rw == UIO_READ) { 561 error = VOP_READ(vp, &auio, ioflg, cred); 562 } else /* if (rw == UIO_WRITE) */ { 563 error = VOP_WRITE(vp, &auio, ioflg, cred); 564 } 565 } 566 if (aresid) 567 *aresid = auio.uio_resid; 568 else 569 if (auio.uio_resid && error == 0) 570 error = EIO; 571 if ((ioflg & IO_NODELOCKED) == 0) { 572 VOP_UNLOCK(vp, 0); 573 if (mp != NULL) 574 vn_finished_write(mp); 575 } 576 out: 577 if (rl_cookie != NULL) 578 vn_rangelock_unlock(vp, rl_cookie); 579 return (error); 580} 581 582/* 583 * Package up an I/O request on a vnode into a uio and do it. The I/O 584 * request is split up into smaller chunks and we try to avoid saturating 585 * the buffer cache while potentially holding a vnode locked, so we 586 * check bwillwrite() before calling vn_rdwr(). We also call kern_yield() 587 * to give other processes a chance to lock the vnode (either other processes 588 * core'ing the same binary, or unrelated processes scanning the directory). 589 */ 590int 591vn_rdwr_inchunks(rw, vp, base, len, offset, segflg, ioflg, active_cred, 592 file_cred, aresid, td) 593 enum uio_rw rw; 594 struct vnode *vp; 595 void *base; 596 size_t len; 597 off_t offset; 598 enum uio_seg segflg; 599 int ioflg; 600 struct ucred *active_cred; 601 struct ucred *file_cred; 602 size_t *aresid; 603 struct thread *td; 604{ 605 int error = 0; 606 ssize_t iaresid; 607 608 do { 609 int chunk; 610 611 /* 612 * Force `offset' to a multiple of MAXBSIZE except possibly 613 * for the first chunk, so that filesystems only need to 614 * write full blocks except possibly for the first and last 615 * chunks. 616 */ 617 chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE; 618 619 if (chunk > len) 620 chunk = len; 621 if (rw != UIO_READ && vp->v_type == VREG) 622 bwillwrite(); 623 iaresid = 0; 624 error = vn_rdwr(rw, vp, base, chunk, offset, segflg, 625 ioflg, active_cred, file_cred, &iaresid, td); 626 len -= chunk; /* aresid calc already includes length */ 627 if (error) 628 break; 629 offset += chunk; 630 base = (char *)base + chunk; 631 kern_yield(PRI_USER); 632 } while (len); 633 if (aresid) 634 *aresid = len + iaresid; 635 return (error); 636} 637 638off_t 639foffset_lock(struct file *fp, int flags) 640{ 641 struct mtx *mtxp; 642 off_t res; 643 644 KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed")); 645 646#if OFF_MAX <= LONG_MAX 647 /* 648 * Caller only wants the current f_offset value. Assume that 649 * the long and shorter integer types reads are atomic. 650 */ 651 if ((flags & FOF_NOLOCK) != 0) 652 return (fp->f_offset); 653#endif 654 655 /* 656 * According to McKusick the vn lock was protecting f_offset here. 657 * It is now protected by the FOFFSET_LOCKED flag. 658 */ 659 mtxp = mtx_pool_find(mtxpool_sleep, fp); 660 mtx_lock(mtxp); 661 if ((flags & FOF_NOLOCK) == 0) { 662 while (fp->f_vnread_flags & FOFFSET_LOCKED) { 663 fp->f_vnread_flags |= FOFFSET_LOCK_WAITING; 664 msleep(&fp->f_vnread_flags, mtxp, PUSER -1, 665 "vofflock", 0); 666 } 667 fp->f_vnread_flags |= FOFFSET_LOCKED; 668 } 669 res = fp->f_offset; 670 mtx_unlock(mtxp); 671 return (res); 672} 673 674void 675foffset_unlock(struct file *fp, off_t val, int flags) 676{ 677 struct mtx *mtxp; 678 679 KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed")); 680 681#if OFF_MAX <= LONG_MAX 682 if ((flags & FOF_NOLOCK) != 0) { 683 if ((flags & FOF_NOUPDATE) == 0) 684 fp->f_offset = val; 685 if ((flags & FOF_NEXTOFF) != 0) 686 fp->f_nextoff = val; 687 return; 688 } 689#endif 690 691 mtxp = mtx_pool_find(mtxpool_sleep, fp); 692 mtx_lock(mtxp); 693 if ((flags & FOF_NOUPDATE) == 0) 694 fp->f_offset = val; 695 if ((flags & FOF_NEXTOFF) != 0) 696 fp->f_nextoff = val; 697 if ((flags & FOF_NOLOCK) == 0) { 698 KASSERT((fp->f_vnread_flags & FOFFSET_LOCKED) != 0, 699 ("Lost FOFFSET_LOCKED")); 700 if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING) 701 wakeup(&fp->f_vnread_flags); 702 fp->f_vnread_flags = 0; 703 } 704 mtx_unlock(mtxp); 705} 706 707void 708foffset_lock_uio(struct file *fp, struct uio *uio, int flags) 709{ 710 711 if ((flags & FOF_OFFSET) == 0) 712 uio->uio_offset = foffset_lock(fp, flags); 713} 714 715void 716foffset_unlock_uio(struct file *fp, struct uio *uio, int flags) 717{ 718 719 if ((flags & FOF_OFFSET) == 0) 720 foffset_unlock(fp, uio->uio_offset, flags); 721} 722 723static int 724get_advice(struct file *fp, struct uio *uio) 725{ 726 struct mtx *mtxp; 727 int ret; 728 729 ret = POSIX_FADV_NORMAL; 730 if (fp->f_advice == NULL) 731 return (ret); 732 733 mtxp = mtx_pool_find(mtxpool_sleep, fp); 734 mtx_lock(mtxp); 735 if (uio->uio_offset >= fp->f_advice->fa_start && 736 uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end) 737 ret = fp->f_advice->fa_advice; 738 mtx_unlock(mtxp); 739 return (ret); 740} 741 742/* 743 * File table vnode read routine. 744 */ 745static int 746vn_read(fp, uio, active_cred, flags, td) 747 struct file *fp; 748 struct uio *uio; 749 struct ucred *active_cred; 750 int flags; 751 struct thread *td; 752{ 753 struct vnode *vp; 754 struct mtx *mtxp; 755 int error, ioflag; 756 int advice; 757 off_t offset, start, end; 758 759 KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", 760 uio->uio_td, td)); 761 KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET")); 762 vp = fp->f_vnode; 763 ioflag = 0; 764 if (fp->f_flag & FNONBLOCK) 765 ioflag |= IO_NDELAY; 766 if (fp->f_flag & O_DIRECT) 767 ioflag |= IO_DIRECT; 768 advice = get_advice(fp, uio); 769 vn_lock(vp, LK_SHARED | LK_RETRY); 770 771 switch (advice) { 772 case POSIX_FADV_NORMAL: 773 case POSIX_FADV_SEQUENTIAL: 774 case POSIX_FADV_NOREUSE: 775 ioflag |= sequential_heuristic(uio, fp); 776 break; 777 case POSIX_FADV_RANDOM: 778 /* Disable read-ahead for random I/O. */ 779 break; 780 } 781 offset = uio->uio_offset; 782 783#ifdef MAC 784 error = mac_vnode_check_read(active_cred, fp->f_cred, vp); 785 if (error == 0) 786#endif 787 error = VOP_READ(vp, uio, ioflag, fp->f_cred); 788 fp->f_nextoff = uio->uio_offset; 789 VOP_UNLOCK(vp, 0); 790 if (error == 0 && advice == POSIX_FADV_NOREUSE && 791 offset != uio->uio_offset) { 792 /* 793 * Use POSIX_FADV_DONTNEED to flush clean pages and 794 * buffers for the backing file after a 795 * POSIX_FADV_NOREUSE read(2). To optimize the common 796 * case of using POSIX_FADV_NOREUSE with sequential 797 * access, track the previous implicit DONTNEED 798 * request and grow this request to include the 799 * current read(2) in addition to the previous 800 * DONTNEED. With purely sequential access this will 801 * cause the DONTNEED requests to continously grow to 802 * cover all of the previously read regions of the 803 * file. This allows filesystem blocks that are 804 * accessed by multiple calls to read(2) to be flushed 805 * once the last read(2) finishes. 806 */ 807 start = offset; 808 end = uio->uio_offset - 1; 809 mtxp = mtx_pool_find(mtxpool_sleep, fp); 810 mtx_lock(mtxp); 811 if (fp->f_advice != NULL && 812 fp->f_advice->fa_advice == POSIX_FADV_NOREUSE) { 813 if (start != 0 && fp->f_advice->fa_prevend + 1 == start) 814 start = fp->f_advice->fa_prevstart; 815 else if (fp->f_advice->fa_prevstart != 0 && 816 fp->f_advice->fa_prevstart == end + 1) 817 end = fp->f_advice->fa_prevend; 818 fp->f_advice->fa_prevstart = start; 819 fp->f_advice->fa_prevend = end; 820 } 821 mtx_unlock(mtxp); 822 error = VOP_ADVISE(vp, start, end, POSIX_FADV_DONTNEED); 823 } 824 return (error); 825} 826 827/* 828 * File table vnode write routine. 829 */ 830static int 831vn_write(fp, uio, active_cred, flags, td) 832 struct file *fp; 833 struct uio *uio; 834 struct ucred *active_cred; 835 int flags; 836 struct thread *td; 837{ 838 struct vnode *vp; 839 struct mount *mp; 840 struct mtx *mtxp; 841 int error, ioflag, lock_flags; 842 int advice; 843 off_t offset, start, end; 844 845 KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", 846 uio->uio_td, td)); 847 KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET")); 848 vp = fp->f_vnode; 849 if (vp->v_type == VREG) 850 bwillwrite(); 851 ioflag = IO_UNIT; 852 if (vp->v_type == VREG && (fp->f_flag & O_APPEND)) 853 ioflag |= IO_APPEND; 854 if (fp->f_flag & FNONBLOCK) 855 ioflag |= IO_NDELAY; 856 if (fp->f_flag & O_DIRECT) 857 ioflag |= IO_DIRECT; 858 if ((fp->f_flag & O_FSYNC) || 859 (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS))) 860 ioflag |= IO_SYNC; 861 mp = NULL; 862 if (vp->v_type != VCHR && 863 (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) 864 goto unlock; 865 866 advice = get_advice(fp, uio); 867 868 if (MNT_SHARED_WRITES(mp) || 869 (mp == NULL && MNT_SHARED_WRITES(vp->v_mount))) { 870 lock_flags = LK_SHARED; 871 } else { 872 lock_flags = LK_EXCLUSIVE; 873 } 874 875 vn_lock(vp, lock_flags | LK_RETRY); 876 switch (advice) { 877 case POSIX_FADV_NORMAL: 878 case POSIX_FADV_SEQUENTIAL: 879 case POSIX_FADV_NOREUSE: 880 ioflag |= sequential_heuristic(uio, fp); 881 break; 882 case POSIX_FADV_RANDOM: 883 /* XXX: Is this correct? */ 884 break; 885 } 886 offset = uio->uio_offset; 887 888#ifdef MAC 889 error = mac_vnode_check_write(active_cred, fp->f_cred, vp); 890 if (error == 0) 891#endif 892 error = VOP_WRITE(vp, uio, ioflag, fp->f_cred); 893 fp->f_nextoff = uio->uio_offset; 894 VOP_UNLOCK(vp, 0); 895 if (vp->v_type != VCHR) 896 vn_finished_write(mp); 897 if (error == 0 && advice == POSIX_FADV_NOREUSE && 898 offset != uio->uio_offset) { 899 /* 900 * Use POSIX_FADV_DONTNEED to flush clean pages and 901 * buffers for the backing file after a 902 * POSIX_FADV_NOREUSE write(2). To optimize the 903 * common case of using POSIX_FADV_NOREUSE with 904 * sequential access, track the previous implicit 905 * DONTNEED request and grow this request to include 906 * the current write(2) in addition to the previous 907 * DONTNEED. With purely sequential access this will 908 * cause the DONTNEED requests to continously grow to 909 * cover all of the previously written regions of the 910 * file. 911 * 912 * Note that the blocks just written are almost 913 * certainly still dirty, so this only works when 914 * VOP_ADVISE() calls from subsequent writes push out 915 * the data written by this write(2) once the backing 916 * buffers are clean. However, as compared to forcing 917 * IO_DIRECT, this gives much saner behavior. Write 918 * clustering is still allowed, and clean pages are 919 * merely moved to the cache page queue rather than 920 * outright thrown away. This means a subsequent 921 * read(2) can still avoid hitting the disk if the 922 * pages have not been reclaimed. 923 * 924 * This does make POSIX_FADV_NOREUSE largely useless 925 * with non-sequential access. However, sequential 926 * access is the more common use case and the flag is 927 * merely advisory. 928 */ 929 start = offset; 930 end = uio->uio_offset - 1; 931 mtxp = mtx_pool_find(mtxpool_sleep, fp); 932 mtx_lock(mtxp); 933 if (fp->f_advice != NULL && 934 fp->f_advice->fa_advice == POSIX_FADV_NOREUSE) { 935 if (start != 0 && fp->f_advice->fa_prevend + 1 == start) 936 start = fp->f_advice->fa_prevstart; 937 else if (fp->f_advice->fa_prevstart != 0 && 938 fp->f_advice->fa_prevstart == end + 1) 939 end = fp->f_advice->fa_prevend; 940 fp->f_advice->fa_prevstart = start; 941 fp->f_advice->fa_prevend = end; 942 } 943 mtx_unlock(mtxp); 944 error = VOP_ADVISE(vp, start, end, POSIX_FADV_DONTNEED); 945 } 946 947unlock: 948 return (error); 949} 950 951/* 952 * The vn_io_fault() is a wrapper around vn_read() and vn_write() to 953 * prevent the following deadlock: 954 * 955 * Assume that the thread A reads from the vnode vp1 into userspace 956 * buffer buf1 backed by the pages of vnode vp2. If a page in buf1 is 957 * currently not resident, then system ends up with the call chain 958 * vn_read() -> VOP_READ(vp1) -> uiomove() -> [Page Fault] -> 959 * vm_fault(buf1) -> vnode_pager_getpages(vp2) -> VOP_GETPAGES(vp2) 960 * which establishes lock order vp1->vn_lock, then vp2->vn_lock. 961 * If, at the same time, thread B reads from vnode vp2 into buffer buf2 962 * backed by the pages of vnode vp1, and some page in buf2 is not 963 * resident, we get a reversed order vp2->vn_lock, then vp1->vn_lock. 964 * 965 * To prevent the lock order reversal and deadlock, vn_io_fault() does 966 * not allow page faults to happen during VOP_READ() or VOP_WRITE(). 967 * Instead, it first tries to do the whole range i/o with pagefaults 968 * disabled. If all pages in the i/o buffer are resident and mapped, 969 * VOP will succeed (ignoring the genuine filesystem errors). 970 * Otherwise, we get back EFAULT, and vn_io_fault() falls back to do 971 * i/o in chunks, with all pages in the chunk prefaulted and held 972 * using vm_fault_quick_hold_pages(). 973 * 974 * Filesystems using this deadlock avoidance scheme should use the 975 * array of the held pages from uio, saved in the curthread->td_ma, 976 * instead of doing uiomove(). A helper function 977 * vn_io_fault_uiomove() converts uiomove request into 978 * uiomove_fromphys() over td_ma array. 979 * 980 * Since vnode locks do not cover the whole i/o anymore, rangelocks 981 * make the current i/o request atomic with respect to other i/os and 982 * truncations. 983 */ 984 985/* 986 * Decode vn_io_fault_args and perform the corresponding i/o. 987 */ 988static int 989vn_io_fault_doio(struct vn_io_fault_args *args, struct uio *uio, 990 struct thread *td) 991{ 992 993 switch (args->kind) { 994 case VN_IO_FAULT_FOP: 995 return ((args->args.fop_args.doio)(args->args.fop_args.fp, 996 uio, args->cred, args->flags, td)); 997 case VN_IO_FAULT_VOP: 998 if (uio->uio_rw == UIO_READ) { 999 return (VOP_READ(args->args.vop_args.vp, uio, 1000 args->flags, args->cred)); 1001 } else if (uio->uio_rw == UIO_WRITE) { 1002 return (VOP_WRITE(args->args.vop_args.vp, uio, 1003 args->flags, args->cred)); 1004 } 1005 break; 1006 } 1007 panic("vn_io_fault_doio: unknown kind of io %d %d", args->kind, 1008 uio->uio_rw); 1009} 1010 1011/* 1012 * Common code for vn_io_fault(), agnostic to the kind of i/o request. 1013 * Uses vn_io_fault_doio() to make the call to an actual i/o function. 1014 * Used from vn_rdwr() and vn_io_fault(), which encode the i/o request 1015 * into args and call vn_io_fault1() to handle faults during the user 1016 * mode buffer accesses. 1017 */ 1018static int 1019vn_io_fault1(struct vnode *vp, struct uio *uio, struct vn_io_fault_args *args, 1020 struct thread *td) 1021{ 1022 vm_page_t ma[io_hold_cnt + 2]; 1023 struct uio *uio_clone, short_uio; 1024 struct iovec short_iovec[1]; 1025 vm_page_t *prev_td_ma; 1026 vm_prot_t prot; 1027 vm_offset_t addr, end; 1028 size_t len, resid; 1029 ssize_t adv; 1030 int error, cnt, save, saveheld, prev_td_ma_cnt; 1031 1032 prot = uio->uio_rw == UIO_READ ? VM_PROT_WRITE : VM_PROT_READ; 1033 1034 /* 1035 * The UFS follows IO_UNIT directive and replays back both 1036 * uio_offset and uio_resid if an error is encountered during the 1037 * operation. But, since the iovec may be already advanced, 1038 * uio is still in an inconsistent state. 1039 * 1040 * Cache a copy of the original uio, which is advanced to the redo 1041 * point using UIO_NOCOPY below. 1042 */ 1043 uio_clone = cloneuio(uio); 1044 resid = uio->uio_resid; 1045 1046 short_uio.uio_segflg = UIO_USERSPACE; 1047 short_uio.uio_rw = uio->uio_rw; 1048 short_uio.uio_td = uio->uio_td; 1049 1050 save = vm_fault_disable_pagefaults(); 1051 error = vn_io_fault_doio(args, uio, td); 1052 if (error != EFAULT) 1053 goto out; 1054 1055 atomic_add_long(&vn_io_faults_cnt, 1); 1056 uio_clone->uio_segflg = UIO_NOCOPY; 1057 uiomove(NULL, resid - uio->uio_resid, uio_clone); 1058 uio_clone->uio_segflg = uio->uio_segflg; 1059 1060 saveheld = curthread_pflags_set(TDP_UIOHELD); 1061 prev_td_ma = td->td_ma; 1062 prev_td_ma_cnt = td->td_ma_cnt; 1063 1064 while (uio_clone->uio_resid != 0) { 1065 len = uio_clone->uio_iov->iov_len; 1066 if (len == 0) { 1067 KASSERT(uio_clone->uio_iovcnt >= 1, 1068 ("iovcnt underflow")); 1069 uio_clone->uio_iov++; 1070 uio_clone->uio_iovcnt--; 1071 continue; 1072 } 1073 if (len > io_hold_cnt * PAGE_SIZE) 1074 len = io_hold_cnt * PAGE_SIZE; 1075 addr = (uintptr_t)uio_clone->uio_iov->iov_base; 1076 end = round_page(addr + len); 1077 if (end < addr) { 1078 error = EFAULT; 1079 break; 1080 } 1081 cnt = atop(end - trunc_page(addr)); 1082 /* 1083 * A perfectly misaligned address and length could cause 1084 * both the start and the end of the chunk to use partial 1085 * page. +2 accounts for such a situation. 1086 */ 1087 cnt = vm_fault_quick_hold_pages(&td->td_proc->p_vmspace->vm_map, 1088 addr, len, prot, ma, io_hold_cnt + 2); 1089 if (cnt == -1) { 1090 error = EFAULT; 1091 break; 1092 } 1093 short_uio.uio_iov = &short_iovec[0]; 1094 short_iovec[0].iov_base = (void *)addr; 1095 short_uio.uio_iovcnt = 1; 1096 short_uio.uio_resid = short_iovec[0].iov_len = len; 1097 short_uio.uio_offset = uio_clone->uio_offset; 1098 td->td_ma = ma; 1099 td->td_ma_cnt = cnt; 1100 1101 error = vn_io_fault_doio(args, &short_uio, td); 1102 vm_page_unhold_pages(ma, cnt); 1103 adv = len - short_uio.uio_resid; 1104 1105 uio_clone->uio_iov->iov_base = 1106 (char *)uio_clone->uio_iov->iov_base + adv; 1107 uio_clone->uio_iov->iov_len -= adv; 1108 uio_clone->uio_resid -= adv; 1109 uio_clone->uio_offset += adv; 1110 1111 uio->uio_resid -= adv; 1112 uio->uio_offset += adv; 1113 1114 if (error != 0 || adv == 0) 1115 break; 1116 } 1117 td->td_ma = prev_td_ma; 1118 td->td_ma_cnt = prev_td_ma_cnt; 1119 curthread_pflags_restore(saveheld); 1120out: 1121 vm_fault_enable_pagefaults(save); 1122 free(uio_clone, M_IOV); 1123 return (error); 1124} 1125 1126static int 1127vn_io_fault(struct file *fp, struct uio *uio, struct ucred *active_cred, 1128 int flags, struct thread *td) 1129{ 1130 fo_rdwr_t *doio; 1131 struct vnode *vp; 1132 void *rl_cookie; 1133 struct vn_io_fault_args args; 1134 int error; 1135 1136 doio = uio->uio_rw == UIO_READ ? vn_read : vn_write; 1137 vp = fp->f_vnode; 1138 foffset_lock_uio(fp, uio, flags); 1139 if (do_vn_io_fault(vp, uio)) { 1140 args.kind = VN_IO_FAULT_FOP; 1141 args.args.fop_args.fp = fp; 1142 args.args.fop_args.doio = doio; 1143 args.cred = active_cred; 1144 args.flags = flags | FOF_OFFSET; 1145 if (uio->uio_rw == UIO_READ) { 1146 rl_cookie = vn_rangelock_rlock(vp, uio->uio_offset, 1147 uio->uio_offset + uio->uio_resid); 1148 } else if ((fp->f_flag & O_APPEND) != 0 || 1149 (flags & FOF_OFFSET) == 0) { 1150 /* For appenders, punt and lock the whole range. */ 1151 rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX); 1152 } else { 1153 rl_cookie = vn_rangelock_wlock(vp, uio->uio_offset, 1154 uio->uio_offset + uio->uio_resid); 1155 } 1156 error = vn_io_fault1(vp, uio, &args, td); 1157 vn_rangelock_unlock(vp, rl_cookie); 1158 } else { 1159 error = doio(fp, uio, active_cred, flags | FOF_OFFSET, td); 1160 } 1161 foffset_unlock_uio(fp, uio, flags); 1162 return (error); 1163} 1164 1165/* 1166 * Helper function to perform the requested uiomove operation using 1167 * the held pages for io->uio_iov[0].iov_base buffer instead of 1168 * copyin/copyout. Access to the pages with uiomove_fromphys() 1169 * instead of iov_base prevents page faults that could occur due to 1170 * pmap_collect() invalidating the mapping created by 1171 * vm_fault_quick_hold_pages(), or pageout daemon, page laundry or 1172 * object cleanup revoking the write access from page mappings. 1173 * 1174 * Filesystems specified MNTK_NO_IOPF shall use vn_io_fault_uiomove() 1175 * instead of plain uiomove(). 1176 */ 1177int 1178vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio) 1179{ 1180 struct uio transp_uio; 1181 struct iovec transp_iov[1]; 1182 struct thread *td; 1183 size_t adv; 1184 int error, pgadv; 1185 1186 td = curthread; 1187 if ((td->td_pflags & TDP_UIOHELD) == 0 || 1188 uio->uio_segflg != UIO_USERSPACE) 1189 return (uiomove(data, xfersize, uio)); 1190 1191 KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt)); 1192 transp_iov[0].iov_base = data; 1193 transp_uio.uio_iov = &transp_iov[0]; 1194 transp_uio.uio_iovcnt = 1; 1195 if (xfersize > uio->uio_resid) 1196 xfersize = uio->uio_resid; 1197 transp_uio.uio_resid = transp_iov[0].iov_len = xfersize; 1198 transp_uio.uio_offset = 0; 1199 transp_uio.uio_segflg = UIO_SYSSPACE; 1200 /* 1201 * Since transp_iov points to data, and td_ma page array 1202 * corresponds to original uio->uio_iov, we need to invert the 1203 * direction of the i/o operation as passed to 1204 * uiomove_fromphys(). 1205 */ 1206 switch (uio->uio_rw) { 1207 case UIO_WRITE: 1208 transp_uio.uio_rw = UIO_READ; 1209 break; 1210 case UIO_READ: 1211 transp_uio.uio_rw = UIO_WRITE; 1212 break; 1213 } 1214 transp_uio.uio_td = uio->uio_td; 1215 error = uiomove_fromphys(td->td_ma, 1216 ((vm_offset_t)uio->uio_iov->iov_base) & PAGE_MASK, 1217 xfersize, &transp_uio); 1218 adv = xfersize - transp_uio.uio_resid; 1219 pgadv = 1220 (((vm_offset_t)uio->uio_iov->iov_base + adv) >> PAGE_SHIFT) - 1221 (((vm_offset_t)uio->uio_iov->iov_base) >> PAGE_SHIFT); 1222 td->td_ma += pgadv; 1223 KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt, 1224 pgadv)); 1225 td->td_ma_cnt -= pgadv; 1226 uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + adv; 1227 uio->uio_iov->iov_len -= adv; 1228 uio->uio_resid -= adv; 1229 uio->uio_offset += adv; 1230 return (error); 1231} 1232 1233int 1234vn_io_fault_pgmove(vm_page_t ma[], vm_offset_t offset, int xfersize, 1235 struct uio *uio) 1236{ 1237 struct thread *td; 1238 vm_offset_t iov_base; 1239 int cnt, pgadv; 1240 1241 td = curthread; 1242 if ((td->td_pflags & TDP_UIOHELD) == 0 || 1243 uio->uio_segflg != UIO_USERSPACE) 1244 return (uiomove_fromphys(ma, offset, xfersize, uio)); 1245 1246 KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt)); 1247 cnt = xfersize > uio->uio_resid ? uio->uio_resid : xfersize; 1248 iov_base = (vm_offset_t)uio->uio_iov->iov_base; 1249 switch (uio->uio_rw) { 1250 case UIO_WRITE: 1251 pmap_copy_pages(td->td_ma, iov_base & PAGE_MASK, ma, 1252 offset, cnt); 1253 break; 1254 case UIO_READ: 1255 pmap_copy_pages(ma, offset, td->td_ma, iov_base & PAGE_MASK, 1256 cnt); 1257 break; 1258 } 1259 pgadv = ((iov_base + cnt) >> PAGE_SHIFT) - (iov_base >> PAGE_SHIFT); 1260 td->td_ma += pgadv; 1261 KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt, 1262 pgadv)); 1263 td->td_ma_cnt -= pgadv; 1264 uio->uio_iov->iov_base = (char *)(iov_base + cnt); 1265 uio->uio_iov->iov_len -= cnt; 1266 uio->uio_resid -= cnt; 1267 uio->uio_offset += cnt; 1268 return (0); 1269} 1270 1271 1272/* 1273 * File table truncate routine. 1274 */ 1275static int 1276vn_truncate(struct file *fp, off_t length, struct ucred *active_cred, 1277 struct thread *td) 1278{ 1279 struct vattr vattr; 1280 struct mount *mp; 1281 struct vnode *vp; 1282 void *rl_cookie; 1283 int error; 1284 1285 vp = fp->f_vnode; 1286 1287 /* 1288 * Lock the whole range for truncation. Otherwise split i/o 1289 * might happen partly before and partly after the truncation. 1290 */ 1291 rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX); 1292 error = vn_start_write(vp, &mp, V_WAIT | PCATCH); 1293 if (error) 1294 goto out1; 1295 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1296 if (vp->v_type == VDIR) { 1297 error = EISDIR; 1298 goto out; 1299 } 1300#ifdef MAC 1301 error = mac_vnode_check_write(active_cred, fp->f_cred, vp); 1302 if (error) 1303 goto out; 1304#endif 1305 error = vn_writechk(vp); 1306 if (error == 0) { 1307 VATTR_NULL(&vattr); 1308 vattr.va_size = length; 1309 error = VOP_SETATTR(vp, &vattr, fp->f_cred); 1310 } 1311out: 1312 VOP_UNLOCK(vp, 0); 1313 vn_finished_write(mp); 1314out1: 1315 vn_rangelock_unlock(vp, rl_cookie); 1316 return (error); 1317} 1318 1319/* 1320 * File table vnode stat routine. 1321 */ 1322static int 1323vn_statfile(fp, sb, active_cred, td) 1324 struct file *fp; 1325 struct stat *sb; 1326 struct ucred *active_cred; 1327 struct thread *td; 1328{ 1329 struct vnode *vp = fp->f_vnode; 1330 int error; 1331 1332 vn_lock(vp, LK_SHARED | LK_RETRY); 1333 error = vn_stat(vp, sb, active_cred, fp->f_cred, td); 1334 VOP_UNLOCK(vp, 0); 1335 1336 return (error); 1337} 1338 1339/* 1340 * Stat a vnode; implementation for the stat syscall 1341 */ 1342int 1343vn_stat(vp, sb, active_cred, file_cred, td) 1344 struct vnode *vp; 1345 register struct stat *sb; 1346 struct ucred *active_cred; 1347 struct ucred *file_cred; 1348 struct thread *td; 1349{ 1350 struct vattr vattr; 1351 register struct vattr *vap; 1352 int error; 1353 u_short mode; 1354 1355#ifdef MAC 1356 error = mac_vnode_check_stat(active_cred, file_cred, vp); 1357 if (error) 1358 return (error); 1359#endif 1360 1361 vap = &vattr; 1362 1363 /* 1364 * Initialize defaults for new and unusual fields, so that file 1365 * systems which don't support these fields don't need to know 1366 * about them. 1367 */ 1368 vap->va_birthtime.tv_sec = -1; 1369 vap->va_birthtime.tv_nsec = 0; 1370 vap->va_fsid = VNOVAL; 1371 vap->va_rdev = NODEV; 1372 1373 error = VOP_GETATTR(vp, vap, active_cred); 1374 if (error) 1375 return (error); 1376 1377 /* 1378 * Zero the spare stat fields 1379 */ 1380 bzero(sb, sizeof *sb); 1381 1382 /* 1383 * Copy from vattr table 1384 */ 1385 if (vap->va_fsid != VNOVAL) 1386 sb->st_dev = vap->va_fsid; 1387 else 1388 sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0]; 1389 sb->st_ino = vap->va_fileid; 1390 mode = vap->va_mode; 1391 switch (vap->va_type) { 1392 case VREG: 1393 mode |= S_IFREG; 1394 break; 1395 case VDIR: 1396 mode |= S_IFDIR; 1397 break; 1398 case VBLK: 1399 mode |= S_IFBLK; 1400 break; 1401 case VCHR: 1402 mode |= S_IFCHR; 1403 break; 1404 case VLNK: 1405 mode |= S_IFLNK; 1406 break; 1407 case VSOCK: 1408 mode |= S_IFSOCK; 1409 break; 1410 case VFIFO: 1411 mode |= S_IFIFO; 1412 break; 1413 default: 1414 return (EBADF); 1415 }; 1416 sb->st_mode = mode; 1417 sb->st_nlink = vap->va_nlink; 1418 sb->st_uid = vap->va_uid; 1419 sb->st_gid = vap->va_gid; 1420 sb->st_rdev = vap->va_rdev; 1421 if (vap->va_size > OFF_MAX) 1422 return (EOVERFLOW); 1423 sb->st_size = vap->va_size; 1424 sb->st_atim = vap->va_atime; 1425 sb->st_mtim = vap->va_mtime; 1426 sb->st_ctim = vap->va_ctime; 1427 sb->st_birthtim = vap->va_birthtime; 1428 1429 /* 1430 * According to www.opengroup.org, the meaning of st_blksize is 1431 * "a filesystem-specific preferred I/O block size for this 1432 * object. In some filesystem types, this may vary from file 1433 * to file" 1434 * Use miminum/default of PAGE_SIZE (e.g. for VCHR). 1435 */ 1436 1437 sb->st_blksize = max(PAGE_SIZE, vap->va_blocksize); 1438 1439 sb->st_flags = vap->va_flags; 1440 if (priv_check(td, PRIV_VFS_GENERATION)) 1441 sb->st_gen = 0; 1442 else 1443 sb->st_gen = vap->va_gen; 1444 1445 sb->st_blocks = vap->va_bytes / S_BLKSIZE; 1446 return (0); 1447} 1448 1449/* 1450 * File table vnode ioctl routine. 1451 */ 1452static int 1453vn_ioctl(fp, com, data, active_cred, td) 1454 struct file *fp; 1455 u_long com; 1456 void *data; 1457 struct ucred *active_cred; 1458 struct thread *td; 1459{ 1460 struct vattr vattr; 1461 struct vnode *vp; 1462 int error; 1463 1464 vp = fp->f_vnode; 1465 switch (vp->v_type) { 1466 case VDIR: 1467 case VREG: 1468 switch (com) { 1469 case FIONREAD: 1470 vn_lock(vp, LK_SHARED | LK_RETRY); 1471 error = VOP_GETATTR(vp, &vattr, active_cred); 1472 VOP_UNLOCK(vp, 0); 1473 if (error == 0) 1474 *(int *)data = vattr.va_size - fp->f_offset; 1475 return (error); 1476 case FIONBIO: 1477 case FIOASYNC: 1478 return (0); 1479 default: 1480 return (VOP_IOCTL(vp, com, data, fp->f_flag, 1481 active_cred, td)); 1482 } 1483 default: 1484 return (ENOTTY); 1485 } 1486} 1487 1488/* 1489 * File table vnode poll routine. 1490 */ 1491static int 1492vn_poll(fp, events, active_cred, td) 1493 struct file *fp; 1494 int events; 1495 struct ucred *active_cred; 1496 struct thread *td; 1497{ 1498 struct vnode *vp; 1499 int error; 1500 1501 vp = fp->f_vnode; 1502#ifdef MAC 1503 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1504 error = mac_vnode_check_poll(active_cred, fp->f_cred, vp); 1505 VOP_UNLOCK(vp, 0); 1506 if (!error) 1507#endif 1508 1509 error = VOP_POLL(vp, events, fp->f_cred, td); 1510 return (error); 1511} 1512 1513/* 1514 * Acquire the requested lock and then check for validity. LK_RETRY 1515 * permits vn_lock to return doomed vnodes. 1516 */ 1517int 1518_vn_lock(struct vnode *vp, int flags, char *file, int line) 1519{ 1520 int error; 1521 1522 VNASSERT((flags & LK_TYPE_MASK) != 0, vp, 1523 ("vn_lock called with no locktype.")); 1524 do { 1525#ifdef DEBUG_VFS_LOCKS 1526 KASSERT(vp->v_holdcnt != 0, 1527 ("vn_lock %p: zero hold count", vp)); 1528#endif 1529 error = VOP_LOCK1(vp, flags, file, line); 1530 flags &= ~LK_INTERLOCK; /* Interlock is always dropped. */ 1531 KASSERT((flags & LK_RETRY) == 0 || error == 0, 1532 ("LK_RETRY set with incompatible flags (0x%x) or an error occured (%d)", 1533 flags, error)); 1534 /* 1535 * Callers specify LK_RETRY if they wish to get dead vnodes. 1536 * If RETRY is not set, we return ENOENT instead. 1537 */ 1538 if (error == 0 && vp->v_iflag & VI_DOOMED && 1539 (flags & LK_RETRY) == 0) { 1540 VOP_UNLOCK(vp, 0); 1541 error = ENOENT; 1542 break; 1543 } 1544 } while (flags & LK_RETRY && error != 0); 1545 return (error); 1546} 1547 1548/* 1549 * File table vnode close routine. 1550 */ 1551static int 1552vn_closefile(fp, td) 1553 struct file *fp; 1554 struct thread *td; 1555{ 1556 struct vnode *vp; 1557 struct flock lf; 1558 int error; 1559 1560 vp = fp->f_vnode; 1561 fp->f_ops = &badfileops; 1562 1563 if (fp->f_type == DTYPE_VNODE && fp->f_flag & FHASLOCK) 1564 vref(vp); 1565 1566 error = vn_close(vp, fp->f_flag, fp->f_cred, td); 1567 1568 if (fp->f_type == DTYPE_VNODE && fp->f_flag & FHASLOCK) { 1569 lf.l_whence = SEEK_SET; 1570 lf.l_start = 0; 1571 lf.l_len = 0; 1572 lf.l_type = F_UNLCK; 1573 (void) VOP_ADVLOCK(vp, fp, F_UNLCK, &lf, F_FLOCK); 1574 vrele(vp); 1575 } 1576 return (error); 1577} 1578 1579/* 1580 * Preparing to start a filesystem write operation. If the operation is 1581 * permitted, then we bump the count of operations in progress and 1582 * proceed. If a suspend request is in progress, we wait until the 1583 * suspension is over, and then proceed. 1584 */ 1585static int 1586vn_start_write_locked(struct mount *mp, int flags) 1587{ 1588 int error, mflags; 1589 1590 mtx_assert(MNT_MTX(mp), MA_OWNED); 1591 error = 0; 1592 1593 /* 1594 * Check on status of suspension. 1595 */ 1596 if ((curthread->td_pflags & TDP_IGNSUSP) == 0 || 1597 mp->mnt_susp_owner != curthread) { 1598 mflags = ((mp->mnt_vfc->vfc_flags & VFCF_SBDRY) != 0 ? 1599 (flags & PCATCH) : 0) | (PUSER - 1); 1600 while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) { 1601 if (flags & V_NOWAIT) { 1602 error = EWOULDBLOCK; 1603 goto unlock; 1604 } 1605 error = msleep(&mp->mnt_flag, MNT_MTX(mp), mflags, 1606 "suspfs", 0); 1607 if (error) 1608 goto unlock; 1609 } 1610 } 1611 if (flags & V_XSLEEP) 1612 goto unlock; 1613 mp->mnt_writeopcount++; 1614unlock: 1615 if (error != 0 || (flags & V_XSLEEP) != 0) 1616 MNT_REL(mp); 1617 MNT_IUNLOCK(mp); 1618 return (error); 1619} 1620 1621int 1622vn_start_write(vp, mpp, flags) 1623 struct vnode *vp; 1624 struct mount **mpp; 1625 int flags; 1626{ 1627 struct mount *mp; 1628 int error; 1629 1630 error = 0; 1631 /* 1632 * If a vnode is provided, get and return the mount point that 1633 * to which it will write. 1634 */ 1635 if (vp != NULL) { 1636 if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) { 1637 *mpp = NULL; 1638 if (error != EOPNOTSUPP) 1639 return (error); 1640 return (0); 1641 } 1642 } 1643 if ((mp = *mpp) == NULL) 1644 return (0); 1645 1646 /* 1647 * VOP_GETWRITEMOUNT() returns with the mp refcount held through 1648 * a vfs_ref(). 1649 * As long as a vnode is not provided we need to acquire a 1650 * refcount for the provided mountpoint too, in order to 1651 * emulate a vfs_ref(). 1652 */ 1653 MNT_ILOCK(mp); 1654 if (vp == NULL) 1655 MNT_REF(mp); 1656 1657 return (vn_start_write_locked(mp, flags)); 1658} 1659 1660/* 1661 * Secondary suspension. Used by operations such as vop_inactive 1662 * routines that are needed by the higher level functions. These 1663 * are allowed to proceed until all the higher level functions have 1664 * completed (indicated by mnt_writeopcount dropping to zero). At that 1665 * time, these operations are halted until the suspension is over. 1666 */ 1667int 1668vn_start_secondary_write(vp, mpp, flags) 1669 struct vnode *vp; 1670 struct mount **mpp; 1671 int flags; 1672{ 1673 struct mount *mp; 1674 int error; 1675 1676 retry: 1677 if (vp != NULL) { 1678 if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) { 1679 *mpp = NULL; 1680 if (error != EOPNOTSUPP) 1681 return (error); 1682 return (0); 1683 } 1684 } 1685 /* 1686 * If we are not suspended or have not yet reached suspended 1687 * mode, then let the operation proceed. 1688 */ 1689 if ((mp = *mpp) == NULL) 1690 return (0); 1691 1692 /* 1693 * VOP_GETWRITEMOUNT() returns with the mp refcount held through 1694 * a vfs_ref(). 1695 * As long as a vnode is not provided we need to acquire a 1696 * refcount for the provided mountpoint too, in order to 1697 * emulate a vfs_ref(). 1698 */ 1699 MNT_ILOCK(mp); 1700 if (vp == NULL) 1701 MNT_REF(mp); 1702 if ((mp->mnt_kern_flag & (MNTK_SUSPENDED | MNTK_SUSPEND2)) == 0) { 1703 mp->mnt_secondary_writes++; 1704 mp->mnt_secondary_accwrites++; 1705 MNT_IUNLOCK(mp); 1706 return (0); 1707 } 1708 if (flags & V_NOWAIT) { 1709 MNT_REL(mp); 1710 MNT_IUNLOCK(mp); 1711 return (EWOULDBLOCK); 1712 } 1713 /* 1714 * Wait for the suspension to finish. 1715 */ 1716 error = msleep(&mp->mnt_flag, MNT_MTX(mp), (PUSER - 1) | PDROP | 1717 ((mp->mnt_vfc->vfc_flags & VFCF_SBDRY) != 0 ? (flags & PCATCH) : 0), 1718 "suspfs", 0); 1719 vfs_rel(mp); 1720 if (error == 0) 1721 goto retry; 1722 return (error); 1723} 1724 1725/* 1726 * Filesystem write operation has completed. If we are suspending and this 1727 * operation is the last one, notify the suspender that the suspension is 1728 * now in effect. 1729 */ 1730void 1731vn_finished_write(mp) 1732 struct mount *mp; 1733{ 1734 if (mp == NULL) 1735 return; 1736 MNT_ILOCK(mp); 1737 MNT_REL(mp); 1738 mp->mnt_writeopcount--; 1739 if (mp->mnt_writeopcount < 0) 1740 panic("vn_finished_write: neg cnt"); 1741 if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 && 1742 mp->mnt_writeopcount <= 0) 1743 wakeup(&mp->mnt_writeopcount); 1744 MNT_IUNLOCK(mp); 1745} 1746 1747 1748/* 1749 * Filesystem secondary write operation has completed. If we are 1750 * suspending and this operation is the last one, notify the suspender 1751 * that the suspension is now in effect. 1752 */ 1753void 1754vn_finished_secondary_write(mp) 1755 struct mount *mp; 1756{ 1757 if (mp == NULL) 1758 return; 1759 MNT_ILOCK(mp); 1760 MNT_REL(mp); 1761 mp->mnt_secondary_writes--; 1762 if (mp->mnt_secondary_writes < 0) 1763 panic("vn_finished_secondary_write: neg cnt"); 1764 if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 && 1765 mp->mnt_secondary_writes <= 0) 1766 wakeup(&mp->mnt_secondary_writes); 1767 MNT_IUNLOCK(mp); 1768} 1769 1770 1771 1772/* 1773 * Request a filesystem to suspend write operations. 1774 */ 1775int 1776vfs_write_suspend(struct mount *mp, int flags) 1777{ 1778 int error; 1779 1780 MNT_ILOCK(mp); 1781 if (mp->mnt_susp_owner == curthread) { 1782 MNT_IUNLOCK(mp); 1783 return (EALREADY); 1784 } 1785 while (mp->mnt_kern_flag & MNTK_SUSPEND) 1786 msleep(&mp->mnt_flag, MNT_MTX(mp), PUSER - 1, "wsuspfs", 0); 1787 1788 /* 1789 * Unmount holds a write reference on the mount point. If we 1790 * own busy reference and drain for writers, we deadlock with 1791 * the reference draining in the unmount path. Callers of 1792 * vfs_write_suspend() must specify VS_SKIP_UNMOUNT if 1793 * vfs_busy() reference is owned and caller is not in the 1794 * unmount context. 1795 */ 1796 if ((flags & VS_SKIP_UNMOUNT) != 0 && 1797 (mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) { 1798 MNT_IUNLOCK(mp); 1799 return (EBUSY); 1800 } 1801 1802 mp->mnt_kern_flag |= MNTK_SUSPEND; 1803 mp->mnt_susp_owner = curthread; 1804 if (mp->mnt_writeopcount > 0) 1805 (void) msleep(&mp->mnt_writeopcount, 1806 MNT_MTX(mp), (PUSER - 1)|PDROP, "suspwt", 0); 1807 else 1808 MNT_IUNLOCK(mp); 1809 if ((error = VFS_SYNC(mp, MNT_SUSPEND)) != 0) 1810 vfs_write_resume(mp, 0); 1811 return (error); 1812} 1813 1814/* 1815 * Request a filesystem to resume write operations. 1816 */ 1817void 1818vfs_write_resume(struct mount *mp, int flags) 1819{ 1820 1821 MNT_ILOCK(mp); 1822 if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) { 1823 KASSERT(mp->mnt_susp_owner == curthread, ("mnt_susp_owner")); 1824 mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPEND2 | 1825 MNTK_SUSPENDED); 1826 mp->mnt_susp_owner = NULL; 1827 wakeup(&mp->mnt_writeopcount); 1828 wakeup(&mp->mnt_flag); 1829 curthread->td_pflags &= ~TDP_IGNSUSP; 1830 if ((flags & VR_START_WRITE) != 0) { 1831 MNT_REF(mp); 1832 mp->mnt_writeopcount++; 1833 } 1834 MNT_IUNLOCK(mp); 1835 if ((flags & VR_NO_SUSPCLR) == 0) 1836 VFS_SUSP_CLEAN(mp); 1837 } else if ((flags & VR_START_WRITE) != 0) { 1838 MNT_REF(mp); 1839 vn_start_write_locked(mp, 0); 1840 } else { 1841 MNT_IUNLOCK(mp); 1842 } 1843} 1844 1845/* 1846 * Helper loop around vfs_write_suspend() for filesystem unmount VFS 1847 * methods. 1848 */ 1849int 1850vfs_write_suspend_umnt(struct mount *mp) 1851{ 1852 int error; 1853 1854 KASSERT((curthread->td_pflags & TDP_IGNSUSP) == 0, 1855 ("vfs_write_suspend_umnt: recursed")); 1856 1857 /* dounmount() already called vn_start_write(). */ 1858 for (;;) { 1859 vn_finished_write(mp); 1860 error = vfs_write_suspend(mp, 0); 1861 if (error != 0) { 1862 vn_start_write(NULL, &mp, V_WAIT); 1863 return (error); 1864 } 1865 MNT_ILOCK(mp); 1866 if ((mp->mnt_kern_flag & MNTK_SUSPENDED) != 0) 1867 break; 1868 MNT_IUNLOCK(mp); 1869 vn_start_write(NULL, &mp, V_WAIT); 1870 } 1871 mp->mnt_kern_flag &= ~(MNTK_SUSPENDED | MNTK_SUSPEND2); 1872 wakeup(&mp->mnt_flag); 1873 MNT_IUNLOCK(mp); 1874 curthread->td_pflags |= TDP_IGNSUSP; 1875 return (0); 1876} 1877 1878/* 1879 * Implement kqueues for files by translating it to vnode operation. 1880 */ 1881static int 1882vn_kqfilter(struct file *fp, struct knote *kn) 1883{ 1884 1885 return (VOP_KQFILTER(fp->f_vnode, kn)); 1886} 1887 1888/* 1889 * Simplified in-kernel wrapper calls for extended attribute access. 1890 * Both calls pass in a NULL credential, authorizing as "kernel" access. 1891 * Set IO_NODELOCKED in ioflg if the vnode is already locked. 1892 */ 1893int 1894vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace, 1895 const char *attrname, int *buflen, char *buf, struct thread *td) 1896{ 1897 struct uio auio; 1898 struct iovec iov; 1899 int error; 1900 1901 iov.iov_len = *buflen; 1902 iov.iov_base = buf; 1903 1904 auio.uio_iov = &iov; 1905 auio.uio_iovcnt = 1; 1906 auio.uio_rw = UIO_READ; 1907 auio.uio_segflg = UIO_SYSSPACE; 1908 auio.uio_td = td; 1909 auio.uio_offset = 0; 1910 auio.uio_resid = *buflen; 1911 1912 if ((ioflg & IO_NODELOCKED) == 0) 1913 vn_lock(vp, LK_SHARED | LK_RETRY); 1914 1915 ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); 1916 1917 /* authorize attribute retrieval as kernel */ 1918 error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL, 1919 td); 1920 1921 if ((ioflg & IO_NODELOCKED) == 0) 1922 VOP_UNLOCK(vp, 0); 1923 1924 if (error == 0) { 1925 *buflen = *buflen - auio.uio_resid; 1926 } 1927 1928 return (error); 1929} 1930 1931/* 1932 * XXX failure mode if partially written? 1933 */ 1934int 1935vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace, 1936 const char *attrname, int buflen, char *buf, struct thread *td) 1937{ 1938 struct uio auio; 1939 struct iovec iov; 1940 struct mount *mp; 1941 int error; 1942 1943 iov.iov_len = buflen; 1944 iov.iov_base = buf; 1945 1946 auio.uio_iov = &iov; 1947 auio.uio_iovcnt = 1; 1948 auio.uio_rw = UIO_WRITE; 1949 auio.uio_segflg = UIO_SYSSPACE; 1950 auio.uio_td = td; 1951 auio.uio_offset = 0; 1952 auio.uio_resid = buflen; 1953 1954 if ((ioflg & IO_NODELOCKED) == 0) { 1955 if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0) 1956 return (error); 1957 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1958 } 1959 1960 ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); 1961 1962 /* authorize attribute setting as kernel */ 1963 error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td); 1964 1965 if ((ioflg & IO_NODELOCKED) == 0) { 1966 vn_finished_write(mp); 1967 VOP_UNLOCK(vp, 0); 1968 } 1969 1970 return (error); 1971} 1972 1973int 1974vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace, 1975 const char *attrname, struct thread *td) 1976{ 1977 struct mount *mp; 1978 int error; 1979 1980 if ((ioflg & IO_NODELOCKED) == 0) { 1981 if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0) 1982 return (error); 1983 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1984 } 1985 1986 ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); 1987 1988 /* authorize attribute removal as kernel */ 1989 error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NULL, td); 1990 if (error == EOPNOTSUPP) 1991 error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL, 1992 NULL, td); 1993 1994 if ((ioflg & IO_NODELOCKED) == 0) { 1995 vn_finished_write(mp); 1996 VOP_UNLOCK(vp, 0); 1997 } 1998 1999 return (error); 2000} 2001 2002static int 2003vn_get_ino_alloc_vget(struct mount *mp, void *arg, int lkflags, 2004 struct vnode **rvp) 2005{ 2006 2007 return (VFS_VGET(mp, *(ino_t *)arg, lkflags, rvp)); 2008} 2009 2010int 2011vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags, struct vnode **rvp) 2012{ 2013 2014 return (vn_vget_ino_gen(vp, vn_get_ino_alloc_vget, &ino, 2015 lkflags, rvp)); 2016} 2017 2018int 2019vn_vget_ino_gen(struct vnode *vp, vn_get_ino_t alloc, void *alloc_arg, 2020 int lkflags, struct vnode **rvp) 2021{ 2022 struct mount *mp; 2023 int ltype, error; 2024 2025 ASSERT_VOP_LOCKED(vp, "vn_vget_ino_get"); 2026 mp = vp->v_mount; 2027 ltype = VOP_ISLOCKED(vp); 2028 KASSERT(ltype == LK_EXCLUSIVE || ltype == LK_SHARED, 2029 ("vn_vget_ino: vp not locked")); 2030 error = vfs_busy(mp, MBF_NOWAIT); 2031 if (error != 0) { 2032 vfs_ref(mp); 2033 VOP_UNLOCK(vp, 0); 2034 error = vfs_busy(mp, 0); 2035 vn_lock(vp, ltype | LK_RETRY); 2036 vfs_rel(mp); 2037 if (error != 0) 2038 return (ENOENT); 2039 if (vp->v_iflag & VI_DOOMED) { 2040 vfs_unbusy(mp); 2041 return (ENOENT); 2042 } 2043 } 2044 VOP_UNLOCK(vp, 0); 2045 error = alloc(mp, alloc_arg, lkflags, rvp); 2046 vfs_unbusy(mp); 2047 if (*rvp != vp) 2048 vn_lock(vp, ltype | LK_RETRY); 2049 if (vp->v_iflag & VI_DOOMED) { 2050 if (error == 0) { 2051 if (*rvp == vp) 2052 vunref(vp); 2053 else 2054 vput(*rvp); 2055 } 2056 error = ENOENT; 2057 } 2058 return (error); 2059} 2060 2061int 2062vn_rlimit_fsize(const struct vnode *vp, const struct uio *uio, 2063 const struct thread *td) 2064{ 2065 2066 if (vp->v_type != VREG || td == NULL) 2067 return (0); 2068 PROC_LOCK(td->td_proc); 2069 if ((uoff_t)uio->uio_offset + uio->uio_resid > 2070 lim_cur(td->td_proc, RLIMIT_FSIZE)) { 2071 kern_psignal(td->td_proc, SIGXFSZ); 2072 PROC_UNLOCK(td->td_proc); 2073 return (EFBIG); 2074 } 2075 PROC_UNLOCK(td->td_proc); 2076 return (0); 2077} 2078 2079int 2080vn_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, 2081 struct thread *td) 2082{ 2083 struct vnode *vp; 2084 2085 vp = fp->f_vnode; 2086#ifdef AUDIT 2087 vn_lock(vp, LK_SHARED | LK_RETRY); 2088 AUDIT_ARG_VNODE1(vp); 2089 VOP_UNLOCK(vp, 0); 2090#endif 2091 return (setfmode(td, active_cred, vp, mode)); 2092} 2093 2094int 2095vn_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, 2096 struct thread *td) 2097{ 2098 struct vnode *vp; 2099 2100 vp = fp->f_vnode; 2101#ifdef AUDIT 2102 vn_lock(vp, LK_SHARED | LK_RETRY); 2103 AUDIT_ARG_VNODE1(vp); 2104 VOP_UNLOCK(vp, 0); 2105#endif 2106 return (setfown(td, active_cred, vp, uid, gid)); 2107} 2108 2109void 2110vn_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end) 2111{ 2112 vm_object_t object; 2113 2114 if ((object = vp->v_object) == NULL) 2115 return; 2116 VM_OBJECT_WLOCK(object); 2117 vm_object_page_remove(object, start, end, 0); 2118 VM_OBJECT_WUNLOCK(object); 2119} 2120 2121int 2122vn_bmap_seekhole(struct vnode *vp, u_long cmd, off_t *off, struct ucred *cred) 2123{ 2124 struct vattr va; 2125 daddr_t bn, bnp; 2126 uint64_t bsize; 2127 off_t noff; 2128 int error; 2129 2130 KASSERT(cmd == FIOSEEKHOLE || cmd == FIOSEEKDATA, 2131 ("Wrong command %lu", cmd)); 2132 2133 if (vn_lock(vp, LK_SHARED) != 0) 2134 return (EBADF); 2135 if (vp->v_type != VREG) { 2136 error = ENOTTY; 2137 goto unlock; 2138 } 2139 error = VOP_GETATTR(vp, &va, cred); 2140 if (error != 0) 2141 goto unlock; 2142 noff = *off; 2143 if (noff >= va.va_size) { 2144 error = ENXIO; 2145 goto unlock; 2146 } 2147 bsize = vp->v_mount->mnt_stat.f_iosize; 2148 for (bn = noff / bsize; noff < va.va_size; bn++, noff += bsize) { 2149 error = VOP_BMAP(vp, bn, NULL, &bnp, NULL, NULL); 2150 if (error == EOPNOTSUPP) { 2151 error = ENOTTY; 2152 goto unlock; 2153 } 2154 if ((bnp == -1 && cmd == FIOSEEKHOLE) || 2155 (bnp != -1 && cmd == FIOSEEKDATA)) { 2156 noff = bn * bsize; 2157 if (noff < *off) 2158 noff = *off; 2159 goto unlock; 2160 } 2161 } 2162 if (noff > va.va_size) 2163 noff = va.va_size; 2164 /* noff == va.va_size. There is an implicit hole at the end of file. */ 2165 if (cmd == FIOSEEKDATA) 2166 error = ENXIO; 2167unlock: 2168 VOP_UNLOCK(vp, 0); 2169 if (error == 0) 2170 *off = noff; 2171 return (error); 2172} 2173 2174int 2175vn_seek(struct file *fp, off_t offset, int whence, struct thread *td) 2176{ 2177 struct ucred *cred; 2178 struct vnode *vp; 2179 struct vattr vattr; 2180 off_t foffset, size; 2181 int error, noneg; 2182 2183 cred = td->td_ucred; 2184 vp = fp->f_vnode; 2185 foffset = foffset_lock(fp, 0); 2186 noneg = (vp->v_type != VCHR); 2187 error = 0; 2188 switch (whence) { 2189 case L_INCR: 2190 if (noneg && 2191 (foffset < 0 || 2192 (offset > 0 && foffset > OFF_MAX - offset))) { 2193 error = EOVERFLOW; 2194 break; 2195 } 2196 offset += foffset; 2197 break; 2198 case L_XTND: 2199 vn_lock(vp, LK_SHARED | LK_RETRY); 2200 error = VOP_GETATTR(vp, &vattr, cred); 2201 VOP_UNLOCK(vp, 0); 2202 if (error) 2203 break; 2204 2205 /* 2206 * If the file references a disk device, then fetch 2207 * the media size and use that to determine the ending 2208 * offset. 2209 */ 2210 if (vattr.va_size == 0 && vp->v_type == VCHR && 2211 fo_ioctl(fp, DIOCGMEDIASIZE, &size, cred, td) == 0) 2212 vattr.va_size = size; 2213 if (noneg && 2214 (vattr.va_size > OFF_MAX || 2215 (offset > 0 && vattr.va_size > OFF_MAX - offset))) { 2216 error = EOVERFLOW; 2217 break; 2218 } 2219 offset += vattr.va_size; 2220 break; 2221 case L_SET: 2222 break; 2223 case SEEK_DATA: 2224 error = fo_ioctl(fp, FIOSEEKDATA, &offset, cred, td); 2225 break; 2226 case SEEK_HOLE: 2227 error = fo_ioctl(fp, FIOSEEKHOLE, &offset, cred, td); 2228 break; 2229 default: 2230 error = EINVAL; 2231 } 2232 if (error == 0 && noneg && offset < 0) 2233 error = EINVAL; 2234 if (error != 0) 2235 goto drop; 2236 VFS_KNOTE_UNLOCKED(vp, 0); 2237 *(off_t *)(td->td_retval) = offset; 2238drop: 2239 foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0); 2240 return (error); 2241} 2242 2243int 2244vn_utimes_perm(struct vnode *vp, struct vattr *vap, struct ucred *cred, 2245 struct thread *td) 2246{ 2247 int error; 2248 2249 /* 2250 * Grant permission if the caller is the owner of the file, or 2251 * the super-user, or has ACL_WRITE_ATTRIBUTES permission on 2252 * on the file. If the time pointer is null, then write 2253 * permission on the file is also sufficient. 2254 * 2255 * From NFSv4.1, draft 21, 6.2.1.3.1, Discussion of Mask Attributes: 2256 * A user having ACL_WRITE_DATA or ACL_WRITE_ATTRIBUTES 2257 * will be allowed to set the times [..] to the current 2258 * server time. 2259 */ 2260 error = VOP_ACCESSX(vp, VWRITE_ATTRIBUTES, cred, td); 2261 if (error != 0 && (vap->va_vaflags & VA_UTIMES_NULL) != 0) 2262 error = VOP_ACCESS(vp, VWRITE, cred, td); 2263 return (error); 2264} 2265