1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26/* Portions Copyright 2007 Jeremy Teo */ 27 28#include <sys/types.h> 29#include <sys/param.h> 30#include <sys/time.h> 31#include <sys/systm.h> 32#include <sys/sysmacros.h> 33#include <sys/resource.h> 34#include <sys/vfs.h> 35#include <sys/vnode.h> 36#include <sys/file.h> 37#include <sys/stat.h> 38#include <sys/kmem.h> 39#include <sys/taskq.h> 40#include <sys/uio.h> 41#include <sys/atomic.h> 42#include <sys/namei.h> 43#include <sys/mman.h> 44#include <sys/cmn_err.h> 45#include <sys/errno.h> 46#include <sys/unistd.h> 47#include <sys/zfs_dir.h> 48#include <sys/zfs_ioctl.h> 49#include <sys/fs/zfs.h> 50#include <sys/dmu.h> 51#include <sys/spa.h> 52#include <sys/txg.h> 53#include <sys/dbuf.h> 54#include <sys/zap.h> 55#include <sys/dirent.h> 56#include <sys/policy.h> 57#include <sys/sunddi.h> 58#include <sys/filio.h> 59#include <sys/zfs_ctldir.h> 60#include <sys/zfs_fuid.h> 61#include <sys/zfs_vfsops.h> 62#include <sys/dnlc.h> 63#include <sys/zfs_rlock.h> 64#include <sys/extdirent.h> 65#include <sys/kidmap.h> 66#include <sys/buf.h> 67#include <sys/sched.h> 68#include <sys/acl.h> 69#include <sys/extattr.h> 70 71#ifdef __NetBSD__ 72#include <miscfs/genfs/genfs.h> 73#endif 74 75/* 76 * Programming rules. 77 * 78 * Each vnode op performs some logical unit of work. To do this, the ZPL must 79 * properly lock its in-core state, create a DMU transaction, do the work, 80 * record this work in the intent log (ZIL), commit the DMU transaction, 81 * and wait for the intent log to commit if it is a synchronous operation. 82 * Moreover, the vnode ops must work in both normal and log replay context. 83 * The ordering of events is important to avoid deadlocks and references 84 * to freed memory. The example below illustrates the following Big Rules: 85 * 86 * (1) A check must be made in each zfs thread for a mounted file system. 87 * This is done avoiding races using ZFS_ENTER(zfsvfs). 88 * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes 89 * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros 90 * can return EIO from the calling function. 91 * 92 * (2) VN_RELE() should always be the last thing except for zil_commit() 93 * (if necessary) and ZFS_EXIT(). This is for 3 reasons: 94 * First, if it's the last reference, the vnode/znode 95 * can be freed, so the zp may point to freed memory. Second, the last 96 * reference will call zfs_zinactive(), which may induce a lot of work -- 97 * pushing cached pages (which acquires range locks) and syncing out 98 * cached atime changes. Third, zfs_zinactive() may require a new tx, 99 * which could deadlock the system if you were already holding one. 100 * If you must call VN_RELE() within a tx then use VN_RELE_ASYNC(). 101 * 102 * (3) All range locks must be grabbed before calling dmu_tx_assign(), 103 * as they can span dmu_tx_assign() calls. 104 * 105 * (4) Always pass TXG_NOWAIT as the second argument to dmu_tx_assign(). 106 * This is critical because we don't want to block while holding locks. 107 * Note, in particular, that if a lock is sometimes acquired before 108 * the tx assigns, and sometimes after (e.g. z_lock), then failing to 109 * use a non-blocking assign can deadlock the system. The scenario: 110 * 111 * Thread A has grabbed a lock before calling dmu_tx_assign(). 112 * Thread B is in an already-assigned tx, and blocks for this lock. 113 * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() 114 * forever, because the previous txg can't quiesce until B's tx commits. 115 * 116 * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, 117 * then drop all locks, call dmu_tx_wait(), and try again. 118 * 119 * (5) If the operation succeeded, generate the intent log entry for it 120 * before dropping locks. This ensures that the ordering of events 121 * in the intent log matches the order in which they actually occurred. 122 * During ZIL replay the zfs_log_* functions will update the sequence 123 * number to indicate the zil transaction has replayed. 124 * 125 * (6) At the end of each vnode op, the DMU tx must always commit, 126 * regardless of whether there were any errors. 127 * 128 * (7) After dropping all locks, invoke zil_commit(zilog, seq, foid) 129 * to ensure that synchronous semantics are provided when necessary. 130 * 131 * In general, this is how things should be ordered in each vnode op: 132 * 133 * ZFS_ENTER(zfsvfs); // exit if unmounted 134 * top: 135 * zfs_dirent_lock(&dl, ...) // lock directory entry (may VN_HOLD()) 136 * rw_enter(...); // grab any other locks you need 137 * tx = dmu_tx_create(...); // get DMU tx 138 * dmu_tx_hold_*(); // hold each object you might modify 139 * error = dmu_tx_assign(tx, TXG_NOWAIT); // try to assign 140 * if (error) { 141 * rw_exit(...); // drop locks 142 * zfs_dirent_unlock(...); // unlock directory entry 143 * VN_RELE(...); // release held vnodes 144 * if (error == ERESTART) { 145 * dmu_tx_wait(tx); 146 * dmu_tx_abort(tx); 147 * goto top; 148 * } 149 * dmu_tx_abort(tx); // abort DMU tx 150 * ZFS_EXIT(zfsvfs); // finished in zfs 151 * return (error); // really out of space 152 * } 153 * error = do_real_work(); // do whatever this VOP does 154 * if (error == 0) 155 * zfs_log_*(...); // on success, make ZIL entry 156 * dmu_tx_commit(tx); // commit DMU tx -- error or not 157 * rw_exit(...); // drop locks 158 * zfs_dirent_unlock(dl, 0); // unlock directory entry 159 * VN_RELE(...); // release held vnodes 160 * zil_commit(zilog, seq, foid); // synchronous when necessary 161 * ZFS_EXIT(zfsvfs); // finished in zfs 162 * return (error); // done, report error 163 */ 164 165/* ARGSUSED */ 166static int 167zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) 168{ 169 znode_t *zp = VTOZ(*vpp); 170 171 if ((flag & FWRITE) && (zp->z_phys->zp_flags & ZFS_APPENDONLY) && 172 ((flag & FAPPEND) == 0)) { 173 return (EPERM); 174 } 175 176 if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && 177 ZTOV(zp)->v_type == VREG && 178 !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) && 179 zp->z_phys->zp_size > 0) 180 if (fs_vscan(*vpp, cr, 0) != 0) 181 return (EACCES); 182 183 /* Keep a count of the synchronous opens in the znode */ 184 if (flag & (FSYNC | FDSYNC)) 185 atomic_inc_32(&zp->z_sync_cnt); 186 187 return (0); 188} 189 190/* ARGSUSED */ 191static int 192zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, 193 caller_context_t *ct) 194{ 195 znode_t *zp = VTOZ(vp); 196 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 197 198 /* 199 * Clean up any locks held by this process on the vp. 200 */ 201 cleanlocks(vp, ddi_get_pid(), 0); 202 cleanshares(vp, ddi_get_pid()); 203 204 ZFS_ENTER(zfsvfs); 205 ZFS_VERIFY_ZP(zp); 206 207 dprintf("zfs_close called \n"); 208 /* Decrement the synchronous opens in the znode */ 209 if ((flag & (FSYNC | FDSYNC)) && (count == 1)) 210 atomic_dec_32(&zp->z_sync_cnt); 211 212 if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && 213 ZTOV(zp)->v_type == VREG && 214 !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) && 215 zp->z_phys->zp_size > 0) 216 VERIFY(fs_vscan(vp, cr, 1) == 0); 217 218 return (0); 219} 220 221#ifdef PORT_NETBSD 222/* 223 * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and 224 * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter. 225 */ 226static int 227zfs_holey(vnode_t *vp, u_long cmd, offset_t *off) 228{ 229 znode_t *zp = VTOZ(vp); 230 uint64_t noff = (uint64_t)*off; /* new offset */ 231 uint64_t file_sz; 232 int error; 233 boolean_t hole; 234 235 file_sz = zp->z_phys->zp_size; 236 if (noff >= file_sz) { 237 return (ENXIO); 238 } 239 240 if (cmd == _FIO_SEEK_HOLE) 241 hole = B_TRUE; 242 else 243 hole = B_FALSE; 244 245 error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff); 246 247 /* end of file? */ 248 if ((error == ESRCH) || (noff > file_sz)) { 249 /* 250 * Handle the virtual hole at the end of file. 251 */ 252 if (hole) { 253 *off = file_sz; 254 return (0); 255 } 256 return (ENXIO); 257 } 258 259 if (noff < *off) 260 return (error); 261 *off = noff; 262 return (error); 263} 264#endif /* PORT_NETBSD */ 265 266static int 267zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred, 268 int *rvalp, caller_context_t *ct) 269{ 270 offset_t off; 271 int error; 272 zfsvfs_t *zfsvfs; 273 znode_t *zp; 274 275 switch (com) { 276 case _FIOFFS: 277 return (0); 278 279 /* 280 * The following two ioctls are used by bfu. Faking out, 281 * necessary to avoid bfu errors. 282 */ 283 case _FIOGDIO: 284 case _FIOSDIO: 285 return (0); 286#ifdef PORT_NETBSD /* XXX NetBSD Do we support holes in files ? */ 287 case _FIO_SEEK_DATA: 288 case _FIO_SEEK_HOLE: 289 if (ddi_copyin((void *)data, &off, sizeof (off), flag)) 290 return (EFAULT); 291 292 zp = VTOZ(vp); 293 zfsvfs = zp->z_zfsvfs; 294 ZFS_ENTER(zfsvfs); 295 ZFS_VERIFY_ZP(zp); 296 297 /* offset parameter is in/out */ 298 error = zfs_holey(vp, com, &off); 299 ZFS_EXIT(zfsvfs); 300 if (error) 301 return (error); 302 if (ddi_copyout(&off, (void *)data, sizeof (off), flag)) 303 return (EFAULT); 304 return (0); 305#endif 306 } 307 308 return (ENOTTY); 309} 310 311#ifdef PORT_NETBSD 312/* 313 * When a file is memory mapped, we must keep the IO data synchronized 314 * between the DMU cache and the memory mapped pages. What this means: 315 * 316 * On Write: If we find a memory mapped page, we write to *both* 317 * the page and the dmu buffer. 318 */ 319static void 320update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid) 321{ 322 int64_t off; 323 324 off = start & PAGEOFFSET; 325 dirbytes = 0; 326 VM_OBJECT_LOCK(obj); 327 for (start &= PAGEMASK; len > 0; start += PAGESIZE) { 328 page_t *pp; 329 uint64_t nbytes = MIN(PAGESIZE - off, len); 330 331 if (pp = page_lookup(vp, start, SE_SHARED)) { 332 caddr_t va; 333 334 va = zfs_map_page(pp, S_WRITE); 335 (void) dmu_read(os, oid, start+off, nbytes, va+off, 336 DMU_READ_PREFETCH); 337 zfs_unmap_page(pp, va); 338 page_unlock(pp); 339 } 340 len -= nbytes; 341 off = 0; 342 } 343 344 VM_OBJECT_UNLOCK(obj); 345 if (error == 0 && dirbytes > 0) 346 error = dmu_write_uio(os, zp->z_id, uio, dirbytes, tx); 347 return (error); 348} 349 350/* 351 * When a file is memory mapped, we must keep the IO data synchronized 352 * between the DMU cache and the memory mapped pages. What this means: 353 * 354 * On Read: We "read" preferentially from memory mapped pages, 355 * else we default from the dmu buffer. 356 * 357 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when 358 * the file is memory mapped. 359 */ 360static int 361mappedread(vnode_t *vp, int nbytes, uio_t *uio) 362{ 363 znode_t *zp = VTOZ(vp); 364 objset_t *os = zp->z_zfsvfs->z_os; 365 vm_object_t obj; 366 vm_page_t m; 367 struct sf_buf *sf; 368 int64_t start, off; 369 caddr_t va; 370 int len = nbytes; 371 int error = 0; 372 uint64_t dirbytes; 373 374 ASSERT(vp->v_mount != NULL); 375 obj = vp->v_object; 376 ASSERT(obj != NULL); 377 378 start = uio->uio_loffset; 379 off = start & PAGEOFFSET; 380 dirbytes = 0; 381 VM_OBJECT_LOCK(obj); 382 for (start &= PAGEMASK; len > 0; start += PAGESIZE) { 383 uint64_t bytes = MIN(PAGESIZE - off, len); 384 385again: 386 if ((m = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && 387 vm_page_is_valid(m, (vm_offset_t)off, bytes)) { 388 if (vm_page_sleep_if_busy(m, FALSE, "zfsmrb")) 389 goto again; 390 vm_page_busy(m); 391 VM_OBJECT_UNLOCK(obj); 392 if (dirbytes > 0) { 393 error = dmu_read_uio(os, zp->z_id, uio, 394 dirbytes); 395 dirbytes = 0; 396 } 397 if (error == 0) { 398 sched_pin(); 399 sf = sf_buf_alloc(m, SFB_CPUPRIVATE); 400 va = (caddr_t)sf_buf_kva(sf); 401 error = uiomove(va + off, bytes, UIO_READ, uio); 402 sf_buf_free(sf); 403 sched_unpin(); 404 } 405 VM_OBJECT_LOCK(obj); 406 vm_page_wakeup(m); 407 } else if (m != NULL && uio->uio_segflg == UIO_NOCOPY) { 408 /* 409 * The code below is here to make sendfile(2) work 410 * correctly with ZFS. As pointed out by ups@ 411 * sendfile(2) should be changed to use VOP_GETPAGES(), 412 * but it pessimize performance of sendfile/UFS, that's 413 * why I handle this special case in ZFS code. 414 */ 415 if (vm_page_sleep_if_busy(m, FALSE, "zfsmrb")) 416 goto again; 417 vm_page_busy(m); 418 VM_OBJECT_UNLOCK(obj); 419 if (dirbytes > 0) { 420 error = dmu_read_uio(os, zp->z_id, uio, 421 dirbytes); 422 dirbytes = 0; 423 } 424 if (error == 0) { 425 sched_pin(); 426 sf = sf_buf_alloc(m, SFB_CPUPRIVATE); 427 va = (caddr_t)sf_buf_kva(sf); 428 error = dmu_read(os, zp->z_id, start + off, 429 bytes, (void *)(va + off)); 430 sf_buf_free(sf); 431 sched_unpin(); 432 } 433 VM_OBJECT_LOCK(obj); 434 vm_page_wakeup(m); 435 if (error == 0) 436 uio->uio_resid -= bytes; 437 } else { 438 dirbytes += bytes; 439 } 440 len -= bytes; 441 off = 0; 442 if (error) 443 break; 444 } 445 VM_OBJECT_UNLOCK(obj); 446 if (error == 0 && dirbytes > 0) 447 error = dmu_read_uio(os, zp->z_id, uio, dirbytes); 448 return (error); 449} 450#endif /* PORT_NETBSD */ 451offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */ 452 453/* 454 * Read bytes from specified file into supplied buffer. 455 * 456 * IN: vp - vnode of file to be read from. 457 * uio - structure supplying read location, range info, 458 * and return buffer. 459 * ioflag - SYNC flags; used to provide FRSYNC semantics. 460 * cr - credentials of caller. 461 * ct - caller context 462 * 463 * OUT: uio - updated offset and range, buffer filled. 464 * 465 * RETURN: 0 if success 466 * error code if failure 467 * 468 * Side Effects: 469 * vp - atime updated if byte count > 0 470 */ 471/* ARGSUSED */ 472static int 473zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) 474{ 475 znode_t *zp = VTOZ(vp); 476 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 477 objset_t *os; 478 ssize_t n, nbytes; 479 int error; 480 rl_t *rl; 481 xuio_t *xuio = NULL; 482 483 dprintf("zfs_read called\n"); 484 485 ZFS_ENTER(zfsvfs); 486 ZFS_VERIFY_ZP(zp); 487 os = zfsvfs->z_os; 488 489 if (zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) { 490 ZFS_EXIT(zfsvfs); 491 return (EACCES); 492 } 493 494 /* 495 * Validate file offset 496 */ 497 if (uio->uio_loffset < (offset_t)0) { 498 ZFS_EXIT(zfsvfs); 499 return (EINVAL); 500 } 501 502 /* 503 * Fasttrack empty reads 504 */ 505 if (uio->uio_resid == 0) { 506 ZFS_EXIT(zfsvfs); 507 return (0); 508 } 509 510 /* 511 * Check for mandatory locks 512 */ 513 if (MANDMODE((mode_t)zp->z_phys->zp_mode)) { 514 if (error = chklock(vp, FREAD, 515 uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) { 516 ZFS_EXIT(zfsvfs); 517 return (error); 518 } 519 } 520 521 /* 522 * If we're in FRSYNC mode, sync out this znode before reading it. 523 */ 524 if (ioflag & FRSYNC) 525 zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id); 526 527 /* 528 * Lock the range against changes. 529 */ 530 rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER); 531 532 /* 533 * If we are reading past end-of-file we can skip 534 * to the end; but we might still need to set atime. 535 */ 536 if (uio->uio_loffset >= zp->z_phys->zp_size) { 537 error = 0; 538 goto out; 539 } 540 541 ASSERT(uio->uio_loffset < zp->z_phys->zp_size); 542 n = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset); 543#ifdef PORT_SOLARIS 544 if ((uio->uio_extflg == UIO_XUIO) && 545 (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) { 546 int nblk; 547 int blksz = zp->z_blksz; 548 uint64_t offset = uio->uio_loffset; 549 550 xuio = (xuio_t *)uio; 551 if ((ISP2(blksz))) { 552 nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset, 553 blksz)) / blksz; 554 } else { 555 ASSERT(offset + n <= blksz); 556 nblk = 1; 557 } 558 (void) dmu_xuio_init(xuio, nblk); 559 560 if (vn_has_cached_data(vp)) { 561 /* 562 * For simplicity, we always allocate a full buffer 563 * even if we only expect to read a portion of a block. 564 */ 565 while (--nblk >= 0) { 566 (void) dmu_xuio_add(xuio, 567 dmu_request_arcbuf(zp->z_dbuf, blksz), 568 0, blksz); 569 } 570 } 571 } 572#endif 573 while (n > 0) { 574 nbytes = MIN(n, zfs_read_chunk_size - 575 P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); 576 577// if (vn_has_cached_data(vp)) 578// error = mappedread(vp, nbytes, uio); 579// else 580 error = dmu_read_uio(os, zp->z_id, uio, nbytes); 581 if (error) { 582 /* convert checksum errors into IO errors */ 583 if (error == ECKSUM) 584 error = EIO; 585 break; 586 } 587 588 n -= nbytes; 589 } 590out: 591 zfs_range_unlock(rl); 592 593 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 594 ZFS_EXIT(zfsvfs); 595 return (error); 596} 597 598/* 599 * Fault in the pages of the first n bytes specified by the uio structure. 600 * 1 byte in each page is touched and the uio struct is unmodified. 601 * Any error will exit this routine as this is only a best 602 * attempt to get the pages resident. This is a copy of ufs_trans_touch(). 603 */ 604static void 605zfs_prefault_write(ssize_t n, struct uio *uio) 606{ 607 struct iovec *iov; 608 ulong_t cnt, incr; 609 caddr_t p; 610 611 if (VMSPACE_IS_KERNEL_P(uio->uio_vmspace)) 612 return; 613 614 iov = uio->uio_iov; 615 616 while (n) { 617 cnt = MIN(iov->iov_len, n); 618 if (cnt == 0) { 619 /* empty iov entry */ 620 iov++; 621 continue; 622 } 623 n -= cnt; 624 /* 625 * touch each page in this segment. 626 */ 627 p = iov->iov_base; 628 while (cnt) { 629 if (fubyte(p) == -1) 630 return; 631 incr = MIN(cnt, PAGESIZE); 632 p += incr; 633 cnt -= incr; 634 } 635 /* 636 * touch the last byte in case it straddles a page. 637 */ 638 p--; 639 if (fubyte(p) == -1) 640 return; 641 iov++; 642 } 643} 644 645/* 646 * Write the bytes to a file. 647 * 648 * IN: vp - vnode of file to be written to. 649 * uio - structure supplying write location, range info, 650 * and data buffer. 651 * ioflag - IO_APPEND flag set if in append mode. 652 * cr - credentials of caller. 653 * ct - caller context (NFS/CIFS fem monitor only) 654 * 655 * OUT: uio - updated offset and range. 656 * 657 * RETURN: 0 if success 658 * error code if failure 659 * 660 * Timestamps: 661 * vp - ctime|mtime updated if byte count > 0 662 */ 663/* ARGSUSED */ 664static int 665zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) 666{ 667 znode_t *zp = VTOZ(vp); 668 rlim64_t limit = MAXOFFSET_T; 669 ssize_t start_resid = uio->uio_resid; 670 ssize_t tx_bytes; 671 uint64_t end_size; 672 dmu_tx_t *tx; 673 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 674 zilog_t *zilog; 675 offset_t woff; 676 ssize_t n, nbytes; 677 rl_t *rl; 678 int max_blksz = zfsvfs->z_max_blksz; 679 uint64_t pflags; 680 int error; 681 arc_buf_t *abuf; 682 iovec_t *aiov; 683 xuio_t *xuio = NULL; 684 int i_iov = 0; 685 int iovcnt = uio->uio_iovcnt; 686 iovec_t *iovp = uio->uio_iov; 687 int write_eof; 688 689 dprintf("zfs_write called\n"); 690 691 /* 692 * Fasttrack empty write 693 */ 694 n = start_resid; 695 if (n == 0) 696 return (0); 697 698 if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) 699 limit = MAXOFFSET_T; 700 701 ZFS_ENTER(zfsvfs); 702 ZFS_VERIFY_ZP(zp); 703 704 /* 705 * If immutable or not appending then return EPERM 706 */ 707 pflags = zp->z_phys->zp_flags; 708 if ((pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) || 709 ((pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) && 710 (uio->uio_loffset < zp->z_phys->zp_size))) { 711 ZFS_EXIT(zfsvfs); 712 return (EPERM); 713 } 714 715 zilog = zfsvfs->z_log; 716 717 /* 718 * Validate file offset 719 */ 720 woff = ioflag & FAPPEND ? zp->z_phys->zp_size : uio->uio_loffset; 721 if (woff < 0) { 722 ZFS_EXIT(zfsvfs); 723 return (EINVAL); 724 } 725 726 /* 727 * Check for mandatory locks before calling zfs_range_lock() 728 * in order to prevent a deadlock with locks set via fcntl(). 729 */ 730 if (MANDMODE((mode_t)zp->z_phys->zp_mode) && 731 (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) { 732 ZFS_EXIT(zfsvfs); 733 return (error); 734 } 735 736 /* 737 * Pre-fault the pages to ensure slow (eg NFS) pages 738 * don't hold up txg. 739 * Skip this if uio contains loaned arc_buf. 740 */ 741 zfs_prefault_write(n, uio); 742 743 /* 744 * If in append mode, set the io offset pointer to eof. 745 */ 746 if (ioflag & IO_APPEND) { 747 /* 748 * Obtain an appending range lock to guarantee file append 749 * semantics. We reset the write offset once we have the lock. 750 */ 751 rl = zfs_range_lock(zp, 0, n, RL_APPEND); 752 woff = rl->r_off; 753 if (rl->r_len == UINT64_MAX) { 754 /* 755 * We overlocked the file because this write will cause 756 * the file block size to increase. 757 * Note that zp_size cannot change with this lock held. 758 */ 759 woff = zp->z_phys->zp_size; 760 } 761 uio->uio_loffset = woff; 762 } else { 763 /* 764 * Note that if the file block size will change as a result of 765 * this write, then this range lock will lock the entire file 766 * so that we can re-write the block safely. 767 */ 768 rl = zfs_range_lock(zp, woff, n, RL_WRITER); 769 } 770 771 if (woff >= limit) { 772 zfs_range_unlock(rl); 773 ZFS_EXIT(zfsvfs); 774 return (EFBIG); 775 } 776 777 if ((woff + n) > limit || woff > (limit - n)) 778 n = limit - woff; 779 780 /* Will this write extend the file length? */ 781 write_eof = (woff + n > zp->z_phys->zp_size); 782 783 end_size = MAX(zp->z_phys->zp_size, woff + n); 784 785 /* 786 * Write the file in reasonable size chunks. Each chunk is written 787 * in a separate transaction; this keeps the intent log records small 788 * and allows us to do more fine-grained space accounting. 789 */ 790 while (n > 0) { 791 abuf = NULL; 792 woff = uio->uio_loffset; 793again: 794 if (zfs_usergroup_overquota(zfsvfs, 795 B_FALSE, zp->z_phys->zp_uid) || 796 zfs_usergroup_overquota(zfsvfs, 797 B_TRUE, zp->z_phys->zp_gid)) { 798 if (abuf != NULL) 799 dmu_return_arcbuf(abuf); 800 error = EDQUOT; 801 break; 802 } 803 804 if (xuio && abuf == NULL) { 805 ASSERT(i_iov < iovcnt); 806 aiov = &iovp[i_iov]; 807 abuf = dmu_xuio_arcbuf(xuio, i_iov); 808 dmu_xuio_clear(xuio, i_iov); 809 DTRACE_PROBE3(zfs_cp_write, int, i_iov, 810 iovec_t *, aiov, arc_buf_t *, abuf); 811 ASSERT((aiov->iov_base == abuf->b_data) || 812 ((char *)aiov->iov_base - (char *)abuf->b_data + 813 aiov->iov_len == arc_buf_size(abuf))); 814 i_iov++; 815 } else if (abuf == NULL && n >= max_blksz && 816 woff >= zp->z_phys->zp_size && 817 P2PHASE(woff, max_blksz) == 0 && 818 zp->z_blksz == max_blksz) { 819 /* 820 * This write covers a full block. "Borrow" a buffer 821 * from the dmu so that we can fill it before we enter 822 * a transaction. This avoids the possibility of 823 * holding up the transaction if the data copy hangs 824 * up on a pagefault (e.g., from an NFS server mapping). 825 */ 826 size_t cbytes; 827 828 abuf = dmu_request_arcbuf(zp->z_dbuf, max_blksz); 829 ASSERT(abuf != NULL); 830 ASSERT(arc_buf_size(abuf) == max_blksz); 831 if (error = uiocopy(abuf->b_data, max_blksz, 832 UIO_WRITE, uio, &cbytes)) { 833 dmu_return_arcbuf(abuf); 834 break; 835 } 836 ASSERT(cbytes == max_blksz); 837 } 838 839 /* 840 * Start a transaction. 841 */ 842 tx = dmu_tx_create(zfsvfs->z_os); 843 dmu_tx_hold_bonus(tx, zp->z_id); 844 dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz)); 845 error = dmu_tx_assign(tx, TXG_NOWAIT); 846 if (error) { 847 if (error == ERESTART) { 848 dmu_tx_wait(tx); 849 dmu_tx_abort(tx); 850 goto again; 851 } 852 dmu_tx_abort(tx); 853 if (abuf != NULL) 854 dmu_return_arcbuf(abuf); 855 break; 856 } 857 858 /* 859 * If zfs_range_lock() over-locked we grow the blocksize 860 * and then reduce the lock range. This will only happen 861 * on the first iteration since zfs_range_reduce() will 862 * shrink down r_len to the appropriate size. 863 */ 864 if (rl->r_len == UINT64_MAX) { 865 uint64_t new_blksz; 866 867 if (zp->z_blksz > max_blksz) { 868 ASSERT(!ISP2(zp->z_blksz)); 869 new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE); 870 } else { 871 new_blksz = MIN(end_size, max_blksz); 872 } 873 zfs_grow_blocksize(zp, new_blksz, tx); 874 zfs_range_reduce(rl, woff, n); 875 } 876 877 /* 878 * XXX - should we really limit each write to z_max_blksz? 879 * Perhaps we should use SPA_MAXBLOCKSIZE chunks? 880 */ 881 nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); 882 883 if (abuf == NULL) { 884 tx_bytes = uio->uio_resid; 885 error = dmu_write_uio(zfsvfs->z_os, zp->z_id, uio, 886 nbytes, tx); 887 tx_bytes -= uio->uio_resid; 888 } else { 889 tx_bytes = nbytes; 890 ASSERT(xuio == NULL || tx_bytes == aiov->iov_len); 891 /* 892 * If this is not a full block write, but we are 893 * extending the file past EOF and this data starts 894 * block-aligned, use assign_arcbuf(). Otherwise, 895 * write via dmu_write(). 896 */ 897 if (tx_bytes < max_blksz && (!write_eof || 898 aiov->iov_base != abuf->b_data)) { 899 ASSERT(xuio); 900 dmu_write(zfsvfs->z_os, zp->z_id, woff, 901 aiov->iov_len, aiov->iov_base, tx); 902 dmu_return_arcbuf(abuf); 903 xuio_stat_wbuf_copied(); 904 } else { 905 ASSERT(xuio || tx_bytes == max_blksz); 906 dmu_assign_arcbuf(zp->z_dbuf, woff, abuf, tx); 907 } 908 ASSERT(tx_bytes <= uio->uio_resid); 909 uioskip(uio, tx_bytes); 910 } 911#ifdef PORT_SOLARIS 912 if (tx_bytes && vn_has_cached_data(vp)) { 913 update_pages(vp, woff, 914 tx_bytes, zfsvfs->z_os, zp->z_id); 915 } 916#endif 917 /* 918 * If we made no progress, we're done. If we made even 919 * partial progress, update the znode and ZIL accordingly. 920 */ 921 if (tx_bytes == 0) { 922 dmu_tx_commit(tx); 923 ASSERT(error != 0); 924 break; 925 } 926 927 /* 928 * Clear Set-UID/Set-GID bits on successful write if not 929 * privileged and at least one of the excute bits is set. 930 * 931 * It would be nice to to this after all writes have 932 * been done, but that would still expose the ISUID/ISGID 933 * to another app after the partial write is committed. 934 * 935 * Note: we don't call zfs_fuid_map_id() here because 936 * user 0 is not an ephemeral uid. 937 */ 938 mutex_enter(&zp->z_acl_lock); 939 if ((zp->z_phys->zp_mode & (S_IXUSR | (S_IXUSR >> 3) | 940 (S_IXUSR >> 6))) != 0 && 941 (zp->z_phys->zp_mode & (S_ISUID | S_ISGID)) != 0 && 942 secpolicy_vnode_setid_retain(cr, (zp->z_phys->zp_mode & S_ISUID) != 0 && zp->z_phys->zp_uid == 0) != 0) { 943 zp->z_phys->zp_mode &= ~(S_ISUID | S_ISGID); 944 } 945 mutex_exit(&zp->z_acl_lock); 946 947 /* 948 * Update time stamp. NOTE: This marks the bonus buffer as 949 * dirty, so we don't have to do it again for zp_size. 950 */ 951 zfs_time_stamper(zp, CONTENT_MODIFIED, tx); 952 953 /* 954 * Update the file size (zp_size) if it has changed; 955 * account for possible concurrent updates. 956 */ 957 while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset) 958 (void) atomic_cas_64(&zp->z_phys->zp_size, end_size, 959 uio->uio_loffset); 960 zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag); 961 dmu_tx_commit(tx); 962 963 if (error != 0) 964 break; 965 ASSERT(tx_bytes == nbytes); 966 n -= nbytes; 967 } 968 969 zfs_range_unlock(rl); 970 971 /* 972 * If we're in replay mode, or we made no progress, return error. 973 * Otherwise, it's at least a partial write, so it's successful. 974 */ 975 if (zfsvfs->z_replay || uio->uio_resid == start_resid) { 976 ZFS_EXIT(zfsvfs); 977 return (error); 978 } 979 980 if (ioflag & (FSYNC | FDSYNC)) 981 zil_commit(zilog, zp->z_last_itx, zp->z_id); 982 983 ZFS_EXIT(zfsvfs); 984 985 return (0); 986} 987 988void 989zfs_get_done(zgd_t *zgd, int error) 990{ 991 znode_t *zp = zgd->zgd_private; 992 objset_t *os = zp->z_zfsvfs->z_os; 993 994 if (zgd->zgd_db) 995 dmu_buf_rele(zgd->zgd_db, zgd); 996 997 zfs_range_unlock(zgd->zgd_rl); 998 999 /* 1000 * Release the vnode asynchronously as we currently have the 1001 * txg stopped from syncing. 1002 */ 1003 VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os))); 1004 1005 if (error == 0 && zgd->zgd_bp) 1006 zil_add_block(zgd->zgd_zilog, zgd->zgd_bp); 1007 1008 kmem_free(zgd, sizeof (zgd_t)); 1009} 1010 1011#ifdef DEBUG 1012static int zil_fault_io = 0; 1013#endif 1014 1015/* 1016 * Get data to generate a TX_WRITE intent log record. 1017 */ 1018int 1019zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) 1020{ 1021 zfsvfs_t *zfsvfs = arg; 1022 objset_t *os = zfsvfs->z_os; 1023 znode_t *zp; 1024 uint64_t object = lr->lr_foid; 1025 uint64_t offset = lr->lr_offset; 1026 uint64_t size = lr->lr_length; 1027 blkptr_t *bp = &lr->lr_blkptr; 1028 dmu_buf_t *db; 1029 zgd_t *zgd; 1030 int error = 0; 1031 1032 ASSERT(zio != NULL); 1033 ASSERT(size != 0); 1034 1035 /* 1036 * Nothing to do if the file has been removed 1037 */ 1038 if (zfs_zget(zfsvfs, object, &zp) != 0) 1039 return (ENOENT); 1040 if (zp->z_unlinked) { 1041 /* 1042 * Release the vnode asynchronously as we currently have the 1043 * txg stopped from syncing. 1044 */ 1045 VN_RELE_ASYNC(ZTOV(zp), 1046 dsl_pool_vnrele_taskq(dmu_objset_pool(os))); 1047 return (ENOENT); 1048 } 1049 1050 zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP); 1051 zgd->zgd_zilog = zfsvfs->z_log; 1052 zgd->zgd_private = zp; 1053 1054 /* 1055 * Write records come in two flavors: immediate and indirect. 1056 * For small writes it's cheaper to store the data with the 1057 * log record (immediate); for large writes it's cheaper to 1058 * sync the data and get a pointer to it (indirect) so that 1059 * we don't have to write the data twice. 1060 */ 1061 if (buf != NULL) { /* immediate write */ 1062 zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER); 1063 /* test for truncation needs to be done while range locked */ 1064 if (offset >= zp->z_phys->zp_size) { 1065 error = ENOENT; 1066 } else { 1067 error = dmu_read(os, object, offset, size, buf, 1068 DMU_READ_NO_PREFETCH); 1069 } 1070 ASSERT(error == 0 || error == ENOENT); 1071 } else { /* indirect write */ 1072 /* 1073 * Have to lock the whole block to ensure when it's 1074 * written out and it's checksum is being calculated 1075 * that no one can change the data. We need to re-check 1076 * blocksize after we get the lock in case it's changed! 1077 */ 1078 for (;;) { 1079 uint64_t blkoff; 1080 size = zp->z_blksz; 1081 blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; 1082 offset -= blkoff; 1083 zgd->zgd_rl = zfs_range_lock(zp, offset, size, 1084 RL_READER); 1085 if (zp->z_blksz == size) 1086 break; 1087 offset += blkoff; 1088 zfs_range_unlock(zgd->zgd_rl); 1089 } 1090 /* test for truncation needs to be done while range locked */ 1091 if (lr->lr_offset >= zp->z_phys->zp_size) 1092 error = ENOENT; 1093#ifdef DEBUG 1094 if (zil_fault_io) { 1095 error = EIO; 1096 zil_fault_io = 0; 1097 } 1098#endif 1099 if (error == 0) 1100 error = dmu_buf_hold(os, object, offset, zgd, &db); 1101 1102 if (error == 0) { 1103 zgd->zgd_db = db; 1104 zgd->zgd_bp = bp; 1105 1106 ASSERT(db->db_offset == offset); 1107 ASSERT(db->db_size == size); 1108 1109 error = dmu_sync(zio, lr->lr_common.lrc_txg, 1110 zfs_get_done, zgd); 1111 ASSERT(error || lr->lr_length <= zp->z_blksz); 1112 1113 /* 1114 * On success, we need to wait for the write I/O 1115 * initiated by dmu_sync() to complete before we can 1116 * release this dbuf. We will finish everything up 1117 * in the zfs_get_done() callback. 1118 */ 1119 if (error == 0) 1120 return (0); 1121 1122 if (error == EALREADY) { 1123 lr->lr_common.lrc_txtype = TX_WRITE2; 1124 error = 0; 1125 } 1126 } 1127 } 1128 1129 zfs_get_done(zgd, error); 1130 1131 return (error); 1132} 1133 1134/*ARGSUSED*/ 1135static int 1136zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr, 1137 caller_context_t *ct) 1138{ 1139 znode_t *zp = VTOZ(vp); 1140 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1141 int error; 1142 1143 ZFS_ENTER(zfsvfs); 1144 ZFS_VERIFY_ZP(zp); 1145 1146 if (flag & V_ACE_MASK) 1147 error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); 1148 else 1149 error = zfs_zaccess_rwx(zp, mode, flag, cr); 1150 1151 ZFS_EXIT(zfsvfs); 1152 return (error); 1153} 1154 1155/* 1156 * If vnode is for a device return a specfs vnode instead. 1157 */ 1158static int 1159specvp_check(vnode_t **vpp, cred_t *cr) 1160{ 1161 int error = 0; 1162 1163 if (IS_DEVVP(*vpp)) { 1164 struct vnode *svp; 1165 1166 svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); 1167 VN_RELE(*vpp); 1168 if (svp == NULL) 1169 error = ENOSYS; 1170 *vpp = svp; 1171 } 1172 return (error); 1173} 1174 1175 1176/* 1177 * Lookup an entry in a directory, or an extended attribute directory. 1178 * If it exists, return a held vnode reference for it. 1179 * 1180 * IN: dvp - vnode of directory to search. 1181 * nm - name of entry to lookup. 1182 * pnp - full pathname to lookup [UNUSED]. 1183 * flags - LOOKUP_XATTR set if looking for an attribute. 1184 * rdir - root directory vnode [UNUSED]. 1185 * cr - credentials of caller. 1186 * ct - caller context 1187 * direntflags - directory lookup flags 1188 * realpnp - returned pathname. 1189 * 1190 * OUT: vpp - vnode of located entry, NULL if not found. 1191 * 1192 * RETURN: 0 if success 1193 * error code if failure 1194 * 1195 * Timestamps: 1196 * NA 1197 */ 1198/* ARGSUSED */ 1199static int 1200zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp, 1201 int nameiop, cred_t *cr, int flags) 1202{ 1203 znode_t *zdp = VTOZ(dvp); 1204 zfsvfs_t *zfsvfs = zdp->z_zfsvfs; 1205 int *direntflags = NULL; 1206 void *realpnp = NULL; 1207 int error = 0; 1208 1209 /* fast path */ 1210 if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) { 1211 1212 if (dvp->v_type != VDIR) { 1213 return (ENOTDIR); 1214 } else if (zdp->z_dbuf == NULL) { 1215 return (EIO); 1216 } 1217 1218 if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) { 1219 error = zfs_fastaccesschk_execute(zdp, cr); 1220 if (!error) { 1221 *vpp = dvp; 1222 VN_HOLD(*vpp); 1223 return (0); 1224 } 1225 return (error); 1226 } else { 1227 vnode_t *tvp = dnlc_lookup(dvp, nm); 1228 1229 if (tvp) { 1230 error = zfs_fastaccesschk_execute(zdp, cr); 1231 if (error) { 1232 VN_RELE(tvp); 1233 return (error); 1234 } 1235 if (tvp == DNLC_NO_VNODE) { 1236 VN_RELE(tvp); 1237 return (ENOENT); 1238 } else { 1239 *vpp = tvp; 1240 return (specvp_check(vpp, cr)); 1241 } 1242 } 1243 } 1244 } 1245 1246 DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm); 1247 1248 ZFS_ENTER(zfsvfs); 1249 ZFS_VERIFY_ZP(zdp); 1250 1251 *vpp = NULL; 1252 dprintf("zfs_lookup called %s\n", nm); 1253 if (flags & LOOKUP_XATTR) { 1254#ifdef TODO 1255 /* 1256 * If the xattr property is off, refuse the lookup request. 1257 */ 1258 if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) { 1259 ZFS_EXIT(zfsvfs); 1260 return (EINVAL); 1261 } 1262#endif 1263 1264 /* 1265 * We don't allow recursive attributes.. 1266 * Maybe someday we will. 1267 */ 1268 if (zdp->z_phys->zp_flags & ZFS_XATTR) { 1269 ZFS_EXIT(zfsvfs); 1270 return (EINVAL); 1271 } 1272 1273 if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) { 1274 ZFS_EXIT(zfsvfs); 1275 return (error); 1276 } 1277 1278 /* 1279 * Do we have permission to get into attribute directory? 1280 */ 1281 if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0, 1282 B_FALSE, cr)) { 1283 VN_RELE(*vpp); 1284 *vpp = NULL; 1285 } 1286 1287 ZFS_EXIT(zfsvfs); 1288 return (error); 1289 } 1290 1291 if (dvp->v_type != VDIR) { 1292 ZFS_EXIT(zfsvfs); 1293 return (ENOTDIR); 1294 } 1295 1296 /* 1297 * Check accessibility of directory. 1298 */ 1299 if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) { 1300 ZFS_EXIT(zfsvfs); 1301 return (error); 1302 } 1303 1304 /* 1305 * Before tediously performing a linear scan of the directory, 1306 * check the name cache to see if the directory/name pair 1307 * we are looking for is known already. 1308 */ 1309 1310 if ((error = cache_lookup(dvp, vpp, cnp)) >= 0) { 1311 ZFS_EXIT(zfsvfs); 1312 return (error); 1313 } 1314 1315 if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), 1316 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 1317 ZFS_EXIT(zfsvfs); 1318 return (EILSEQ); 1319 } 1320 1321 error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp); 1322 if (error == 0) 1323 error = specvp_check(vpp, cr); 1324 1325 ZFS_EXIT(zfsvfs); 1326 1327 /* Translate errors and add SAVENAME when needed. */ 1328 if (cnp->cn_flags & ISLASTCN) { 1329 switch (nameiop) { 1330 case CREATE: 1331 case RENAME: 1332 if (error == ENOENT) { 1333 error = EJUSTRETURN; 1334 break; 1335 } 1336 /* FALLTHROUGH */ 1337 case DELETE: 1338 break; 1339 } 1340 } 1341 1342 if (error == 0 && (nm[0] != '.' || nm[1] != '\0')) { 1343 int ltype = 0; 1344 1345 if (cnp->cn_flags & ISDOTDOT) { 1346 ltype = VOP_ISLOCKED(dvp); 1347 VOP_UNLOCK(dvp); 1348 } 1349 error = vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); 1350 if (cnp->cn_flags & ISDOTDOT) 1351 vn_lock(dvp, ltype | LK_RETRY); 1352 if (error != 0) { 1353 VN_RELE(*vpp); 1354 *vpp = NULL; 1355 return (error); 1356 } 1357 } 1358 1359 /* 1360 * Insert name into cache if appropriate. 1361 */ 1362 if ((cnp->cn_flags & MAKEENTRY) == 0){ 1363 return (error); 1364 } 1365 switch (error) { 1366 case 0: 1367 cache_enter(dvp, *vpp, cnp); 1368 break; 1369 case ENOENT: 1370 if (nameiop != CREATE) 1371 cache_enter(dvp, *vpp, cnp); 1372 break; 1373 default: 1374 break; 1375 } 1376 return (error); 1377} 1378 1379/* 1380 * Attempt to create a new entry in a directory. If the entry 1381 * already exists, truncate the file if permissible, else return 1382 * an error. Return the vp of the created or trunc'd file. 1383 * 1384 * IN: dvp - vnode of directory to put new file entry in. 1385 * name - name of new file entry. 1386 * vap - attributes of new file. 1387 * excl - flag indicating exclusive or non-exclusive mode. 1388 * mode - mode to open file with. 1389 * cr - credentials of caller. 1390 * flag - large file flag [UNUSED]. 1391 * ct - caller context 1392 * vsecp - ACL to be set 1393 * 1394 * OUT: vpp - vnode of created or trunc'd entry. 1395 * 1396 * RETURN: 0 if success 1397 * error code if failure 1398 * 1399 * Timestamps: 1400 * dvp - ctime|mtime updated if new entry created 1401 * vp - ctime|mtime always, atime if new 1402 */ 1403 1404/* ARGSUSED */ 1405static int 1406zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode, 1407 vnode_t **vpp, cred_t *cr) 1408{ 1409 znode_t *zp, *dzp = VTOZ(dvp); 1410 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1411 zilog_t *zilog; 1412 objset_t *os; 1413 zfs_dirlock_t *dl; 1414 dmu_tx_t *tx; 1415 int error; 1416 void *vsecp = NULL; 1417 int flag = 0; 1418 zfs_acl_ids_t acl_ids; 1419 boolean_t fuid_dirtied; 1420 1421 dprintf("zfs_create called\n"); 1422 /* 1423 * If we have an ephemeral id, ACL, or XVATTR then 1424 * make sure file system is at proper version 1425 */ 1426 1427 if (zfsvfs->z_use_fuids == B_FALSE && 1428 (vsecp || (vap->va_mask & AT_XVATTR) || 1429 IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr)))) 1430 return (EINVAL); 1431 1432 ZFS_ENTER(zfsvfs); 1433 ZFS_VERIFY_ZP(dzp); 1434 os = zfsvfs->z_os; 1435 zilog = zfsvfs->z_log; 1436 1437 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), 1438 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 1439 ZFS_EXIT(zfsvfs); 1440 return (EILSEQ); 1441 } 1442 1443 if (vap->va_mask & AT_XVATTR) { 1444 if ((error = secpolicy_xvattr((xvattr_t *)vap, 1445 crgetuid(cr), cr, vap->va_type)) != 0) { 1446 ZFS_EXIT(zfsvfs); 1447 return (error); 1448 } 1449 } 1450top: 1451 *vpp = NULL; 1452 1453 if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr)) 1454 vap->va_mode &= ~S_ISVTX; 1455 1456 if (*name == '\0') { 1457 /* 1458 * Null component name refers to the directory itself. 1459 */ 1460 VN_HOLD(dvp); 1461 zp = dzp; 1462 dl = NULL; 1463 error = 0; 1464 } else { 1465 /* possible VN_HOLD(zp) */ 1466 int zflg = 0; 1467 1468 if (flag & FIGNORECASE) 1469 zflg |= ZCILOOK; 1470 1471 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 1472 NULL, NULL); 1473 if (error) { 1474 if (strcmp(name, "..") == 0) 1475 error = EISDIR; 1476 ZFS_EXIT(zfsvfs); 1477 return (error); 1478 } 1479 } 1480 if (zp == NULL) { 1481 uint64_t txtype; 1482 1483 /* 1484 * Create a new file object and update the directory 1485 * to reference it. 1486 */ 1487 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { 1488 goto out; 1489 } 1490 1491 /* 1492 * We only support the creation of regular files in 1493 * extended attribute directories. 1494 */ 1495 if ((dzp->z_phys->zp_flags & ZFS_XATTR) && 1496 (vap->va_type != VREG)) { 1497 error = EINVAL; 1498 goto out; 1499 } 1500 1501 if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp, 1502 &acl_ids)) != 0) 1503 goto out; 1504 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { 1505 zfs_acl_ids_free(&acl_ids); 1506 error = EDQUOT; 1507 goto out; 1508 } 1509 1510 tx = dmu_tx_create(os); 1511 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 1512 fuid_dirtied = zfsvfs->z_fuid_dirty; 1513 if (fuid_dirtied) 1514 zfs_fuid_txhold(zfsvfs, tx); 1515 dmu_tx_hold_bonus(tx, dzp->z_id); 1516 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 1517 if (acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 1518 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 1519 0, SPA_MAXBLOCKSIZE); 1520 } 1521 error = dmu_tx_assign(tx, TXG_NOWAIT); 1522 if (error) { 1523 zfs_acl_ids_free(&acl_ids); 1524 zfs_dirent_unlock(dl); 1525 if (error == ERESTART) { 1526 dmu_tx_wait(tx); 1527 dmu_tx_abort(tx); 1528 goto top; 1529 } 1530 dmu_tx_abort(tx); 1531 ZFS_EXIT(zfsvfs); 1532 return (error); 1533 } 1534 zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, &acl_ids); 1535 1536 if (fuid_dirtied) 1537 zfs_fuid_sync(zfsvfs, tx); 1538 1539 (void) zfs_link_create(dl, zp, tx, ZNEW); 1540 1541 txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); 1542 if (flag & FIGNORECASE) 1543 txtype |= TX_CI; 1544 zfs_log_create(zilog, tx, txtype, dzp, zp, name, 1545 vsecp, acl_ids.z_fuidp, vap); 1546 zfs_acl_ids_free(&acl_ids); 1547 dmu_tx_commit(tx); 1548 } else { 1549 int aflags = (flag & FAPPEND) ? V_APPEND : 0; 1550 1551 /* 1552 * A directory entry already exists for this name. 1553 */ 1554 /* 1555 * Can't truncate an existing file if in exclusive mode. 1556 */ 1557 if (excl == EXCL) { 1558 error = EEXIST; 1559 goto out; 1560 } 1561 /* 1562 * Can't open a directory for writing. 1563 */ 1564 if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) { 1565 error = EISDIR; 1566 goto out; 1567 } 1568 /* 1569 * Verify requested access to file. 1570 */ 1571 if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) { 1572 goto out; 1573 } 1574 1575 mutex_enter(&dzp->z_lock); 1576 dzp->z_seq++; 1577 mutex_exit(&dzp->z_lock); 1578 1579 /* 1580 * Truncate regular files if requested. 1581 */ 1582 if ((ZTOV(zp)->v_type == VREG) && 1583 (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) { 1584 /* we can't hold any locks when calling zfs_freesp() */ 1585 zfs_dirent_unlock(dl); 1586 dl = NULL; 1587 error = zfs_freesp(zp, 0, 0, mode, TRUE); 1588 if (error == 0) { 1589 vnevent_create(ZTOV(zp), NULL); 1590 } 1591 } 1592 } 1593out: 1594 if (dl) 1595 zfs_dirent_unlock(dl); 1596 1597 if (error) { 1598 if (zp) 1599 VN_RELE(ZTOV(zp)); 1600 } else { 1601 *vpp = ZTOV(zp); 1602 error = specvp_check(vpp, cr); 1603 } 1604 1605 ZFS_EXIT(zfsvfs); 1606 return (error); 1607} 1608 1609/* 1610 * Remove an entry from a directory. 1611 * 1612 * IN: dvp - vnode of directory to remove entry from. 1613 * name - name of entry to remove. 1614 * cr - credentials of caller. 1615 * ct - caller context 1616 * flags - case flags 1617 * 1618 * RETURN: 0 if success 1619 * error code if failure 1620 * 1621 * Timestamps: 1622 * dvp - ctime|mtime 1623 * vp - ctime (if nlink > 0) 1624 */ 1625/*ARGSUSED*/ 1626static int 1627zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct, 1628 int flags) 1629{ 1630 znode_t *zp, *dzp = VTOZ(dvp); 1631 znode_t *xzp = NULL; 1632 vnode_t *vp; 1633 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1634 zilog_t *zilog; 1635 uint64_t acl_obj, xattr_obj; 1636 zfs_dirlock_t *dl; 1637 dmu_tx_t *tx; 1638 boolean_t may_delete_now, delete_now = FALSE; 1639 boolean_t unlinked, toobig = FALSE; 1640 uint64_t txtype; 1641 pathname_t *realnmp = NULL; 1642 pathname_t realnm; 1643 int error; 1644 int zflg = ZEXISTS; 1645 1646 dprintf("zfs_remove called\n"); 1647 1648 ZFS_ENTER(zfsvfs); 1649 ZFS_VERIFY_ZP(dzp); 1650 zilog = zfsvfs->z_log; 1651 1652 if (flags & FIGNORECASE) { 1653 zflg |= ZCILOOK; 1654 pn_alloc(&realnm); 1655 realnmp = &realnm; 1656 } 1657 1658top: 1659 /* 1660 * Attempt to lock directory; fail if entry doesn't exist. 1661 */ 1662 if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 1663 NULL, realnmp)) { 1664 if (realnmp) 1665 pn_free(realnmp); 1666 ZFS_EXIT(zfsvfs); 1667 return (error); 1668 } 1669 1670 vp = ZTOV(zp); 1671 1672 if (error = zfs_zaccess_delete(dzp, zp, cr)) { 1673 goto out; 1674 } 1675 1676 /* 1677 * Need to use rmdir for removing directories. 1678 */ 1679 if (vp->v_type == VDIR) { 1680 error = EPERM; 1681 goto out; 1682 } 1683 1684 vnevent_remove(vp, dvp, name, ct); 1685 1686 if (realnmp) 1687 dnlc_remove(dvp, realnmp->pn_buf); 1688 else 1689 dnlc_remove(dvp, name); 1690 1691 may_delete_now = FALSE; 1692 1693 /* 1694 * We may delete the znode now, or we may put it in the unlinked set; 1695 * it depends on whether we're the last link, and on whether there are 1696 * other holds on the vnode. So we dmu_tx_hold() the right things to 1697 * allow for either case. 1698 */ 1699 tx = dmu_tx_create(zfsvfs->z_os); 1700 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); 1701 dmu_tx_hold_bonus(tx, zp->z_id); 1702 if (may_delete_now) { 1703 toobig = 1704 zp->z_phys->zp_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT; 1705 /* if the file is too big, only hold_free a token amount */ 1706 dmu_tx_hold_free(tx, zp->z_id, 0, 1707 (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END)); 1708 } 1709 1710 /* are there any extended attributes? */ 1711 if ((xattr_obj = zp->z_phys->zp_xattr) != 0) { 1712 /* XXX - do we need this if we are deleting? */ 1713 dmu_tx_hold_bonus(tx, xattr_obj); 1714 } 1715 1716 /* are there any additional acls */ 1717 if ((acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj) != 0 && 1718 may_delete_now) 1719 dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); 1720 1721 /* charge as an update -- would be nice not to charge at all */ 1722 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 1723 1724 error = dmu_tx_assign(tx, TXG_NOWAIT); 1725 if (error) { 1726 zfs_dirent_unlock(dl); 1727 VN_RELE(vp); 1728 if (error == ERESTART) { 1729 dmu_tx_wait(tx); 1730 dmu_tx_abort(tx); 1731 goto top; 1732 } 1733 if (realnmp) 1734 pn_free(realnmp); 1735 dmu_tx_abort(tx); 1736 ZFS_EXIT(zfsvfs); 1737 return (error); 1738 } 1739 1740 /* 1741 * Remove the directory entry. 1742 */ 1743 error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked); 1744 1745 if (error) { 1746 dmu_tx_commit(tx); 1747 goto out; 1748 } 1749 1750 if (0 && unlinked) { 1751 KASSERT(0); /* NetBSD: must now happen now */ 1752 VI_LOCK(vp); 1753 delete_now = may_delete_now && !toobig && 1754 vp->v_count == 1 && !vn_has_cached_data(vp) && 1755 zp->z_phys->zp_xattr == xattr_obj && 1756 zp->z_phys->zp_acl.z_acl_extern_obj == acl_obj; 1757 VI_UNLOCK(vp); 1758 } 1759 1760 if (delete_now) { 1761 KASSERT(0); /* NetBSD: must now happen now */ 1762 if (zp->z_phys->zp_xattr) { 1763 error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp); 1764 ASSERT3U(error, ==, 0); 1765 ASSERT3U(xzp->z_phys->zp_links, ==, 2); 1766 dmu_buf_will_dirty(xzp->z_dbuf, tx); 1767 mutex_enter(&xzp->z_lock); 1768 xzp->z_unlinked = 1; 1769 xzp->z_phys->zp_links = 0; 1770 mutex_exit(&xzp->z_lock); 1771 zfs_unlinked_add(xzp, tx); 1772 zp->z_phys->zp_xattr = 0; /* probably unnecessary */ 1773 } 1774 mutex_enter(&zp->z_lock); 1775 VI_LOCK(vp); 1776 vp->v_count--; 1777 ASSERT3U(vp->v_count, ==, 0); 1778 VI_UNLOCK(vp); 1779 mutex_exit(&zp->z_lock); 1780 zfs_znode_delete(zp, tx); 1781 } else if (unlinked) { 1782 zfs_unlinked_add(zp, tx); 1783 } 1784 1785 txtype = TX_REMOVE; 1786 if (flags & FIGNORECASE) 1787 txtype |= TX_CI; 1788 zfs_log_remove(zilog, tx, txtype, dzp, name); 1789 1790 dmu_tx_commit(tx); 1791out: 1792 if (realnmp) 1793 pn_free(realnmp); 1794 1795 zfs_dirent_unlock(dl); 1796 1797 if (!delete_now) { 1798 VN_RELE(vp); 1799 } else if (xzp) { 1800 /* this rele is delayed to prevent nesting transactions */ 1801 VN_RELE(ZTOV(xzp)); 1802 } 1803 1804 ZFS_EXIT(zfsvfs); 1805 return (error); 1806} 1807 1808/* 1809 * Create a new directory and insert it into dvp using the name 1810 * provided. Return a pointer to the inserted directory. 1811 * 1812 * IN: dvp - vnode of directory to add subdir to. 1813 * dirname - name of new directory. 1814 * vap - attributes of new directory. 1815 * cr - credentials of caller. 1816 * ct - caller context 1817 * vsecp - ACL to be set 1818 * 1819 * OUT: vpp - vnode of created directory. 1820 * 1821 * RETURN: 0 if success 1822 * error code if failure 1823 * 1824 * Timestamps: 1825 * dvp - ctime|mtime updated 1826 * vp - ctime|mtime|atime updated 1827 */ 1828/*ARGSUSED*/ 1829static int 1830zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr, 1831 caller_context_t *ct, int flags, vsecattr_t *vsecp) 1832{ 1833 znode_t *zp, *dzp = VTOZ(dvp); 1834 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1835 zilog_t *zilog; 1836 zfs_dirlock_t *dl; 1837 uint64_t txtype; 1838 dmu_tx_t *tx; 1839 int error; 1840 int zf = ZNEW; 1841 zfs_acl_ids_t acl_ids; 1842 boolean_t fuid_dirtied; 1843 1844 ASSERT(vap->va_type == VDIR); 1845 1846 /* 1847 * If we have an ephemeral id, ACL, or XVATTR then 1848 * make sure file system is at proper version 1849 */ 1850 1851 if (zfsvfs->z_use_fuids == B_FALSE && 1852 (vsecp || (vap->va_mask & AT_XVATTR) || IS_EPHEMERAL(crgetuid(cr))|| 1853 IS_EPHEMERAL(crgetgid(cr)))) 1854 return (EINVAL); 1855 1856 ZFS_ENTER(zfsvfs); 1857 ZFS_VERIFY_ZP(dzp); 1858 zilog = zfsvfs->z_log; 1859 1860 if (dzp->z_phys->zp_flags & ZFS_XATTR) { 1861 ZFS_EXIT(zfsvfs); 1862 return (EINVAL); 1863 } 1864 1865 if (zfsvfs->z_utf8 && u8_validate(dirname, 1866 strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 1867 ZFS_EXIT(zfsvfs); 1868 return (EILSEQ); 1869 } 1870 if (flags & FIGNORECASE) 1871 zf |= ZCILOOK; 1872 1873 if (vap->va_mask & AT_XVATTR) 1874 if ((error = secpolicy_xvattr((xvattr_t *)vap, 1875 crgetuid(cr), cr, vap->va_type)) != 0) { 1876 ZFS_EXIT(zfsvfs); 1877 return (error); 1878 } 1879 1880 /* 1881 * First make sure the new directory doesn't exist. 1882 */ 1883top: 1884 *vpp = NULL; 1885 1886 if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf, 1887 NULL, NULL)) { 1888 ZFS_EXIT(zfsvfs); 1889 return (error); 1890 } 1891 1892 if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) { 1893 zfs_dirent_unlock(dl); 1894 ZFS_EXIT(zfsvfs); 1895 return (error); 1896 } 1897 1898 if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp, 1899 &acl_ids)) != 0) { 1900 zfs_dirent_unlock(dl); 1901 ZFS_EXIT(zfsvfs); 1902 return (error); 1903 } 1904 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { 1905 zfs_acl_ids_free(&acl_ids); 1906 zfs_dirent_unlock(dl); 1907 ZFS_EXIT(zfsvfs); 1908 return (EDQUOT); 1909 } 1910 1911 /* 1912 * Add a new entry to the directory. 1913 */ 1914 tx = dmu_tx_create(zfsvfs->z_os); 1915 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); 1916 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); 1917 fuid_dirtied = zfsvfs->z_fuid_dirty; 1918 if (fuid_dirtied) 1919 zfs_fuid_txhold(zfsvfs, tx); 1920 if (acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) 1921 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 1922 0, SPA_MAXBLOCKSIZE); 1923 error = dmu_tx_assign(tx, TXG_NOWAIT); 1924 if (error) { 1925 zfs_acl_ids_free(&acl_ids); 1926 zfs_dirent_unlock(dl); 1927 if (error == ERESTART) { 1928 dmu_tx_wait(tx); 1929 dmu_tx_abort(tx); 1930 goto top; 1931 } 1932 dmu_tx_abort(tx); 1933 ZFS_EXIT(zfsvfs); 1934 return (error); 1935 } 1936 1937 /* 1938 * Create new node. 1939 */ 1940 zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, &acl_ids); 1941 1942 if (fuid_dirtied) 1943 zfs_fuid_sync(zfsvfs, tx); 1944 /* 1945 * Now put new name in parent dir. 1946 */ 1947 (void) zfs_link_create(dl, zp, tx, ZNEW); 1948 1949 *vpp = ZTOV(zp); 1950 1951 txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap); 1952 if (flags & FIGNORECASE) 1953 txtype |= TX_CI; 1954 zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, 1955 acl_ids.z_fuidp, vap); 1956 1957 zfs_acl_ids_free(&acl_ids); 1958 dmu_tx_commit(tx); 1959 1960 zfs_dirent_unlock(dl); 1961 1962 ZFS_EXIT(zfsvfs); 1963 return (0); 1964} 1965 1966/* 1967 * Remove a directory subdir entry. If the current working 1968 * directory is the same as the subdir to be removed, the 1969 * remove will fail. 1970 * 1971 * IN: dvp - vnode of directory to remove from. 1972 * name - name of directory to be removed. 1973 * cwd - vnode of current working directory. 1974 * cr - credentials of caller. 1975 * ct - caller context 1976 * flags - case flags 1977 * 1978 * RETURN: 0 if success 1979 * error code if failure 1980 * 1981 * Timestamps: 1982 * dvp - ctime|mtime updated 1983 */ 1984/*ARGSUSED*/ 1985static int 1986zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr, 1987 caller_context_t *ct, int flags) 1988{ 1989 znode_t *dzp = VTOZ(dvp); 1990 znode_t *zp; 1991 vnode_t *vp; 1992 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1993 zilog_t *zilog; 1994 zfs_dirlock_t *dl; 1995 dmu_tx_t *tx; 1996 int error; 1997 int zflg = ZEXISTS; 1998 1999 ZFS_ENTER(zfsvfs); 2000 ZFS_VERIFY_ZP(dzp); 2001 zilog = zfsvfs->z_log; 2002 2003 if (flags & FIGNORECASE) 2004 zflg |= ZCILOOK; 2005top: 2006 zp = NULL; 2007 2008 /* 2009 * Attempt to lock directory; fail if entry doesn't exist. 2010 */ 2011 if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 2012 NULL, NULL)) { 2013 ZFS_EXIT(zfsvfs); 2014 return (error); 2015 } 2016 2017 vp = ZTOV(zp); 2018 2019 if (error = zfs_zaccess_delete(dzp, zp, cr)) { 2020 goto out; 2021 } 2022 2023 if (vp->v_type != VDIR) { 2024 error = ENOTDIR; 2025 goto out; 2026 } 2027 2028 if (vp == cwd) { 2029 error = EINVAL; 2030 goto out; 2031 } 2032 2033 vnevent_rmdir(vp, dvp, name, ct); 2034 2035 /* 2036 * Grab a lock on the parent pointer to make sure we play well 2037 * with the treewalk and directory rename code. 2038 */ 2039 rw_enter(&zp->z_parent_lock, RW_WRITER); 2040 2041 tx = dmu_tx_create(zfsvfs->z_os); 2042 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); 2043 dmu_tx_hold_bonus(tx, zp->z_id); 2044 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 2045 error = dmu_tx_assign(tx, TXG_NOWAIT); 2046 if (error) { 2047 rw_exit(&zp->z_parent_lock); 2048 rw_exit(&zp->z_name_lock); 2049 zfs_dirent_unlock(dl); 2050 VN_RELE(vp); 2051 if (error == ERESTART) { 2052 dmu_tx_wait(tx); 2053 dmu_tx_abort(tx); 2054 goto top; 2055 } 2056 dmu_tx_abort(tx); 2057 ZFS_EXIT(zfsvfs); 2058 return (error); 2059 } 2060 2061 /* Purge cache entries, while still holding locks. */ 2062 cache_purge(dvp); 2063 cache_purge(vp); 2064 2065 error = zfs_link_destroy(dl, zp, tx, zflg, NULL); 2066 2067 if (error == 0) { 2068 uint64_t txtype = TX_RMDIR; 2069 if (flags & FIGNORECASE) 2070 txtype |= TX_CI; 2071 zfs_log_remove(zilog, tx, txtype, dzp, name); 2072 } 2073 2074 dmu_tx_commit(tx); 2075 2076 rw_exit(&zp->z_parent_lock); 2077 rw_exit(&zp->z_name_lock); 2078out: 2079 zfs_dirent_unlock(dl); 2080 2081 VN_RELE(vp); 2082 2083 ZFS_EXIT(zfsvfs); 2084 return (error); 2085} 2086 2087/* 2088 * Read as many directory entries as will fit into the provided 2089 * buffer from the given directory cursor position (specified in 2090 * the uio structure. 2091 * 2092 * IN: vp - vnode of directory to read. 2093 * uio - structure supplying read location, range info, 2094 * and return buffer. 2095 * cr - credentials of caller. 2096 * ct - caller context 2097 * flags - case flags 2098 * 2099 * OUT: uio - updated offset and range, buffer filled. 2100 * eofp - set to true if end-of-file detected. 2101 * 2102 * RETURN: 0 if success 2103 * error code if failure 2104 * 2105 * Timestamps: 2106 * vp - atime updated 2107 * 2108 * Note that the low 4 bits of the cookie returned by zap is always zero. 2109 * This allows us to use the low range for "special" directory entries: 2110 * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, 2111 * we use the offset 2 for the '.zfs' directory. 2112 */ 2113/* ARGSUSED */ 2114static int 2115zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies) 2116{ 2117 znode_t *zp = VTOZ(vp); 2118 iovec_t *iovp; 2119 edirent_t *eodp; 2120 dirent64_t *odp; 2121 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2122 objset_t *os; 2123 caddr_t outbuf; 2124 size_t bufsize; 2125 zap_cursor_t zc; 2126 zap_attribute_t zap; 2127 uint_t bytes_wanted; 2128 uint64_t offset; /* must be unsigned; checks for < 1 */ 2129 int local_eof; 2130 int outcount; 2131 int error; 2132 uint8_t prefetch; 2133 boolean_t check_sysattrs; 2134 uint8_t type; 2135 int ncooks; 2136 u_long *cooks = NULL; 2137 int flags = 0; 2138 2139 dprintf("zfs_readdir called\n"); 2140 2141 ZFS_ENTER(zfsvfs); 2142 ZFS_VERIFY_ZP(zp); 2143 2144 /* 2145 * If we are not given an eof variable, 2146 * use a local one. 2147 */ 2148 if (eofp == NULL) 2149 eofp = &local_eof; 2150 2151 /* 2152 * Check for valid iov_len. 2153 */ 2154 if (uio->uio_iov->iov_len <= 0) { 2155 ZFS_EXIT(zfsvfs); 2156 return (EINVAL); 2157 } 2158 2159 /* 2160 * Quit if directory has been removed (posix) 2161 */ 2162 if ((*eofp = zp->z_unlinked) != 0) { 2163 ZFS_EXIT(zfsvfs); 2164 return (0); 2165 } 2166 2167 error = 0; 2168 os = zfsvfs->z_os; 2169 offset = uio->uio_loffset; 2170 prefetch = zp->z_zn_prefetch; 2171 2172 /* 2173 * Initialize the iterator cursor. 2174 */ 2175 if (offset <= 3) { 2176 /* 2177 * Start iteration from the beginning of the directory. 2178 */ 2179 zap_cursor_init(&zc, os, zp->z_id); 2180 } else { 2181 /* 2182 * The offset is a serialized cursor. 2183 */ 2184 zap_cursor_init_serialized(&zc, os, zp->z_id, offset); 2185 } 2186 2187 /* 2188 * Get space to change directory entries into fs independent format. 2189 */ 2190 iovp = uio->uio_iov; 2191 bytes_wanted = iovp->iov_len; 2192 if (!VMSPACE_IS_KERNEL_P(uio->uio_vmspace) || uio->uio_iovcnt != 1) { 2193 bufsize = bytes_wanted; 2194 outbuf = kmem_alloc(bufsize, KM_SLEEP); 2195 memset(outbuf, 0, bufsize); 2196 odp = (struct dirent64 *)outbuf; 2197 } else { 2198 bufsize = bytes_wanted; 2199 odp = (struct dirent64 *)iovp->iov_base; 2200 } 2201 eodp = (struct edirent *)odp; 2202 2203 if (ncookies != NULL) { 2204 /* 2205 * Minimum entry size is dirent size and 1 byte for a file name. 2206 */ 2207 ncooks = uio->uio_resid / _DIRENT_MINSIZE(odp); 2208// sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1); 2209 cooks = kmem_alloc(ncooks * sizeof(u_long), KM_SLEEP); 2210 2211 memset(cooks, 0, ncooks * sizeof(u_long)); 2212 *cookies = cooks; 2213 *ncookies = ncooks; 2214 } 2215 2216 /* 2217 * If this VFS supports the system attribute view interface; and 2218 * we're looking at an extended attribute directory; and we care 2219 * about normalization conflicts on this vfs; then we must check 2220 * for normalization conflicts with the sysattr name space. 2221 */ 2222#ifdef TODO 2223 check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && 2224 (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm && 2225 (flags & V_RDDIR_ENTFLAGS); 2226#else 2227 check_sysattrs = 0; 2228#endif 2229 2230 /* 2231 * Transform to file-system independent format 2232 */ 2233 outcount = 0; 2234 while (outcount < bytes_wanted) { 2235 ino64_t objnum; 2236 ushort_t reclen; 2237 off64_t *next; 2238 2239 /* 2240 * Special case `.', `..', and `.zfs'. 2241 */ 2242 if (offset == 0) { 2243 (void) strcpy(zap.za_name, "."); 2244 zap.za_normalization_conflict = 0; 2245 objnum = zp->z_id; 2246 type = DT_DIR; 2247 } else if (offset == 1) { 2248 (void) strcpy(zap.za_name, ".."); 2249 zap.za_normalization_conflict = 0; 2250 objnum = zp->z_phys->zp_parent; 2251 type = DT_DIR; 2252 } else if (offset == 2 && zfs_show_ctldir(zp)) { 2253 (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); 2254 zap.za_normalization_conflict = 0; 2255 objnum = ZFSCTL_INO_ROOT; 2256 type = DT_DIR; 2257 } else { 2258 /* 2259 * Grab next entry. 2260 */ 2261 if (error = zap_cursor_retrieve(&zc, &zap)) { 2262 if ((*eofp = (error == ENOENT)) != 0) 2263 break; 2264 else 2265 goto update; 2266 } 2267 2268 if (zap.za_integer_length != 8 || 2269 zap.za_num_integers != 1) { 2270 cmn_err(CE_WARN, "zap_readdir: bad directory " 2271 "entry, obj = %lld, offset = %lld\n", 2272 (u_longlong_t)zp->z_id, 2273 (u_longlong_t)offset); 2274 error = ENXIO; 2275 goto update; 2276 } 2277 2278 objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); 2279 /* 2280 * MacOS X can extract the object type here such as: 2281 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer); 2282 */ 2283 type = ZFS_DIRENT_TYPE(zap.za_first_integer); 2284 2285 if (check_sysattrs && !zap.za_normalization_conflict) { 2286#ifdef TODO 2287 zap.za_normalization_conflict = 2288 xattr_sysattr_casechk(zap.za_name); 2289#else 2290 panic("%s:%u: TODO", __func__, __LINE__); 2291#endif 2292 } 2293 } 2294 2295 if (flags & V_RDDIR_ACCFILTER) { 2296 /* 2297 * If we have no access at all, don't include 2298 * this entry in the returned information 2299 */ 2300 znode_t *ezp; 2301 if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0) 2302 goto skip_entry; 2303 if (!zfs_has_access(ezp, cr)) { 2304 VN_RELE(ZTOV(ezp)); 2305 goto skip_entry; 2306 } 2307 VN_RELE(ZTOV(ezp)); 2308 } 2309 2310 if (flags & V_RDDIR_ENTFLAGS) 2311 reclen = EDIRENT_RECLEN(strlen(zap.za_name)); 2312 else 2313 reclen = _DIRENT_RECLEN(odp, strlen(zap.za_name)); 2314 2315 /* 2316 * Will this entry fit in the buffer? 2317 */ 2318 if (outcount + reclen > bufsize) { 2319 /* 2320 * Did we manage to fit anything in the buffer? 2321 */ 2322 if (!outcount) { 2323 error = EINVAL; 2324 goto update; 2325 } 2326 break; 2327 } 2328 if (flags & V_RDDIR_ENTFLAGS) { 2329 /* 2330 * Add extended flag entry: 2331 */ 2332 eodp->ed_ino = objnum; 2333 eodp->ed_reclen = reclen; 2334 /* NOTE: ed_off is the offset for the *next* entry */ 2335 next = &(eodp->ed_off); 2336 eodp->ed_eflags = zap.za_normalization_conflict ? 2337 ED_CASE_CONFLICT : 0; 2338 (void) strncpy(eodp->ed_name, zap.za_name, 2339 EDIRENT_NAMELEN(reclen)); 2340 eodp = (edirent_t *)((intptr_t)eodp + reclen); 2341 } else { 2342 /* 2343 * Add normal entry: 2344 */ 2345 odp->d_ino = objnum; 2346 odp->d_reclen = reclen; 2347 odp->d_namlen = strlen(zap.za_name); 2348 (void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1); 2349 odp->d_type = type; 2350 odp = (dirent64_t *)((intptr_t)odp + reclen); 2351 } 2352 outcount += reclen; 2353 2354 KASSERT(outcount <= bufsize); 2355 2356 /* Prefetch znode */ 2357 if (prefetch) 2358 dmu_prefetch(os, objnum, 0, 0); 2359 2360 skip_entry: 2361 /* 2362 * Move to the next entry, fill in the previous offset. 2363 */ 2364 if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { 2365 zap_cursor_advance(&zc); 2366 offset = zap_cursor_serialize(&zc); 2367 } else { 2368 offset += 1; 2369 } 2370 2371 if (cooks != NULL) { 2372 *cooks++ = offset; 2373 ncooks--; 2374 KASSERT(ncooks >= 0); 2375 } 2376 } 2377 zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ 2378 2379 /* Subtract unused cookies */ 2380 if (ncookies != NULL) 2381 *ncookies -= ncooks; 2382 2383 if (VMSPACE_IS_KERNEL_P(uio->uio_vmspace) && uio->uio_iovcnt == 1) { 2384 iovp->iov_base += outcount; 2385 iovp->iov_len -= outcount; 2386 uio->uio_resid -= outcount; 2387 } else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) { 2388 /* 2389 * Reset the pointer. 2390 */ 2391 offset = uio->uio_loffset; 2392 } 2393 2394update: 2395 zap_cursor_fini(&zc); 2396 if (!VMSPACE_IS_KERNEL_P(uio->uio_vmspace) || uio->uio_iovcnt != 1) 2397 kmem_free(outbuf, bufsize); 2398 2399 if (error == ENOENT) 2400 error = 0; 2401 2402 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 2403 2404 uio->uio_loffset = offset; 2405 ZFS_EXIT(zfsvfs); 2406 if (error != 0 && cookies != NULL) { 2407 kmem_free(*cookies, ncooks * sizeof(u_long)); 2408 *cookies = NULL; 2409 *ncookies = 0; 2410 } 2411 return (error); 2412} 2413 2414ulong_t zfs_fsync_sync_cnt = 4; 2415 2416static int 2417zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct) 2418{ 2419 znode_t *zp = VTOZ(vp); 2420 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2421 int error; 2422 2423 error = 0; 2424 2425 dprintf("zfs_fsync called vp %p -- zfsvfs %p\n", vp, zfsvfs); 2426 (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt); 2427 2428 ZFS_ENTER(zfsvfs); 2429 ZFS_VERIFY_ZP(zp); 2430 /* 2431 * NetBSD: if the sync is from reclaim or from ioflush, 2432 * push dirty atime now. No need to lock: in the reclaim 2433 * case, everything is single threaded and for ioflush this 2434 * is a lazy writeback. 2435 * 2436 * XXXNETBSD: in the ioflush case, we don't want to push anything 2437 * to disk immediately. We just want to queue the update so it 2438 * will happen "soon". Check this is the case otherwise zfs will 2439 * perform poorly. 2440 */ 2441 if (zp->z_atime_dirty && zp->z_unlinked == 0 && 2442 (syncflag & (FSYNC_RECLAIM | FSYNC_LAZY)) != 0) { 2443 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); 2444 2445 dmu_tx_hold_bonus(tx, zp->z_id); 2446 error = dmu_tx_assign(tx, TXG_WAIT); 2447 if (error) { 2448 dmu_tx_abort(tx); 2449 } else { 2450 dmu_buf_will_dirty(zp->z_dbuf, tx); 2451 mutex_enter(&zp->z_lock); 2452 zp->z_atime_dirty = 0; 2453 mutex_exit(&zp->z_lock); 2454 dmu_tx_commit(tx); 2455 } 2456 } 2457 zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id); 2458 ZFS_EXIT(zfsvfs); 2459 return (0); 2460} 2461 2462 2463/* 2464 * Get the requested file attributes and place them in the provided 2465 * vattr structure. 2466 * 2467 * IN: vp - vnode of file. 2468 * vap - va_mask identifies requested attributes. 2469 * If AT_XVATTR set, then optional attrs are requested 2470 * flags - ATTR_NOACLCHECK (CIFS server context) 2471 * cr - credentials of caller. 2472 * ct - caller context 2473 * 2474 * OUT: vap - attribute values. 2475 * 2476 * RETURN: 0 (always succeeds) 2477 */ 2478/* ARGSUSED */ 2479static int 2480zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, 2481 caller_context_t *ct) 2482{ 2483 znode_t *zp = VTOZ(vp); 2484 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2485 znode_phys_t *pzp; 2486 int error = 0; 2487 uint32_t blksize; 2488 u_longlong_t nblocks; 2489 uint64_t links; 2490 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ 2491 xoptattr_t *xoap = NULL; 2492 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 2493 2494 dprintf("zfs_getattr called\n"); 2495 2496 ZFS_ENTER(zfsvfs); 2497 ZFS_VERIFY_ZP(zp); 2498 pzp = zp->z_phys; 2499 2500 /* 2501 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES. 2502 * Also, if we are the owner don't bother, since owner should 2503 * always be allowed to read basic attributes of file. 2504 */ 2505 if (!(pzp->zp_flags & ZFS_ACL_TRIVIAL) && 2506 (pzp->zp_uid != crgetuid(cr))) { 2507 if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0, 2508 skipaclchk, cr)) { 2509 ZFS_EXIT(zfsvfs); 2510 return (error); 2511 } 2512 } 2513 2514 /* 2515 * Return all attributes. It's cheaper to provide the answer 2516 * than to determine whether we were asked the question. 2517 */ 2518 mutex_enter(&zp->z_lock); 2519 vap->va_type = IFTOVT(pzp->zp_mode); 2520 vap->va_mode = pzp->zp_mode & ~S_IFMT; 2521 zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid); 2522 vap->va_nodeid = zp->z_id; 2523 if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp)) 2524 links = pzp->zp_links + 1; 2525 else 2526 links = pzp->zp_links; 2527 vap->va_nlink = MIN(links, UINT32_MAX); /* nlink_t limit! */ 2528 vap->va_size = pzp->zp_size; 2529 vap->va_fsid = vp->v_mount->mnt_stat.f_fsidx.__fsid_val[0]; 2530// vap->va_fsid = 0; 2531 vap->va_rdev = zfs_cmpldev(pzp->zp_rdev); 2532 vap->va_seq = zp->z_seq; 2533 vap->va_flags = 0; /* FreeBSD: Reset chflags(2) flags. */ 2534 2535 /* 2536 * Add in any requested optional attributes and the create time. 2537 * Also set the corresponding bits in the returned attribute bitmap. 2538 */ 2539 if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) { 2540 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { 2541 xoap->xoa_archive = 2542 ((pzp->zp_flags & ZFS_ARCHIVE) != 0); 2543 XVA_SET_RTN(xvap, XAT_ARCHIVE); 2544 } 2545 2546 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { 2547 xoap->xoa_readonly = 2548 ((pzp->zp_flags & ZFS_READONLY) != 0); 2549 XVA_SET_RTN(xvap, XAT_READONLY); 2550 } 2551 2552 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { 2553 xoap->xoa_system = 2554 ((pzp->zp_flags & ZFS_SYSTEM) != 0); 2555 XVA_SET_RTN(xvap, XAT_SYSTEM); 2556 } 2557 2558 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { 2559 xoap->xoa_hidden = 2560 ((pzp->zp_flags & ZFS_HIDDEN) != 0); 2561 XVA_SET_RTN(xvap, XAT_HIDDEN); 2562 } 2563 2564 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { 2565 xoap->xoa_nounlink = 2566 ((pzp->zp_flags & ZFS_NOUNLINK) != 0); 2567 XVA_SET_RTN(xvap, XAT_NOUNLINK); 2568 } 2569 2570 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { 2571 xoap->xoa_immutable = 2572 ((pzp->zp_flags & ZFS_IMMUTABLE) != 0); 2573 XVA_SET_RTN(xvap, XAT_IMMUTABLE); 2574 } 2575 2576 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { 2577 xoap->xoa_appendonly = 2578 ((pzp->zp_flags & ZFS_APPENDONLY) != 0); 2579 XVA_SET_RTN(xvap, XAT_APPENDONLY); 2580 } 2581 2582 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { 2583 xoap->xoa_nodump = 2584 ((pzp->zp_flags & ZFS_NODUMP) != 0); 2585 XVA_SET_RTN(xvap, XAT_NODUMP); 2586 } 2587 2588 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { 2589 xoap->xoa_opaque = 2590 ((pzp->zp_flags & ZFS_OPAQUE) != 0); 2591 XVA_SET_RTN(xvap, XAT_OPAQUE); 2592 } 2593 2594 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { 2595 xoap->xoa_av_quarantined = 2596 ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0); 2597 XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); 2598 } 2599 2600 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { 2601 xoap->xoa_av_modified = 2602 ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0); 2603 XVA_SET_RTN(xvap, XAT_AV_MODIFIED); 2604 } 2605 2606 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) && 2607 vp->v_type == VREG && 2608 (pzp->zp_flags & ZFS_BONUS_SCANSTAMP)) { 2609 size_t len; 2610 dmu_object_info_t doi; 2611 2612 /* 2613 * Only VREG files have anti-virus scanstamps, so we 2614 * won't conflict with symlinks in the bonus buffer. 2615 */ 2616 dmu_object_info_from_db(zp->z_dbuf, &doi); 2617 len = sizeof (xoap->xoa_av_scanstamp) + 2618 sizeof (znode_phys_t); 2619 if (len <= doi.doi_bonus_size) { 2620 /* 2621 * pzp points to the start of the 2622 * znode_phys_t. pzp + 1 points to the 2623 * first byte after the znode_phys_t. 2624 */ 2625 (void) memcpy(xoap->xoa_av_scanstamp, 2626 pzp + 1, 2627 sizeof (xoap->xoa_av_scanstamp)); 2628 XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); 2629 } 2630 } 2631 2632 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { 2633 ZFS_TIME_DECODE(&xoap->xoa_createtime, pzp->zp_crtime); 2634 XVA_SET_RTN(xvap, XAT_CREATETIME); 2635 } 2636 2637 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { 2638 xoap->xoa_reparse = 2639 ((pzp->zp_flags & ZFS_REPARSE) != 0); 2640 XVA_SET_RTN(xvap, XAT_REPARSE); 2641 } 2642 } 2643 2644 ZFS_TIME_DECODE(&vap->va_atime, pzp->zp_atime); 2645 ZFS_TIME_DECODE(&vap->va_mtime, pzp->zp_mtime); 2646 ZFS_TIME_DECODE(&vap->va_ctime, pzp->zp_ctime); 2647 ZFS_TIME_DECODE(&vap->va_birthtime, pzp->zp_crtime); 2648 2649 mutex_exit(&zp->z_lock); 2650 2651 dmu_object_size_from_db(zp->z_dbuf, &blksize, &nblocks); 2652 vap->va_blksize = blksize; 2653 vap->va_bytes = nblocks << 9; /* nblocks * 512 */ 2654 2655 if (zp->z_blksz == 0) { 2656 /* 2657 * Block size hasn't been set; suggest maximal I/O transfers. 2658 */ 2659 vap->va_blksize = zfsvfs->z_max_blksz; 2660 } 2661 2662 ZFS_EXIT(zfsvfs); 2663 return (0); 2664} 2665 2666/* 2667 * Set the file attributes to the values contained in the 2668 * vattr structure. 2669 * 2670 * IN: vp - vnode of file to be modified. 2671 * vap - new attribute values. 2672 * If AT_XVATTR set, then optional attrs are being set 2673 * flags - ATTR_UTIME set if non-default time values provided. 2674 * - ATTR_NOACLCHECK (CIFS context only). 2675 * cr - credentials of caller. 2676 * ct - caller context 2677 * 2678 * RETURN: 0 if success 2679 * error code if failure 2680 * 2681 * Timestamps: 2682 * vp - ctime updated, mtime updated if size changed. 2683 */ 2684/* ARGSUSED */ 2685static int 2686zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, 2687 caller_context_t *ct) 2688{ 2689 znode_t *zp = VTOZ(vp); 2690 znode_phys_t *pzp; 2691 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2692 zilog_t *zilog; 2693 dmu_tx_t *tx; 2694 vattr_t oldva; 2695 xvattr_t tmpxvattr; 2696 uint_t mask = vap->va_mask; 2697 uint_t saved_mask; 2698 int trim_mask = 0; 2699 uint64_t new_mode; 2700 uint64_t new_uid, new_gid; 2701 znode_t *attrzp; 2702 int need_policy = FALSE; 2703 int err; 2704 zfs_fuid_info_t *fuidp = NULL; 2705 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ 2706 xoptattr_t *xoap; 2707 zfs_acl_t *aclp = NULL; 2708 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 2709 boolean_t fuid_dirtied = B_FALSE; 2710 2711 dprintf("zfs_setattr called\n"); 2712 2713 if (mask == 0) 2714 return (0); 2715 2716 if (mask & AT_NOSET) 2717 return (EINVAL); 2718 2719 ZFS_ENTER(zfsvfs); 2720 ZFS_VERIFY_ZP(zp); 2721 2722 pzp = zp->z_phys; 2723 zilog = zfsvfs->z_log; 2724 2725 /* 2726 * Make sure that if we have ephemeral uid/gid or xvattr specified 2727 * that file system is at proper version level 2728 */ 2729 2730 if (zfsvfs->z_use_fuids == B_FALSE && 2731 (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) || 2732 ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) || 2733 (mask & AT_XVATTR))) { 2734 ZFS_EXIT(zfsvfs); 2735 return (EINVAL); 2736 } 2737 2738 if (mask & AT_SIZE && vp->v_type == VDIR) { 2739 ZFS_EXIT(zfsvfs); 2740 return (EISDIR); 2741 } 2742 2743 if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) { 2744 ZFS_EXIT(zfsvfs); 2745 return (EINVAL); 2746 } 2747 2748 /* 2749 * If this is an xvattr_t, then get a pointer to the structure of 2750 * optional attributes. If this is NULL, then we have a vattr_t. 2751 */ 2752 xoap = xva_getxoptattr(xvap); 2753 2754 xva_init(&tmpxvattr); 2755 2756 /* 2757 * Immutable files can only alter immutable bit and atime 2758 */ 2759 if ((pzp->zp_flags & ZFS_IMMUTABLE) && 2760 ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) || 2761 ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { 2762 ZFS_EXIT(zfsvfs); 2763 return (EPERM); 2764 } 2765 2766 if ((mask & AT_SIZE) && (pzp->zp_flags & ZFS_READONLY)) { 2767 ZFS_EXIT(zfsvfs); 2768 return (EPERM); 2769 } 2770 2771 /* 2772 * Verify timestamps doesn't overflow 32 bits. 2773 * ZFS can handle large timestamps, but 32bit syscalls can't 2774 * handle times greater than 2039. This check should be removed 2775 * once large timestamps are fully supported. 2776 */ 2777 if (mask & (AT_ATIME | AT_MTIME)) { 2778 if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || 2779 ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { 2780 ZFS_EXIT(zfsvfs); 2781 return (EOVERFLOW); 2782 } 2783 } 2784 2785top: 2786 attrzp = NULL; 2787 2788 /* Can this be moved to before the top label? */ 2789 if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { 2790 ZFS_EXIT(zfsvfs); 2791 return (EROFS); 2792 } 2793 2794 /* 2795 * First validate permissions 2796 */ 2797 if (mask & AT_SIZE) { 2798 err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr); 2799 if (err) { 2800 ZFS_EXIT(zfsvfs); 2801 return (err); 2802 } 2803 /* 2804 * XXX - Note, we are not providing any open 2805 * mode flags here (like FNDELAY), so we may 2806 * block if there are locks present... this 2807 * should be addressed in openat(). 2808 */ 2809 /* XXX - would it be OK to generate a log record here? */ 2810 err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); 2811 if (err) { 2812 ZFS_EXIT(zfsvfs); 2813 return (err); 2814 } 2815 } 2816 2817 if (mask & (AT_ATIME|AT_MTIME) || 2818 ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || 2819 XVA_ISSET_REQ(xvap, XAT_READONLY) || 2820 XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || 2821 XVA_ISSET_REQ(xvap, XAT_CREATETIME) || 2822 XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) 2823 need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, 2824 skipaclchk, cr); 2825 2826 if (mask & (AT_UID|AT_GID)) { 2827 int idmask = (mask & (AT_UID|AT_GID)); 2828 int take_owner; 2829 int take_group; 2830 2831 /* 2832 * NOTE: even if a new mode is being set, 2833 * we may clear S_ISUID/S_ISGID bits. 2834 */ 2835 2836 if (!(mask & AT_MODE)) 2837 vap->va_mode = pzp->zp_mode; 2838 2839 /* 2840 * Take ownership or chgrp to group we are a member of 2841 */ 2842 2843 take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr)); 2844 take_group = (mask & AT_GID) && 2845 zfs_groupmember(zfsvfs, vap->va_gid, cr); 2846 2847 /* 2848 * If both AT_UID and AT_GID are set then take_owner and 2849 * take_group must both be set in order to allow taking 2850 * ownership. 2851 * 2852 * Otherwise, send the check through secpolicy_vnode_setattr() 2853 * 2854 */ 2855 2856 if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) || 2857 ((idmask == AT_UID) && take_owner) || 2858 ((idmask == AT_GID) && take_group)) { 2859 if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, 2860 skipaclchk, cr) == 0) { 2861 /* 2862 * Remove setuid/setgid for non-privileged users 2863 */ 2864 secpolicy_setid_clear(vap, cr); 2865 trim_mask = (mask & (AT_UID|AT_GID)); 2866 } else { 2867 need_policy = TRUE; 2868 } 2869 } else { 2870 need_policy = TRUE; 2871 } 2872 } 2873 2874 mutex_enter(&zp->z_lock); 2875 oldva.va_mode = pzp->zp_mode; 2876 zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); 2877 if (mask & AT_XVATTR) { 2878 /* 2879 * Update xvattr mask to include only those attributes 2880 * that are actually changing. 2881 * 2882 * the bits will be restored prior to actually setting 2883 * the attributes so the caller thinks they were set. 2884 */ 2885 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { 2886 if (xoap->xoa_appendonly != 2887 ((pzp->zp_flags & ZFS_APPENDONLY) != 0)) { 2888 need_policy = TRUE; 2889 } else { 2890 XVA_CLR_REQ(xvap, XAT_APPENDONLY); 2891 XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY); 2892 } 2893 } 2894 2895 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { 2896 if (xoap->xoa_nounlink != 2897 ((pzp->zp_flags & ZFS_NOUNLINK) != 0)) { 2898 need_policy = TRUE; 2899 } else { 2900 XVA_CLR_REQ(xvap, XAT_NOUNLINK); 2901 XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK); 2902 } 2903 } 2904 2905 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { 2906 if (xoap->xoa_immutable != 2907 ((pzp->zp_flags & ZFS_IMMUTABLE) != 0)) { 2908 need_policy = TRUE; 2909 } else { 2910 XVA_CLR_REQ(xvap, XAT_IMMUTABLE); 2911 XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE); 2912 } 2913 } 2914 2915 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { 2916 if (xoap->xoa_nodump != 2917 ((pzp->zp_flags & ZFS_NODUMP) != 0)) { 2918 need_policy = TRUE; 2919 } else { 2920 XVA_CLR_REQ(xvap, XAT_NODUMP); 2921 XVA_SET_REQ(&tmpxvattr, XAT_NODUMP); 2922 } 2923 } 2924 2925 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { 2926 if (xoap->xoa_av_modified != 2927 ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0)) { 2928 need_policy = TRUE; 2929 } else { 2930 XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); 2931 XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED); 2932 } 2933 } 2934 2935 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { 2936 if ((vp->v_type != VREG && 2937 xoap->xoa_av_quarantined) || 2938 xoap->xoa_av_quarantined != 2939 ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0)) { 2940 need_policy = TRUE; 2941 } else { 2942 XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); 2943 XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED); 2944 } 2945 } 2946 2947 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { 2948 mutex_exit(&zp->z_lock); 2949 ZFS_EXIT(zfsvfs); 2950 return (EPERM); 2951 } 2952 2953 if (need_policy == FALSE && 2954 (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) || 2955 XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { 2956 need_policy = TRUE; 2957 } 2958 } 2959 2960 mutex_exit(&zp->z_lock); 2961 2962 if (mask & AT_MODE) { 2963 if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) { 2964 err = secpolicy_setid_setsticky_clear(vp, vap, 2965 &oldva, cr); 2966 if (err) { 2967 ZFS_EXIT(zfsvfs); 2968 return (err); 2969 } 2970 trim_mask |= AT_MODE; 2971 } else { 2972 need_policy = TRUE; 2973 } 2974 } 2975 2976 if (need_policy) { 2977 /* 2978 * If trim_mask is set then take ownership 2979 * has been granted or write_acl is present and user 2980 * has the ability to modify mode. In that case remove 2981 * UID|GID and or MODE from mask so that 2982 * secpolicy_vnode_setattr() doesn't revoke it. 2983 */ 2984 2985 if (trim_mask) { 2986 saved_mask = vap->va_mask; 2987 vap->va_mask &= ~trim_mask; 2988 } 2989 err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags, 2990 (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp); 2991 if (err) { 2992 ZFS_EXIT(zfsvfs); 2993 return (err); 2994 } 2995 2996 if (trim_mask) 2997 vap->va_mask |= saved_mask; 2998 } 2999 /* 3000 * secpolicy_vnode_setattr, or take ownership may have 3001 * changed va_mask 3002 */ 3003 mask = vap->va_mask; 3004 3005 tx = dmu_tx_create(zfsvfs->z_os); 3006 dmu_tx_hold_bonus(tx, zp->z_id); 3007 3008 if (mask & AT_MODE) { 3009 uint64_t pmode = pzp->zp_mode; 3010 3011 new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); 3012 3013 if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)) 3014 goto out; 3015 if (pzp->zp_acl.z_acl_extern_obj) { 3016 /* Are we upgrading ACL from old V0 format to new V1 */ 3017 if (zfsvfs->z_version <= ZPL_VERSION_FUID && 3018 pzp->zp_acl.z_acl_version == 3019 ZFS_ACL_VERSION_INITIAL) { 3020 dmu_tx_hold_free(tx, 3021 pzp->zp_acl.z_acl_extern_obj, 0, 3022 DMU_OBJECT_END); 3023 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 3024 0, aclp->z_acl_bytes); 3025 } else { 3026 dmu_tx_hold_write(tx, 3027 pzp->zp_acl.z_acl_extern_obj, 0, 3028 aclp->z_acl_bytes); 3029 } 3030 } else if (aclp->z_acl_bytes > ZFS_ACE_SPACE) { 3031 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 3032 0, aclp->z_acl_bytes); 3033 } 3034 } 3035 3036 if (mask & (AT_UID | AT_GID)) { 3037 if (pzp->zp_xattr) { 3038 err = zfs_zget(zp->z_zfsvfs, pzp->zp_xattr, &attrzp); 3039 if (err) 3040 goto out; 3041 dmu_tx_hold_bonus(tx, attrzp->z_id); 3042 } 3043 if (mask & AT_UID) { 3044 new_uid = zfs_fuid_create(zfsvfs, 3045 (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); 3046 if (new_uid != pzp->zp_uid && 3047 zfs_usergroup_overquota(zfsvfs, B_FALSE, new_uid)) { 3048 err = EDQUOT; 3049 goto out; 3050 } 3051 } 3052 3053 if (mask & AT_GID) { 3054 new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid, 3055 cr, ZFS_GROUP, &fuidp); 3056 if (new_gid != pzp->zp_gid && 3057 zfs_usergroup_overquota(zfsvfs, B_TRUE, new_gid)) { 3058 err = EDQUOT; 3059 goto out; 3060 } 3061 } 3062 fuid_dirtied = zfsvfs->z_fuid_dirty; 3063 if (fuid_dirtied) { 3064 if (zfsvfs->z_fuid_obj == 0) { 3065 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 3066 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 3067 FUID_SIZE_ESTIMATE(zfsvfs)); 3068 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, 3069 FALSE, NULL); 3070 } else { 3071 dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); 3072 dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, 3073 FUID_SIZE_ESTIMATE(zfsvfs)); 3074 } 3075 } 3076 } 3077 3078 err = dmu_tx_assign(tx, TXG_NOWAIT); 3079 if (err) { 3080 if (err == ERESTART) 3081 dmu_tx_wait(tx); 3082 goto out; 3083 } 3084 3085 dmu_buf_will_dirty(zp->z_dbuf, tx); 3086 3087 /* 3088 * Set each attribute requested. 3089 * We group settings according to the locks they need to acquire. 3090 * 3091 * Note: you cannot set ctime directly, although it will be 3092 * updated as a side-effect of calling this function. 3093 */ 3094 3095 mutex_enter(&zp->z_lock); 3096 3097 if (mask & AT_MODE) { 3098 mutex_enter(&zp->z_acl_lock); 3099 zp->z_phys->zp_mode = new_mode; 3100 err = zfs_aclset_common(zp, aclp, cr, tx); 3101 ASSERT3U(err, ==, 0); 3102 zp->z_acl_cached = aclp; 3103 aclp = NULL; 3104 mutex_exit(&zp->z_acl_lock); 3105 } 3106 3107 if (attrzp) 3108 mutex_enter(&attrzp->z_lock); 3109 3110 if (mask & AT_UID) { 3111 pzp->zp_uid = new_uid; 3112 if (attrzp) 3113 attrzp->z_phys->zp_uid = new_uid; 3114 } 3115 3116 if (mask & AT_GID) { 3117 pzp->zp_gid = new_gid; 3118 if (attrzp) 3119 attrzp->z_phys->zp_gid = new_gid; 3120 } 3121 3122 if (attrzp) 3123 mutex_exit(&attrzp->z_lock); 3124 3125 if (mask & AT_ATIME) 3126 ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime); 3127 3128 if (mask & AT_MTIME) 3129 ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime); 3130 3131 /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */ 3132 if (mask & AT_SIZE) 3133 zfs_time_stamper_locked(zp, CONTENT_MODIFIED, tx); 3134 else if (mask != 0) 3135 zfs_time_stamper_locked(zp, STATE_CHANGED, tx); 3136 /* 3137 * Do this after setting timestamps to prevent timestamp 3138 * update from toggling bit 3139 */ 3140 3141 if (xoap && (mask & AT_XVATTR)) { 3142 3143 /* 3144 * restore trimmed off masks 3145 * so that return masks can be set for caller. 3146 */ 3147 3148 if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) { 3149 XVA_SET_REQ(xvap, XAT_APPENDONLY); 3150 } 3151 if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) { 3152 XVA_SET_REQ(xvap, XAT_NOUNLINK); 3153 } 3154 if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) { 3155 XVA_SET_REQ(xvap, XAT_IMMUTABLE); 3156 } 3157 if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) { 3158 XVA_SET_REQ(xvap, XAT_NODUMP); 3159 } 3160 if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) { 3161 XVA_SET_REQ(xvap, XAT_AV_MODIFIED); 3162 } 3163 if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) { 3164 XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); 3165 } 3166 3167 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { 3168 size_t len; 3169 dmu_object_info_t doi; 3170 3171 ASSERT(vp->v_type == VREG); 3172 3173 /* Grow the bonus buffer if necessary. */ 3174 dmu_object_info_from_db(zp->z_dbuf, &doi); 3175 len = sizeof (xoap->xoa_av_scanstamp) + 3176 sizeof (znode_phys_t); 3177 if (len > doi.doi_bonus_size) 3178 VERIFY(dmu_set_bonus(zp->z_dbuf, len, tx) == 0); 3179 } 3180 zfs_xvattr_set(zp, xvap); 3181 } 3182 3183 if (fuid_dirtied) 3184 zfs_fuid_sync(zfsvfs, tx); 3185 3186 if (mask != 0) 3187 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); 3188 3189 mutex_exit(&zp->z_lock); 3190 3191out: 3192 if (attrzp) 3193 VN_RELE(ZTOV(attrzp)); 3194 3195 if (aclp) 3196 zfs_acl_free(aclp); 3197 3198 if (fuidp) { 3199 zfs_fuid_info_free(fuidp); 3200 fuidp = NULL; 3201 } 3202 3203 if (err) 3204 dmu_tx_abort(tx); 3205 else 3206 dmu_tx_commit(tx); 3207 3208 if (err == ERESTART) 3209 goto top; 3210 3211 ZFS_EXIT(zfsvfs); 3212 return (err); 3213} 3214 3215typedef struct zfs_zlock { 3216 krwlock_t *zl_rwlock; /* lock we acquired */ 3217 znode_t *zl_znode; /* znode we held */ 3218 struct zfs_zlock *zl_next; /* next in list */ 3219} zfs_zlock_t; 3220 3221/* 3222 * Drop locks and release vnodes that were held by zfs_rename_lock(). 3223 */ 3224static void 3225zfs_rename_unlock(zfs_zlock_t **zlpp) 3226{ 3227 zfs_zlock_t *zl; 3228 3229 while ((zl = *zlpp) != NULL) { 3230 if (zl->zl_znode != NULL) 3231 VN_RELE(ZTOV(zl->zl_znode)); 3232 rw_exit(zl->zl_rwlock); 3233 *zlpp = zl->zl_next; 3234 kmem_free(zl, sizeof (*zl)); 3235 } 3236} 3237 3238/* 3239 * Search back through the directory tree, using the ".." entries. 3240 * Lock each directory in the chain to prevent concurrent renames. 3241 * Fail any attempt to move a directory into one of its own descendants. 3242 * XXX - z_parent_lock can overlap with map or grow locks 3243 */ 3244static int 3245zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) 3246{ 3247 zfs_zlock_t *zl; 3248 znode_t *zp = tdzp; 3249 uint64_t rootid = zp->z_zfsvfs->z_root; 3250 uint64_t *oidp = &zp->z_id; 3251 krwlock_t *rwlp = &szp->z_parent_lock; 3252 krw_t rw = RW_WRITER; 3253 3254 /* 3255 * First pass write-locks szp and compares to zp->z_id. 3256 * Later passes read-lock zp and compare to zp->z_parent. 3257 */ 3258 do { 3259 if (!rw_tryenter(rwlp, rw)) { 3260 /* 3261 * Another thread is renaming in this path. 3262 * Note that if we are a WRITER, we don't have any 3263 * parent_locks held yet. 3264 */ 3265 if (rw == RW_READER && zp->z_id > szp->z_id) { 3266 /* 3267 * Drop our locks and restart 3268 */ 3269 zfs_rename_unlock(&zl); 3270 *zlpp = NULL; 3271 zp = tdzp; 3272 oidp = &zp->z_id; 3273 rwlp = &szp->z_parent_lock; 3274 rw = RW_WRITER; 3275 continue; 3276 } else { 3277 /* 3278 * Wait for other thread to drop its locks 3279 */ 3280 rw_enter(rwlp, rw); 3281 } 3282 } 3283 3284 zl = kmem_alloc(sizeof (*zl), KM_SLEEP); 3285 zl->zl_rwlock = rwlp; 3286 zl->zl_znode = NULL; 3287 zl->zl_next = *zlpp; 3288 *zlpp = zl; 3289 3290 if (*oidp == szp->z_id) /* We're a descendant of szp */ 3291 return (EINVAL); 3292 3293 if (*oidp == rootid) /* We've hit the top */ 3294 return (0); 3295 3296 if (rw == RW_READER) { /* i.e. not the first pass */ 3297 int error = zfs_zget(zp->z_zfsvfs, *oidp, &zp); 3298 if (error) 3299 return (error); 3300 zl->zl_znode = zp; 3301 } 3302 oidp = &zp->z_phys->zp_parent; 3303 rwlp = &zp->z_parent_lock; 3304 rw = RW_READER; 3305 3306 } while (zp->z_id != sdzp->z_id); 3307 3308 return (0); 3309} 3310 3311/* 3312 * Move an entry from the provided source directory to the target 3313 * directory. Change the entry name as indicated. 3314 * 3315 * IN: sdvp - Source directory containing the "old entry". 3316 * snm - Old entry name. 3317 * tdvp - Target directory to contain the "new entry". 3318 * tnm - New entry name. 3319 * cr - credentials of caller. 3320 * ct - caller context 3321 * flags - case flags 3322 * 3323 * RETURN: 0 if success 3324 * error code if failure 3325 * 3326 * Timestamps: 3327 * sdvp,tdvp - ctime|mtime updated 3328 */ 3329/* XXX NetBSD There is significant problem with dirent locking during rename 3330 * of files which are in a same dir. zfs_dirent_lock is then called twice on 3331 * same lock which panics LOCKDEBUG kernel. Locking twice is not needed. 3332 * Proper solution for this is add new flag to zfs_dirent_lock which will 3333 * disable rw_enter in it. Renaming of files in same dir is considered as broken 3334 * on LOCKDEBUG kernels on NetBSD for now. 3335 */ 3336/*ARGSUSED*/ 3337static int 3338zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr, 3339 caller_context_t *ct, int flags) 3340{ 3341 znode_t *tdzp, *szp, *tzp; 3342 znode_t *sdzp = VTOZ(sdvp); 3343 zfsvfs_t *zfsvfs = sdzp->z_zfsvfs; 3344 zilog_t *zilog; 3345 vnode_t *realvp; 3346 zfs_dirlock_t *sdl, *tdl; 3347 dmu_tx_t *tx; 3348 zfs_zlock_t *zl; 3349 int cmp, serr, terr; 3350 int error = 0; 3351 int zflg = 0; 3352 int samedir = 0; 3353 3354 tdl = NULL; 3355 sdl = NULL; 3356 3357 dprintf("zfs_rename called\n"); 3358 3359 ZFS_ENTER(zfsvfs); 3360 ZFS_VERIFY_ZP(sdzp); 3361 zilog = zfsvfs->z_log; 3362 3363 /* 3364 * Make sure we have the real vp for the target directory. 3365 */ 3366 if (VOP_REALVP(tdvp, &realvp, ct) == 0) 3367 tdvp = realvp; 3368 3369 if (tdvp->v_vfsp != sdvp->v_vfsp) { 3370 ZFS_EXIT(zfsvfs); 3371 return (EXDEV); 3372 } 3373 3374 tdzp = VTOZ(tdvp); 3375 ZFS_VERIFY_ZP(tdzp); 3376 if (zfsvfs->z_utf8 && u8_validate(tnm, 3377 strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 3378 ZFS_EXIT(zfsvfs); 3379 return (EILSEQ); 3380 } 3381 3382 if (flags & FIGNORECASE) 3383 zflg |= ZCILOOK; 3384 3385top: 3386 szp = NULL; 3387 tzp = NULL; 3388 zl = NULL; 3389 3390 /* 3391 * This is to prevent the creation of links into attribute space 3392 * by renaming a linked file into/outof an attribute directory. 3393 * See the comment in zfs_link() for why this is considered bad. 3394 */ 3395 if ((tdzp->z_phys->zp_flags & ZFS_XATTR) != 3396 (sdzp->z_phys->zp_flags & ZFS_XATTR)) { 3397 ZFS_EXIT(zfsvfs); 3398 return (EINVAL); 3399 } 3400 3401 /* 3402 * Lock source and target directory entries. To prevent deadlock, 3403 * a lock ordering must be defined. We lock the directory with 3404 * the smallest object id first, or if it's a tie, the one with 3405 * the lexically first name. 3406 */ 3407 if (sdzp->z_id < tdzp->z_id) { 3408 cmp = -1; 3409 } else if (sdzp->z_id > tdzp->z_id) { 3410 cmp = 1; 3411 } else { 3412 /* 3413 * First compare the two name arguments without 3414 * considering any case folding. 3415 */ 3416 int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER); 3417 3418 cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error); 3419 ASSERT(error == 0 || !zfsvfs->z_utf8); 3420 if (cmp == 0) { 3421 /* 3422 * POSIX: "If the old argument and the new argument 3423 * both refer to links to the same existing file, 3424 * the rename() function shall return successfully 3425 * and perform no other action." 3426 */ 3427 ZFS_EXIT(zfsvfs); 3428 return (0); 3429 } 3430 /* 3431 * If the file system is case-folding, then we may 3432 * have some more checking to do. A case-folding file 3433 * system is either supporting mixed case sensitivity 3434 * access or is completely case-insensitive. Note 3435 * that the file system is always case preserving. 3436 * 3437 * In mixed sensitivity mode case sensitive behavior 3438 * is the default. FIGNORECASE must be used to 3439 * explicitly request case insensitive behavior. 3440 * 3441 * If the source and target names provided differ only 3442 * by case (e.g., a request to rename 'tim' to 'Tim'), 3443 * we will treat this as a special case in the 3444 * case-insensitive mode: as long as the source name 3445 * is an exact match, we will allow this to proceed as 3446 * a name-change request. 3447 */ 3448 if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE || 3449 (zfsvfs->z_case == ZFS_CASE_MIXED && 3450 flags & FIGNORECASE)) && 3451 u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST, 3452 &error) == 0) { 3453 /* 3454 * case preserving rename request, require exact 3455 * name matches 3456 */ 3457 zflg |= ZCIEXACT; 3458 zflg &= ~ZCILOOK; 3459 } 3460 } 3461 3462 /* 3463 * If the source and destination directories are the same, we should 3464 * grab the z_name_lock of that directory only once. 3465 */ 3466 if (sdzp == tdzp) { 3467 zflg |= ZHAVELOCK; 3468 rw_enter(&sdzp->z_name_lock, RW_READER); 3469 } 3470 3471 if (cmp < 0) { 3472 3473 serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, 3474 ZEXISTS | zflg, NULL, NULL); 3475 if ((serr == 0) && (sdzp == tdzp)) { 3476 /* 3477 * If renaming within the one directory we must 3478 * be careful not to recursively acquire locks. 3479 */ 3480 zflg |= ZHAVELOCK; 3481 } 3482 terr = zfs_dirent_lock(&tdl, 3483 tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL); 3484 } else { 3485 terr = zfs_dirent_lock(&tdl, 3486 tdzp, tnm, &tzp, zflg, NULL, NULL); 3487 3488 if ((terr == 0) && (sdzp == tdzp)) { 3489 /* 3490 * If renaming within the one directory we must 3491 * be careful not to recursively acquire locks. 3492 */ 3493 zflg |= ZHAVELOCK; 3494 } 3495 serr = zfs_dirent_lock(&sdl, 3496 sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg, 3497 NULL, NULL); 3498 } 3499 3500 if (serr) { 3501 /* 3502 * Source entry invalid or not there. 3503 */ 3504 if (!terr) { 3505 zfs_dirent_unlock(tdl); 3506 if (tzp) 3507 VN_RELE(ZTOV(tzp)); 3508 } 3509 3510 if (sdzp == tdzp) 3511 rw_exit(&sdzp->z_name_lock); 3512 3513 if (strcmp(snm, "..") == 0) 3514 serr = EINVAL; 3515 ZFS_EXIT(zfsvfs); 3516 return (serr); 3517 } 3518 if (terr) { 3519 if (sdl != NULL) 3520 zfs_dirent_unlock(sdl); 3521 VN_RELE(ZTOV(szp)); 3522 3523 if (sdzp == tdzp) 3524 rw_exit(&sdzp->z_name_lock); 3525 3526 if (strcmp(tnm, "..") == 0) 3527 terr = EINVAL; 3528 ZFS_EXIT(zfsvfs); 3529 return (terr); 3530 } 3531 3532 /* 3533 * Must have write access at the source to remove the old entry 3534 * and write access at the target to create the new entry. 3535 * Note that if target and source are the same, this can be 3536 * done in a single check. 3537 */ 3538 3539 if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr)) 3540 goto out; 3541 3542 if (ZTOV(szp)->v_type == VDIR) { 3543 /* 3544 * Check to make sure rename is valid. 3545 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d 3546 */ 3547 if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl)) 3548 goto out; 3549 } 3550 3551 /* 3552 * Does target exist? 3553 */ 3554 if (tzp) { 3555 /* 3556 * Source and target must be the same type. 3557 */ 3558 if (ZTOV(szp)->v_type == VDIR) { 3559 if (ZTOV(tzp)->v_type != VDIR) { 3560 error = ENOTDIR; 3561 goto out; 3562 } 3563 } else { 3564 if (ZTOV(tzp)->v_type == VDIR) { 3565 error = EISDIR; 3566 goto out; 3567 } 3568 } 3569 /* 3570 * POSIX dictates that when the source and target 3571 * entries refer to the same file object, rename 3572 * must do nothing and exit without error. 3573 */ 3574 if (szp->z_id == tzp->z_id) { 3575 error = 0; 3576 goto out; 3577 } 3578 } 3579 3580 vnevent_rename_src(ZTOV(szp), sdvp, snm, ct); 3581 if (tzp) 3582 vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct); 3583 3584 /* 3585 * notify the target directory if it is not the same 3586 * as source directory. 3587 */ 3588 if (tdvp != sdvp) { 3589 vnevent_rename_dest_dir(tdvp, ct); 3590 } 3591 3592 tx = dmu_tx_create(zfsvfs->z_os); 3593 dmu_tx_hold_bonus(tx, szp->z_id); /* nlink changes */ 3594 dmu_tx_hold_bonus(tx, sdzp->z_id); /* nlink changes */ 3595 dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); 3596 dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); 3597 if (sdzp != tdzp) 3598 dmu_tx_hold_bonus(tx, tdzp->z_id); /* nlink changes */ 3599 if (tzp) 3600 dmu_tx_hold_bonus(tx, tzp->z_id); /* parent changes */ 3601 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 3602 error = dmu_tx_assign(tx, TXG_NOWAIT); 3603 if (error) { 3604 if (zl != NULL) 3605 zfs_rename_unlock(&zl); 3606 3607 zfs_dirent_unlock(sdl); 3608 zfs_dirent_unlock(tdl); 3609 3610 if (sdzp == tdzp) 3611 rw_exit(&sdzp->z_name_lock); 3612 3613 VN_RELE(ZTOV(szp)); 3614 if (tzp) 3615 VN_RELE(ZTOV(tzp)); 3616 if (error == ERESTART) { 3617 dmu_tx_wait(tx); 3618 dmu_tx_abort(tx); 3619 goto top; 3620 } 3621 dmu_tx_abort(tx); 3622 ZFS_EXIT(zfsvfs); 3623 return (error); 3624 } 3625 3626 if (tzp) /* Attempt to remove the existing target */ 3627 error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL); 3628 3629 if (error == 0) { 3630 error = zfs_link_create(tdl, szp, tx, ZRENAMING); 3631 if (error == 0) { 3632 szp->z_phys->zp_flags |= ZFS_AV_MODIFIED; 3633 3634 error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); 3635 ASSERT(error == 0); 3636 3637 zfs_log_rename(zilog, tx, 3638 TX_RENAME | (flags & FIGNORECASE ? TX_CI : 0), 3639 sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp); 3640 3641 /* Update path information for the target vnode */ 3642 vn_renamepath(tdvp, ZTOV(szp), tnm, strlen(tnm)); 3643 } 3644 if (error == 0) { 3645 /* Purge cache entries, while still holding locks. */ 3646 cache_purge(sdvp); 3647 cache_purge(tdvp); 3648 } 3649 } 3650 3651 dmu_tx_commit(tx); 3652out: 3653 if (zl != NULL) 3654 zfs_rename_unlock(&zl); 3655 3656 zfs_dirent_unlock(sdl); 3657 zfs_dirent_unlock(tdl); 3658 3659 if (sdzp == tdzp) 3660 rw_exit(&sdzp->z_name_lock); 3661 3662 3663 VN_RELE(ZTOV(szp)); 3664 if (tzp) 3665 VN_RELE(ZTOV(tzp)); 3666 3667 ZFS_EXIT(zfsvfs); 3668 3669 return (error); 3670} 3671 3672/* 3673 * Insert the indicated symbolic reference entry into the directory. 3674 * 3675 * IN: dvp - Directory to contain new symbolic link. 3676 * link - Name for new symlink entry. 3677 * vap - Attributes of new entry. 3678 * target - Target path of new symlink. 3679 * cr - credentials of caller. 3680 * ct - caller context 3681 * flags - case flags 3682 * 3683 * RETURN: 0 if success 3684 * error code if failure 3685 * 3686 * Timestamps: 3687 * dvp - ctime|mtime updated 3688 */ 3689/*ARGSUSED*/ 3690static int 3691zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link, 3692 cred_t *cr, int flags) 3693{ 3694 znode_t *zp, *dzp = VTOZ(dvp); 3695 zfs_dirlock_t *dl; 3696 dmu_tx_t *tx; 3697 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 3698 zilog_t *zilog; 3699 int len = strlen(link); 3700 int error; 3701 int zflg = ZNEW; 3702 zfs_acl_ids_t acl_ids; 3703 boolean_t fuid_dirtied; 3704 3705 ASSERT(vap->va_type == VLNK); 3706 3707 ZFS_ENTER(zfsvfs); 3708 ZFS_VERIFY_ZP(dzp); 3709 zilog = zfsvfs->z_log; 3710 3711 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), 3712 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 3713 ZFS_EXIT(zfsvfs); 3714 return (EILSEQ); 3715 } 3716 if (flags & FIGNORECASE) 3717 zflg |= ZCILOOK; 3718top: 3719 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { 3720 ZFS_EXIT(zfsvfs); 3721 return (error); 3722 } 3723 3724 if (len > MAXPATHLEN) { 3725 ZFS_EXIT(zfsvfs); 3726 return (ENAMETOOLONG); 3727 } 3728 3729 /* 3730 * Attempt to lock directory; fail if entry already exists. 3731 */ 3732 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); 3733 if (error) { 3734 ZFS_EXIT(zfsvfs); 3735 return (error); 3736 } 3737 3738 VERIFY(0 == zfs_acl_ids_create(dzp, 0, vap, cr, NULL, &acl_ids)); 3739 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { 3740 zfs_acl_ids_free(&acl_ids); 3741 zfs_dirent_unlock(dl); 3742 ZFS_EXIT(zfsvfs); 3743 return (EDQUOT); 3744 } 3745 tx = dmu_tx_create(zfsvfs->z_os); 3746 fuid_dirtied = zfsvfs->z_fuid_dirty; 3747 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); 3748 dmu_tx_hold_bonus(tx, dzp->z_id); 3749 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 3750 if (acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) 3751 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE); 3752 if (fuid_dirtied) 3753 zfs_fuid_txhold(zfsvfs, tx); 3754 error = dmu_tx_assign(tx, TXG_NOWAIT); 3755 if (error) { 3756 zfs_acl_ids_free(&acl_ids); 3757 zfs_dirent_unlock(dl); 3758 if (error == ERESTART) { 3759 dmu_tx_wait(tx); 3760 dmu_tx_abort(tx); 3761 goto top; 3762 } 3763 dmu_tx_abort(tx); 3764 ZFS_EXIT(zfsvfs); 3765 return (error); 3766 } 3767 3768 dmu_buf_will_dirty(dzp->z_dbuf, tx); 3769 3770 /* 3771 * Create a new object for the symlink. 3772 * Put the link content into bonus buffer if it will fit; 3773 * otherwise, store it just like any other file data. 3774 */ 3775 if (sizeof (znode_phys_t) + len <= dmu_bonus_max()) { 3776 zfs_mknode(dzp, vap, tx, cr, 0, &zp, len, &acl_ids); 3777 if (len != 0) 3778 bcopy(link, zp->z_phys + 1, len); 3779 } else { 3780 dmu_buf_t *dbp; 3781 3782 zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, &acl_ids); 3783 3784 if (fuid_dirtied) 3785 zfs_fuid_sync(zfsvfs, tx); 3786 /* 3787 * Nothing can access the znode yet so no locking needed 3788 * for growing the znode's blocksize. 3789 */ 3790 zfs_grow_blocksize(zp, len, tx); 3791 3792 VERIFY(0 == dmu_buf_hold(zfsvfs->z_os, 3793 zp->z_id, 0, FTAG, &dbp)); 3794 dmu_buf_will_dirty(dbp, tx); 3795 3796 ASSERT3U(len, <=, dbp->db_size); 3797 bcopy(link, dbp->db_data, len); 3798 dmu_buf_rele(dbp, FTAG); 3799 } 3800 zp->z_phys->zp_size = len; 3801 3802 /* 3803 * Insert the new object into the directory. 3804 */ 3805 (void) zfs_link_create(dl, zp, tx, ZNEW); 3806 if (error == 0) { 3807 uint64_t txtype = TX_SYMLINK; 3808 if (flags & FIGNORECASE) 3809 txtype |= TX_CI; 3810 zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); 3811 *vpp = ZTOV(zp); 3812 } 3813 3814 zfs_acl_ids_free(&acl_ids); 3815 3816 dmu_tx_commit(tx); 3817 3818 zfs_dirent_unlock(dl); 3819 3820 ZFS_EXIT(zfsvfs); 3821 return (error); 3822} 3823 3824/* 3825 * Return, in the buffer contained in the provided uio structure, 3826 * the symbolic path referred to by vp. 3827 * 3828 * IN: vp - vnode of symbolic link. 3829 * uoip - structure to contain the link path. 3830 * cr - credentials of caller. 3831 * ct - caller context 3832 * 3833 * OUT: uio - structure to contain the link path. 3834 * 3835 * RETURN: 0 if success 3836 * error code if failure 3837 * 3838 * Timestamps: 3839 * vp - atime updated 3840 */ 3841/* ARGSUSED */ 3842static int 3843zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct) 3844{ 3845 znode_t *zp = VTOZ(vp); 3846 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 3847 size_t bufsz; 3848 int error; 3849 3850 ZFS_ENTER(zfsvfs); 3851 ZFS_VERIFY_ZP(zp); 3852 3853 bufsz = (size_t)zp->z_phys->zp_size; 3854 if (bufsz + sizeof (znode_phys_t) <= zp->z_dbuf->db_size) { 3855 error = uiomove(zp->z_phys + 1, 3856 MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); 3857 } else { 3858 dmu_buf_t *dbp; 3859 error = dmu_buf_hold(zfsvfs->z_os, zp->z_id, 0, FTAG, &dbp); 3860 if (error) { 3861 ZFS_EXIT(zfsvfs); 3862 return (error); 3863 } 3864 error = uiomove(dbp->db_data, 3865 MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); 3866 dmu_buf_rele(dbp, FTAG); 3867 } 3868 3869 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 3870 ZFS_EXIT(zfsvfs); 3871 return (error); 3872} 3873 3874/* 3875 * Insert a new entry into directory tdvp referencing svp. 3876 * 3877 * IN: tdvp - Directory to contain new entry. 3878 * svp - vnode of new entry. 3879 * name - name of new entry. 3880 * cr - credentials of caller. 3881 * ct - caller context 3882 * 3883 * RETURN: 0 if success 3884 * error code if failure 3885 * 3886 * Timestamps: 3887 * tdvp - ctime|mtime updated 3888 * svp - ctime updated 3889 */ 3890/* ARGSUSED */ 3891static int 3892zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr, 3893 caller_context_t *ct, int flags) 3894{ 3895 znode_t *dzp = VTOZ(tdvp); 3896 znode_t *tzp, *szp; 3897 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 3898 zilog_t *zilog; 3899 zfs_dirlock_t *dl; 3900 dmu_tx_t *tx; 3901 vnode_t *realvp; 3902 int error; 3903 int zf = ZNEW; 3904 uid_t owner; 3905 3906 ASSERT(tdvp->v_type == VDIR); 3907 3908 ZFS_ENTER(zfsvfs); 3909 ZFS_VERIFY_ZP(dzp); 3910 zilog = zfsvfs->z_log; 3911 3912 if (VOP_REALVP(svp, &realvp, ct) == 0) 3913 svp = realvp; 3914 3915 if (svp->v_vfsp != tdvp->v_vfsp) { 3916 ZFS_EXIT(zfsvfs); 3917 return (EXDEV); 3918 } 3919 szp = VTOZ(svp); 3920 ZFS_VERIFY_ZP(szp); 3921 3922 if (zfsvfs->z_utf8 && u8_validate(name, 3923 strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 3924 ZFS_EXIT(zfsvfs); 3925 return (EILSEQ); 3926 } 3927 if (flags & FIGNORECASE) 3928 zf |= ZCILOOK; 3929 3930top: 3931 /* 3932 * We do not support links between attributes and non-attributes 3933 * because of the potential security risk of creating links 3934 * into "normal" file space in order to circumvent restrictions 3935 * imposed in attribute space. 3936 */ 3937 if ((szp->z_phys->zp_flags & ZFS_XATTR) != 3938 (dzp->z_phys->zp_flags & ZFS_XATTR)) { 3939 ZFS_EXIT(zfsvfs); 3940 return (EINVAL); 3941 } 3942 3943 /* 3944 * POSIX dictates that we return EPERM here. 3945 * Better choices include ENOTSUP or EISDIR. 3946 */ 3947 if (svp->v_type == VDIR) { 3948 ZFS_EXIT(zfsvfs); 3949 return (EPERM); 3950 } 3951 3952 owner = zfs_fuid_map_id(zfsvfs, szp->z_phys->zp_uid, cr, ZFS_OWNER); 3953 if (owner != crgetuid(cr) && 3954 secpolicy_basic_link(cr) != 0) { 3955 ZFS_EXIT(zfsvfs); 3956 return (EPERM); 3957 } 3958 3959 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { 3960 ZFS_EXIT(zfsvfs); 3961 return (error); 3962 } 3963 3964 /* 3965 * Attempt to lock directory; fail if entry already exists. 3966 */ 3967 error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL); 3968 if (error) { 3969 ZFS_EXIT(zfsvfs); 3970 return (error); 3971 } 3972 3973 tx = dmu_tx_create(zfsvfs->z_os); 3974 dmu_tx_hold_bonus(tx, szp->z_id); 3975 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 3976 error = dmu_tx_assign(tx, TXG_NOWAIT); 3977 if (error) { 3978 zfs_dirent_unlock(dl); 3979 if (error == ERESTART) { 3980 dmu_tx_wait(tx); 3981 dmu_tx_abort(tx); 3982 goto top; 3983 } 3984 dmu_tx_abort(tx); 3985 ZFS_EXIT(zfsvfs); 3986 return (error); 3987 } 3988 3989 error = zfs_link_create(dl, szp, tx, 0); 3990 3991 if (error == 0) { 3992 uint64_t txtype = TX_LINK; 3993 if (flags & FIGNORECASE) 3994 txtype |= TX_CI; 3995 zfs_log_link(zilog, tx, txtype, dzp, szp, name); 3996 } 3997 3998 dmu_tx_commit(tx); 3999 4000 zfs_dirent_unlock(dl); 4001 4002 if (error == 0) { 4003 vnevent_link(svp, ct); 4004 } 4005 4006 ZFS_EXIT(zfsvfs); 4007 return (error); 4008} 4009 4010/*ARGSUSED*/ 4011 4012/* CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid)); */ 4013/* CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid)); */ 4014 4015/*ARGSUSED*/ 4016static int 4017zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct) 4018{ 4019 /* XXX This should bre reviewed maybe Opensolaris version of zfs_fid can 4020 be used for NetBSD */ 4021 znode_t *zp = VTOZ(vp); 4022 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4023 uint32_t gen; 4024 uint64_t object = zp->z_id; 4025 zfid_short_t *zfid; 4026 int size, i; 4027 4028 ZFS_ENTER(zfsvfs); 4029 ZFS_VERIFY_ZP(zp); 4030 gen = (uint32_t)zp->z_gen; 4031 4032 size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN; 4033 fidp->fid_len = size; 4034 4035 zfid = (zfid_short_t *)fidp; 4036 4037 zfid->zf_len = size; 4038 4039 for (i = 0; i < sizeof (zfid->zf_object); i++) 4040 zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); 4041 4042 /* Must have a non-zero generation number to distinguish from .zfs */ 4043 if (gen == 0) 4044 gen = 1; 4045 for (i = 0; i < sizeof (zfid->zf_gen); i++) 4046 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); 4047 4048 if (size == LONG_FID_LEN) { 4049 uint64_t objsetid = dmu_objset_id(zfsvfs->z_os); 4050 zfid_long_t *zlfid; 4051 4052 zlfid = (zfid_long_t *)fidp; 4053 4054 for (i = 0; i < sizeof (zlfid->zf_setid); i++) 4055 zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i)); 4056 4057 /* XXX - this should be the generation number for the objset */ 4058 for (i = 0; i < sizeof (zlfid->zf_setgen); i++) 4059 zlfid->zf_setgen[i] = 0; 4060 } 4061 4062 ZFS_EXIT(zfsvfs); 4063 return (0); 4064} 4065 4066/* 4067 * Copy the portion of the file indicated from pages into the file. 4068 * The pages are stored in a page list attached to the files vnode. 4069 * 4070 * IN: vp - vnode of file to push page data to. 4071 * off - position in file to put data. 4072 * len - amount of data to write. 4073 * flags - flags to control the operation. 4074 * cr - credentials of caller. 4075 * ct - caller context. 4076 * 4077 * RETURN: 0 if success 4078 * error code if failure 4079 * 4080 * Timestamps: 4081 * vp - ctime|mtime updated 4082 */ 4083/*ARGSUSED*/ 4084#ifdef PORT_SOLARIS 4085static int 4086zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr, 4087 caller_context_t *ct) 4088{ 4089 znode_t *zp = VTOZ(vp); 4090 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4091 page_t *pp; 4092 size_t io_len; 4093 u_offset_t io_off; 4094 uint_t blksz; 4095 rl_t *rl; 4096 int error = 0; 4097 4098 ZFS_ENTER(zfsvfs); 4099 ZFS_VERIFY_ZP(zp); 4100 4101 /* 4102 * Align this request to the file block size in case we kluster. 4103 * XXX - this can result in pretty aggresive locking, which can 4104 * impact simultanious read/write access. One option might be 4105 * to break up long requests (len == 0) into block-by-block 4106 * operations to get narrower locking. 4107 */ 4108 blksz = zp->z_blksz; 4109 if (ISP2(blksz)) 4110 io_off = P2ALIGN_TYPED(off, blksz, u_offset_t); 4111 else 4112 io_off = 0; 4113 if (len > 0 && ISP2(blksz)) 4114 io_len = P2ROUNDUP_TYPED(len + (off - io_off), blksz, size_t); 4115 else 4116 io_len = 0; 4117 4118 if (io_len == 0) { 4119 /* 4120 * Search the entire vp list for pages >= io_off. 4121 */ 4122 rl = zfs_range_lock(zp, io_off, UINT64_MAX, RL_WRITER); 4123 error = pvn_vplist_dirty(vp, io_off, zfs_putapage, flags, cr); 4124 goto out; 4125 } 4126 rl = zfs_range_lock(zp, io_off, io_len, RL_WRITER); 4127 4128 if (off > zp->z_phys->zp_size) { 4129 /* past end of file */ 4130 zfs_range_unlock(rl); 4131 ZFS_EXIT(zfsvfs); 4132 return (0); 4133 } 4134 4135 len = MIN(io_len, P2ROUNDUP(zp->z_phys->zp_size, PAGESIZE) - io_off); 4136 4137 for (off = io_off; io_off < off + len; io_off += io_len) { 4138 if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) { 4139 pp = page_lookup(vp, io_off, 4140 (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED); 4141 } else { 4142 pp = page_lookup_nowait(vp, io_off, 4143 (flags & B_FREE) ? SE_EXCL : SE_SHARED); 4144 } 4145 4146 if (pp != NULL && pvn_getdirty(pp, flags)) { 4147 int err; 4148 4149 /* 4150 * Found a dirty page to push 4151 */ 4152 err = zfs_putapage(vp, pp, &io_off, &io_len, flags, cr); 4153 if (err) 4154 error = err; 4155 } else { 4156 io_len = PAGESIZE; 4157 } 4158 } 4159out: 4160 zfs_range_unlock(rl); 4161 if ((flags & B_ASYNC) == 0) 4162 zil_commit(zfsvfs->z_log, UINT64_MAX, zp->z_id); 4163 ZFS_EXIT(zfsvfs); 4164 return (error); 4165} 4166 4167/*ARGSUSED*/ 4168void 4169zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) 4170{ 4171 znode_t *zp = VTOZ(vp); 4172 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4173 int error; 4174 4175 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); 4176 if (zp->z_dbuf == NULL) { 4177 /* 4178 * The fs has been unmounted, or we did a 4179 * suspend/resume and this file no longer exists. 4180 */ 4181 if (vn_has_cached_data(vp)) { 4182 (void) pvn_vplist_dirty(vp, 0, zfs_null_putapage, 4183 B_INVAL, cr); 4184 } 4185 4186 mutex_enter(&zp->z_lock); 4187 mutex_enter(&vp->v_lock); 4188 ASSERT(vp->v_count == 1); 4189 vp->v_count = 0; 4190 mutex_exit(&vp->v_lock); 4191 mutex_exit(&zp->z_lock); 4192 rw_exit(&zfsvfs->z_teardown_inactive_lock); 4193 zfs_znode_free(zp); 4194 return; 4195 } 4196 4197 /* 4198 * Attempt to push any data in the page cache. If this fails 4199 * we will get kicked out later in zfs_zinactive(). 4200 */ 4201 if (vn_has_cached_data(vp)) { 4202 (void) pvn_vplist_dirty(vp, 0, zfs_putapage, B_INVAL|B_ASYNC, 4203 cr); 4204 } 4205 4206 if (zp->z_atime_dirty && zp->z_unlinked == 0) { 4207 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); 4208 4209 dmu_tx_hold_bonus(tx, zp->z_id); 4210 error = dmu_tx_assign(tx, TXG_WAIT); 4211 if (error) { 4212 dmu_tx_abort(tx); 4213 } else { 4214 dmu_buf_will_dirty(zp->z_dbuf, tx); 4215 mutex_enter(&zp->z_lock); 4216 zp->z_atime_dirty = 0; 4217 mutex_exit(&zp->z_lock); 4218 dmu_tx_commit(tx); 4219 } 4220 } 4221 4222 zfs_zinactive(zp); 4223 rw_exit(&zfsvfs->z_teardown_inactive_lock); 4224} 4225#endif /* PORT_SOLARIS */ 4226 4227/* 4228 * Bounds-check the seek operation. 4229 * 4230 * IN: vp - vnode seeking within 4231 * ooff - old file offset 4232 * noffp - pointer to new file offset 4233 * ct - caller context 4234 * 4235 * RETURN: 0 if success 4236 * EINVAL if new offset invalid 4237 */ 4238/* ARGSUSED */ 4239static int 4240zfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, 4241 caller_context_t *ct) 4242{ 4243 if (vp->v_type == VDIR) 4244 return (0); 4245 return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0); 4246} 4247 4248#ifdef PORT_SOLARIS 4249/* 4250 * Pre-filter the generic locking function to trap attempts to place 4251 * a mandatory lock on a memory mapped file. 4252 */ 4253static int 4254zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset, 4255 flk_callback_t *flk_cbp, cred_t *cr, caller_context_t *ct) 4256{ 4257 znode_t *zp = VTOZ(vp); 4258 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4259 4260 ZFS_ENTER(zfsvfs); 4261 ZFS_VERIFY_ZP(zp); 4262 4263 /* 4264 * We are following the UFS semantics with respect to mapcnt 4265 * here: If we see that the file is mapped already, then we will 4266 * return an error, but we don't worry about races between this 4267 * function and zfs_map(). 4268 */ 4269 if (zp->z_mapcnt > 0 && MANDMODE((mode_t)zp->z_phys->zp_mode)) { 4270 ZFS_EXIT(zfsvfs); 4271 return (EAGAIN); 4272 } 4273 ZFS_EXIT(zfsvfs); 4274 return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct)); 4275} 4276 4277 4278/* 4279 * If we can't find a page in the cache, we will create a new page 4280 * and fill it with file data. For efficiency, we may try to fill 4281 * multiple pages at once (klustering) to fill up the supplied page 4282 * list. Note that the pages to be filled are held with an exclusive 4283 * lock to prevent access by other threads while they are being filled. 4284 */ 4285static int 4286zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg, 4287 caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw) 4288{ 4289 znode_t *zp = VTOZ(vp); 4290 page_t *pp, *cur_pp; 4291 objset_t *os = zp->z_zfsvfs->z_os; 4292 u_offset_t io_off, total; 4293 size_t io_len; 4294 int err; 4295 4296 if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE) { 4297 /* 4298 * We only have a single page, don't bother klustering 4299 */ 4300 io_off = off; 4301 io_len = PAGESIZE; 4302 pp = page_create_va(vp, io_off, io_len, 4303 PG_EXCL | PG_WAIT, seg, addr); 4304 } else { 4305 /* 4306 * Try to find enough pages to fill the page list 4307 */ 4308 pp = pvn_read_kluster(vp, off, seg, addr, &io_off, 4309 &io_len, off, plsz, 0); 4310 } 4311 if (pp == NULL) { 4312 /* 4313 * The page already exists, nothing to do here. 4314 */ 4315 *pl = NULL; 4316 return (0); 4317 } 4318 4319 /* 4320 * Fill the pages in the kluster. 4321 */ 4322 cur_pp = pp; 4323 for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) { 4324 caddr_t va; 4325 4326 ASSERT3U(io_off, ==, cur_pp->p_offset); 4327 va = zfs_map_page(cur_pp, S_WRITE); 4328 err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va, 4329 DMU_READ_PREFETCH); 4330 zfs_unmap_page(cur_pp, va); 4331 if (err) { 4332 /* On error, toss the entire kluster */ 4333 pvn_read_done(pp, B_ERROR); 4334 /* convert checksum errors into IO errors */ 4335 if (err == ECKSUM) 4336 err = EIO; 4337 return (err); 4338 } 4339 cur_pp = cur_pp->p_next; 4340 } 4341 4342 /* 4343 * Fill in the page list array from the kluster starting 4344 * from the desired offset `off'. 4345 * NOTE: the page list will always be null terminated. 4346 */ 4347 pvn_plist_init(pp, pl, plsz, off, io_len, rw); 4348 ASSERT(pl == NULL || (*pl)->p_offset == off); 4349 4350 return (0); 4351} 4352 4353/* 4354 * Return pointers to the pages for the file region [off, off + len] 4355 * in the pl array. If plsz is greater than len, this function may 4356 * also return page pointers from after the specified region 4357 * (i.e. the region [off, off + plsz]). These additional pages are 4358 * only returned if they are already in the cache, or were created as 4359 * part of a klustered read. 4360 * 4361 * IN: vp - vnode of file to get data from. 4362 * off - position in file to get data from. 4363 * len - amount of data to retrieve. 4364 * plsz - length of provided page list. 4365 * seg - segment to obtain pages for. 4366 * addr - virtual address of fault. 4367 * rw - mode of created pages. 4368 * cr - credentials of caller. 4369 * ct - caller context. 4370 * 4371 * OUT: protp - protection mode of created pages. 4372 * pl - list of pages created. 4373 * 4374 * RETURN: 0 if success 4375 * error code if failure 4376 * 4377 * Timestamps: 4378 * vp - atime updated 4379 */ 4380/* ARGSUSED */ 4381static int 4382zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp, 4383 page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, 4384 enum seg_rw rw, cred_t *cr, caller_context_t *ct) 4385{ 4386 znode_t *zp = VTOZ(vp); 4387 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4388 page_t **pl0 = pl; 4389 int err = 0; 4390 4391 /* we do our own caching, faultahead is unnecessary */ 4392 if (pl == NULL) 4393 return (0); 4394 else if (len > plsz) 4395 len = plsz; 4396 else 4397 len = P2ROUNDUP(len, PAGESIZE); 4398 ASSERT(plsz >= len); 4399 4400 ZFS_ENTER(zfsvfs); 4401 ZFS_VERIFY_ZP(zp); 4402 4403 if (protp) 4404 *protp = PROT_ALL; 4405 4406 /* 4407 * Loop through the requested range [off, off + len) looking 4408 * for pages. If we don't find a page, we will need to create 4409 * a new page and fill it with data from the file. 4410 */ 4411 while (len > 0) { 4412 if (*pl = page_lookup(vp, off, SE_SHARED)) 4413 *(pl+1) = NULL; 4414 else if (err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw)) 4415 goto out; 4416 while (*pl) { 4417 ASSERT3U((*pl)->p_offset, ==, off); 4418 off += PAGESIZE; 4419 addr += PAGESIZE; 4420 if (len > 0) { 4421 ASSERT3U(len, >=, PAGESIZE); 4422 len -= PAGESIZE; 4423 } 4424 ASSERT3U(plsz, >=, PAGESIZE); 4425 plsz -= PAGESIZE; 4426 pl++; 4427 } 4428 } 4429 4430 /* 4431 * Fill out the page array with any pages already in the cache. 4432 */ 4433 while (plsz > 0 && 4434 (*pl++ = page_lookup_nowait(vp, off, SE_SHARED))) { 4435 off += PAGESIZE; 4436 plsz -= PAGESIZE; 4437 } 4438out: 4439 if (err) { 4440 /* 4441 * Release any pages we have previously locked. 4442 */ 4443 while (pl > pl0) 4444 page_unlock(*--pl); 4445 } else { 4446 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 4447 } 4448 4449 *pl = NULL; 4450 4451 ZFS_EXIT(zfsvfs); 4452 return (err); 4453} 4454 4455/* 4456 * Request a memory map for a section of a file. This code interacts 4457 * with common code and the VM system as follows: 4458 * 4459 * common code calls mmap(), which ends up in smmap_common() 4460 * 4461 * this calls VOP_MAP(), which takes you into (say) zfs 4462 * 4463 * zfs_map() calls as_map(), passing segvn_create() as the callback 4464 * 4465 * segvn_create() creates the new segment and calls VOP_ADDMAP() 4466 * 4467 * zfs_addmap() updates z_mapcnt 4468 */ 4469/*ARGSUSED*/ 4470static int 4471zfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, 4472 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, 4473 caller_context_t *ct) 4474{ 4475 znode_t *zp = VTOZ(vp); 4476 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4477 segvn_crargs_t vn_a; 4478 int error; 4479 4480 ZFS_ENTER(zfsvfs); 4481 ZFS_VERIFY_ZP(zp); 4482 4483 if ((prot & PROT_WRITE) && 4484 (zp->z_phys->zp_flags & (ZFS_IMMUTABLE | ZFS_READONLY | 4485 ZFS_APPENDONLY))) { 4486 ZFS_EXIT(zfsvfs); 4487 return (EPERM); 4488 } 4489 4490 if ((prot & (PROT_READ | PROT_EXEC)) && 4491 (zp->z_phys->zp_flags & ZFS_AV_QUARANTINED)) { 4492 ZFS_EXIT(zfsvfs); 4493 return (EACCES); 4494 } 4495 4496 if (vp->v_flag & VNOMAP) { 4497 ZFS_EXIT(zfsvfs); 4498 return (ENOSYS); 4499 } 4500 4501 if (off < 0 || len > MAXOFFSET_T - off) { 4502 ZFS_EXIT(zfsvfs); 4503 return (ENXIO); 4504 } 4505 4506 if (vp->v_type != VREG) { 4507 ZFS_EXIT(zfsvfs); 4508 return (ENODEV); 4509 } 4510 4511 /* 4512 * If file is locked, disallow mapping. 4513 */ 4514 if (MANDMODE((mode_t)zp->z_phys->zp_mode) && vn_has_flocks(vp)) { 4515 ZFS_EXIT(zfsvfs); 4516 return (EAGAIN); 4517 } 4518 4519 as_rangelock(as); 4520 error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags); 4521 if (error != 0) { 4522 as_rangeunlock(as); 4523 ZFS_EXIT(zfsvfs); 4524 return (error); 4525 } 4526 4527 vn_a.vp = vp; 4528 vn_a.offset = (u_offset_t)off; 4529 vn_a.type = flags & MAP_TYPE; 4530 vn_a.prot = prot; 4531 vn_a.maxprot = maxprot; 4532 vn_a.cred = cr; 4533 vn_a.amp = NULL; 4534 vn_a.flags = flags & ~MAP_TYPE; 4535 vn_a.szc = 0; 4536 vn_a.lgrp_mem_policy_flags = 0; 4537 4538 error = as_map(as, *addrp, len, segvn_create, &vn_a); 4539 4540 as_rangeunlock(as); 4541 ZFS_EXIT(zfsvfs); 4542 return (error); 4543} 4544 4545/* ARGSUSED */ 4546static int 4547zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, 4548 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, 4549 caller_context_t *ct) 4550{ 4551 uint64_t pages = btopr(len); 4552 4553 atomic_add_64(&VTOZ(vp)->z_mapcnt, pages); 4554 return (0); 4555} 4556 4557/* 4558 * The reason we push dirty pages as part of zfs_delmap() is so that we get a 4559 * more accurate mtime for the associated file. Since we don't have a way of 4560 * detecting when the data was actually modified, we have to resort to 4561 * heuristics. If an explicit msync() is done, then we mark the mtime when the 4562 * last page is pushed. The problem occurs when the msync() call is omitted, 4563 * which by far the most common case: 4564 * 4565 * open() 4566 * mmap() 4567 * <modify memory> 4568 * munmap() 4569 * close() 4570 * <time lapse> 4571 * putpage() via fsflush 4572 * 4573 * If we wait until fsflush to come along, we can have a modification time that 4574 * is some arbitrary point in the future. In order to prevent this in the 4575 * common case, we flush pages whenever a (MAP_SHARED, PROT_WRITE) mapping is 4576 * torn down. 4577 */ 4578/* ARGSUSED */ 4579static int 4580zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, 4581 size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr, 4582 caller_context_t *ct) 4583{ 4584 uint64_t pages = btopr(len); 4585 4586 ASSERT3U(VTOZ(vp)->z_mapcnt, >=, pages); 4587 atomic_add_64(&VTOZ(vp)->z_mapcnt, -pages); 4588 4589 if ((flags & MAP_SHARED) && (prot & PROT_WRITE) && 4590 vn_has_cached_data(vp)) 4591 (void) VOP_PUTPAGE(vp, off, len, B_ASYNC, cr, ct); 4592 4593 return (0); 4594} 4595 4596/* 4597 * Free or allocate space in a file. Currently, this function only 4598 * supports the `F_FREESP' command. However, this command is somewhat 4599 * misnamed, as its functionality includes the ability to allocate as 4600 * well as free space. 4601 * 4602 * IN: vp - vnode of file to free data in. 4603 * cmd - action to take (only F_FREESP supported). 4604 * bfp - section of file to free/alloc. 4605 * flag - current file open mode flags. 4606 * offset - current file offset. 4607 * cr - credentials of caller [UNUSED]. 4608 * ct - caller context. 4609 * 4610 * RETURN: 0 if success 4611 * error code if failure 4612 * 4613 * Timestamps: 4614 * vp - ctime|mtime updated 4615 */ 4616/* ARGSUSED */ 4617static int 4618zfs_space(vnode_t *vp, int cmd, flock64_t *bfp, int flag, 4619 offset_t offset, cred_t *cr, caller_context_t *ct) 4620{ 4621 znode_t *zp = VTOZ(vp); 4622 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4623 uint64_t off, len; 4624 int error; 4625 4626 ZFS_ENTER(zfsvfs); 4627 ZFS_VERIFY_ZP(zp); 4628 4629 if (cmd != F_FREESP) { 4630 ZFS_EXIT(zfsvfs); 4631 return (EINVAL); 4632 } 4633 4634 if (error = convoff(vp, bfp, 0, offset)) { 4635 ZFS_EXIT(zfsvfs); 4636 return (error); 4637 } 4638 4639 if (bfp->l_len < 0) { 4640 ZFS_EXIT(zfsvfs); 4641 return (EINVAL); 4642 } 4643 4644 off = bfp->l_start; 4645 len = bfp->l_len; /* 0 means from off to end of file */ 4646 4647 error = zfs_freesp(zp, off, len, flag, TRUE); 4648 4649 ZFS_EXIT(zfsvfs); 4650 return (error); 4651} 4652 4653/*ARGSUSED*/ 4654static int 4655zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct) 4656{ 4657 znode_t *zp = VTOZ(vp); 4658 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4659 uint32_t gen; 4660 uint64_t object = zp->z_id; 4661 zfid_short_t *zfid; 4662 int size, i; 4663 4664 ZFS_ENTER(zfsvfs); 4665 ZFS_VERIFY_ZP(zp); 4666 gen = (uint32_t)zp->z_gen; 4667 4668 size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN; 4669 if (fidp->fid_len < size) { 4670 fidp->fid_len = size; 4671 ZFS_EXIT(zfsvfs); 4672 return (ENOSPC); 4673 } 4674 4675 zfid = (zfid_short_t *)fidp; 4676 4677 zfid->zf_len = size; 4678 4679 for (i = 0; i < sizeof (zfid->zf_object); i++) 4680 zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); 4681 4682 /* Must have a non-zero generation number to distinguish from .zfs */ 4683 if (gen == 0) 4684 gen = 1; 4685 for (i = 0; i < sizeof (zfid->zf_gen); i++) 4686 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); 4687 4688 if (size == LONG_FID_LEN) { 4689 uint64_t objsetid = dmu_objset_id(zfsvfs->z_os); 4690 zfid_long_t *zlfid; 4691 4692 zlfid = (zfid_long_t *)fidp; 4693 4694 for (i = 0; i < sizeof (zlfid->zf_setid); i++) 4695 zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i)); 4696 4697 /* XXX - this should be the generation number for the objset */ 4698 for (i = 0; i < sizeof (zlfid->zf_setgen); i++) 4699 zlfid->zf_setgen[i] = 0; 4700 } 4701 4702 ZFS_EXIT(zfsvfs); 4703 return (0); 4704} 4705#endif /* PORT_SOLARIS */ 4706 4707static int 4708zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, 4709 caller_context_t *ct) 4710{ 4711 znode_t *zp, *xzp; 4712 zfsvfs_t *zfsvfs; 4713 zfs_dirlock_t *dl; 4714 int error; 4715 4716 switch (cmd) { 4717 case _PC_LINK_MAX: 4718 *valp = INT_MAX; 4719 return (0); 4720 4721 case _PC_FILESIZEBITS: 4722 *valp = 64; 4723 return (0); 4724 4725#if 0 4726 case _PC_XATTR_EXISTS: 4727 zp = VTOZ(vp); 4728 zfsvfs = zp->z_zfsvfs; 4729 ZFS_ENTER(zfsvfs); 4730 ZFS_VERIFY_ZP(zp); 4731 *valp = 0; 4732 error = zfs_dirent_lock(&dl, zp, "", &xzp, 4733 ZXATTR | ZEXISTS | ZSHARED, NULL, NULL); 4734 if (error == 0) { 4735 zfs_dirent_unlock(dl); 4736 if (!zfs_dirempty(xzp)) 4737 *valp = 1; 4738 VN_RELE(ZTOV(xzp)); 4739 } else if (error == ENOENT) { 4740 /* 4741 * If there aren't extended attributes, it's the 4742 * same as having zero of them. 4743 */ 4744 error = 0; 4745 } 4746 ZFS_EXIT(zfsvfs); 4747 return (error); 4748#endif 4749 case _PC_SATTR_ENABLED: 4750 case _PC_SATTR_EXISTS: 4751 *valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && 4752 (vp->v_type == VREG || vp->v_type == VDIR); 4753 return (0); 4754 4755 case _PC_ACCESS_FILTERING: 4756 *valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) && 4757 vp->v_type == VDIR; 4758 return (0); 4759 4760 case _PC_ACL_ENABLED: 4761 *valp = _ACL_ACE_ENABLED; 4762 return (0); 4763 4764 case _PC_MIN_HOLE_SIZE: 4765 *valp = (int)SPA_MINBLOCKSIZE; 4766 return (0); 4767 4768 case _PC_TIMESTAMP_RESOLUTION: 4769 /* nanosecond timestamp resolution */ 4770 *valp = 1L; 4771 return (0); 4772 4773 default: 4774 return (EOPNOTSUPP); 4775 } 4776} 4777 4778static int 4779zfs_netbsd_open(void *v) 4780{ 4781 struct vop_open_args *ap = v; 4782 vnode_t *vp = ap->a_vp; 4783 znode_t *zp = VTOZ(vp); 4784 int error; 4785 4786 error = zfs_open(&vp, ap->a_mode, ap->a_cred, NULL); 4787 4788 return (error); 4789} 4790 4791static int 4792zfs_netbsd_close(void *v) 4793{ 4794 struct vop_close_args *ap = v; 4795 4796 return (zfs_close(ap->a_vp, ap->a_fflag, 0, 0, ap->a_cred, NULL)); 4797} 4798 4799static int 4800zfs_netbsd_ioctl(void *v) 4801{ 4802 struct vop_ioctl_args *ap = v; 4803 4804 return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data, 4805 ap->a_fflag, ap->a_cred, NULL, NULL)); 4806} 4807 4808 4809static int 4810zfs_netbsd_read(void *v) 4811{ 4812 struct vop_read_args *ap = v; 4813 4814 return (zfs_read(ap->a_vp, ap->a_uio, ap->a_ioflag, ap->a_cred, NULL)); 4815} 4816 4817static int 4818zfs_netbsd_write(void *v) 4819{ 4820 struct vop_write_args *ap = v; 4821 4822 return (zfs_write(ap->a_vp, ap->a_uio, ap->a_ioflag, ap->a_cred, NULL)); 4823} 4824 4825static int 4826zfs_netbsd_access(void *v) 4827{ 4828 struct vop_access_args *ap = v; 4829 4830 /* 4831 * ZFS itself only knowns about VREAD, VWRITE and VEXEC, the rest 4832 * we have to handle by calling vaccess(). 4833 */ 4834 if ((ap->a_mode & ~(VREAD|VWRITE|VEXEC)) != 0) { 4835 vnode_t *vp = ap->a_vp; 4836 znode_t *zp = VTOZ(vp); 4837 znode_phys_t *zphys = zp->z_phys; 4838 4839 return (vaccess(vp->v_type, zphys->zp_mode, zphys->zp_uid, 4840 zphys->zp_gid, ap->a_mode, ap->a_cred)); 4841 } 4842 4843 return (zfs_access(ap->a_vp, ap->a_mode, 0, ap->a_cred, NULL)); 4844} 4845 4846static int 4847zfs_netbsd_lookup(void *v) 4848{ 4849 struct vop_lookup_args *ap = v; 4850 struct componentname *cnp = ap->a_cnp; 4851 char nm[NAME_MAX + 1]; 4852 int err; 4853 4854 ASSERT(cnp->cn_namelen < sizeof(nm)); 4855 strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm))); 4856 4857 err = zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop, 4858 cnp->cn_cred, 0); 4859 4860 return err; 4861} 4862 4863static int 4864zfs_netbsd_create(void *v) 4865{ 4866 struct vop_create_args *ap = v; 4867 struct componentname *cnp = ap->a_cnp; 4868 vattr_t *vap = ap->a_vap; 4869 int mode; 4870 4871 vattr_init_mask(vap); 4872 mode = vap->va_mode & ALLPERMS; 4873 4874 return (zfs_create(ap->a_dvp, (char *)cnp->cn_nameptr, vap, !EXCL, mode, 4875 ap->a_vpp, cnp->cn_cred)); 4876} 4877 4878static int 4879zfs_netbsd_remove(void *v) 4880{ 4881 struct vop_remove_args *ap = v; 4882 4883 return (zfs_remove(ap->a_dvp, (char *)ap->a_cnp->cn_nameptr, 4884 ap->a_cnp->cn_cred, NULL, 0)); 4885} 4886 4887static int 4888zfs_netbsd_mkdir(void *v) 4889{ 4890 struct vop_mkdir_args *ap = v; 4891 vattr_t *vap = ap->a_vap; 4892 4893 vattr_init_mask(vap); 4894 4895 return (zfs_mkdir(ap->a_dvp, (char *)ap->a_cnp->cn_nameptr, vap, ap->a_vpp, 4896 ap->a_cnp->cn_cred, NULL, 0, NULL)); 4897} 4898 4899static int 4900zfs_netbsd_rmdir(void *v) 4901{ 4902 struct vop_rmdir_args *ap = v; 4903 struct componentname *cnp = ap->a_cnp; 4904 4905 return (zfs_rmdir(ap->a_dvp, (char *)cnp->cn_nameptr, NULL, cnp->cn_cred, NULL, 0)); 4906} 4907 4908static int 4909zfs_netbsd_readdir(void *v) 4910{ 4911 struct vop_readdir_args *ap = v; 4912 4913 return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag, 4914 ap->a_ncookies, (u_long **)ap->a_cookies)); 4915} 4916 4917static int 4918zfs_netbsd_fsync(void *v) 4919{ 4920 struct vop_fsync_args *ap = v; 4921 4922 return (zfs_fsync(ap->a_vp, ap->a_flags, ap->a_cred, NULL)); 4923} 4924 4925static int 4926zfs_netbsd_getattr(void *v) 4927{ 4928 struct vop_getattr_args *ap = v; 4929 vattr_t *vap = ap->a_vap; 4930 xvattr_t xvap; 4931 u_long fflags = 0; 4932 int error; 4933 4934 xva_init(&xvap); 4935 xvap.xva_vattr = *vap; 4936 xvap.xva_vattr.va_mask |= AT_XVATTR; 4937 4938 /* Convert chflags into ZFS-type flags. */ 4939 /* XXX: what about SF_SETTABLE?. */ 4940 XVA_SET_REQ(&xvap, XAT_IMMUTABLE); 4941 XVA_SET_REQ(&xvap, XAT_APPENDONLY); 4942 XVA_SET_REQ(&xvap, XAT_NOUNLINK); 4943 XVA_SET_REQ(&xvap, XAT_NODUMP); 4944 error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL); 4945 if (error != 0) 4946 return (error); 4947 4948 /* Convert ZFS xattr into chflags. */ 4949#define FLAG_CHECK(fflag, xflag, xfield) do { \ 4950 if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0) \ 4951 fflags |= (fflag); \ 4952} while (0) 4953 FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE, 4954 xvap.xva_xoptattrs.xoa_immutable); 4955 FLAG_CHECK(SF_APPEND, XAT_APPENDONLY, 4956 xvap.xva_xoptattrs.xoa_appendonly); 4957 FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK, 4958 xvap.xva_xoptattrs.xoa_nounlink); 4959 FLAG_CHECK(UF_NODUMP, XAT_NODUMP, 4960 xvap.xva_xoptattrs.xoa_nodump); 4961#undef FLAG_CHECK 4962 *vap = xvap.xva_vattr; 4963 vap->va_flags = fflags; 4964 return (0); 4965} 4966 4967static int 4968zfs_netbsd_setattr(void *v) 4969{ 4970 struct vop_setattr_args *ap = v; 4971 vnode_t *vp = ap->a_vp; 4972 vattr_t *vap = ap->a_vap; 4973 cred_t *cred = ap->a_cred; 4974 xvattr_t xvap; 4975 u_long fflags; 4976 uint64_t zflags; 4977 4978 vattr_init_mask(vap); 4979 vap->va_mask &= ~AT_NOSET; 4980 4981 xva_init(&xvap); 4982 xvap.xva_vattr = *vap; 4983 4984 zflags = VTOZ(vp)->z_phys->zp_flags; 4985 4986 if (vap->va_flags != VNOVAL) { 4987 int error; 4988 4989 fflags = vap->va_flags; 4990 if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_NODUMP)) != 0) 4991 return (EOPNOTSUPP); 4992 /* 4993 * Callers may only modify the file flags on objects they 4994 * have VADMIN rights for. 4995 */ 4996 if ((error = VOP_ACCESS(vp, VWRITE, cred)) != 0) 4997 return (error); 4998 /* 4999 * Unprivileged processes are not permitted to unset system 5000 * flags, or modify flags if any system flags are set. 5001 * Privileged non-jail processes may not modify system flags 5002 * if securelevel > 0 and any existing system flags are set. 5003 * Privileged jail processes behave like privileged non-jail 5004 * processes if the security.jail.chflags_allowed sysctl is 5005 * is non-zero; otherwise, they behave like unprivileged 5006 * processes. 5007 */ 5008 if (kauth_authorize_system(cred, KAUTH_SYSTEM_CHSYSFLAGS, 0, 5009 NULL, NULL, NULL) != 0) { 5010 5011 if (zflags & 5012 (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) { 5013 return (EPERM); 5014 } 5015 if (fflags & 5016 (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) { 5017 return (EPERM); 5018 } 5019 } 5020 5021#define FLAG_CHANGE(fflag, zflag, xflag, xfield) do { \ 5022 if (((fflags & (fflag)) && !(zflags & (zflag))) || \ 5023 ((zflags & (zflag)) && !(fflags & (fflag)))) { \ 5024 XVA_SET_REQ(&xvap, (xflag)); \ 5025 (xfield) = ((fflags & (fflag)) != 0); \ 5026 } \ 5027} while (0) 5028 /* Convert chflags into ZFS-type flags. */ 5029 /* XXX: what about SF_SETTABLE?. */ 5030 FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE, 5031 xvap.xva_xoptattrs.xoa_immutable); 5032 FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY, 5033 xvap.xva_xoptattrs.xoa_appendonly); 5034 FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK, 5035 xvap.xva_xoptattrs.xoa_nounlink); 5036 FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP, 5037 xvap.xva_xoptattrs.xoa_nodump); 5038#undef FLAG_CHANGE 5039 } 5040 return (zfs_setattr(vp, (vattr_t *)&xvap, 0, cred, NULL)); 5041} 5042 5043static int 5044zfs_netbsd_rename(void *v) 5045{ 5046 struct vop_rename_args /* { 5047 struct vnode *a_fdvp; 5048 struct vnode *a_fvp; 5049 struct componentname *a_fcnp; 5050 struct vnode *a_tdvp; 5051 struct vnode *a_tvp; 5052 struct componentname *a_tcnp; 5053 } */ *ap = v; 5054 vnode_t *fdvp = ap->a_fdvp; 5055 vnode_t *fvp = ap->a_fvp; 5056 vnode_t *tdvp = ap->a_tdvp; 5057 vnode_t *tvp = ap->a_tvp; 5058 int error; 5059 5060 error = zfs_rename(fdvp, (char *)ap->a_fcnp->cn_nameptr, tdvp, 5061 (char *)ap->a_tcnp->cn_nameptr, ap->a_fcnp->cn_cred, NULL, 0); 5062 5063 if (tdvp == tvp) 5064 VN_RELE(tdvp); 5065 else 5066 VN_URELE(tdvp); 5067 if (tvp) 5068 VN_URELE(tvp); 5069 VN_RELE(fdvp); 5070 VN_RELE(fvp); 5071 5072 return (error); 5073} 5074 5075static int 5076zfs_netbsd_symlink(void *v) 5077{ 5078 struct vop_symlink_args *ap = v; 5079 struct componentname *cnp = ap->a_cnp; 5080 vattr_t *vap = ap->a_vap; 5081 5082 vap->va_type = VLNK; /* Netbsd: Syscall only sets va_mode. */ 5083 vattr_init_mask(vap); 5084 5085 return (zfs_symlink(ap->a_dvp, ap->a_vpp, (char *)cnp->cn_nameptr, vap, 5086 ap->a_target, cnp->cn_cred, 0)); 5087} 5088 5089#ifdef PORT_SOLARIS 5090/* 5091 * Tunable, both must be a power of 2. 5092 * 5093 * zcr_blksz_min: the smallest read we may consider to loan out an arcbuf 5094 * zcr_blksz_max: if set to less than the file block size, allow loaning out of 5095 * an arcbuf for a partial block read 5096 */ 5097int zcr_blksz_min = (1 << 10); /* 1K */ 5098int zcr_blksz_max = (1 << 17); /* 128K */ 5099 5100/*ARGSUSED*/ 5101static int 5102zfs_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr, 5103 caller_context_t *ct) 5104{ 5105 znode_t *zp = VTOZ(vp); 5106 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 5107 int max_blksz = zfsvfs->z_max_blksz; 5108 uio_t *uio = &xuio->xu_uio; 5109 ssize_t size = uio->uio_resid; 5110 offset_t offset = uio->uio_loffset; 5111 int blksz; 5112 int fullblk, i; 5113 arc_buf_t *abuf; 5114 ssize_t maxsize; 5115 int preamble, postamble; 5116 5117 if (xuio->xu_type != UIOTYPE_ZEROCOPY) 5118 return (EINVAL); 5119 5120 ZFS_ENTER(zfsvfs); 5121 ZFS_VERIFY_ZP(zp); 5122 switch (ioflag) { 5123 case UIO_WRITE: 5124 /* 5125 * Loan out an arc_buf for write if write size is bigger than 5126 * max_blksz, and the file's block size is also max_blksz. 5127 */ 5128 blksz = max_blksz; 5129 if (size < blksz || zp->z_blksz != blksz) { 5130 ZFS_EXIT(zfsvfs); 5131 return (EINVAL); 5132 } 5133 /* 5134 * Caller requests buffers for write before knowing where the 5135 * write offset might be (e.g. NFS TCP write). 5136 */ 5137 if (offset == -1) { 5138 preamble = 0; 5139 } else { 5140 preamble = P2PHASE(offset, blksz); 5141 if (preamble) { 5142 preamble = blksz - preamble; 5143 size -= preamble; 5144 } 5145 } 5146 5147 postamble = P2PHASE(size, blksz); 5148 size -= postamble; 5149 5150 fullblk = size / blksz; 5151 (void) dmu_xuio_init(xuio, 5152 (preamble != 0) + fullblk + (postamble != 0)); 5153 DTRACE_PROBE3(zfs_reqzcbuf_align, int, preamble, 5154 int, postamble, int, 5155 (preamble != 0) + fullblk + (postamble != 0)); 5156 5157 /* 5158 * Have to fix iov base/len for partial buffers. They 5159 * currently represent full arc_buf's. 5160 */ 5161 if (preamble) { 5162 /* data begins in the middle of the arc_buf */ 5163 abuf = dmu_request_arcbuf(zp->z_dbuf, blksz); 5164 ASSERT(abuf); 5165 (void) dmu_xuio_add(xuio, abuf, 5166 blksz - preamble, preamble); 5167 } 5168 5169 for (i = 0; i < fullblk; i++) { 5170 abuf = dmu_request_arcbuf(zp->z_dbuf, blksz); 5171 ASSERT(abuf); 5172 (void) dmu_xuio_add(xuio, abuf, 0, blksz); 5173 } 5174 5175 if (postamble) { 5176 /* data ends in the middle of the arc_buf */ 5177 abuf = dmu_request_arcbuf(zp->z_dbuf, blksz); 5178 ASSERT(abuf); 5179 (void) dmu_xuio_add(xuio, abuf, 0, postamble); 5180 } 5181 break; 5182 case UIO_READ: 5183 /* 5184 * Loan out an arc_buf for read if the read size is larger than 5185 * the current file block size. Block alignment is not 5186 * considered. Partial arc_buf will be loaned out for read. 5187 */ 5188 blksz = zp->z_blksz; 5189 if (blksz < zcr_blksz_min) 5190 blksz = zcr_blksz_min; 5191 if (blksz > zcr_blksz_max) 5192 blksz = zcr_blksz_max; 5193 /* avoid potential complexity of dealing with it */ 5194 if (blksz > max_blksz) { 5195 ZFS_EXIT(zfsvfs); 5196 return (EINVAL); 5197 } 5198 5199 maxsize = zp->z_phys->zp_size - uio->uio_loffset; 5200 if (size > maxsize) 5201 size = maxsize; 5202 5203 if (size < blksz || vn_has_cached_data(vp)) { 5204 ZFS_EXIT(zfsvfs); 5205 return (EINVAL); 5206 } 5207 break; 5208 default: 5209 ZFS_EXIT(zfsvfs); 5210 return (EINVAL); 5211 } 5212 5213 uio->uio_extflg = UIO_XUIO; 5214 XUIO_XUZC_RW(xuio) = ioflag; 5215 ZFS_EXIT(zfsvfs); 5216 return (0); 5217} 5218 5219/*ARGSUSED*/ 5220static int 5221zfs_retzcbuf(vnode_t *vp, xuio_t *xuio, cred_t *cr, caller_context_t *ct) 5222{ 5223 int i; 5224 arc_buf_t *abuf; 5225 int ioflag = XUIO_XUZC_RW(xuio); 5226 5227 ASSERT(xuio->xu_type == UIOTYPE_ZEROCOPY); 5228 5229 i = dmu_xuio_cnt(xuio); 5230 while (i-- > 0) { 5231 abuf = dmu_xuio_arcbuf(xuio, i); 5232 /* 5233 * if abuf == NULL, it must be a write buffer 5234 * that has been returned in zfs_write(). 5235 */ 5236 if (abuf) 5237 dmu_return_arcbuf(abuf); 5238 ASSERT(abuf || ioflag == UIO_WRITE); 5239 } 5240 5241 dmu_xuio_fini(xuio); 5242 return (0); 5243} 5244 5245/* 5246 * Predeclare these here so that the compiler assumes that 5247 * this is an "old style" function declaration that does 5248 * not include arguments => we won't get type mismatch errors 5249 * in the initializations that follow. 5250 */ 5251static int zfs_inval(); 5252static int zfs_isdir(); 5253#endif 5254 5255static int 5256zfs_netbsd_readlink(void *v) 5257{ 5258 struct vop_readlink_args *ap = v; 5259 5260 return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL)); 5261} 5262 5263static int 5264zfs_netbsd_link(void *v) 5265{ 5266 struct vop_link_args *ap = v; 5267 struct componentname *cnp = ap->a_cnp; 5268 5269 return (zfs_link(ap->a_dvp, ap->a_vp, (char *)cnp->cn_nameptr, cnp->cn_cred, NULL, 0)); 5270} 5271 5272static int 5273zfs_netbsd_inactive(void *v) 5274{ 5275 struct vop_inactive_args *ap = v; 5276 vnode_t *vp = ap->a_vp; 5277 znode_t *zp = VTOZ(vp); 5278 5279 /* 5280 * NetBSD: nothing to do here, other than indicate if the 5281 * vnode should be reclaimed. No need to lock, if we race 5282 * vrele() will call us again. 5283 */ 5284 *ap->a_recycle = (zp->z_unlinked != 0); 5285 VOP_UNLOCK(vp); 5286 return (0); 5287} 5288 5289/* 5290 * Destroy znode from taskq thread without ZFS_OBJ_MUTEX held. 5291 */ 5292static void 5293zfs_reclaim_deferred(void *arg) 5294{ 5295 znode_t *zp = arg; 5296 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 5297 uint64_t z_id = zp->z_id; 5298 5299 /* 5300 * Don't allow a zfs_zget() while were trying to release this znode 5301 */ 5302 ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id); 5303 5304 /* Don't need to call ZFS_OBJ_HOLD_EXIT zfs_inactive did thatfor us. */ 5305 zfs_zinactive(zp); 5306 5307} 5308 5309static int 5310zfs_netbsd_reclaim(void *v) 5311{ 5312 struct vop_reclaim_args *ap = v; 5313 vnode_t *vp = ap->a_vp; 5314 znode_t *zp = VTOZ(vp); 5315 zfsvfs_t *zfsvfs; 5316 int locked; 5317 5318 locked = 0; 5319 5320 ASSERT(zp != NULL); 5321 KASSERT(!vn_has_cached_data(vp)); 5322 5323 zfsvfs = zp->z_zfsvfs; 5324 5325 mutex_enter(&zp->z_lock); 5326 ASSERT(zp->z_phys); 5327 5328// dprintf("destroying znode %p -- vnode %p -- zp->z_buf = %p\n", zp, ZTOV(zp), zp->z_dbuf); 5329// rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); 5330 genfs_node_destroy(vp); 5331 cache_purge(vp); 5332 5333 if (zp->z_dbuf == NULL) { 5334 /* 5335 * The fs has been unmounted, or we did a 5336 * suspend/resume and this file no longer exists. 5337 */ 5338 rw_exit(&zfsvfs->z_teardown_inactive_lock); 5339 mutex_exit(&zp->z_lock); 5340 zfs_znode_free(zp); 5341 return (0); 5342 } 5343 mutex_exit(&zp->z_lock); 5344 5345 mutex_enter(&zp->z_lock); 5346 if (!zp->z_unlinked) { 5347 /* 5348 * XXX Hack because ZFS_OBJ_MUTEX is held we can't call zfs_zinactive 5349 * now. I need to defer zfs_zinactive to another thread which doesn't hold this mutex. 5350 */ 5351 locked = MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id)) ? 2 : 5352 ZFS_OBJ_HOLD_TRYENTER(zfsvfs, zp->z_id); 5353 if (locked == 0) { 5354 /* 5355 * Lock can't be obtained due to deadlock possibility, 5356 * so defer znode destruction. 5357 */ 5358 taskq_dispatch(system_taskq, zfs_reclaim_deferred, zp, 0); 5359 } else { 5360 zfs_znode_dmu_fini(zp); 5361 /* Our LWP is holding ZFS_OBJ_HELD mutex but it was locked before 5362 zfs_zinactive was called therefore we can't release it. */ 5363 if (locked == 1) 5364 ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id); 5365 zfs_znode_free(zp); 5366 } 5367 } else 5368 mutex_exit(&zp->z_lock); 5369 5370 ZTOV(zp) = NULL; 5371 vp->v_data = NULL; /* v_data must be NULL for a cleaned vnode. */ 5372 5373 return (0); 5374} 5375 5376static int 5377zfs_netbsd_fid(void *v) 5378{ 5379 struct vop_fid_args *ap = v; 5380 5381 return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL)); 5382} 5383 5384static int 5385zfs_netbsd_pathconf(void *v) 5386{ 5387 struct vop_pathconf_args *ap = v; 5388 ulong_t val; 5389 int error; 5390 5391 error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->l_cred, NULL); 5392 if (error == 0) 5393 *ap->a_retval = val; 5394 else if (error == EOPNOTSUPP) { 5395 switch (ap->a_name) { 5396 case _PC_NAME_MAX: 5397 *ap->a_retval = NAME_MAX; 5398 return (0); 5399 case _PC_PATH_MAX: 5400 *ap->a_retval = PATH_MAX; 5401 return (0); 5402 case _PC_LINK_MAX: 5403 *ap->a_retval = LINK_MAX; 5404 return (0); 5405 case _PC_MAX_CANON: 5406 *ap->a_retval = MAX_CANON; 5407 return (0); 5408 case _PC_MAX_INPUT: 5409 *ap->a_retval = MAX_INPUT; 5410 return (0); 5411 case _PC_PIPE_BUF: 5412 *ap->a_retval = PIPE_BUF; 5413 return (0); 5414 case _PC_CHOWN_RESTRICTED: 5415 *ap->a_retval = 1; 5416 return (0); 5417 case _PC_VDISABLE: 5418 *ap->a_retval = _POSIX_VDISABLE; 5419 return (0); 5420 default: 5421 return (EINVAL); 5422 } 5423 /* NOTREACHED */ 5424 } 5425 return (error); 5426} 5427 5428int 5429zfs_netbsd_lock(void *v) 5430{ 5431 struct vop_lock_args *ap = v; 5432 5433 return 0; 5434} 5435 5436int 5437zfs_netbsd_unlock(void *v) 5438{ 5439 5440 return 0; 5441} 5442/* 5443int 5444zfs_netbsd_getpages(void *v) 5445{ 5446 struct vnode *vp = ((struct vop_getpages_args *)v)->a_vp; 5447 voff_t offset = ((struct vop_getpages_args *)v)->a_offset; 5448 struct vm_page **m = ((struct vop_getpages_args *)v)->a_m; 5449 int *count = ((struct vop_getpages_args *)v)->a_count; 5450 int centeridx = ((struct vop_getpages_args *)v)->a_centeridx; 5451 vm_prot_t access_type = ((struct vop_getpages_args *)v)->a_access_type; 5452 int advice = ((struct vop_getpages_args *)v)->a_advice; 5453 int flags = ((struct vop_getpages_args *)v)->a_flags; 5454 5455 int error; 5456 5457 error = 0; 5458 5459 KASSERT(!vn_has_cached_data(vp)); 5460 mutex_exit(&vp->v_interlock); 5461 5462 return error; 5463} 5464*/ 5465 5466int 5467zfs_netbsd_putpages(void *v) 5468{ 5469 struct vnode *vp = ((struct vop_putpages_args *)v)->a_vp; 5470 voff_t offlo = ((struct vop_putpages_args *)v)->a_offlo; 5471 voff_t offhi = ((struct vop_putpages_args *)v)->a_offhi; 5472 int flags = ((struct vop_putpages_args *)v)->a_flags; 5473 znode_t *zp = VTOZ(vp); 5474 5475 int error; 5476 5477 dprintf("putpages entry %p -- zfsvfs %p\n", vp, zp->z_zfsvfs); 5478 error = genfs_putpages(v); 5479 dprintf("putpages exit %p -- zfsvfs %p\n", vp, zp->z_zfsvfs); 5480 5481 return error; 5482} 5483 5484#define zfs_netbsd_seek genfs_seek 5485#define zfs_netbsd_mmap genfs_mmap 5486#define zfs_netbsd_getpages genfs_compat_getpages 5487//#define zfs_netbsd_putpages genfs_putpages 5488#define zfs_netbsd_islocked genfs_islocked 5489 5490int (**zfs_vnodeop_p)(void *); 5491const struct vnodeopv_entry_desc zfs_vnodeop_entries[] = { 5492 { &vop_default_desc, vn_default_error }, 5493 { &vop_lookup_desc, zfs_netbsd_lookup }, 5494 { &vop_create_desc, zfs_netbsd_create }, 5495 { &vop_open_desc, zfs_netbsd_open }, 5496 { &vop_close_desc, zfs_netbsd_close }, 5497 { &vop_access_desc, zfs_netbsd_access }, 5498 { &vop_getattr_desc, zfs_netbsd_getattr }, 5499 { &vop_setattr_desc, zfs_netbsd_setattr }, 5500 { &vop_read_desc, zfs_netbsd_read }, 5501 { &vop_write_desc, zfs_netbsd_write }, 5502 { &vop_ioctl_desc, zfs_netbsd_ioctl }, 5503 { &vop_fsync_desc, zfs_netbsd_fsync }, 5504 { &vop_remove_desc, zfs_netbsd_remove }, 5505 { &vop_link_desc, zfs_netbsd_link }, 5506 { &vop_lock_desc, zfs_netbsd_lock }, 5507 { &vop_unlock_desc, zfs_netbsd_unlock }, 5508 { &vop_rename_desc, zfs_netbsd_rename }, 5509 { &vop_mkdir_desc, zfs_netbsd_mkdir }, 5510 { &vop_rmdir_desc, zfs_netbsd_rmdir }, 5511 { &vop_symlink_desc, zfs_netbsd_symlink }, 5512 { &vop_readdir_desc, zfs_netbsd_readdir }, 5513 { &vop_readlink_desc, zfs_netbsd_readlink }, 5514 { &vop_inactive_desc, zfs_netbsd_inactive }, 5515 { &vop_reclaim_desc, zfs_netbsd_reclaim }, 5516 { &vop_pathconf_desc, zfs_netbsd_pathconf }, 5517 { &vop_seek_desc, zfs_netbsd_seek }, 5518 { &vop_getpages_desc, zfs_netbsd_getpages }, 5519 { &vop_putpages_desc, zfs_netbsd_putpages }, 5520 { &vop_mmap_desc, zfs_netbsd_mmap }, 5521 { &vop_islocked_desc, zfs_netbsd_islocked }, 5522#ifdef notyet 5523 { &vop_advlock_desc, zfs_netbsd_advlock }, 5524 { &vop_fcntl_desc, zfs_netbsd_fcntl }, 5525 { &vop_bmap_desc, zfs_netbsd_bmap }, 5526 { &vop_strategy_desc, zfs_netbsd_strategy }, 5527 { &vop_print_desc, zfs_netbsd_print }, 5528 { &vop_bwrite_desc, zfs_netbsd_bwrite }, 5529#endif 5530 { NULL, NULL } 5531}; 5532 5533const struct vnodeopv_desc zfs_vnodeop_opv_desc = 5534 { &zfs_vnodeop_p, zfs_vnodeop_entries }; 5535