1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22/* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2012, 2015 by Delphix. All rights reserved. 25 * Copyright (c) 2014 Integros [integros.com] 26 * Copyright 2017 Nexenta Systems, Inc. 27 */ 28 29/* Portions Copyright 2007 Jeremy Teo */ 30/* Portions Copyright 2010 Robert Milkowski */ 31 32#include <sys/types.h> 33#include <sys/param.h> 34#include <sys/time.h> 35#include <sys/systm.h> 36#include <sys/sysmacros.h> 37#include <sys/resource.h> 38#include <sys/vfs.h> 39#include <sys/vm.h> 40#include <sys/vnode.h> 41#include <sys/file.h> 42#include <sys/stat.h> 43#include <sys/kmem.h> 44#include <sys/taskq.h> 45#include <sys/uio.h> 46#include <sys/atomic.h> 47#include <sys/namei.h> 48#include <sys/mman.h> 49#include <sys/cmn_err.h> 50#include <sys/errno.h> 51#include <sys/unistd.h> 52#include <sys/zfs_dir.h> 53#include <sys/zfs_ioctl.h> 54#include <sys/fs/zfs.h> 55#include <sys/dmu.h> 56#include <sys/dmu_objset.h> 57#include <sys/spa.h> 58#include <sys/txg.h> 59#include <sys/dbuf.h> 60#include <sys/zap.h> 61#include <sys/sa.h> 62#include <sys/dirent.h> 63#include <sys/policy.h> 64#include <sys/sunddi.h> 65#include <sys/filio.h> 66#include <sys/sid.h> 67#include <sys/zfs_ctldir.h> 68#include <sys/zfs_fuid.h> 69#include <sys/zfs_sa.h> 70#include <sys/zfs_rlock.h> 71#include <sys/extdirent.h> 72#include <sys/kidmap.h> 73#include <sys/bio.h> 74#include <sys/buf.h> 75#include <sys/sched.h> 76#include <sys/acl.h> 77#include <sys/vmmeter.h> 78#include <vm/vm_param.h> 79#include <sys/zil.h> 80 81/* 82 * Programming rules. 83 * 84 * Each vnode op performs some logical unit of work. To do this, the ZPL must 85 * properly lock its in-core state, create a DMU transaction, do the work, 86 * record this work in the intent log (ZIL), commit the DMU transaction, 87 * and wait for the intent log to commit if it is a synchronous operation. 88 * Moreover, the vnode ops must work in both normal and log replay context. 89 * The ordering of events is important to avoid deadlocks and references 90 * to freed memory. The example below illustrates the following Big Rules: 91 * 92 * (1) A check must be made in each zfs thread for a mounted file system. 93 * This is done avoiding races using ZFS_ENTER(zfsvfs). 94 * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes 95 * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros 96 * can return EIO from the calling function. 97 * 98 * (2) VN_RELE() should always be the last thing except for zil_commit() 99 * (if necessary) and ZFS_EXIT(). This is for 3 reasons: 100 * First, if it's the last reference, the vnode/znode 101 * can be freed, so the zp may point to freed memory. Second, the last 102 * reference will call zfs_zinactive(), which may induce a lot of work -- 103 * pushing cached pages (which acquires range locks) and syncing out 104 * cached atime changes. Third, zfs_zinactive() may require a new tx, 105 * which could deadlock the system if you were already holding one. 106 * If you must call VN_RELE() within a tx then use VN_RELE_ASYNC(). 107 * 108 * (3) All range locks must be grabbed before calling dmu_tx_assign(), 109 * as they can span dmu_tx_assign() calls. 110 * 111 * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to 112 * dmu_tx_assign(). This is critical because we don't want to block 113 * while holding locks. 114 * 115 * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This 116 * reduces lock contention and CPU usage when we must wait (note that if 117 * throughput is constrained by the storage, nearly every transaction 118 * must wait). 119 * 120 * Note, in particular, that if a lock is sometimes acquired before 121 * the tx assigns, and sometimes after (e.g. z_lock), then failing 122 * to use a non-blocking assign can deadlock the system. The scenario: 123 * 124 * Thread A has grabbed a lock before calling dmu_tx_assign(). 125 * Thread B is in an already-assigned tx, and blocks for this lock. 126 * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() 127 * forever, because the previous txg can't quiesce until B's tx commits. 128 * 129 * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, 130 * then drop all locks, call dmu_tx_wait(), and try again. On subsequent 131 * calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT, 132 * to indicate that this operation has already called dmu_tx_wait(). 133 * This will ensure that we don't retry forever, waiting a short bit 134 * each time. 135 * 136 * (5) If the operation succeeded, generate the intent log entry for it 137 * before dropping locks. This ensures that the ordering of events 138 * in the intent log matches the order in which they actually occurred. 139 * During ZIL replay the zfs_log_* functions will update the sequence 140 * number to indicate the zil transaction has replayed. 141 * 142 * (6) At the end of each vnode op, the DMU tx must always commit, 143 * regardless of whether there were any errors. 144 * 145 * (7) After dropping all locks, invoke zil_commit(zilog, foid) 146 * to ensure that synchronous semantics are provided when necessary. 147 * 148 * In general, this is how things should be ordered in each vnode op: 149 * 150 * ZFS_ENTER(zfsvfs); // exit if unmounted 151 * top: 152 * zfs_dirent_lookup(&dl, ...) // lock directory entry (may VN_HOLD()) 153 * rw_enter(...); // grab any other locks you need 154 * tx = dmu_tx_create(...); // get DMU tx 155 * dmu_tx_hold_*(); // hold each object you might modify 156 * error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 157 * if (error) { 158 * rw_exit(...); // drop locks 159 * zfs_dirent_unlock(dl); // unlock directory entry 160 * VN_RELE(...); // release held vnodes 161 * if (error == ERESTART) { 162 * waited = B_TRUE; 163 * dmu_tx_wait(tx); 164 * dmu_tx_abort(tx); 165 * goto top; 166 * } 167 * dmu_tx_abort(tx); // abort DMU tx 168 * ZFS_EXIT(zfsvfs); // finished in zfs 169 * return (error); // really out of space 170 * } 171 * error = do_real_work(); // do whatever this VOP does 172 * if (error == 0) 173 * zfs_log_*(...); // on success, make ZIL entry 174 * dmu_tx_commit(tx); // commit DMU tx -- error or not 175 * rw_exit(...); // drop locks 176 * zfs_dirent_unlock(dl); // unlock directory entry 177 * VN_RELE(...); // release held vnodes 178 * zil_commit(zilog, foid); // synchronous when necessary 179 * ZFS_EXIT(zfsvfs); // finished in zfs 180 * return (error); // done, report error 181 */ 182 183/* ARGSUSED */ 184static int 185zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) 186{ 187 znode_t *zp = VTOZ(*vpp); 188 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 189 190 ZFS_ENTER(zfsvfs); 191 ZFS_VERIFY_ZP(zp); 192 193 if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) && 194 ((flag & FAPPEND) == 0)) { 195 ZFS_EXIT(zfsvfs); 196 return (SET_ERROR(EPERM)); 197 } 198 199 if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && 200 ZTOV(zp)->v_type == VREG && 201 !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) { 202 if (fs_vscan(*vpp, cr, 0) != 0) { 203 ZFS_EXIT(zfsvfs); 204 return (SET_ERROR(EACCES)); 205 } 206 } 207 208 /* Keep a count of the synchronous opens in the znode */ 209 if (flag & (FSYNC | FDSYNC)) 210 atomic_inc_32(&zp->z_sync_cnt); 211 212 ZFS_EXIT(zfsvfs); 213 return (0); 214} 215 216/* ARGSUSED */ 217static int 218zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, 219 caller_context_t *ct) 220{ 221 znode_t *zp = VTOZ(vp); 222 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 223 224 /* 225 * Clean up any locks held by this process on the vp. 226 */ 227 cleanlocks(vp, ddi_get_pid(), 0); 228 cleanshares(vp, ddi_get_pid()); 229 230 ZFS_ENTER(zfsvfs); 231 ZFS_VERIFY_ZP(zp); 232 233 /* Decrement the synchronous opens in the znode */ 234 if ((flag & (FSYNC | FDSYNC)) && (count == 1)) 235 atomic_dec_32(&zp->z_sync_cnt); 236 237 if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && 238 ZTOV(zp)->v_type == VREG && 239 !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) 240 VERIFY(fs_vscan(vp, cr, 1) == 0); 241 242 ZFS_EXIT(zfsvfs); 243 return (0); 244} 245 246/* 247 * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and 248 * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter. 249 */ 250static int 251zfs_holey(vnode_t *vp, u_long cmd, offset_t *off) 252{ 253 znode_t *zp = VTOZ(vp); 254 uint64_t noff = (uint64_t)*off; /* new offset */ 255 uint64_t file_sz; 256 int error; 257 boolean_t hole; 258 259 file_sz = zp->z_size; 260 if (noff >= file_sz) { 261 return (SET_ERROR(ENXIO)); 262 } 263 264 if (cmd == _FIO_SEEK_HOLE) 265 hole = B_TRUE; 266 else 267 hole = B_FALSE; 268 269 error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff); 270 271 if (error == ESRCH) 272 return (SET_ERROR(ENXIO)); 273 274 /* 275 * We could find a hole that begins after the logical end-of-file, 276 * because dmu_offset_next() only works on whole blocks. If the 277 * EOF falls mid-block, then indicate that the "virtual hole" 278 * at the end of the file begins at the logical EOF, rather than 279 * at the end of the last block. 280 */ 281 if (noff > file_sz) { 282 ASSERT(hole); 283 noff = file_sz; 284 } 285 286 if (noff < *off) 287 return (error); 288 *off = noff; 289 return (error); 290} 291 292/* ARGSUSED */ 293static int 294zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred, 295 int *rvalp, caller_context_t *ct) 296{ 297 offset_t off; 298 offset_t ndata; 299 dmu_object_info_t doi; 300 int error; 301 zfsvfs_t *zfsvfs; 302 znode_t *zp; 303 304 switch (com) { 305 case _FIOFFS: 306 { 307 return (0); 308 309 /* 310 * The following two ioctls are used by bfu. Faking out, 311 * necessary to avoid bfu errors. 312 */ 313 } 314 case _FIOGDIO: 315 case _FIOSDIO: 316 { 317 return (0); 318 } 319 320 case _FIO_SEEK_DATA: 321 case _FIO_SEEK_HOLE: 322 { 323#ifdef illumos 324 if (ddi_copyin((void *)data, &off, sizeof (off), flag)) 325 return (SET_ERROR(EFAULT)); 326#else 327 off = *(offset_t *)data; 328#endif 329 zp = VTOZ(vp); 330 zfsvfs = zp->z_zfsvfs; 331 ZFS_ENTER(zfsvfs); 332 ZFS_VERIFY_ZP(zp); 333 334 /* offset parameter is in/out */ 335 error = zfs_holey(vp, com, &off); 336 ZFS_EXIT(zfsvfs); 337 if (error) 338 return (error); 339#ifdef illumos 340 if (ddi_copyout(&off, (void *)data, sizeof (off), flag)) 341 return (SET_ERROR(EFAULT)); 342#else 343 *(offset_t *)data = off; 344#endif 345 return (0); 346 } 347#ifdef illumos 348 case _FIO_COUNT_FILLED: 349 { 350 /* 351 * _FIO_COUNT_FILLED adds a new ioctl command which 352 * exposes the number of filled blocks in a 353 * ZFS object. 354 */ 355 zp = VTOZ(vp); 356 zfsvfs = zp->z_zfsvfs; 357 ZFS_ENTER(zfsvfs); 358 ZFS_VERIFY_ZP(zp); 359 360 /* 361 * Wait for all dirty blocks for this object 362 * to get synced out to disk, and the DMU info 363 * updated. 364 */ 365 error = dmu_object_wait_synced(zfsvfs->z_os, zp->z_id); 366 if (error) { 367 ZFS_EXIT(zfsvfs); 368 return (error); 369 } 370 371 /* 372 * Retrieve fill count from DMU object. 373 */ 374 error = dmu_object_info(zfsvfs->z_os, zp->z_id, &doi); 375 if (error) { 376 ZFS_EXIT(zfsvfs); 377 return (error); 378 } 379 380 ndata = doi.doi_fill_count; 381 382 ZFS_EXIT(zfsvfs); 383 if (ddi_copyout(&ndata, (void *)data, sizeof (ndata), flag)) 384 return (SET_ERROR(EFAULT)); 385 return (0); 386 } 387#endif 388 } 389 return (SET_ERROR(ENOTTY)); 390} 391 392static vm_page_t 393page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes) 394{ 395 vm_object_t obj; 396 vm_page_t pp; 397 int64_t end; 398 399 /* 400 * At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE 401 * aligned boundaries, if the range is not aligned. As a result a 402 * DEV_BSIZE subrange with partially dirty data may get marked as clean. 403 * It may happen that all DEV_BSIZE subranges are marked clean and thus 404 * the whole page would be considred clean despite have some dirty data. 405 * For this reason we should shrink the range to DEV_BSIZE aligned 406 * boundaries before calling vm_page_clear_dirty. 407 */ 408 end = rounddown2(off + nbytes, DEV_BSIZE); 409 off = roundup2(off, DEV_BSIZE); 410 nbytes = end - off; 411 412 obj = vp->v_object; 413 zfs_vmobject_assert_wlocked(obj); 414 415 for (;;) { 416 if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && 417 pp->valid) { 418 if (vm_page_xbusied(pp)) { 419 /* 420 * Reference the page before unlocking and 421 * sleeping so that the page daemon is less 422 * likely to reclaim it. 423 */ 424 vm_page_reference(pp); 425 vm_page_lock(pp); 426 zfs_vmobject_wunlock(obj); 427 vm_page_busy_sleep(pp, "zfsmwb", true); 428 zfs_vmobject_wlock(obj); 429 continue; 430 } 431 vm_page_sbusy(pp); 432 } else if (pp != NULL) { 433 ASSERT(!pp->valid); 434 pp = NULL; 435 } 436 437 if (pp != NULL) { 438 ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); 439 vm_object_pip_add(obj, 1); 440 pmap_remove_write(pp); 441 if (nbytes != 0) 442 vm_page_clear_dirty(pp, off, nbytes); 443 } 444 break; 445 } 446 return (pp); 447} 448 449static void 450page_unbusy(vm_page_t pp) 451{ 452 453 vm_page_sunbusy(pp); 454 vm_object_pip_subtract(pp->object, 1); 455} 456 457static vm_page_t 458page_hold(vnode_t *vp, int64_t start) 459{ 460 vm_object_t obj; 461 vm_page_t pp; 462 463 obj = vp->v_object; 464 zfs_vmobject_assert_wlocked(obj); 465 466 for (;;) { 467 if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && 468 pp->valid) { 469 if (vm_page_xbusied(pp)) { 470 /* 471 * Reference the page before unlocking and 472 * sleeping so that the page daemon is less 473 * likely to reclaim it. 474 */ 475 vm_page_reference(pp); 476 vm_page_lock(pp); 477 zfs_vmobject_wunlock(obj); 478 vm_page_busy_sleep(pp, "zfsmwb", true); 479 zfs_vmobject_wlock(obj); 480 continue; 481 } 482 483 ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); 484 vm_page_lock(pp); 485 vm_page_hold(pp); 486 vm_page_unlock(pp); 487 488 } else 489 pp = NULL; 490 break; 491 } 492 return (pp); 493} 494 495static void 496page_unhold(vm_page_t pp) 497{ 498 499 vm_page_lock(pp); 500 vm_page_unhold(pp); 501 vm_page_unlock(pp); 502} 503 504/* 505 * When a file is memory mapped, we must keep the IO data synchronized 506 * between the DMU cache and the memory mapped pages. What this means: 507 * 508 * On Write: If we find a memory mapped page, we write to *both* 509 * the page and the dmu buffer. 510 */ 511static void 512update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid, 513 int segflg, dmu_tx_t *tx) 514{ 515 vm_object_t obj; 516 struct sf_buf *sf; 517 caddr_t va; 518 int off; 519 520 ASSERT(segflg != UIO_NOCOPY); 521 ASSERT(vp->v_mount != NULL); 522 obj = vp->v_object; 523 ASSERT(obj != NULL); 524 525 off = start & PAGEOFFSET; 526 zfs_vmobject_wlock(obj); 527 for (start &= PAGEMASK; len > 0; start += PAGESIZE) { 528 vm_page_t pp; 529 int nbytes = imin(PAGESIZE - off, len); 530 531 if ((pp = page_busy(vp, start, off, nbytes)) != NULL) { 532 zfs_vmobject_wunlock(obj); 533 534 va = zfs_map_page(pp, &sf); 535 (void) dmu_read(os, oid, start+off, nbytes, 536 va+off, DMU_READ_PREFETCH);; 537 zfs_unmap_page(sf); 538 539 zfs_vmobject_wlock(obj); 540 page_unbusy(pp); 541 } 542 len -= nbytes; 543 off = 0; 544 } 545 vm_object_pip_wakeupn(obj, 0); 546 zfs_vmobject_wunlock(obj); 547} 548 549/* 550 * Read with UIO_NOCOPY flag means that sendfile(2) requests 551 * ZFS to populate a range of page cache pages with data. 552 * 553 * NOTE: this function could be optimized to pre-allocate 554 * all pages in advance, drain exclusive busy on all of them, 555 * map them into contiguous KVA region and populate them 556 * in one single dmu_read() call. 557 */ 558static int 559mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio) 560{ 561 znode_t *zp = VTOZ(vp); 562 objset_t *os = zp->z_zfsvfs->z_os; 563 struct sf_buf *sf; 564 vm_object_t obj; 565 vm_page_t pp; 566 int64_t start; 567 caddr_t va; 568 int len = nbytes; 569 int off; 570 int error = 0; 571 572 ASSERT(uio->uio_segflg == UIO_NOCOPY); 573 ASSERT(vp->v_mount != NULL); 574 obj = vp->v_object; 575 ASSERT(obj != NULL); 576 ASSERT((uio->uio_loffset & PAGEOFFSET) == 0); 577 578 zfs_vmobject_wlock(obj); 579 for (start = uio->uio_loffset; len > 0; start += PAGESIZE) { 580 int bytes = MIN(PAGESIZE, len); 581 582 pp = vm_page_grab(obj, OFF_TO_IDX(start), VM_ALLOC_SBUSY | 583 VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY); 584 if (pp->valid == 0) { 585 zfs_vmobject_wunlock(obj); 586 va = zfs_map_page(pp, &sf); 587 error = dmu_read(os, zp->z_id, start, bytes, va, 588 DMU_READ_PREFETCH); 589 if (bytes != PAGESIZE && error == 0) 590 bzero(va + bytes, PAGESIZE - bytes); 591 zfs_unmap_page(sf); 592 zfs_vmobject_wlock(obj); 593 vm_page_sunbusy(pp); 594 vm_page_lock(pp); 595 if (error) { 596 if (pp->wire_count == 0 && pp->valid == 0 && 597 !vm_page_busied(pp)) 598 vm_page_free(pp); 599 } else { 600 pp->valid = VM_PAGE_BITS_ALL; 601 vm_page_activate(pp); 602 } 603 vm_page_unlock(pp); 604 } else { 605 ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); 606 vm_page_sunbusy(pp); 607 } 608 if (error) 609 break; 610 uio->uio_resid -= bytes; 611 uio->uio_offset += bytes; 612 len -= bytes; 613 } 614 zfs_vmobject_wunlock(obj); 615 return (error); 616} 617 618/* 619 * When a file is memory mapped, we must keep the IO data synchronized 620 * between the DMU cache and the memory mapped pages. What this means: 621 * 622 * On Read: We "read" preferentially from memory mapped pages, 623 * else we default from the dmu buffer. 624 * 625 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when 626 * the file is memory mapped. 627 */ 628static int 629mappedread(vnode_t *vp, int nbytes, uio_t *uio) 630{ 631 znode_t *zp = VTOZ(vp); 632 vm_object_t obj; 633 int64_t start; 634 caddr_t va; 635 int len = nbytes; 636 int off; 637 int error = 0; 638 639 ASSERT(vp->v_mount != NULL); 640 obj = vp->v_object; 641 ASSERT(obj != NULL); 642 643 start = uio->uio_loffset; 644 off = start & PAGEOFFSET; 645 zfs_vmobject_wlock(obj); 646 for (start &= PAGEMASK; len > 0; start += PAGESIZE) { 647 vm_page_t pp; 648 uint64_t bytes = MIN(PAGESIZE - off, len); 649 650 if (pp = page_hold(vp, start)) { 651 struct sf_buf *sf; 652 caddr_t va; 653 654 zfs_vmobject_wunlock(obj); 655 va = zfs_map_page(pp, &sf); 656#ifdef illumos 657 error = uiomove(va + off, bytes, UIO_READ, uio); 658#else 659 error = vn_io_fault_uiomove(va + off, bytes, uio); 660#endif 661 zfs_unmap_page(sf); 662 zfs_vmobject_wlock(obj); 663 page_unhold(pp); 664 } else { 665 zfs_vmobject_wunlock(obj); 666 error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), 667 uio, bytes); 668 zfs_vmobject_wlock(obj); 669 } 670 len -= bytes; 671 off = 0; 672 if (error) 673 break; 674 } 675 zfs_vmobject_wunlock(obj); 676 return (error); 677} 678 679offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */ 680 681/* 682 * Read bytes from specified file into supplied buffer. 683 * 684 * IN: vp - vnode of file to be read from. 685 * uio - structure supplying read location, range info, 686 * and return buffer. 687 * ioflag - SYNC flags; used to provide FRSYNC semantics. 688 * cr - credentials of caller. 689 * ct - caller context 690 * 691 * OUT: uio - updated offset and range, buffer filled. 692 * 693 * RETURN: 0 on success, error code on failure. 694 * 695 * Side Effects: 696 * vp - atime updated if byte count > 0 697 */ 698/* ARGSUSED */ 699static int 700zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) 701{ 702 znode_t *zp = VTOZ(vp); 703 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 704 ssize_t n, nbytes; 705 int error = 0; 706 rl_t *rl; 707 xuio_t *xuio = NULL; 708 709 ZFS_ENTER(zfsvfs); 710 ZFS_VERIFY_ZP(zp); 711 712 if (zp->z_pflags & ZFS_AV_QUARANTINED) { 713 ZFS_EXIT(zfsvfs); 714 return (SET_ERROR(EACCES)); 715 } 716 717 /* 718 * Validate file offset 719 */ 720 if (uio->uio_loffset < (offset_t)0) { 721 ZFS_EXIT(zfsvfs); 722 return (SET_ERROR(EINVAL)); 723 } 724 725 /* 726 * Fasttrack empty reads 727 */ 728 if (uio->uio_resid == 0) { 729 ZFS_EXIT(zfsvfs); 730 return (0); 731 } 732 733 /* 734 * Check for mandatory locks 735 */ 736 if (MANDMODE(zp->z_mode)) { 737 if (error = chklock(vp, FREAD, 738 uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) { 739 ZFS_EXIT(zfsvfs); 740 return (error); 741 } 742 } 743 744 /* 745 * If we're in FRSYNC mode, sync out this znode before reading it. 746 */ 747 if (zfsvfs->z_log && 748 (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)) 749 zil_commit(zfsvfs->z_log, zp->z_id); 750 751 /* 752 * Lock the range against changes. 753 */ 754 rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER); 755 756 /* 757 * If we are reading past end-of-file we can skip 758 * to the end; but we might still need to set atime. 759 */ 760 if (uio->uio_loffset >= zp->z_size) { 761 error = 0; 762 goto out; 763 } 764 765 ASSERT(uio->uio_loffset < zp->z_size); 766 n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset); 767 768#ifdef illumos 769 if ((uio->uio_extflg == UIO_XUIO) && 770 (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) { 771 int nblk; 772 int blksz = zp->z_blksz; 773 uint64_t offset = uio->uio_loffset; 774 775 xuio = (xuio_t *)uio; 776 if ((ISP2(blksz))) { 777 nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset, 778 blksz)) / blksz; 779 } else { 780 ASSERT(offset + n <= blksz); 781 nblk = 1; 782 } 783 (void) dmu_xuio_init(xuio, nblk); 784 785 if (vn_has_cached_data(vp)) { 786 /* 787 * For simplicity, we always allocate a full buffer 788 * even if we only expect to read a portion of a block. 789 */ 790 while (--nblk >= 0) { 791 (void) dmu_xuio_add(xuio, 792 dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), 793 blksz), 0, blksz); 794 } 795 } 796 } 797#endif /* illumos */ 798 799 while (n > 0) { 800 nbytes = MIN(n, zfs_read_chunk_size - 801 P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); 802 803#ifdef __FreeBSD__ 804 if (uio->uio_segflg == UIO_NOCOPY) 805 error = mappedread_sf(vp, nbytes, uio); 806 else 807#endif /* __FreeBSD__ */ 808 if (vn_has_cached_data(vp)) { 809 error = mappedread(vp, nbytes, uio); 810 } else { 811 error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), 812 uio, nbytes); 813 } 814 if (error) { 815 /* convert checksum errors into IO errors */ 816 if (error == ECKSUM) 817 error = SET_ERROR(EIO); 818 break; 819 } 820 821 n -= nbytes; 822 } 823out: 824 zfs_range_unlock(rl); 825 826 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 827 ZFS_EXIT(zfsvfs); 828 return (error); 829} 830 831/* 832 * Write the bytes to a file. 833 * 834 * IN: vp - vnode of file to be written to. 835 * uio - structure supplying write location, range info, 836 * and data buffer. 837 * ioflag - FAPPEND, FSYNC, and/or FDSYNC. FAPPEND is 838 * set if in append mode. 839 * cr - credentials of caller. 840 * ct - caller context (NFS/CIFS fem monitor only) 841 * 842 * OUT: uio - updated offset and range. 843 * 844 * RETURN: 0 on success, error code on failure. 845 * 846 * Timestamps: 847 * vp - ctime|mtime updated if byte count > 0 848 */ 849 850/* ARGSUSED */ 851static int 852zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) 853{ 854 znode_t *zp = VTOZ(vp); 855 rlim64_t limit = MAXOFFSET_T; 856 ssize_t start_resid = uio->uio_resid; 857 ssize_t tx_bytes; 858 uint64_t end_size; 859 dmu_tx_t *tx; 860 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 861 zilog_t *zilog; 862 offset_t woff; 863 ssize_t n, nbytes; 864 rl_t *rl; 865 int max_blksz = zfsvfs->z_max_blksz; 866 int error = 0; 867 arc_buf_t *abuf; 868 iovec_t *aiov = NULL; 869 xuio_t *xuio = NULL; 870 int i_iov = 0; 871 int iovcnt = uio->uio_iovcnt; 872 iovec_t *iovp = uio->uio_iov; 873 int write_eof; 874 int count = 0; 875 sa_bulk_attr_t bulk[4]; 876 uint64_t mtime[2], ctime[2]; 877 878 /* 879 * Fasttrack empty write 880 */ 881 n = start_resid; 882 if (n == 0) 883 return (0); 884 885 if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) 886 limit = MAXOFFSET_T; 887 888 ZFS_ENTER(zfsvfs); 889 ZFS_VERIFY_ZP(zp); 890 891 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); 892 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); 893 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, 894 &zp->z_size, 8); 895 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 896 &zp->z_pflags, 8); 897 898 /* 899 * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our 900 * callers might not be able to detect properly that we are read-only, 901 * so check it explicitly here. 902 */ 903 if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { 904 ZFS_EXIT(zfsvfs); 905 return (SET_ERROR(EROFS)); 906 } 907 908 /* 909 * If immutable or not appending then return EPERM. 910 * Intentionally allow ZFS_READONLY through here. 911 * See zfs_zaccess_common() 912 */ 913 if ((zp->z_pflags & ZFS_IMMUTABLE) || 914 ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) && 915 (uio->uio_loffset < zp->z_size))) { 916 ZFS_EXIT(zfsvfs); 917 return (SET_ERROR(EPERM)); 918 } 919 920 zilog = zfsvfs->z_log; 921 922 /* 923 * Validate file offset 924 */ 925 woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset; 926 if (woff < 0) { 927 ZFS_EXIT(zfsvfs); 928 return (SET_ERROR(EINVAL)); 929 } 930 931 /* 932 * Check for mandatory locks before calling zfs_range_lock() 933 * in order to prevent a deadlock with locks set via fcntl(). 934 */ 935 if (MANDMODE((mode_t)zp->z_mode) && 936 (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) { 937 ZFS_EXIT(zfsvfs); 938 return (error); 939 } 940 941#ifdef illumos 942 /* 943 * Pre-fault the pages to ensure slow (eg NFS) pages 944 * don't hold up txg. 945 * Skip this if uio contains loaned arc_buf. 946 */ 947 if ((uio->uio_extflg == UIO_XUIO) && 948 (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) 949 xuio = (xuio_t *)uio; 950 else 951 uio_prefaultpages(MIN(n, max_blksz), uio); 952#endif 953 954 /* 955 * If in append mode, set the io offset pointer to eof. 956 */ 957 if (ioflag & FAPPEND) { 958 /* 959 * Obtain an appending range lock to guarantee file append 960 * semantics. We reset the write offset once we have the lock. 961 */ 962 rl = zfs_range_lock(zp, 0, n, RL_APPEND); 963 woff = rl->r_off; 964 if (rl->r_len == UINT64_MAX) { 965 /* 966 * We overlocked the file because this write will cause 967 * the file block size to increase. 968 * Note that zp_size cannot change with this lock held. 969 */ 970 woff = zp->z_size; 971 } 972 uio->uio_loffset = woff; 973 } else { 974 /* 975 * Note that if the file block size will change as a result of 976 * this write, then this range lock will lock the entire file 977 * so that we can re-write the block safely. 978 */ 979 rl = zfs_range_lock(zp, woff, n, RL_WRITER); 980 } 981 982 if (vn_rlimit_fsize(vp, uio, uio->uio_td)) { 983 zfs_range_unlock(rl); 984 ZFS_EXIT(zfsvfs); 985 return (EFBIG); 986 } 987 988 if (woff >= limit) { 989 zfs_range_unlock(rl); 990 ZFS_EXIT(zfsvfs); 991 return (SET_ERROR(EFBIG)); 992 } 993 994 if ((woff + n) > limit || woff > (limit - n)) 995 n = limit - woff; 996 997 /* Will this write extend the file length? */ 998 write_eof = (woff + n > zp->z_size); 999 1000 end_size = MAX(zp->z_size, woff + n); 1001 1002 /* 1003 * Write the file in reasonable size chunks. Each chunk is written 1004 * in a separate transaction; this keeps the intent log records small 1005 * and allows us to do more fine-grained space accounting. 1006 */ 1007 while (n > 0) { 1008 abuf = NULL; 1009 woff = uio->uio_loffset; 1010 if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) || 1011 zfs_owner_overquota(zfsvfs, zp, B_TRUE)) { 1012 if (abuf != NULL) 1013 dmu_return_arcbuf(abuf); 1014 error = SET_ERROR(EDQUOT); 1015 break; 1016 } 1017 1018 if (xuio && abuf == NULL) { 1019 ASSERT(i_iov < iovcnt); 1020 aiov = &iovp[i_iov]; 1021 abuf = dmu_xuio_arcbuf(xuio, i_iov); 1022 dmu_xuio_clear(xuio, i_iov); 1023 DTRACE_PROBE3(zfs_cp_write, int, i_iov, 1024 iovec_t *, aiov, arc_buf_t *, abuf); 1025 ASSERT((aiov->iov_base == abuf->b_data) || 1026 ((char *)aiov->iov_base - (char *)abuf->b_data + 1027 aiov->iov_len == arc_buf_size(abuf))); 1028 i_iov++; 1029 } else if (abuf == NULL && n >= max_blksz && 1030 woff >= zp->z_size && 1031 P2PHASE(woff, max_blksz) == 0 && 1032 zp->z_blksz == max_blksz) { 1033 /* 1034 * This write covers a full block. "Borrow" a buffer 1035 * from the dmu so that we can fill it before we enter 1036 * a transaction. This avoids the possibility of 1037 * holding up the transaction if the data copy hangs 1038 * up on a pagefault (e.g., from an NFS server mapping). 1039 */ 1040 size_t cbytes; 1041 1042 abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), 1043 max_blksz); 1044 ASSERT(abuf != NULL); 1045 ASSERT(arc_buf_size(abuf) == max_blksz); 1046 if (error = uiocopy(abuf->b_data, max_blksz, 1047 UIO_WRITE, uio, &cbytes)) { 1048 dmu_return_arcbuf(abuf); 1049 break; 1050 } 1051 ASSERT(cbytes == max_blksz); 1052 } 1053 1054 /* 1055 * Start a transaction. 1056 */ 1057 tx = dmu_tx_create(zfsvfs->z_os); 1058 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 1059 dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz)); 1060 zfs_sa_upgrade_txholds(tx, zp); 1061 error = dmu_tx_assign(tx, TXG_WAIT); 1062 if (error) { 1063 dmu_tx_abort(tx); 1064 if (abuf != NULL) 1065 dmu_return_arcbuf(abuf); 1066 break; 1067 } 1068 1069 /* 1070 * If zfs_range_lock() over-locked we grow the blocksize 1071 * and then reduce the lock range. This will only happen 1072 * on the first iteration since zfs_range_reduce() will 1073 * shrink down r_len to the appropriate size. 1074 */ 1075 if (rl->r_len == UINT64_MAX) { 1076 uint64_t new_blksz; 1077 1078 if (zp->z_blksz > max_blksz) { 1079 /* 1080 * File's blocksize is already larger than the 1081 * "recordsize" property. Only let it grow to 1082 * the next power of 2. 1083 */ 1084 ASSERT(!ISP2(zp->z_blksz)); 1085 new_blksz = MIN(end_size, 1086 1 << highbit64(zp->z_blksz)); 1087 } else { 1088 new_blksz = MIN(end_size, max_blksz); 1089 } 1090 zfs_grow_blocksize(zp, new_blksz, tx); 1091 zfs_range_reduce(rl, woff, n); 1092 } 1093 1094 /* 1095 * XXX - should we really limit each write to z_max_blksz? 1096 * Perhaps we should use SPA_MAXBLOCKSIZE chunks? 1097 */ 1098 nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); 1099 1100 if (woff + nbytes > zp->z_size) 1101 vnode_pager_setsize(vp, woff + nbytes); 1102 1103 if (abuf == NULL) { 1104 tx_bytes = uio->uio_resid; 1105 error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), 1106 uio, nbytes, tx); 1107 tx_bytes -= uio->uio_resid; 1108 } else { 1109 tx_bytes = nbytes; 1110 ASSERT(xuio == NULL || tx_bytes == aiov->iov_len); 1111 /* 1112 * If this is not a full block write, but we are 1113 * extending the file past EOF and this data starts 1114 * block-aligned, use assign_arcbuf(). Otherwise, 1115 * write via dmu_write(). 1116 */ 1117 if (tx_bytes < max_blksz && (!write_eof || 1118 aiov->iov_base != abuf->b_data)) { 1119 ASSERT(xuio); 1120 dmu_write(zfsvfs->z_os, zp->z_id, woff, 1121 aiov->iov_len, aiov->iov_base, tx); 1122 dmu_return_arcbuf(abuf); 1123 xuio_stat_wbuf_copied(); 1124 } else { 1125 ASSERT(xuio || tx_bytes == max_blksz); 1126 dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl), 1127 woff, abuf, tx); 1128 } 1129 ASSERT(tx_bytes <= uio->uio_resid); 1130 uioskip(uio, tx_bytes); 1131 } 1132 if (tx_bytes && vn_has_cached_data(vp)) { 1133 update_pages(vp, woff, tx_bytes, zfsvfs->z_os, 1134 zp->z_id, uio->uio_segflg, tx); 1135 } 1136 1137 /* 1138 * If we made no progress, we're done. If we made even 1139 * partial progress, update the znode and ZIL accordingly. 1140 */ 1141 if (tx_bytes == 0) { 1142 (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), 1143 (void *)&zp->z_size, sizeof (uint64_t), tx); 1144 dmu_tx_commit(tx); 1145 ASSERT(error != 0); 1146 break; 1147 } 1148 1149 /* 1150 * Clear Set-UID/Set-GID bits on successful write if not 1151 * privileged and at least one of the excute bits is set. 1152 * 1153 * It would be nice to to this after all writes have 1154 * been done, but that would still expose the ISUID/ISGID 1155 * to another app after the partial write is committed. 1156 * 1157 * Note: we don't call zfs_fuid_map_id() here because 1158 * user 0 is not an ephemeral uid. 1159 */ 1160 mutex_enter(&zp->z_acl_lock); 1161 if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | 1162 (S_IXUSR >> 6))) != 0 && 1163 (zp->z_mode & (S_ISUID | S_ISGID)) != 0 && 1164 secpolicy_vnode_setid_retain(vp, cr, 1165 (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) { 1166 uint64_t newmode; 1167 zp->z_mode &= ~(S_ISUID | S_ISGID); 1168 newmode = zp->z_mode; 1169 (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), 1170 (void *)&newmode, sizeof (uint64_t), tx); 1171 } 1172 mutex_exit(&zp->z_acl_lock); 1173 1174 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, 1175 B_TRUE); 1176 1177 /* 1178 * Update the file size (zp_size) if it has changed; 1179 * account for possible concurrent updates. 1180 */ 1181 while ((end_size = zp->z_size) < uio->uio_loffset) { 1182 (void) atomic_cas_64(&zp->z_size, end_size, 1183 uio->uio_loffset); 1184#ifdef illumos 1185 ASSERT(error == 0); 1186#else 1187 ASSERT(error == 0 || error == EFAULT); 1188#endif 1189 } 1190 /* 1191 * If we are replaying and eof is non zero then force 1192 * the file size to the specified eof. Note, there's no 1193 * concurrency during replay. 1194 */ 1195 if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0) 1196 zp->z_size = zfsvfs->z_replay_eof; 1197 1198 if (error == 0) 1199 error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 1200 else 1201 (void) sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 1202 1203 zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag); 1204 dmu_tx_commit(tx); 1205 1206 if (error != 0) 1207 break; 1208 ASSERT(tx_bytes == nbytes); 1209 n -= nbytes; 1210 1211#ifdef illumos 1212 if (!xuio && n > 0) 1213 uio_prefaultpages(MIN(n, max_blksz), uio); 1214#endif 1215 } 1216 1217 zfs_range_unlock(rl); 1218 1219 /* 1220 * If we're in replay mode, or we made no progress, return error. 1221 * Otherwise, it's at least a partial write, so it's successful. 1222 */ 1223 if (zfsvfs->z_replay || uio->uio_resid == start_resid) { 1224 ZFS_EXIT(zfsvfs); 1225 return (error); 1226 } 1227 1228#ifdef __FreeBSD__ 1229 /* 1230 * EFAULT means that at least one page of the source buffer was not 1231 * available. VFS will re-try remaining I/O upon this error. 1232 */ 1233 if (error == EFAULT) { 1234 ZFS_EXIT(zfsvfs); 1235 return (error); 1236 } 1237#endif 1238 1239 if (ioflag & (FSYNC | FDSYNC) || 1240 zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 1241 zil_commit(zilog, zp->z_id); 1242 1243 ZFS_EXIT(zfsvfs); 1244 return (0); 1245} 1246 1247void 1248zfs_get_done(zgd_t *zgd, int error) 1249{ 1250 znode_t *zp = zgd->zgd_private; 1251 objset_t *os = zp->z_zfsvfs->z_os; 1252 1253 if (zgd->zgd_db) 1254 dmu_buf_rele(zgd->zgd_db, zgd); 1255 1256 zfs_range_unlock(zgd->zgd_rl); 1257 1258 /* 1259 * Release the vnode asynchronously as we currently have the 1260 * txg stopped from syncing. 1261 */ 1262 VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os))); 1263 1264 if (error == 0 && zgd->zgd_bp) 1265 zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp); 1266 1267 kmem_free(zgd, sizeof (zgd_t)); 1268} 1269 1270#ifdef DEBUG 1271static int zil_fault_io = 0; 1272#endif 1273 1274/* 1275 * Get data to generate a TX_WRITE intent log record. 1276 */ 1277int 1278zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) 1279{ 1280 zfsvfs_t *zfsvfs = arg; 1281 objset_t *os = zfsvfs->z_os; 1282 znode_t *zp; 1283 uint64_t object = lr->lr_foid; 1284 uint64_t offset = lr->lr_offset; 1285 uint64_t size = lr->lr_length; 1286 dmu_buf_t *db; 1287 zgd_t *zgd; 1288 int error = 0; 1289 1290 ASSERT3P(lwb, !=, NULL); 1291 ASSERT3P(zio, !=, NULL); 1292 ASSERT3U(size, !=, 0); 1293 1294 /* 1295 * Nothing to do if the file has been removed 1296 */ 1297 if (zfs_zget(zfsvfs, object, &zp) != 0) 1298 return (SET_ERROR(ENOENT)); 1299 if (zp->z_unlinked) { 1300 /* 1301 * Release the vnode asynchronously as we currently have the 1302 * txg stopped from syncing. 1303 */ 1304 VN_RELE_ASYNC(ZTOV(zp), 1305 dsl_pool_vnrele_taskq(dmu_objset_pool(os))); 1306 return (SET_ERROR(ENOENT)); 1307 } 1308 1309 zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP); 1310 zgd->zgd_lwb = lwb; 1311 zgd->zgd_private = zp; 1312 1313 /* 1314 * Write records come in two flavors: immediate and indirect. 1315 * For small writes it's cheaper to store the data with the 1316 * log record (immediate); for large writes it's cheaper to 1317 * sync the data and get a pointer to it (indirect) so that 1318 * we don't have to write the data twice. 1319 */ 1320 if (buf != NULL) { /* immediate write */ 1321 zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER); 1322 /* test for truncation needs to be done while range locked */ 1323 if (offset >= zp->z_size) { 1324 error = SET_ERROR(ENOENT); 1325 } else { 1326 error = dmu_read(os, object, offset, size, buf, 1327 DMU_READ_NO_PREFETCH); 1328 } 1329 ASSERT(error == 0 || error == ENOENT); 1330 } else { /* indirect write */ 1331 /* 1332 * Have to lock the whole block to ensure when it's 1333 * written out and its checksum is being calculated 1334 * that no one can change the data. We need to re-check 1335 * blocksize after we get the lock in case it's changed! 1336 */ 1337 for (;;) { 1338 uint64_t blkoff; 1339 size = zp->z_blksz; 1340 blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; 1341 offset -= blkoff; 1342 zgd->zgd_rl = zfs_range_lock(zp, offset, size, 1343 RL_READER); 1344 if (zp->z_blksz == size) 1345 break; 1346 offset += blkoff; 1347 zfs_range_unlock(zgd->zgd_rl); 1348 } 1349 /* test for truncation needs to be done while range locked */ 1350 if (lr->lr_offset >= zp->z_size) 1351 error = SET_ERROR(ENOENT); 1352#ifdef DEBUG 1353 if (zil_fault_io) { 1354 error = SET_ERROR(EIO); 1355 zil_fault_io = 0; 1356 } 1357#endif 1358 if (error == 0) 1359 error = dmu_buf_hold(os, object, offset, zgd, &db, 1360 DMU_READ_NO_PREFETCH); 1361 1362 if (error == 0) { 1363 blkptr_t *bp = &lr->lr_blkptr; 1364 1365 zgd->zgd_db = db; 1366 zgd->zgd_bp = bp; 1367 1368 ASSERT(db->db_offset == offset); 1369 ASSERT(db->db_size == size); 1370 1371 error = dmu_sync(zio, lr->lr_common.lrc_txg, 1372 zfs_get_done, zgd); 1373 ASSERT(error || lr->lr_length <= size); 1374 1375 /* 1376 * On success, we need to wait for the write I/O 1377 * initiated by dmu_sync() to complete before we can 1378 * release this dbuf. We will finish everything up 1379 * in the zfs_get_done() callback. 1380 */ 1381 if (error == 0) 1382 return (0); 1383 1384 if (error == EALREADY) { 1385 lr->lr_common.lrc_txtype = TX_WRITE2; 1386 /* 1387 * TX_WRITE2 relies on the data previously 1388 * written by the TX_WRITE that caused 1389 * EALREADY. We zero out the BP because 1390 * it is the old, currently-on-disk BP, 1391 * so there's no need to zio_flush() its 1392 * vdevs (flushing would needlesly hurt 1393 * performance, and doesn't work on 1394 * indirect vdevs). 1395 */ 1396 zgd->zgd_bp = NULL; 1397 BP_ZERO(bp); 1398 error = 0; 1399 } 1400 } 1401 } 1402 1403 zfs_get_done(zgd, error); 1404 1405 return (error); 1406} 1407 1408/*ARGSUSED*/ 1409static int 1410zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr, 1411 caller_context_t *ct) 1412{ 1413 znode_t *zp = VTOZ(vp); 1414 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1415 int error; 1416 1417 ZFS_ENTER(zfsvfs); 1418 ZFS_VERIFY_ZP(zp); 1419 1420 if (flag & V_ACE_MASK) 1421 error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); 1422 else 1423 error = zfs_zaccess_rwx(zp, mode, flag, cr); 1424 1425 ZFS_EXIT(zfsvfs); 1426 return (error); 1427} 1428 1429static int 1430zfs_dd_callback(struct mount *mp, void *arg, int lkflags, struct vnode **vpp) 1431{ 1432 int error; 1433 1434 *vpp = arg; 1435 error = vn_lock(*vpp, lkflags); 1436 if (error != 0) 1437 vrele(*vpp); 1438 return (error); 1439} 1440 1441static int 1442zfs_lookup_lock(vnode_t *dvp, vnode_t *vp, const char *name, int lkflags) 1443{ 1444 znode_t *zdp = VTOZ(dvp); 1445 zfsvfs_t *zfsvfs = zdp->z_zfsvfs; 1446 int error; 1447 int ltype; 1448 1449 ASSERT_VOP_LOCKED(dvp, __func__); 1450#ifdef DIAGNOSTIC 1451 if ((zdp->z_pflags & ZFS_XATTR) == 0) 1452 VERIFY(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock)); 1453#endif 1454 1455 if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) { 1456 ASSERT3P(dvp, ==, vp); 1457 vref(dvp); 1458 ltype = lkflags & LK_TYPE_MASK; 1459 if (ltype != VOP_ISLOCKED(dvp)) { 1460 if (ltype == LK_EXCLUSIVE) 1461 vn_lock(dvp, LK_UPGRADE | LK_RETRY); 1462 else /* if (ltype == LK_SHARED) */ 1463 vn_lock(dvp, LK_DOWNGRADE | LK_RETRY); 1464 1465 /* 1466 * Relock for the "." case could leave us with 1467 * reclaimed vnode. 1468 */ 1469 if (dvp->v_iflag & VI_DOOMED) { 1470 vrele(dvp); 1471 return (SET_ERROR(ENOENT)); 1472 } 1473 } 1474 return (0); 1475 } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) { 1476 /* 1477 * Note that in this case, dvp is the child vnode, and we 1478 * are looking up the parent vnode - exactly reverse from 1479 * normal operation. Unlocking dvp requires some rather 1480 * tricky unlock/relock dance to prevent mp from being freed; 1481 * use vn_vget_ino_gen() which takes care of all that. 1482 * 1483 * XXX Note that there is a time window when both vnodes are 1484 * unlocked. It is possible, although highly unlikely, that 1485 * during that window the parent-child relationship between 1486 * the vnodes may change, for example, get reversed. 1487 * In that case we would have a wrong lock order for the vnodes. 1488 * All other filesystems seem to ignore this problem, so we 1489 * do the same here. 1490 * A potential solution could be implemented as follows: 1491 * - using LK_NOWAIT when locking the second vnode and retrying 1492 * if necessary 1493 * - checking that the parent-child relationship still holds 1494 * after locking both vnodes and retrying if it doesn't 1495 */ 1496 error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp); 1497 return (error); 1498 } else { 1499 error = vn_lock(vp, lkflags); 1500 if (error != 0) 1501 vrele(vp); 1502 return (error); 1503 } 1504} 1505 1506/* 1507 * Lookup an entry in a directory, or an extended attribute directory. 1508 * If it exists, return a held vnode reference for it. 1509 * 1510 * IN: dvp - vnode of directory to search. 1511 * nm - name of entry to lookup. 1512 * pnp - full pathname to lookup [UNUSED]. 1513 * flags - LOOKUP_XATTR set if looking for an attribute. 1514 * rdir - root directory vnode [UNUSED]. 1515 * cr - credentials of caller. 1516 * ct - caller context 1517 * 1518 * OUT: vpp - vnode of located entry, NULL if not found. 1519 * 1520 * RETURN: 0 on success, error code on failure. 1521 * 1522 * Timestamps: 1523 * NA 1524 */ 1525/* ARGSUSED */ 1526static int 1527zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp, 1528 int nameiop, cred_t *cr, kthread_t *td, int flags, boolean_t cached) 1529{ 1530 znode_t *zdp = VTOZ(dvp); 1531 znode_t *zp; 1532 zfsvfs_t *zfsvfs = zdp->z_zfsvfs; 1533 int error = 0; 1534 1535 /* 1536 * Fast path lookup, however we must skip DNLC lookup 1537 * for case folding or normalizing lookups because the 1538 * DNLC code only stores the passed in name. This means 1539 * creating 'a' and removing 'A' on a case insensitive 1540 * file system would work, but DNLC still thinks 'a' 1541 * exists and won't let you create it again on the next 1542 * pass through fast path. 1543 */ 1544 if (!(flags & LOOKUP_XATTR)) { 1545 if (dvp->v_type != VDIR) { 1546 return (SET_ERROR(ENOTDIR)); 1547 } else if (zdp->z_sa_hdl == NULL) { 1548 return (SET_ERROR(EIO)); 1549 } 1550 } 1551 1552 DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm); 1553 1554 ZFS_ENTER(zfsvfs); 1555 ZFS_VERIFY_ZP(zdp); 1556 1557 *vpp = NULL; 1558 1559 if (flags & LOOKUP_XATTR) { 1560#ifdef TODO 1561 /* 1562 * If the xattr property is off, refuse the lookup request. 1563 */ 1564 if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) { 1565 ZFS_EXIT(zfsvfs); 1566 return (SET_ERROR(EINVAL)); 1567 } 1568#endif 1569 1570 /* 1571 * We don't allow recursive attributes.. 1572 * Maybe someday we will. 1573 */ 1574 if (zdp->z_pflags & ZFS_XATTR) { 1575 ZFS_EXIT(zfsvfs); 1576 return (SET_ERROR(EINVAL)); 1577 } 1578 1579 if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) { 1580 ZFS_EXIT(zfsvfs); 1581 return (error); 1582 } 1583 1584 /* 1585 * Do we have permission to get into attribute directory? 1586 */ 1587 if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0, 1588 B_FALSE, cr)) { 1589 vrele(*vpp); 1590 *vpp = NULL; 1591 } 1592 1593 ZFS_EXIT(zfsvfs); 1594 return (error); 1595 } 1596 1597 /* 1598 * Check accessibility of directory. 1599 */ 1600 if (!cached) { 1601 if ((cnp->cn_flags & NOEXECCHECK) != 0) { 1602 cnp->cn_flags &= ~NOEXECCHECK; 1603 } else { 1604 error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr); 1605 if (error != 0) { 1606 ZFS_EXIT(zfsvfs); 1607 return (error); 1608 } 1609 } 1610 } 1611 1612 if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), 1613 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 1614 ZFS_EXIT(zfsvfs); 1615 return (SET_ERROR(EILSEQ)); 1616 } 1617 1618 1619 /* 1620 * First handle the special cases. 1621 */ 1622 if ((cnp->cn_flags & ISDOTDOT) != 0) { 1623 /* 1624 * If we are a snapshot mounted under .zfs, return 1625 * the vp for the snapshot directory. 1626 */ 1627 if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) { 1628 struct componentname cn; 1629 vnode_t *zfsctl_vp; 1630 int ltype; 1631 1632 ZFS_EXIT(zfsvfs); 1633 ltype = VOP_ISLOCKED(dvp); 1634 VOP_UNLOCK(dvp, 0); 1635 error = zfsctl_root(zfsvfs->z_parent, LK_SHARED, 1636 &zfsctl_vp); 1637 if (error == 0) { 1638 cn.cn_nameptr = "snapshot"; 1639 cn.cn_namelen = strlen(cn.cn_nameptr); 1640 cn.cn_nameiop = cnp->cn_nameiop; 1641 cn.cn_flags = cnp->cn_flags & ~ISDOTDOT; 1642 cn.cn_lkflags = cnp->cn_lkflags; 1643 error = VOP_LOOKUP(zfsctl_vp, vpp, &cn); 1644 vput(zfsctl_vp); 1645 } 1646 vn_lock(dvp, ltype | LK_RETRY); 1647 return (error); 1648 } 1649 } 1650 if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) { 1651 ZFS_EXIT(zfsvfs); 1652 if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP) 1653 return (SET_ERROR(ENOTSUP)); 1654 error = zfsctl_root(zfsvfs, cnp->cn_lkflags, vpp); 1655 return (error); 1656 } 1657 1658 /* 1659 * The loop is retry the lookup if the parent-child relationship 1660 * changes during the dot-dot locking complexities. 1661 */ 1662 for (;;) { 1663 uint64_t parent; 1664 1665 error = zfs_dirlook(zdp, nm, &zp); 1666 if (error == 0) 1667 *vpp = ZTOV(zp); 1668 1669 ZFS_EXIT(zfsvfs); 1670 if (error != 0) 1671 break; 1672 1673 error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags); 1674 if (error != 0) { 1675 /* 1676 * If we've got a locking error, then the vnode 1677 * got reclaimed because of a force unmount. 1678 * We never enter doomed vnodes into the name cache. 1679 */ 1680 *vpp = NULL; 1681 return (error); 1682 } 1683 1684 if ((cnp->cn_flags & ISDOTDOT) == 0) 1685 break; 1686 1687 ZFS_ENTER(zfsvfs); 1688 if (zdp->z_sa_hdl == NULL) { 1689 error = SET_ERROR(EIO); 1690 } else { 1691 error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), 1692 &parent, sizeof (parent)); 1693 } 1694 if (error != 0) { 1695 ZFS_EXIT(zfsvfs); 1696 vput(ZTOV(zp)); 1697 break; 1698 } 1699 if (zp->z_id == parent) { 1700 ZFS_EXIT(zfsvfs); 1701 break; 1702 } 1703 vput(ZTOV(zp)); 1704 } 1705 1706out: 1707 if (error != 0) 1708 *vpp = NULL; 1709 1710 /* Translate errors and add SAVENAME when needed. */ 1711 if (cnp->cn_flags & ISLASTCN) { 1712 switch (nameiop) { 1713 case CREATE: 1714 case RENAME: 1715 if (error == ENOENT) { 1716 error = EJUSTRETURN; 1717 cnp->cn_flags |= SAVENAME; 1718 break; 1719 } 1720 /* FALLTHROUGH */ 1721 case DELETE: 1722 if (error == 0) 1723 cnp->cn_flags |= SAVENAME; 1724 break; 1725 } 1726 } 1727 1728 /* Insert name into cache (as non-existent) if appropriate. */ 1729 if (zfsvfs->z_use_namecache && 1730 error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0) 1731 cache_enter(dvp, NULL, cnp); 1732 1733 /* Insert name into cache if appropriate. */ 1734 if (zfsvfs->z_use_namecache && 1735 error == 0 && (cnp->cn_flags & MAKEENTRY)) { 1736 if (!(cnp->cn_flags & ISLASTCN) || 1737 (nameiop != DELETE && nameiop != RENAME)) { 1738 cache_enter(dvp, *vpp, cnp); 1739 } 1740 } 1741 1742 return (error); 1743} 1744 1745/* 1746 * Attempt to create a new entry in a directory. If the entry 1747 * already exists, truncate the file if permissible, else return 1748 * an error. Return the vp of the created or trunc'd file. 1749 * 1750 * IN: dvp - vnode of directory to put new file entry in. 1751 * name - name of new file entry. 1752 * vap - attributes of new file. 1753 * excl - flag indicating exclusive or non-exclusive mode. 1754 * mode - mode to open file with. 1755 * cr - credentials of caller. 1756 * flag - large file flag [UNUSED]. 1757 * ct - caller context 1758 * vsecp - ACL to be set 1759 * 1760 * OUT: vpp - vnode of created or trunc'd entry. 1761 * 1762 * RETURN: 0 on success, error code on failure. 1763 * 1764 * Timestamps: 1765 * dvp - ctime|mtime updated if new entry created 1766 * vp - ctime|mtime always, atime if new 1767 */ 1768 1769/* ARGSUSED */ 1770static int 1771zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode, 1772 vnode_t **vpp, cred_t *cr, kthread_t *td) 1773{ 1774 znode_t *zp, *dzp = VTOZ(dvp); 1775 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1776 zilog_t *zilog; 1777 objset_t *os; 1778 dmu_tx_t *tx; 1779 int error; 1780 ksid_t *ksid; 1781 uid_t uid; 1782 gid_t gid = crgetgid(cr); 1783 zfs_acl_ids_t acl_ids; 1784 boolean_t fuid_dirtied; 1785 void *vsecp = NULL; 1786 int flag = 0; 1787 uint64_t txtype; 1788 1789 /* 1790 * If we have an ephemeral id, ACL, or XVATTR then 1791 * make sure file system is at proper version 1792 */ 1793 1794 ksid = crgetsid(cr, KSID_OWNER); 1795 if (ksid) 1796 uid = ksid_getid(ksid); 1797 else 1798 uid = crgetuid(cr); 1799 1800 if (zfsvfs->z_use_fuids == B_FALSE && 1801 (vsecp || (vap->va_mask & AT_XVATTR) || 1802 IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) 1803 return (SET_ERROR(EINVAL)); 1804 1805 ZFS_ENTER(zfsvfs); 1806 ZFS_VERIFY_ZP(dzp); 1807 os = zfsvfs->z_os; 1808 zilog = zfsvfs->z_log; 1809 1810 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), 1811 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 1812 ZFS_EXIT(zfsvfs); 1813 return (SET_ERROR(EILSEQ)); 1814 } 1815 1816 if (vap->va_mask & AT_XVATTR) { 1817 if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap, 1818 crgetuid(cr), cr, vap->va_type)) != 0) { 1819 ZFS_EXIT(zfsvfs); 1820 return (error); 1821 } 1822 } 1823 1824 *vpp = NULL; 1825 1826 if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr)) 1827 vap->va_mode &= ~S_ISVTX; 1828 1829 error = zfs_dirent_lookup(dzp, name, &zp, ZNEW); 1830 if (error) { 1831 ZFS_EXIT(zfsvfs); 1832 return (error); 1833 } 1834 ASSERT3P(zp, ==, NULL); 1835 1836 /* 1837 * Create a new file object and update the directory 1838 * to reference it. 1839 */ 1840 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { 1841 goto out; 1842 } 1843 1844 /* 1845 * We only support the creation of regular files in 1846 * extended attribute directories. 1847 */ 1848 1849 if ((dzp->z_pflags & ZFS_XATTR) && 1850 (vap->va_type != VREG)) { 1851 error = SET_ERROR(EINVAL); 1852 goto out; 1853 } 1854 1855 if ((error = zfs_acl_ids_create(dzp, 0, vap, 1856 cr, vsecp, &acl_ids)) != 0) 1857 goto out; 1858 1859 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { 1860 zfs_acl_ids_free(&acl_ids); 1861 error = SET_ERROR(EDQUOT); 1862 goto out; 1863 } 1864 1865 getnewvnode_reserve(1); 1866 1867 tx = dmu_tx_create(os); 1868 1869 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 1870 ZFS_SA_BASE_ATTR_SIZE); 1871 1872 fuid_dirtied = zfsvfs->z_fuid_dirty; 1873 if (fuid_dirtied) 1874 zfs_fuid_txhold(zfsvfs, tx); 1875 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 1876 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); 1877 if (!zfsvfs->z_use_sa && 1878 acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 1879 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 1880 0, acl_ids.z_aclp->z_acl_bytes); 1881 } 1882 error = dmu_tx_assign(tx, TXG_WAIT); 1883 if (error) { 1884 zfs_acl_ids_free(&acl_ids); 1885 dmu_tx_abort(tx); 1886 getnewvnode_drop_reserve(); 1887 ZFS_EXIT(zfsvfs); 1888 return (error); 1889 } 1890 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); 1891 1892 if (fuid_dirtied) 1893 zfs_fuid_sync(zfsvfs, tx); 1894 1895 (void) zfs_link_create(dzp, name, zp, tx, ZNEW); 1896 txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); 1897 zfs_log_create(zilog, tx, txtype, dzp, zp, name, 1898 vsecp, acl_ids.z_fuidp, vap); 1899 zfs_acl_ids_free(&acl_ids); 1900 dmu_tx_commit(tx); 1901 1902 getnewvnode_drop_reserve(); 1903 1904out: 1905 if (error == 0) { 1906 *vpp = ZTOV(zp); 1907 } 1908 1909 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 1910 zil_commit(zilog, 0); 1911 1912 ZFS_EXIT(zfsvfs); 1913 return (error); 1914} 1915 1916/* 1917 * Remove an entry from a directory. 1918 * 1919 * IN: dvp - vnode of directory to remove entry from. 1920 * name - name of entry to remove. 1921 * cr - credentials of caller. 1922 * ct - caller context 1923 * flags - case flags 1924 * 1925 * RETURN: 0 on success, error code on failure. 1926 * 1927 * Timestamps: 1928 * dvp - ctime|mtime 1929 * vp - ctime (if nlink > 0) 1930 */ 1931 1932/*ARGSUSED*/ 1933static int 1934zfs_remove(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr) 1935{ 1936 znode_t *dzp = VTOZ(dvp); 1937 znode_t *zp = VTOZ(vp); 1938 znode_t *xzp; 1939 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1940 zilog_t *zilog; 1941 uint64_t acl_obj, xattr_obj; 1942 uint64_t obj = 0; 1943 dmu_tx_t *tx; 1944 boolean_t unlinked, toobig = FALSE; 1945 uint64_t txtype; 1946 int error; 1947 1948 ZFS_ENTER(zfsvfs); 1949 ZFS_VERIFY_ZP(dzp); 1950 ZFS_VERIFY_ZP(zp); 1951 zilog = zfsvfs->z_log; 1952 zp = VTOZ(vp); 1953 1954 xattr_obj = 0; 1955 xzp = NULL; 1956 1957 if (error = zfs_zaccess_delete(dzp, zp, cr)) { 1958 goto out; 1959 } 1960 1961 /* 1962 * Need to use rmdir for removing directories. 1963 */ 1964 if (vp->v_type == VDIR) { 1965 error = SET_ERROR(EPERM); 1966 goto out; 1967 } 1968 1969 vnevent_remove(vp, dvp, name, ct); 1970 1971 obj = zp->z_id; 1972 1973 /* are there any extended attributes? */ 1974 error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), 1975 &xattr_obj, sizeof (xattr_obj)); 1976 if (error == 0 && xattr_obj) { 1977 error = zfs_zget(zfsvfs, xattr_obj, &xzp); 1978 ASSERT0(error); 1979 } 1980 1981 /* 1982 * We may delete the znode now, or we may put it in the unlinked set; 1983 * it depends on whether we're the last link, and on whether there are 1984 * other holds on the vnode. So we dmu_tx_hold() the right things to 1985 * allow for either case. 1986 */ 1987 tx = dmu_tx_create(zfsvfs->z_os); 1988 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); 1989 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 1990 zfs_sa_upgrade_txholds(tx, zp); 1991 zfs_sa_upgrade_txholds(tx, dzp); 1992 1993 if (xzp) { 1994 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 1995 dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); 1996 } 1997 1998 /* charge as an update -- would be nice not to charge at all */ 1999 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 2000 2001 /* 2002 * Mark this transaction as typically resulting in a net free of space 2003 */ 2004 dmu_tx_mark_netfree(tx); 2005 2006 error = dmu_tx_assign(tx, TXG_WAIT); 2007 if (error) { 2008 dmu_tx_abort(tx); 2009 ZFS_EXIT(zfsvfs); 2010 return (error); 2011 } 2012 2013 /* 2014 * Remove the directory entry. 2015 */ 2016 error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked); 2017 2018 if (error) { 2019 dmu_tx_commit(tx); 2020 goto out; 2021 } 2022 2023 if (unlinked) { 2024 zfs_unlinked_add(zp, tx); 2025 vp->v_vflag |= VV_NOSYNC; 2026 } 2027 2028 txtype = TX_REMOVE; 2029 zfs_log_remove(zilog, tx, txtype, dzp, name, obj); 2030 2031 dmu_tx_commit(tx); 2032out: 2033 2034 if (xzp) 2035 vrele(ZTOV(xzp)); 2036 2037 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 2038 zil_commit(zilog, 0); 2039 2040 ZFS_EXIT(zfsvfs); 2041 return (error); 2042} 2043 2044/* 2045 * Create a new directory and insert it into dvp using the name 2046 * provided. Return a pointer to the inserted directory. 2047 * 2048 * IN: dvp - vnode of directory to add subdir to. 2049 * dirname - name of new directory. 2050 * vap - attributes of new directory. 2051 * cr - credentials of caller. 2052 * ct - caller context 2053 * flags - case flags 2054 * vsecp - ACL to be set 2055 * 2056 * OUT: vpp - vnode of created directory. 2057 * 2058 * RETURN: 0 on success, error code on failure. 2059 * 2060 * Timestamps: 2061 * dvp - ctime|mtime updated 2062 * vp - ctime|mtime|atime updated 2063 */ 2064/*ARGSUSED*/ 2065static int 2066zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr) 2067{ 2068 znode_t *zp, *dzp = VTOZ(dvp); 2069 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 2070 zilog_t *zilog; 2071 uint64_t txtype; 2072 dmu_tx_t *tx; 2073 int error; 2074 ksid_t *ksid; 2075 uid_t uid; 2076 gid_t gid = crgetgid(cr); 2077 zfs_acl_ids_t acl_ids; 2078 boolean_t fuid_dirtied; 2079 2080 ASSERT(vap->va_type == VDIR); 2081 2082 /* 2083 * If we have an ephemeral id, ACL, or XVATTR then 2084 * make sure file system is at proper version 2085 */ 2086 2087 ksid = crgetsid(cr, KSID_OWNER); 2088 if (ksid) 2089 uid = ksid_getid(ksid); 2090 else 2091 uid = crgetuid(cr); 2092 if (zfsvfs->z_use_fuids == B_FALSE && 2093 ((vap->va_mask & AT_XVATTR) || 2094 IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) 2095 return (SET_ERROR(EINVAL)); 2096 2097 ZFS_ENTER(zfsvfs); 2098 ZFS_VERIFY_ZP(dzp); 2099 zilog = zfsvfs->z_log; 2100 2101 if (dzp->z_pflags & ZFS_XATTR) { 2102 ZFS_EXIT(zfsvfs); 2103 return (SET_ERROR(EINVAL)); 2104 } 2105 2106 if (zfsvfs->z_utf8 && u8_validate(dirname, 2107 strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 2108 ZFS_EXIT(zfsvfs); 2109 return (SET_ERROR(EILSEQ)); 2110 } 2111 2112 if (vap->va_mask & AT_XVATTR) { 2113 if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap, 2114 crgetuid(cr), cr, vap->va_type)) != 0) { 2115 ZFS_EXIT(zfsvfs); 2116 return (error); 2117 } 2118 } 2119 2120 if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, 2121 NULL, &acl_ids)) != 0) { 2122 ZFS_EXIT(zfsvfs); 2123 return (error); 2124 } 2125 2126 /* 2127 * First make sure the new directory doesn't exist. 2128 * 2129 * Existence is checked first to make sure we don't return 2130 * EACCES instead of EEXIST which can cause some applications 2131 * to fail. 2132 */ 2133 *vpp = NULL; 2134 2135 if (error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW)) { 2136 zfs_acl_ids_free(&acl_ids); 2137 ZFS_EXIT(zfsvfs); 2138 return (error); 2139 } 2140 ASSERT3P(zp, ==, NULL); 2141 2142 if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) { 2143 zfs_acl_ids_free(&acl_ids); 2144 ZFS_EXIT(zfsvfs); 2145 return (error); 2146 } 2147 2148 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { 2149 zfs_acl_ids_free(&acl_ids); 2150 ZFS_EXIT(zfsvfs); 2151 return (SET_ERROR(EDQUOT)); 2152 } 2153 2154 /* 2155 * Add a new entry to the directory. 2156 */ 2157 getnewvnode_reserve(1); 2158 tx = dmu_tx_create(zfsvfs->z_os); 2159 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); 2160 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); 2161 fuid_dirtied = zfsvfs->z_fuid_dirty; 2162 if (fuid_dirtied) 2163 zfs_fuid_txhold(zfsvfs, tx); 2164 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 2165 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 2166 acl_ids.z_aclp->z_acl_bytes); 2167 } 2168 2169 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 2170 ZFS_SA_BASE_ATTR_SIZE); 2171 2172 error = dmu_tx_assign(tx, TXG_WAIT); 2173 if (error) { 2174 zfs_acl_ids_free(&acl_ids); 2175 dmu_tx_abort(tx); 2176 getnewvnode_drop_reserve(); 2177 ZFS_EXIT(zfsvfs); 2178 return (error); 2179 } 2180 2181 /* 2182 * Create new node. 2183 */ 2184 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); 2185 2186 if (fuid_dirtied) 2187 zfs_fuid_sync(zfsvfs, tx); 2188 2189 /* 2190 * Now put new name in parent dir. 2191 */ 2192 (void) zfs_link_create(dzp, dirname, zp, tx, ZNEW); 2193 2194 *vpp = ZTOV(zp); 2195 2196 txtype = zfs_log_create_txtype(Z_DIR, NULL, vap); 2197 zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL, 2198 acl_ids.z_fuidp, vap); 2199 2200 zfs_acl_ids_free(&acl_ids); 2201 2202 dmu_tx_commit(tx); 2203 2204 getnewvnode_drop_reserve(); 2205 2206 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 2207 zil_commit(zilog, 0); 2208 2209 ZFS_EXIT(zfsvfs); 2210 return (0); 2211} 2212 2213/* 2214 * Remove a directory subdir entry. If the current working 2215 * directory is the same as the subdir to be removed, the 2216 * remove will fail. 2217 * 2218 * IN: dvp - vnode of directory to remove from. 2219 * name - name of directory to be removed. 2220 * cwd - vnode of current working directory. 2221 * cr - credentials of caller. 2222 * ct - caller context 2223 * flags - case flags 2224 * 2225 * RETURN: 0 on success, error code on failure. 2226 * 2227 * Timestamps: 2228 * dvp - ctime|mtime updated 2229 */ 2230/*ARGSUSED*/ 2231static int 2232zfs_rmdir(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr) 2233{ 2234 znode_t *dzp = VTOZ(dvp); 2235 znode_t *zp = VTOZ(vp); 2236 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 2237 zilog_t *zilog; 2238 dmu_tx_t *tx; 2239 int error; 2240 2241 ZFS_ENTER(zfsvfs); 2242 ZFS_VERIFY_ZP(dzp); 2243 ZFS_VERIFY_ZP(zp); 2244 zilog = zfsvfs->z_log; 2245 2246 2247 if (error = zfs_zaccess_delete(dzp, zp, cr)) { 2248 goto out; 2249 } 2250 2251 if (vp->v_type != VDIR) { 2252 error = SET_ERROR(ENOTDIR); 2253 goto out; 2254 } 2255 2256 vnevent_rmdir(vp, dvp, name, ct); 2257 2258 tx = dmu_tx_create(zfsvfs->z_os); 2259 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); 2260 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 2261 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 2262 zfs_sa_upgrade_txholds(tx, zp); 2263 zfs_sa_upgrade_txholds(tx, dzp); 2264 dmu_tx_mark_netfree(tx); 2265 error = dmu_tx_assign(tx, TXG_WAIT); 2266 if (error) { 2267 dmu_tx_abort(tx); 2268 ZFS_EXIT(zfsvfs); 2269 return (error); 2270 } 2271 2272 cache_purge(dvp); 2273 2274 error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL); 2275 2276 if (error == 0) { 2277 uint64_t txtype = TX_RMDIR; 2278 zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT); 2279 } 2280 2281 dmu_tx_commit(tx); 2282 2283 cache_purge(vp); 2284out: 2285 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 2286 zil_commit(zilog, 0); 2287 2288 ZFS_EXIT(zfsvfs); 2289 return (error); 2290} 2291 2292/* 2293 * Read as many directory entries as will fit into the provided 2294 * buffer from the given directory cursor position (specified in 2295 * the uio structure). 2296 * 2297 * IN: vp - vnode of directory to read. 2298 * uio - structure supplying read location, range info, 2299 * and return buffer. 2300 * cr - credentials of caller. 2301 * ct - caller context 2302 * flags - case flags 2303 * 2304 * OUT: uio - updated offset and range, buffer filled. 2305 * eofp - set to true if end-of-file detected. 2306 * 2307 * RETURN: 0 on success, error code on failure. 2308 * 2309 * Timestamps: 2310 * vp - atime updated 2311 * 2312 * Note that the low 4 bits of the cookie returned by zap is always zero. 2313 * This allows us to use the low range for "special" directory entries: 2314 * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, 2315 * we use the offset 2 for the '.zfs' directory. 2316 */ 2317/* ARGSUSED */ 2318static int 2319zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies) 2320{ 2321 znode_t *zp = VTOZ(vp); 2322 iovec_t *iovp; 2323 edirent_t *eodp; 2324 dirent64_t *odp; 2325 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2326 objset_t *os; 2327 caddr_t outbuf; 2328 size_t bufsize; 2329 zap_cursor_t zc; 2330 zap_attribute_t zap; 2331 uint_t bytes_wanted; 2332 uint64_t offset; /* must be unsigned; checks for < 1 */ 2333 uint64_t parent; 2334 int local_eof; 2335 int outcount; 2336 int error; 2337 uint8_t prefetch; 2338 boolean_t check_sysattrs; 2339 uint8_t type; 2340 int ncooks; 2341 u_long *cooks = NULL; 2342 int flags = 0; 2343 2344 ZFS_ENTER(zfsvfs); 2345 ZFS_VERIFY_ZP(zp); 2346 2347 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), 2348 &parent, sizeof (parent))) != 0) { 2349 ZFS_EXIT(zfsvfs); 2350 return (error); 2351 } 2352 2353 /* 2354 * If we are not given an eof variable, 2355 * use a local one. 2356 */ 2357 if (eofp == NULL) 2358 eofp = &local_eof; 2359 2360 /* 2361 * Check for valid iov_len. 2362 */ 2363 if (uio->uio_iov->iov_len <= 0) { 2364 ZFS_EXIT(zfsvfs); 2365 return (SET_ERROR(EINVAL)); 2366 } 2367 2368 /* 2369 * Quit if directory has been removed (posix) 2370 */ 2371 if ((*eofp = zp->z_unlinked) != 0) { 2372 ZFS_EXIT(zfsvfs); 2373 return (0); 2374 } 2375 2376 error = 0; 2377 os = zfsvfs->z_os; 2378 offset = uio->uio_loffset; 2379 prefetch = zp->z_zn_prefetch; 2380 2381 /* 2382 * Initialize the iterator cursor. 2383 */ 2384 if (offset <= 3) { 2385 /* 2386 * Start iteration from the beginning of the directory. 2387 */ 2388 zap_cursor_init(&zc, os, zp->z_id); 2389 } else { 2390 /* 2391 * The offset is a serialized cursor. 2392 */ 2393 zap_cursor_init_serialized(&zc, os, zp->z_id, offset); 2394 } 2395 2396 /* 2397 * Get space to change directory entries into fs independent format. 2398 */ 2399 iovp = uio->uio_iov; 2400 bytes_wanted = iovp->iov_len; 2401 if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) { 2402 bufsize = bytes_wanted; 2403 outbuf = kmem_alloc(bufsize, KM_SLEEP); 2404 odp = (struct dirent64 *)outbuf; 2405 } else { 2406 bufsize = bytes_wanted; 2407 outbuf = NULL; 2408 odp = (struct dirent64 *)iovp->iov_base; 2409 } 2410 eodp = (struct edirent *)odp; 2411 2412 if (ncookies != NULL) { 2413 /* 2414 * Minimum entry size is dirent size and 1 byte for a file name. 2415 */ 2416 ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1); 2417 cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK); 2418 *cookies = cooks; 2419 *ncookies = ncooks; 2420 } 2421 /* 2422 * If this VFS supports the system attribute view interface; and 2423 * we're looking at an extended attribute directory; and we care 2424 * about normalization conflicts on this vfs; then we must check 2425 * for normalization conflicts with the sysattr name space. 2426 */ 2427#ifdef TODO 2428 check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && 2429 (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm && 2430 (flags & V_RDDIR_ENTFLAGS); 2431#else 2432 check_sysattrs = 0; 2433#endif 2434 2435 /* 2436 * Transform to file-system independent format 2437 */ 2438 outcount = 0; 2439 while (outcount < bytes_wanted) { 2440 ino64_t objnum; 2441 ushort_t reclen; 2442 off64_t *next = NULL; 2443 2444 /* 2445 * Special case `.', `..', and `.zfs'. 2446 */ 2447 if (offset == 0) { 2448 (void) strcpy(zap.za_name, "."); 2449 zap.za_normalization_conflict = 0; 2450 objnum = zp->z_id; 2451 type = DT_DIR; 2452 } else if (offset == 1) { 2453 (void) strcpy(zap.za_name, ".."); 2454 zap.za_normalization_conflict = 0; 2455 objnum = parent; 2456 type = DT_DIR; 2457 } else if (offset == 2 && zfs_show_ctldir(zp)) { 2458 (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); 2459 zap.za_normalization_conflict = 0; 2460 objnum = ZFSCTL_INO_ROOT; 2461 type = DT_DIR; 2462 } else { 2463 /* 2464 * Grab next entry. 2465 */ 2466 if (error = zap_cursor_retrieve(&zc, &zap)) { 2467 if ((*eofp = (error == ENOENT)) != 0) 2468 break; 2469 else 2470 goto update; 2471 } 2472 2473 if (zap.za_integer_length != 8 || 2474 zap.za_num_integers != 1) { 2475 cmn_err(CE_WARN, "zap_readdir: bad directory " 2476 "entry, obj = %lld, offset = %lld\n", 2477 (u_longlong_t)zp->z_id, 2478 (u_longlong_t)offset); 2479 error = SET_ERROR(ENXIO); 2480 goto update; 2481 } 2482 2483 objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); 2484 /* 2485 * MacOS X can extract the object type here such as: 2486 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer); 2487 */ 2488 type = ZFS_DIRENT_TYPE(zap.za_first_integer); 2489 2490 if (check_sysattrs && !zap.za_normalization_conflict) { 2491#ifdef TODO 2492 zap.za_normalization_conflict = 2493 xattr_sysattr_casechk(zap.za_name); 2494#else 2495 panic("%s:%u: TODO", __func__, __LINE__); 2496#endif 2497 } 2498 } 2499 2500 if (flags & V_RDDIR_ACCFILTER) { 2501 /* 2502 * If we have no access at all, don't include 2503 * this entry in the returned information 2504 */ 2505 znode_t *ezp; 2506 if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0) 2507 goto skip_entry; 2508 if (!zfs_has_access(ezp, cr)) { 2509 vrele(ZTOV(ezp)); 2510 goto skip_entry; 2511 } 2512 vrele(ZTOV(ezp)); 2513 } 2514 2515 if (flags & V_RDDIR_ENTFLAGS) 2516 reclen = EDIRENT_RECLEN(strlen(zap.za_name)); 2517 else 2518 reclen = DIRENT64_RECLEN(strlen(zap.za_name)); 2519 2520 /* 2521 * Will this entry fit in the buffer? 2522 */ 2523 if (outcount + reclen > bufsize) { 2524 /* 2525 * Did we manage to fit anything in the buffer? 2526 */ 2527 if (!outcount) { 2528 error = SET_ERROR(EINVAL); 2529 goto update; 2530 } 2531 break; 2532 } 2533 if (flags & V_RDDIR_ENTFLAGS) { 2534 /* 2535 * Add extended flag entry: 2536 */ 2537 eodp->ed_ino = objnum; 2538 eodp->ed_reclen = reclen; 2539 /* NOTE: ed_off is the offset for the *next* entry */ 2540 next = &(eodp->ed_off); 2541 eodp->ed_eflags = zap.za_normalization_conflict ? 2542 ED_CASE_CONFLICT : 0; 2543 (void) strncpy(eodp->ed_name, zap.za_name, 2544 EDIRENT_NAMELEN(reclen)); 2545 eodp = (edirent_t *)((intptr_t)eodp + reclen); 2546 } else { 2547 /* 2548 * Add normal entry: 2549 */ 2550 odp->d_ino = objnum; 2551 odp->d_reclen = reclen; 2552 odp->d_namlen = strlen(zap.za_name); 2553 (void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1); 2554 odp->d_type = type; 2555 dirent_terminate(odp); 2556 odp = (dirent64_t *)((intptr_t)odp + reclen); 2557 } 2558 outcount += reclen; 2559 2560 ASSERT(outcount <= bufsize); 2561 2562 /* Prefetch znode */ 2563 if (prefetch) 2564 dmu_prefetch(os, objnum, 0, 0, 0, 2565 ZIO_PRIORITY_SYNC_READ); 2566 2567 skip_entry: 2568 /* 2569 * Move to the next entry, fill in the previous offset. 2570 */ 2571 if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { 2572 zap_cursor_advance(&zc); 2573 offset = zap_cursor_serialize(&zc); 2574 } else { 2575 offset += 1; 2576 } 2577 2578 if (cooks != NULL) { 2579 *cooks++ = offset; 2580 ncooks--; 2581 KASSERT(ncooks >= 0, ("ncookies=%d", ncooks)); 2582 } 2583 } 2584 zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ 2585 2586 /* Subtract unused cookies */ 2587 if (ncookies != NULL) 2588 *ncookies -= ncooks; 2589 2590 if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) { 2591 iovp->iov_base += outcount; 2592 iovp->iov_len -= outcount; 2593 uio->uio_resid -= outcount; 2594 } else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) { 2595 /* 2596 * Reset the pointer. 2597 */ 2598 offset = uio->uio_loffset; 2599 } 2600 2601update: 2602 zap_cursor_fini(&zc); 2603 if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) 2604 kmem_free(outbuf, bufsize); 2605 2606 if (error == ENOENT) 2607 error = 0; 2608 2609 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 2610 2611 uio->uio_loffset = offset; 2612 ZFS_EXIT(zfsvfs); 2613 if (error != 0 && cookies != NULL) { 2614 free(*cookies, M_TEMP); 2615 *cookies = NULL; 2616 *ncookies = 0; 2617 } 2618 return (error); 2619} 2620 2621ulong_t zfs_fsync_sync_cnt = 4; 2622 2623static int 2624zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct) 2625{ 2626 znode_t *zp = VTOZ(vp); 2627 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2628 2629 (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt); 2630 2631 if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { 2632 ZFS_ENTER(zfsvfs); 2633 ZFS_VERIFY_ZP(zp); 2634 zil_commit(zfsvfs->z_log, zp->z_id); 2635 ZFS_EXIT(zfsvfs); 2636 } 2637 return (0); 2638} 2639 2640 2641/* 2642 * Get the requested file attributes and place them in the provided 2643 * vattr structure. 2644 * 2645 * IN: vp - vnode of file. 2646 * vap - va_mask identifies requested attributes. 2647 * If AT_XVATTR set, then optional attrs are requested 2648 * flags - ATTR_NOACLCHECK (CIFS server context) 2649 * cr - credentials of caller. 2650 * ct - caller context 2651 * 2652 * OUT: vap - attribute values. 2653 * 2654 * RETURN: 0 (always succeeds). 2655 */ 2656/* ARGSUSED */ 2657static int 2658zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, 2659 caller_context_t *ct) 2660{ 2661 znode_t *zp = VTOZ(vp); 2662 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2663 int error = 0; 2664 uint32_t blksize; 2665 u_longlong_t nblocks; 2666 uint64_t links; 2667 uint64_t mtime[2], ctime[2], crtime[2], rdev; 2668 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ 2669 xoptattr_t *xoap = NULL; 2670 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 2671 sa_bulk_attr_t bulk[4]; 2672 int count = 0; 2673 2674 ZFS_ENTER(zfsvfs); 2675 ZFS_VERIFY_ZP(zp); 2676 2677 zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid); 2678 2679 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); 2680 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); 2681 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); 2682 if (vp->v_type == VBLK || vp->v_type == VCHR) 2683 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL, 2684 &rdev, 8); 2685 2686 if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) { 2687 ZFS_EXIT(zfsvfs); 2688 return (error); 2689 } 2690 2691 /* 2692 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES. 2693 * Also, if we are the owner don't bother, since owner should 2694 * always be allowed to read basic attributes of file. 2695 */ 2696 if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) && 2697 (vap->va_uid != crgetuid(cr))) { 2698 if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0, 2699 skipaclchk, cr)) { 2700 ZFS_EXIT(zfsvfs); 2701 return (error); 2702 } 2703 } 2704 2705 /* 2706 * Return all attributes. It's cheaper to provide the answer 2707 * than to determine whether we were asked the question. 2708 */ 2709 2710 vap->va_type = IFTOVT(zp->z_mode); 2711 vap->va_mode = zp->z_mode & ~S_IFMT; 2712#ifdef illumos 2713 vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev; 2714#else 2715 vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; 2716#endif 2717 vap->va_nodeid = zp->z_id; 2718 if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp)) 2719 links = zp->z_links + 1; 2720 else 2721 links = zp->z_links; 2722 vap->va_nlink = MIN(links, LINK_MAX); /* nlink_t limit! */ 2723 vap->va_size = zp->z_size; 2724#ifdef illumos 2725 vap->va_rdev = vp->v_rdev; 2726#else 2727 if (vp->v_type == VBLK || vp->v_type == VCHR) 2728 vap->va_rdev = zfs_cmpldev(rdev); 2729#endif 2730 vap->va_seq = zp->z_seq; 2731 vap->va_flags = 0; /* FreeBSD: Reset chflags(2) flags. */ 2732 vap->va_filerev = zp->z_seq; 2733 2734 /* 2735 * Add in any requested optional attributes and the create time. 2736 * Also set the corresponding bits in the returned attribute bitmap. 2737 */ 2738 if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) { 2739 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { 2740 xoap->xoa_archive = 2741 ((zp->z_pflags & ZFS_ARCHIVE) != 0); 2742 XVA_SET_RTN(xvap, XAT_ARCHIVE); 2743 } 2744 2745 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { 2746 xoap->xoa_readonly = 2747 ((zp->z_pflags & ZFS_READONLY) != 0); 2748 XVA_SET_RTN(xvap, XAT_READONLY); 2749 } 2750 2751 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { 2752 xoap->xoa_system = 2753 ((zp->z_pflags & ZFS_SYSTEM) != 0); 2754 XVA_SET_RTN(xvap, XAT_SYSTEM); 2755 } 2756 2757 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { 2758 xoap->xoa_hidden = 2759 ((zp->z_pflags & ZFS_HIDDEN) != 0); 2760 XVA_SET_RTN(xvap, XAT_HIDDEN); 2761 } 2762 2763 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { 2764 xoap->xoa_nounlink = 2765 ((zp->z_pflags & ZFS_NOUNLINK) != 0); 2766 XVA_SET_RTN(xvap, XAT_NOUNLINK); 2767 } 2768 2769 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { 2770 xoap->xoa_immutable = 2771 ((zp->z_pflags & ZFS_IMMUTABLE) != 0); 2772 XVA_SET_RTN(xvap, XAT_IMMUTABLE); 2773 } 2774 2775 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { 2776 xoap->xoa_appendonly = 2777 ((zp->z_pflags & ZFS_APPENDONLY) != 0); 2778 XVA_SET_RTN(xvap, XAT_APPENDONLY); 2779 } 2780 2781 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { 2782 xoap->xoa_nodump = 2783 ((zp->z_pflags & ZFS_NODUMP) != 0); 2784 XVA_SET_RTN(xvap, XAT_NODUMP); 2785 } 2786 2787 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { 2788 xoap->xoa_opaque = 2789 ((zp->z_pflags & ZFS_OPAQUE) != 0); 2790 XVA_SET_RTN(xvap, XAT_OPAQUE); 2791 } 2792 2793 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { 2794 xoap->xoa_av_quarantined = 2795 ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0); 2796 XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); 2797 } 2798 2799 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { 2800 xoap->xoa_av_modified = 2801 ((zp->z_pflags & ZFS_AV_MODIFIED) != 0); 2802 XVA_SET_RTN(xvap, XAT_AV_MODIFIED); 2803 } 2804 2805 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) && 2806 vp->v_type == VREG) { 2807 zfs_sa_get_scanstamp(zp, xvap); 2808 } 2809 2810 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { 2811 xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0); 2812 XVA_SET_RTN(xvap, XAT_REPARSE); 2813 } 2814 if (XVA_ISSET_REQ(xvap, XAT_GEN)) { 2815 xoap->xoa_generation = zp->z_gen; 2816 XVA_SET_RTN(xvap, XAT_GEN); 2817 } 2818 2819 if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { 2820 xoap->xoa_offline = 2821 ((zp->z_pflags & ZFS_OFFLINE) != 0); 2822 XVA_SET_RTN(xvap, XAT_OFFLINE); 2823 } 2824 2825 if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { 2826 xoap->xoa_sparse = 2827 ((zp->z_pflags & ZFS_SPARSE) != 0); 2828 XVA_SET_RTN(xvap, XAT_SPARSE); 2829 } 2830 } 2831 2832 ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime); 2833 ZFS_TIME_DECODE(&vap->va_mtime, mtime); 2834 ZFS_TIME_DECODE(&vap->va_ctime, ctime); 2835 ZFS_TIME_DECODE(&vap->va_birthtime, crtime); 2836 2837 2838 sa_object_size(zp->z_sa_hdl, &blksize, &nblocks); 2839 vap->va_blksize = blksize; 2840 vap->va_bytes = nblocks << 9; /* nblocks * 512 */ 2841 2842 if (zp->z_blksz == 0) { 2843 /* 2844 * Block size hasn't been set; suggest maximal I/O transfers. 2845 */ 2846 vap->va_blksize = zfsvfs->z_max_blksz; 2847 } 2848 2849 ZFS_EXIT(zfsvfs); 2850 return (0); 2851} 2852 2853/* 2854 * Set the file attributes to the values contained in the 2855 * vattr structure. 2856 * 2857 * IN: vp - vnode of file to be modified. 2858 * vap - new attribute values. 2859 * If AT_XVATTR set, then optional attrs are being set 2860 * flags - ATTR_UTIME set if non-default time values provided. 2861 * - ATTR_NOACLCHECK (CIFS context only). 2862 * cr - credentials of caller. 2863 * ct - caller context 2864 * 2865 * RETURN: 0 on success, error code on failure. 2866 * 2867 * Timestamps: 2868 * vp - ctime updated, mtime updated if size changed. 2869 */ 2870/* ARGSUSED */ 2871static int 2872zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, 2873 caller_context_t *ct) 2874{ 2875 znode_t *zp = VTOZ(vp); 2876 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2877 zilog_t *zilog; 2878 dmu_tx_t *tx; 2879 vattr_t oldva; 2880 xvattr_t tmpxvattr; 2881 uint_t mask = vap->va_mask; 2882 uint_t saved_mask = 0; 2883 uint64_t saved_mode; 2884 int trim_mask = 0; 2885 uint64_t new_mode; 2886 uint64_t new_uid, new_gid; 2887 uint64_t xattr_obj; 2888 uint64_t mtime[2], ctime[2]; 2889 znode_t *attrzp; 2890 int need_policy = FALSE; 2891 int err, err2; 2892 zfs_fuid_info_t *fuidp = NULL; 2893 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ 2894 xoptattr_t *xoap; 2895 zfs_acl_t *aclp; 2896 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 2897 boolean_t fuid_dirtied = B_FALSE; 2898 sa_bulk_attr_t bulk[7], xattr_bulk[7]; 2899 int count = 0, xattr_count = 0; 2900 2901 if (mask == 0) 2902 return (0); 2903 2904 if (mask & AT_NOSET) 2905 return (SET_ERROR(EINVAL)); 2906 2907 ZFS_ENTER(zfsvfs); 2908 ZFS_VERIFY_ZP(zp); 2909 2910 zilog = zfsvfs->z_log; 2911 2912 /* 2913 * Make sure that if we have ephemeral uid/gid or xvattr specified 2914 * that file system is at proper version level 2915 */ 2916 2917 if (zfsvfs->z_use_fuids == B_FALSE && 2918 (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) || 2919 ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) || 2920 (mask & AT_XVATTR))) { 2921 ZFS_EXIT(zfsvfs); 2922 return (SET_ERROR(EINVAL)); 2923 } 2924 2925 if (mask & AT_SIZE && vp->v_type == VDIR) { 2926 ZFS_EXIT(zfsvfs); 2927 return (SET_ERROR(EISDIR)); 2928 } 2929 2930 if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) { 2931 ZFS_EXIT(zfsvfs); 2932 return (SET_ERROR(EINVAL)); 2933 } 2934 2935 /* 2936 * If this is an xvattr_t, then get a pointer to the structure of 2937 * optional attributes. If this is NULL, then we have a vattr_t. 2938 */ 2939 xoap = xva_getxoptattr(xvap); 2940 2941 xva_init(&tmpxvattr); 2942 2943 /* 2944 * Immutable files can only alter immutable bit and atime 2945 */ 2946 if ((zp->z_pflags & ZFS_IMMUTABLE) && 2947 ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) || 2948 ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { 2949 ZFS_EXIT(zfsvfs); 2950 return (SET_ERROR(EPERM)); 2951 } 2952 2953 /* 2954 * Note: ZFS_READONLY is handled in zfs_zaccess_common. 2955 */ 2956 2957 /* 2958 * Verify timestamps doesn't overflow 32 bits. 2959 * ZFS can handle large timestamps, but 32bit syscalls can't 2960 * handle times greater than 2039. This check should be removed 2961 * once large timestamps are fully supported. 2962 */ 2963 if (mask & (AT_ATIME | AT_MTIME)) { 2964 if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || 2965 ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { 2966 ZFS_EXIT(zfsvfs); 2967 return (SET_ERROR(EOVERFLOW)); 2968 } 2969 } 2970 if (xoap && (mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME) && 2971 TIMESPEC_OVERFLOW(&vap->va_birthtime)) { 2972 ZFS_EXIT(zfsvfs); 2973 return (SET_ERROR(EOVERFLOW)); 2974 } 2975 2976 attrzp = NULL; 2977 aclp = NULL; 2978 2979 /* Can this be moved to before the top label? */ 2980 if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { 2981 ZFS_EXIT(zfsvfs); 2982 return (SET_ERROR(EROFS)); 2983 } 2984 2985 /* 2986 * First validate permissions 2987 */ 2988 2989 if (mask & AT_SIZE) { 2990 /* 2991 * XXX - Note, we are not providing any open 2992 * mode flags here (like FNDELAY), so we may 2993 * block if there are locks present... this 2994 * should be addressed in openat(). 2995 */ 2996 /* XXX - would it be OK to generate a log record here? */ 2997 err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); 2998 if (err) { 2999 ZFS_EXIT(zfsvfs); 3000 return (err); 3001 } 3002 } 3003 3004 if (mask & (AT_ATIME|AT_MTIME) || 3005 ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || 3006 XVA_ISSET_REQ(xvap, XAT_READONLY) || 3007 XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || 3008 XVA_ISSET_REQ(xvap, XAT_OFFLINE) || 3009 XVA_ISSET_REQ(xvap, XAT_SPARSE) || 3010 XVA_ISSET_REQ(xvap, XAT_CREATETIME) || 3011 XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { 3012 need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, 3013 skipaclchk, cr); 3014 } 3015 3016 if (mask & (AT_UID|AT_GID)) { 3017 int idmask = (mask & (AT_UID|AT_GID)); 3018 int take_owner; 3019 int take_group; 3020 3021 /* 3022 * NOTE: even if a new mode is being set, 3023 * we may clear S_ISUID/S_ISGID bits. 3024 */ 3025 3026 if (!(mask & AT_MODE)) 3027 vap->va_mode = zp->z_mode; 3028 3029 /* 3030 * Take ownership or chgrp to group we are a member of 3031 */ 3032 3033 take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr)); 3034 take_group = (mask & AT_GID) && 3035 zfs_groupmember(zfsvfs, vap->va_gid, cr); 3036 3037 /* 3038 * If both AT_UID and AT_GID are set then take_owner and 3039 * take_group must both be set in order to allow taking 3040 * ownership. 3041 * 3042 * Otherwise, send the check through secpolicy_vnode_setattr() 3043 * 3044 */ 3045 3046 if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) || 3047 ((idmask == AT_UID) && take_owner) || 3048 ((idmask == AT_GID) && take_group)) { 3049 if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, 3050 skipaclchk, cr) == 0) { 3051 /* 3052 * Remove setuid/setgid for non-privileged users 3053 */ 3054 secpolicy_setid_clear(vap, vp, cr); 3055 trim_mask = (mask & (AT_UID|AT_GID)); 3056 } else { 3057 need_policy = TRUE; 3058 } 3059 } else { 3060 need_policy = TRUE; 3061 } 3062 } 3063 3064 oldva.va_mode = zp->z_mode; 3065 zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); 3066 if (mask & AT_XVATTR) { 3067 /* 3068 * Update xvattr mask to include only those attributes 3069 * that are actually changing. 3070 * 3071 * the bits will be restored prior to actually setting 3072 * the attributes so the caller thinks they were set. 3073 */ 3074 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { 3075 if (xoap->xoa_appendonly != 3076 ((zp->z_pflags & ZFS_APPENDONLY) != 0)) { 3077 need_policy = TRUE; 3078 } else { 3079 XVA_CLR_REQ(xvap, XAT_APPENDONLY); 3080 XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY); 3081 } 3082 } 3083 3084 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { 3085 if (xoap->xoa_nounlink != 3086 ((zp->z_pflags & ZFS_NOUNLINK) != 0)) { 3087 need_policy = TRUE; 3088 } else { 3089 XVA_CLR_REQ(xvap, XAT_NOUNLINK); 3090 XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK); 3091 } 3092 } 3093 3094 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { 3095 if (xoap->xoa_immutable != 3096 ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) { 3097 need_policy = TRUE; 3098 } else { 3099 XVA_CLR_REQ(xvap, XAT_IMMUTABLE); 3100 XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE); 3101 } 3102 } 3103 3104 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { 3105 if (xoap->xoa_nodump != 3106 ((zp->z_pflags & ZFS_NODUMP) != 0)) { 3107 need_policy = TRUE; 3108 } else { 3109 XVA_CLR_REQ(xvap, XAT_NODUMP); 3110 XVA_SET_REQ(&tmpxvattr, XAT_NODUMP); 3111 } 3112 } 3113 3114 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { 3115 if (xoap->xoa_av_modified != 3116 ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) { 3117 need_policy = TRUE; 3118 } else { 3119 XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); 3120 XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED); 3121 } 3122 } 3123 3124 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { 3125 if ((vp->v_type != VREG && 3126 xoap->xoa_av_quarantined) || 3127 xoap->xoa_av_quarantined != 3128 ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) { 3129 need_policy = TRUE; 3130 } else { 3131 XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); 3132 XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED); 3133 } 3134 } 3135 3136 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { 3137 ZFS_EXIT(zfsvfs); 3138 return (SET_ERROR(EPERM)); 3139 } 3140 3141 if (need_policy == FALSE && 3142 (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) || 3143 XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { 3144 need_policy = TRUE; 3145 } 3146 } 3147 3148 if (mask & AT_MODE) { 3149 if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) { 3150 err = secpolicy_setid_setsticky_clear(vp, vap, 3151 &oldva, cr); 3152 if (err) { 3153 ZFS_EXIT(zfsvfs); 3154 return (err); 3155 } 3156 trim_mask |= AT_MODE; 3157 } else { 3158 need_policy = TRUE; 3159 } 3160 } 3161 3162 if (need_policy) { 3163 /* 3164 * If trim_mask is set then take ownership 3165 * has been granted or write_acl is present and user 3166 * has the ability to modify mode. In that case remove 3167 * UID|GID and or MODE from mask so that 3168 * secpolicy_vnode_setattr() doesn't revoke it. 3169 */ 3170 3171 if (trim_mask) { 3172 saved_mask = vap->va_mask; 3173 vap->va_mask &= ~trim_mask; 3174 if (trim_mask & AT_MODE) { 3175 /* 3176 * Save the mode, as secpolicy_vnode_setattr() 3177 * will overwrite it with ova.va_mode. 3178 */ 3179 saved_mode = vap->va_mode; 3180 } 3181 } 3182 err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags, 3183 (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp); 3184 if (err) { 3185 ZFS_EXIT(zfsvfs); 3186 return (err); 3187 } 3188 3189 if (trim_mask) { 3190 vap->va_mask |= saved_mask; 3191 if (trim_mask & AT_MODE) { 3192 /* 3193 * Recover the mode after 3194 * secpolicy_vnode_setattr(). 3195 */ 3196 vap->va_mode = saved_mode; 3197 } 3198 } 3199 } 3200 3201 /* 3202 * secpolicy_vnode_setattr, or take ownership may have 3203 * changed va_mask 3204 */ 3205 mask = vap->va_mask; 3206 3207 if ((mask & (AT_UID | AT_GID))) { 3208 err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), 3209 &xattr_obj, sizeof (xattr_obj)); 3210 3211 if (err == 0 && xattr_obj) { 3212 err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp); 3213 if (err == 0) { 3214 err = vn_lock(ZTOV(attrzp), LK_EXCLUSIVE); 3215 if (err != 0) 3216 vrele(ZTOV(attrzp)); 3217 } 3218 if (err) 3219 goto out2; 3220 } 3221 if (mask & AT_UID) { 3222 new_uid = zfs_fuid_create(zfsvfs, 3223 (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); 3224 if (new_uid != zp->z_uid && 3225 zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) { 3226 if (attrzp) 3227 vput(ZTOV(attrzp)); 3228 err = SET_ERROR(EDQUOT); 3229 goto out2; 3230 } 3231 } 3232 3233 if (mask & AT_GID) { 3234 new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid, 3235 cr, ZFS_GROUP, &fuidp); 3236 if (new_gid != zp->z_gid && 3237 zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) { 3238 if (attrzp) 3239 vput(ZTOV(attrzp)); 3240 err = SET_ERROR(EDQUOT); 3241 goto out2; 3242 } 3243 } 3244 } 3245 tx = dmu_tx_create(zfsvfs->z_os); 3246 3247 if (mask & AT_MODE) { 3248 uint64_t pmode = zp->z_mode; 3249 uint64_t acl_obj; 3250 new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); 3251 3252 if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED && 3253 !(zp->z_pflags & ZFS_ACL_TRIVIAL)) { 3254 err = SET_ERROR(EPERM); 3255 goto out; 3256 } 3257 3258 if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)) 3259 goto out; 3260 3261 if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) { 3262 /* 3263 * Are we upgrading ACL from old V0 format 3264 * to V1 format? 3265 */ 3266 if (zfsvfs->z_version >= ZPL_VERSION_FUID && 3267 zfs_znode_acl_version(zp) == 3268 ZFS_ACL_VERSION_INITIAL) { 3269 dmu_tx_hold_free(tx, acl_obj, 0, 3270 DMU_OBJECT_END); 3271 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 3272 0, aclp->z_acl_bytes); 3273 } else { 3274 dmu_tx_hold_write(tx, acl_obj, 0, 3275 aclp->z_acl_bytes); 3276 } 3277 } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { 3278 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 3279 0, aclp->z_acl_bytes); 3280 } 3281 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 3282 } else { 3283 if ((mask & AT_XVATTR) && 3284 XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) 3285 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 3286 else 3287 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 3288 } 3289 3290 if (attrzp) { 3291 dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE); 3292 } 3293 3294 fuid_dirtied = zfsvfs->z_fuid_dirty; 3295 if (fuid_dirtied) 3296 zfs_fuid_txhold(zfsvfs, tx); 3297 3298 zfs_sa_upgrade_txholds(tx, zp); 3299 3300 err = dmu_tx_assign(tx, TXG_WAIT); 3301 if (err) 3302 goto out; 3303 3304 count = 0; 3305 /* 3306 * Set each attribute requested. 3307 * We group settings according to the locks they need to acquire. 3308 * 3309 * Note: you cannot set ctime directly, although it will be 3310 * updated as a side-effect of calling this function. 3311 */ 3312 3313 if (mask & (AT_UID|AT_GID|AT_MODE)) 3314 mutex_enter(&zp->z_acl_lock); 3315 3316 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 3317 &zp->z_pflags, sizeof (zp->z_pflags)); 3318 3319 if (attrzp) { 3320 if (mask & (AT_UID|AT_GID|AT_MODE)) 3321 mutex_enter(&attrzp->z_acl_lock); 3322 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 3323 SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags, 3324 sizeof (attrzp->z_pflags)); 3325 } 3326 3327 if (mask & (AT_UID|AT_GID)) { 3328 3329 if (mask & AT_UID) { 3330 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, 3331 &new_uid, sizeof (new_uid)); 3332 zp->z_uid = new_uid; 3333 if (attrzp) { 3334 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 3335 SA_ZPL_UID(zfsvfs), NULL, &new_uid, 3336 sizeof (new_uid)); 3337 attrzp->z_uid = new_uid; 3338 } 3339 } 3340 3341 if (mask & AT_GID) { 3342 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), 3343 NULL, &new_gid, sizeof (new_gid)); 3344 zp->z_gid = new_gid; 3345 if (attrzp) { 3346 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 3347 SA_ZPL_GID(zfsvfs), NULL, &new_gid, 3348 sizeof (new_gid)); 3349 attrzp->z_gid = new_gid; 3350 } 3351 } 3352 if (!(mask & AT_MODE)) { 3353 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), 3354 NULL, &new_mode, sizeof (new_mode)); 3355 new_mode = zp->z_mode; 3356 } 3357 err = zfs_acl_chown_setattr(zp); 3358 ASSERT(err == 0); 3359 if (attrzp) { 3360 err = zfs_acl_chown_setattr(attrzp); 3361 ASSERT(err == 0); 3362 } 3363 } 3364 3365 if (mask & AT_MODE) { 3366 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, 3367 &new_mode, sizeof (new_mode)); 3368 zp->z_mode = new_mode; 3369 ASSERT3U((uintptr_t)aclp, !=, 0); 3370 err = zfs_aclset_common(zp, aclp, cr, tx); 3371 ASSERT0(err); 3372 if (zp->z_acl_cached) 3373 zfs_acl_free(zp->z_acl_cached); 3374 zp->z_acl_cached = aclp; 3375 aclp = NULL; 3376 } 3377 3378 3379 if (mask & AT_ATIME) { 3380 ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime); 3381 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, 3382 &zp->z_atime, sizeof (zp->z_atime)); 3383 } 3384 3385 if (mask & AT_MTIME) { 3386 ZFS_TIME_ENCODE(&vap->va_mtime, mtime); 3387 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, 3388 mtime, sizeof (mtime)); 3389 } 3390 3391 /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */ 3392 if (mask & AT_SIZE && !(mask & AT_MTIME)) { 3393 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), 3394 NULL, mtime, sizeof (mtime)); 3395 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, 3396 &ctime, sizeof (ctime)); 3397 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, 3398 B_TRUE); 3399 } else if (mask != 0) { 3400 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, 3401 &ctime, sizeof (ctime)); 3402 zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime, 3403 B_TRUE); 3404 if (attrzp) { 3405 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 3406 SA_ZPL_CTIME(zfsvfs), NULL, 3407 &ctime, sizeof (ctime)); 3408 zfs_tstamp_update_setup(attrzp, STATE_CHANGED, 3409 mtime, ctime, B_TRUE); 3410 } 3411 } 3412 /* 3413 * Do this after setting timestamps to prevent timestamp 3414 * update from toggling bit 3415 */ 3416 3417 if (xoap && (mask & AT_XVATTR)) { 3418 3419 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) 3420 xoap->xoa_createtime = vap->va_birthtime; 3421 /* 3422 * restore trimmed off masks 3423 * so that return masks can be set for caller. 3424 */ 3425 3426 if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) { 3427 XVA_SET_REQ(xvap, XAT_APPENDONLY); 3428 } 3429 if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) { 3430 XVA_SET_REQ(xvap, XAT_NOUNLINK); 3431 } 3432 if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) { 3433 XVA_SET_REQ(xvap, XAT_IMMUTABLE); 3434 } 3435 if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) { 3436 XVA_SET_REQ(xvap, XAT_NODUMP); 3437 } 3438 if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) { 3439 XVA_SET_REQ(xvap, XAT_AV_MODIFIED); 3440 } 3441 if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) { 3442 XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); 3443 } 3444 3445 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) 3446 ASSERT(vp->v_type == VREG); 3447 3448 zfs_xvattr_set(zp, xvap, tx); 3449 } 3450 3451 if (fuid_dirtied) 3452 zfs_fuid_sync(zfsvfs, tx); 3453 3454 if (mask != 0) 3455 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); 3456 3457 if (mask & (AT_UID|AT_GID|AT_MODE)) 3458 mutex_exit(&zp->z_acl_lock); 3459 3460 if (attrzp) { 3461 if (mask & (AT_UID|AT_GID|AT_MODE)) 3462 mutex_exit(&attrzp->z_acl_lock); 3463 } 3464out: 3465 if (err == 0 && attrzp) { 3466 err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, 3467 xattr_count, tx); 3468 ASSERT(err2 == 0); 3469 } 3470 3471 if (attrzp) 3472 vput(ZTOV(attrzp)); 3473 3474 if (aclp) 3475 zfs_acl_free(aclp); 3476 3477 if (fuidp) { 3478 zfs_fuid_info_free(fuidp); 3479 fuidp = NULL; 3480 } 3481 3482 if (err) { 3483 dmu_tx_abort(tx); 3484 } else { 3485 err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 3486 dmu_tx_commit(tx); 3487 } 3488 3489out2: 3490 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 3491 zil_commit(zilog, 0); 3492 3493 ZFS_EXIT(zfsvfs); 3494 return (err); 3495} 3496 3497/* 3498 * We acquire all but fdvp locks using non-blocking acquisitions. If we 3499 * fail to acquire any lock in the path we will drop all held locks, 3500 * acquire the new lock in a blocking fashion, and then release it and 3501 * restart the rename. This acquire/release step ensures that we do not 3502 * spin on a lock waiting for release. On error release all vnode locks 3503 * and decrement references the way tmpfs_rename() would do. 3504 */ 3505static int 3506zfs_rename_relock(struct vnode *sdvp, struct vnode **svpp, 3507 struct vnode *tdvp, struct vnode **tvpp, 3508 const struct componentname *scnp, const struct componentname *tcnp) 3509{ 3510 zfsvfs_t *zfsvfs; 3511 struct vnode *nvp, *svp, *tvp; 3512 znode_t *sdzp, *tdzp, *szp, *tzp; 3513 const char *snm = scnp->cn_nameptr; 3514 const char *tnm = tcnp->cn_nameptr; 3515 int error; 3516 3517 VOP_UNLOCK(tdvp, 0); 3518 if (*tvpp != NULL && *tvpp != tdvp) 3519 VOP_UNLOCK(*tvpp, 0); 3520 3521relock: 3522 error = vn_lock(sdvp, LK_EXCLUSIVE); 3523 if (error) 3524 goto out; 3525 sdzp = VTOZ(sdvp); 3526 3527 error = vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT); 3528 if (error != 0) { 3529 VOP_UNLOCK(sdvp, 0); 3530 if (error != EBUSY) 3531 goto out; 3532 error = vn_lock(tdvp, LK_EXCLUSIVE); 3533 if (error) 3534 goto out; 3535 VOP_UNLOCK(tdvp, 0); 3536 goto relock; 3537 } 3538 tdzp = VTOZ(tdvp); 3539 3540 /* 3541 * Before using sdzp and tdzp we must ensure that they are live. 3542 * As a porting legacy from illumos we have two things to worry 3543 * about. One is typical for FreeBSD and it is that the vnode is 3544 * not reclaimed (doomed). The other is that the znode is live. 3545 * The current code can invalidate the znode without acquiring the 3546 * corresponding vnode lock if the object represented by the znode 3547 * and vnode is no longer valid after a rollback or receive operation. 3548 * z_teardown_lock hidden behind ZFS_ENTER and ZFS_EXIT is the lock 3549 * that protects the znodes from the invalidation. 3550 */ 3551 zfsvfs = sdzp->z_zfsvfs; 3552 ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs); 3553 ZFS_ENTER(zfsvfs); 3554 3555 /* 3556 * We can not use ZFS_VERIFY_ZP() here because it could directly return 3557 * bypassing the cleanup code in the case of an error. 3558 */ 3559 if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) { 3560 ZFS_EXIT(zfsvfs); 3561 VOP_UNLOCK(sdvp, 0); 3562 VOP_UNLOCK(tdvp, 0); 3563 error = SET_ERROR(EIO); 3564 goto out; 3565 } 3566 3567 /* 3568 * Re-resolve svp to be certain it still exists and fetch the 3569 * correct vnode. 3570 */ 3571 error = zfs_dirent_lookup(sdzp, snm, &szp, ZEXISTS); 3572 if (error != 0) { 3573 /* Source entry invalid or not there. */ 3574 ZFS_EXIT(zfsvfs); 3575 VOP_UNLOCK(sdvp, 0); 3576 VOP_UNLOCK(tdvp, 0); 3577 if ((scnp->cn_flags & ISDOTDOT) != 0 || 3578 (scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.')) 3579 error = SET_ERROR(EINVAL); 3580 goto out; 3581 } 3582 svp = ZTOV(szp); 3583 3584 /* 3585 * Re-resolve tvp, if it disappeared we just carry on. 3586 */ 3587 error = zfs_dirent_lookup(tdzp, tnm, &tzp, 0); 3588 if (error != 0) { 3589 ZFS_EXIT(zfsvfs); 3590 VOP_UNLOCK(sdvp, 0); 3591 VOP_UNLOCK(tdvp, 0); 3592 vrele(svp); 3593 if ((tcnp->cn_flags & ISDOTDOT) != 0) 3594 error = SET_ERROR(EINVAL); 3595 goto out; 3596 } 3597 if (tzp != NULL) 3598 tvp = ZTOV(tzp); 3599 else 3600 tvp = NULL; 3601 3602 /* 3603 * At present the vnode locks must be acquired before z_teardown_lock, 3604 * although it would be more logical to use the opposite order. 3605 */ 3606 ZFS_EXIT(zfsvfs); 3607 3608 /* 3609 * Now try acquire locks on svp and tvp. 3610 */ 3611 nvp = svp; 3612 error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT); 3613 if (error != 0) { 3614 VOP_UNLOCK(sdvp, 0); 3615 VOP_UNLOCK(tdvp, 0); 3616 if (tvp != NULL) 3617 vrele(tvp); 3618 if (error != EBUSY) { 3619 vrele(nvp); 3620 goto out; 3621 } 3622 error = vn_lock(nvp, LK_EXCLUSIVE); 3623 if (error != 0) { 3624 vrele(nvp); 3625 goto out; 3626 } 3627 VOP_UNLOCK(nvp, 0); 3628 /* 3629 * Concurrent rename race. 3630 * XXX ? 3631 */ 3632 if (nvp == tdvp) { 3633 vrele(nvp); 3634 error = SET_ERROR(EINVAL); 3635 goto out; 3636 } 3637 vrele(*svpp); 3638 *svpp = nvp; 3639 goto relock; 3640 } 3641 vrele(*svpp); 3642 *svpp = nvp; 3643 3644 if (*tvpp != NULL) 3645 vrele(*tvpp); 3646 *tvpp = NULL; 3647 if (tvp != NULL) { 3648 nvp = tvp; 3649 error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT); 3650 if (error != 0) { 3651 VOP_UNLOCK(sdvp, 0); 3652 VOP_UNLOCK(tdvp, 0); 3653 VOP_UNLOCK(*svpp, 0); 3654 if (error != EBUSY) { 3655 vrele(nvp); 3656 goto out; 3657 } 3658 error = vn_lock(nvp, LK_EXCLUSIVE); 3659 if (error != 0) { 3660 vrele(nvp); 3661 goto out; 3662 } 3663 vput(nvp); 3664 goto relock; 3665 } 3666 *tvpp = nvp; 3667 } 3668 3669 return (0); 3670 3671out: 3672 return (error); 3673} 3674 3675/* 3676 * Note that we must use VRELE_ASYNC in this function as it walks 3677 * up the directory tree and vrele may need to acquire an exclusive 3678 * lock if a last reference to a vnode is dropped. 3679 */ 3680static int 3681zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp) 3682{ 3683 zfsvfs_t *zfsvfs; 3684 znode_t *zp, *zp1; 3685 uint64_t parent; 3686 int error; 3687 3688 zfsvfs = tdzp->z_zfsvfs; 3689 if (tdzp == szp) 3690 return (SET_ERROR(EINVAL)); 3691 if (tdzp == sdzp) 3692 return (0); 3693 if (tdzp->z_id == zfsvfs->z_root) 3694 return (0); 3695 zp = tdzp; 3696 for (;;) { 3697 ASSERT(!zp->z_unlinked); 3698 if ((error = sa_lookup(zp->z_sa_hdl, 3699 SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) 3700 break; 3701 3702 if (parent == szp->z_id) { 3703 error = SET_ERROR(EINVAL); 3704 break; 3705 } 3706 if (parent == zfsvfs->z_root) 3707 break; 3708 if (parent == sdzp->z_id) 3709 break; 3710 3711 error = zfs_zget(zfsvfs, parent, &zp1); 3712 if (error != 0) 3713 break; 3714 3715 if (zp != tdzp) 3716 VN_RELE_ASYNC(ZTOV(zp), 3717 dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os))); 3718 zp = zp1; 3719 } 3720 3721 if (error == ENOTDIR) 3722 panic("checkpath: .. not a directory\n"); 3723 if (zp != tdzp) 3724 VN_RELE_ASYNC(ZTOV(zp), 3725 dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os))); 3726 return (error); 3727} 3728 3729/* 3730 * Move an entry from the provided source directory to the target 3731 * directory. Change the entry name as indicated. 3732 * 3733 * IN: sdvp - Source directory containing the "old entry". 3734 * snm - Old entry name. 3735 * tdvp - Target directory to contain the "new entry". 3736 * tnm - New entry name. 3737 * cr - credentials of caller. 3738 * ct - caller context 3739 * flags - case flags 3740 * 3741 * RETURN: 0 on success, error code on failure. 3742 * 3743 * Timestamps: 3744 * sdvp,tdvp - ctime|mtime updated 3745 */ 3746/*ARGSUSED*/ 3747static int 3748zfs_rename(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp, 3749 vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp, 3750 cred_t *cr) 3751{ 3752 zfsvfs_t *zfsvfs; 3753 znode_t *sdzp, *tdzp, *szp, *tzp; 3754 zilog_t *zilog = NULL; 3755 dmu_tx_t *tx; 3756 char *snm = scnp->cn_nameptr; 3757 char *tnm = tcnp->cn_nameptr; 3758 int error = 0; 3759 3760 /* Reject renames across filesystems. */ 3761 if ((*svpp)->v_mount != tdvp->v_mount || 3762 ((*tvpp) != NULL && (*svpp)->v_mount != (*tvpp)->v_mount)) { 3763 error = SET_ERROR(EXDEV); 3764 goto out; 3765 } 3766 3767 if (zfsctl_is_node(tdvp)) { 3768 error = SET_ERROR(EXDEV); 3769 goto out; 3770 } 3771 3772 /* 3773 * Lock all four vnodes to ensure safety and semantics of renaming. 3774 */ 3775 error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp); 3776 if (error != 0) { 3777 /* no vnodes are locked in the case of error here */ 3778 return (error); 3779 } 3780 3781 tdzp = VTOZ(tdvp); 3782 sdzp = VTOZ(sdvp); 3783 zfsvfs = tdzp->z_zfsvfs; 3784 zilog = zfsvfs->z_log; 3785 3786 /* 3787 * After we re-enter ZFS_ENTER() we will have to revalidate all 3788 * znodes involved. 3789 */ 3790 ZFS_ENTER(zfsvfs); 3791 3792 if (zfsvfs->z_utf8 && u8_validate(tnm, 3793 strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 3794 error = SET_ERROR(EILSEQ); 3795 goto unlockout; 3796 } 3797 3798 /* If source and target are the same file, there is nothing to do. */ 3799 if ((*svpp) == (*tvpp)) { 3800 error = 0; 3801 goto unlockout; 3802 } 3803 3804 if (((*svpp)->v_type == VDIR && (*svpp)->v_mountedhere != NULL) || 3805 ((*tvpp) != NULL && (*tvpp)->v_type == VDIR && 3806 (*tvpp)->v_mountedhere != NULL)) { 3807 error = SET_ERROR(EXDEV); 3808 goto unlockout; 3809 } 3810 3811 /* 3812 * We can not use ZFS_VERIFY_ZP() here because it could directly return 3813 * bypassing the cleanup code in the case of an error. 3814 */ 3815 if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) { 3816 error = SET_ERROR(EIO); 3817 goto unlockout; 3818 } 3819 3820 szp = VTOZ(*svpp); 3821 tzp = *tvpp == NULL ? NULL : VTOZ(*tvpp); 3822 if (szp->z_sa_hdl == NULL || (tzp != NULL && tzp->z_sa_hdl == NULL)) { 3823 error = SET_ERROR(EIO); 3824 goto unlockout; 3825 } 3826 3827 /* 3828 * This is to prevent the creation of links into attribute space 3829 * by renaming a linked file into/outof an attribute directory. 3830 * See the comment in zfs_link() for why this is considered bad. 3831 */ 3832 if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { 3833 error = SET_ERROR(EINVAL); 3834 goto unlockout; 3835 } 3836 3837 /* 3838 * Must have write access at the source to remove the old entry 3839 * and write access at the target to create the new entry. 3840 * Note that if target and source are the same, this can be 3841 * done in a single check. 3842 */ 3843 if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr)) 3844 goto unlockout; 3845 3846 if ((*svpp)->v_type == VDIR) { 3847 /* 3848 * Avoid ".", "..", and aliases of "." for obvious reasons. 3849 */ 3850 if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') || 3851 sdzp == szp || 3852 (scnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) { 3853 error = EINVAL; 3854 goto unlockout; 3855 } 3856 3857 /* 3858 * Check to make sure rename is valid. 3859 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d 3860 */ 3861 if (error = zfs_rename_check(szp, sdzp, tdzp)) 3862 goto unlockout; 3863 } 3864 3865 /* 3866 * Does target exist? 3867 */ 3868 if (tzp) { 3869 /* 3870 * Source and target must be the same type. 3871 */ 3872 if ((*svpp)->v_type == VDIR) { 3873 if ((*tvpp)->v_type != VDIR) { 3874 error = SET_ERROR(ENOTDIR); 3875 goto unlockout; 3876 } else { 3877 cache_purge(tdvp); 3878 if (sdvp != tdvp) 3879 cache_purge(sdvp); 3880 } 3881 } else { 3882 if ((*tvpp)->v_type == VDIR) { 3883 error = SET_ERROR(EISDIR); 3884 goto unlockout; 3885 } 3886 } 3887 } 3888 3889 vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct); 3890 if (tzp) 3891 vnevent_rename_dest(*tvpp, tdvp, tnm, ct); 3892 3893 /* 3894 * notify the target directory if it is not the same 3895 * as source directory. 3896 */ 3897 if (tdvp != sdvp) { 3898 vnevent_rename_dest_dir(tdvp, ct); 3899 } 3900 3901 tx = dmu_tx_create(zfsvfs->z_os); 3902 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); 3903 dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); 3904 dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); 3905 dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); 3906 if (sdzp != tdzp) { 3907 dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); 3908 zfs_sa_upgrade_txholds(tx, tdzp); 3909 } 3910 if (tzp) { 3911 dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); 3912 zfs_sa_upgrade_txholds(tx, tzp); 3913 } 3914 3915 zfs_sa_upgrade_txholds(tx, szp); 3916 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 3917 error = dmu_tx_assign(tx, TXG_WAIT); 3918 if (error) { 3919 dmu_tx_abort(tx); 3920 goto unlockout; 3921 } 3922 3923 3924 if (tzp) /* Attempt to remove the existing target */ 3925 error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL); 3926 3927 if (error == 0) { 3928 error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING); 3929 if (error == 0) { 3930 szp->z_pflags |= ZFS_AV_MODIFIED; 3931 3932 error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), 3933 (void *)&szp->z_pflags, sizeof (uint64_t), tx); 3934 ASSERT0(error); 3935 3936 error = zfs_link_destroy(sdzp, snm, szp, tx, ZRENAMING, 3937 NULL); 3938 if (error == 0) { 3939 zfs_log_rename(zilog, tx, TX_RENAME, sdzp, 3940 snm, tdzp, tnm, szp); 3941 3942 /* 3943 * Update path information for the target vnode 3944 */ 3945 vn_renamepath(tdvp, *svpp, tnm, strlen(tnm)); 3946 } else { 3947 /* 3948 * At this point, we have successfully created 3949 * the target name, but have failed to remove 3950 * the source name. Since the create was done 3951 * with the ZRENAMING flag, there are 3952 * complications; for one, the link count is 3953 * wrong. The easiest way to deal with this 3954 * is to remove the newly created target, and 3955 * return the original error. This must 3956 * succeed; fortunately, it is very unlikely to 3957 * fail, since we just created it. 3958 */ 3959 VERIFY3U(zfs_link_destroy(tdzp, tnm, szp, tx, 3960 ZRENAMING, NULL), ==, 0); 3961 } 3962 } 3963 if (error == 0) { 3964 cache_purge(*svpp); 3965 if (*tvpp != NULL) 3966 cache_purge(*tvpp); 3967 cache_purge_negative(tdvp); 3968 } 3969 } 3970 3971 dmu_tx_commit(tx); 3972 3973unlockout: /* all 4 vnodes are locked, ZFS_ENTER called */ 3974 ZFS_EXIT(zfsvfs); 3975 VOP_UNLOCK(*svpp, 0); 3976 VOP_UNLOCK(sdvp, 0); 3977 3978out: /* original two vnodes are locked */ 3979 if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 3980 zil_commit(zilog, 0); 3981 3982 if (*tvpp != NULL) 3983 VOP_UNLOCK(*tvpp, 0); 3984 if (tdvp != *tvpp) 3985 VOP_UNLOCK(tdvp, 0); 3986 return (error); 3987} 3988 3989/* 3990 * Insert the indicated symbolic reference entry into the directory. 3991 * 3992 * IN: dvp - Directory to contain new symbolic link. 3993 * link - Name for new symlink entry. 3994 * vap - Attributes of new entry. 3995 * cr - credentials of caller. 3996 * ct - caller context 3997 * flags - case flags 3998 * 3999 * RETURN: 0 on success, error code on failure. 4000 * 4001 * Timestamps: 4002 * dvp - ctime|mtime updated 4003 */ 4004/*ARGSUSED*/ 4005static int 4006zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link, 4007 cred_t *cr, kthread_t *td) 4008{ 4009 znode_t *zp, *dzp = VTOZ(dvp); 4010 dmu_tx_t *tx; 4011 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 4012 zilog_t *zilog; 4013 uint64_t len = strlen(link); 4014 int error; 4015 zfs_acl_ids_t acl_ids; 4016 boolean_t fuid_dirtied; 4017 uint64_t txtype = TX_SYMLINK; 4018 int flags = 0; 4019 4020 ASSERT(vap->va_type == VLNK); 4021 4022 ZFS_ENTER(zfsvfs); 4023 ZFS_VERIFY_ZP(dzp); 4024 zilog = zfsvfs->z_log; 4025 4026 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), 4027 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 4028 ZFS_EXIT(zfsvfs); 4029 return (SET_ERROR(EILSEQ)); 4030 } 4031 4032 if (len > MAXPATHLEN) { 4033 ZFS_EXIT(zfsvfs); 4034 return (SET_ERROR(ENAMETOOLONG)); 4035 } 4036 4037 if ((error = zfs_acl_ids_create(dzp, 0, 4038 vap, cr, NULL, &acl_ids)) != 0) { 4039 ZFS_EXIT(zfsvfs); 4040 return (error); 4041 } 4042 4043 /* 4044 * Attempt to lock directory; fail if entry already exists. 4045 */ 4046 error = zfs_dirent_lookup(dzp, name, &zp, ZNEW); 4047 if (error) { 4048 zfs_acl_ids_free(&acl_ids); 4049 ZFS_EXIT(zfsvfs); 4050 return (error); 4051 } 4052 4053 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { 4054 zfs_acl_ids_free(&acl_ids); 4055 ZFS_EXIT(zfsvfs); 4056 return (error); 4057 } 4058 4059 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { 4060 zfs_acl_ids_free(&acl_ids); 4061 ZFS_EXIT(zfsvfs); 4062 return (SET_ERROR(EDQUOT)); 4063 } 4064 4065 getnewvnode_reserve(1); 4066 tx = dmu_tx_create(zfsvfs->z_os); 4067 fuid_dirtied = zfsvfs->z_fuid_dirty; 4068 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); 4069 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 4070 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 4071 ZFS_SA_BASE_ATTR_SIZE + len); 4072 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); 4073 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 4074 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 4075 acl_ids.z_aclp->z_acl_bytes); 4076 } 4077 if (fuid_dirtied) 4078 zfs_fuid_txhold(zfsvfs, tx); 4079 error = dmu_tx_assign(tx, TXG_WAIT); 4080 if (error) { 4081 zfs_acl_ids_free(&acl_ids); 4082 dmu_tx_abort(tx); 4083 getnewvnode_drop_reserve(); 4084 ZFS_EXIT(zfsvfs); 4085 return (error); 4086 } 4087 4088 /* 4089 * Create a new object for the symlink. 4090 * for version 4 ZPL datsets the symlink will be an SA attribute 4091 */ 4092 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); 4093 4094 if (fuid_dirtied) 4095 zfs_fuid_sync(zfsvfs, tx); 4096 4097 if (zp->z_is_sa) 4098 error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), 4099 link, len, tx); 4100 else 4101 zfs_sa_symlink(zp, link, len, tx); 4102 4103 zp->z_size = len; 4104 (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), 4105 &zp->z_size, sizeof (zp->z_size), tx); 4106 /* 4107 * Insert the new object into the directory. 4108 */ 4109 (void) zfs_link_create(dzp, name, zp, tx, ZNEW); 4110 4111 zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); 4112 *vpp = ZTOV(zp); 4113 4114 zfs_acl_ids_free(&acl_ids); 4115 4116 dmu_tx_commit(tx); 4117 4118 getnewvnode_drop_reserve(); 4119 4120 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 4121 zil_commit(zilog, 0); 4122 4123 ZFS_EXIT(zfsvfs); 4124 return (error); 4125} 4126 4127/* 4128 * Return, in the buffer contained in the provided uio structure, 4129 * the symbolic path referred to by vp. 4130 * 4131 * IN: vp - vnode of symbolic link. 4132 * uio - structure to contain the link path. 4133 * cr - credentials of caller. 4134 * ct - caller context 4135 * 4136 * OUT: uio - structure containing the link path. 4137 * 4138 * RETURN: 0 on success, error code on failure. 4139 * 4140 * Timestamps: 4141 * vp - atime updated 4142 */ 4143/* ARGSUSED */ 4144static int 4145zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct) 4146{ 4147 znode_t *zp = VTOZ(vp); 4148 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4149 int error; 4150 4151 ZFS_ENTER(zfsvfs); 4152 ZFS_VERIFY_ZP(zp); 4153 4154 if (zp->z_is_sa) 4155 error = sa_lookup_uio(zp->z_sa_hdl, 4156 SA_ZPL_SYMLINK(zfsvfs), uio); 4157 else 4158 error = zfs_sa_readlink(zp, uio); 4159 4160 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 4161 4162 ZFS_EXIT(zfsvfs); 4163 return (error); 4164} 4165 4166/* 4167 * Insert a new entry into directory tdvp referencing svp. 4168 * 4169 * IN: tdvp - Directory to contain new entry. 4170 * svp - vnode of new entry. 4171 * name - name of new entry. 4172 * cr - credentials of caller. 4173 * ct - caller context 4174 * 4175 * RETURN: 0 on success, error code on failure. 4176 * 4177 * Timestamps: 4178 * tdvp - ctime|mtime updated 4179 * svp - ctime updated 4180 */ 4181/* ARGSUSED */ 4182static int 4183zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr, 4184 caller_context_t *ct, int flags) 4185{ 4186 znode_t *dzp = VTOZ(tdvp); 4187 znode_t *tzp, *szp; 4188 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 4189 zilog_t *zilog; 4190 dmu_tx_t *tx; 4191 int error; 4192 uint64_t parent; 4193 uid_t owner; 4194 4195 ASSERT(tdvp->v_type == VDIR); 4196 4197 ZFS_ENTER(zfsvfs); 4198 ZFS_VERIFY_ZP(dzp); 4199 zilog = zfsvfs->z_log; 4200 4201 /* 4202 * POSIX dictates that we return EPERM here. 4203 * Better choices include ENOTSUP or EISDIR. 4204 */ 4205 if (svp->v_type == VDIR) { 4206 ZFS_EXIT(zfsvfs); 4207 return (SET_ERROR(EPERM)); 4208 } 4209 4210 szp = VTOZ(svp); 4211 ZFS_VERIFY_ZP(szp); 4212 4213 if (szp->z_pflags & (ZFS_APPENDONLY | ZFS_IMMUTABLE | ZFS_READONLY)) { 4214 ZFS_EXIT(zfsvfs); 4215 return (SET_ERROR(EPERM)); 4216 } 4217 4218 /* Prevent links to .zfs/shares files */ 4219 4220 if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), 4221 &parent, sizeof (uint64_t))) != 0) { 4222 ZFS_EXIT(zfsvfs); 4223 return (error); 4224 } 4225 if (parent == zfsvfs->z_shares_dir) { 4226 ZFS_EXIT(zfsvfs); 4227 return (SET_ERROR(EPERM)); 4228 } 4229 4230 if (zfsvfs->z_utf8 && u8_validate(name, 4231 strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 4232 ZFS_EXIT(zfsvfs); 4233 return (SET_ERROR(EILSEQ)); 4234 } 4235 4236 /* 4237 * We do not support links between attributes and non-attributes 4238 * because of the potential security risk of creating links 4239 * into "normal" file space in order to circumvent restrictions 4240 * imposed in attribute space. 4241 */ 4242 if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) { 4243 ZFS_EXIT(zfsvfs); 4244 return (SET_ERROR(EINVAL)); 4245 } 4246 4247 4248 owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER); 4249 if (owner != crgetuid(cr) && secpolicy_basic_link(svp, cr) != 0) { 4250 ZFS_EXIT(zfsvfs); 4251 return (SET_ERROR(EPERM)); 4252 } 4253 4254 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { 4255 ZFS_EXIT(zfsvfs); 4256 return (error); 4257 } 4258 4259 /* 4260 * Attempt to lock directory; fail if entry already exists. 4261 */ 4262 error = zfs_dirent_lookup(dzp, name, &tzp, ZNEW); 4263 if (error) { 4264 ZFS_EXIT(zfsvfs); 4265 return (error); 4266 } 4267 4268 tx = dmu_tx_create(zfsvfs->z_os); 4269 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); 4270 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 4271 zfs_sa_upgrade_txholds(tx, szp); 4272 zfs_sa_upgrade_txholds(tx, dzp); 4273 error = dmu_tx_assign(tx, TXG_WAIT); 4274 if (error) { 4275 dmu_tx_abort(tx); 4276 ZFS_EXIT(zfsvfs); 4277 return (error); 4278 } 4279 4280 error = zfs_link_create(dzp, name, szp, tx, 0); 4281 4282 if (error == 0) { 4283 uint64_t txtype = TX_LINK; 4284 zfs_log_link(zilog, tx, txtype, dzp, szp, name); 4285 } 4286 4287 dmu_tx_commit(tx); 4288 4289 if (error == 0) { 4290 vnevent_link(svp, ct); 4291 } 4292 4293 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 4294 zil_commit(zilog, 0); 4295 4296 ZFS_EXIT(zfsvfs); 4297 return (error); 4298} 4299 4300 4301/*ARGSUSED*/ 4302void 4303zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) 4304{ 4305 znode_t *zp = VTOZ(vp); 4306 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4307 int error; 4308 4309 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); 4310 if (zp->z_sa_hdl == NULL) { 4311 /* 4312 * The fs has been unmounted, or we did a 4313 * suspend/resume and this file no longer exists. 4314 */ 4315 rw_exit(&zfsvfs->z_teardown_inactive_lock); 4316 vrecycle(vp); 4317 return; 4318 } 4319 4320 if (zp->z_unlinked) { 4321 /* 4322 * Fast path to recycle a vnode of a removed file. 4323 */ 4324 rw_exit(&zfsvfs->z_teardown_inactive_lock); 4325 vrecycle(vp); 4326 return; 4327 } 4328 4329 if (zp->z_atime_dirty && zp->z_unlinked == 0) { 4330 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); 4331 4332 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 4333 zfs_sa_upgrade_txholds(tx, zp); 4334 error = dmu_tx_assign(tx, TXG_WAIT); 4335 if (error) { 4336 dmu_tx_abort(tx); 4337 } else { 4338 (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), 4339 (void *)&zp->z_atime, sizeof (zp->z_atime), tx); 4340 zp->z_atime_dirty = 0; 4341 dmu_tx_commit(tx); 4342 } 4343 } 4344 rw_exit(&zfsvfs->z_teardown_inactive_lock); 4345} 4346 4347 4348CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid)); 4349CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid)); 4350 4351/*ARGSUSED*/ 4352static int 4353zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct) 4354{ 4355 znode_t *zp = VTOZ(vp); 4356 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4357 uint32_t gen; 4358 uint64_t gen64; 4359 uint64_t object = zp->z_id; 4360 zfid_short_t *zfid; 4361 int size, i, error; 4362 4363 ZFS_ENTER(zfsvfs); 4364 ZFS_VERIFY_ZP(zp); 4365 4366 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), 4367 &gen64, sizeof (uint64_t))) != 0) { 4368 ZFS_EXIT(zfsvfs); 4369 return (error); 4370 } 4371 4372 gen = (uint32_t)gen64; 4373 4374 size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN; 4375 4376#ifdef illumos 4377 if (fidp->fid_len < size) { 4378 fidp->fid_len = size; 4379 ZFS_EXIT(zfsvfs); 4380 return (SET_ERROR(ENOSPC)); 4381 } 4382#else 4383 fidp->fid_len = size; 4384#endif 4385 4386 zfid = (zfid_short_t *)fidp; 4387 4388 zfid->zf_len = size; 4389 4390 for (i = 0; i < sizeof (zfid->zf_object); i++) 4391 zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); 4392 4393 /* Must have a non-zero generation number to distinguish from .zfs */ 4394 if (gen == 0) 4395 gen = 1; 4396 for (i = 0; i < sizeof (zfid->zf_gen); i++) 4397 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); 4398 4399 if (size == LONG_FID_LEN) { 4400 uint64_t objsetid = dmu_objset_id(zfsvfs->z_os); 4401 zfid_long_t *zlfid; 4402 4403 zlfid = (zfid_long_t *)fidp; 4404 4405 for (i = 0; i < sizeof (zlfid->zf_setid); i++) 4406 zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i)); 4407 4408 /* XXX - this should be the generation number for the objset */ 4409 for (i = 0; i < sizeof (zlfid->zf_setgen); i++) 4410 zlfid->zf_setgen[i] = 0; 4411 } 4412 4413 ZFS_EXIT(zfsvfs); 4414 return (0); 4415} 4416 4417static int 4418zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, 4419 caller_context_t *ct) 4420{ 4421 znode_t *zp, *xzp; 4422 zfsvfs_t *zfsvfs; 4423 int error; 4424 4425 switch (cmd) { 4426 case _PC_LINK_MAX: 4427 *valp = INT_MAX; 4428 return (0); 4429 4430 case _PC_FILESIZEBITS: 4431 *valp = 64; 4432 return (0); 4433#ifdef illumos 4434 case _PC_XATTR_EXISTS: 4435 zp = VTOZ(vp); 4436 zfsvfs = zp->z_zfsvfs; 4437 ZFS_ENTER(zfsvfs); 4438 ZFS_VERIFY_ZP(zp); 4439 *valp = 0; 4440 error = zfs_dirent_lookup(zp, "", &xzp, 4441 ZXATTR | ZEXISTS | ZSHARED); 4442 if (error == 0) { 4443 if (!zfs_dirempty(xzp)) 4444 *valp = 1; 4445 vrele(ZTOV(xzp)); 4446 } else if (error == ENOENT) { 4447 /* 4448 * If there aren't extended attributes, it's the 4449 * same as having zero of them. 4450 */ 4451 error = 0; 4452 } 4453 ZFS_EXIT(zfsvfs); 4454 return (error); 4455 4456 case _PC_SATTR_ENABLED: 4457 case _PC_SATTR_EXISTS: 4458 *valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && 4459 (vp->v_type == VREG || vp->v_type == VDIR); 4460 return (0); 4461 4462 case _PC_ACCESS_FILTERING: 4463 *valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) && 4464 vp->v_type == VDIR; 4465 return (0); 4466 4467 case _PC_ACL_ENABLED: 4468 *valp = _ACL_ACE_ENABLED; 4469 return (0); 4470#endif /* illumos */ 4471 case _PC_MIN_HOLE_SIZE: 4472 *valp = (int)SPA_MINBLOCKSIZE; 4473 return (0); 4474#ifdef illumos 4475 case _PC_TIMESTAMP_RESOLUTION: 4476 /* nanosecond timestamp resolution */ 4477 *valp = 1L; 4478 return (0); 4479#endif 4480 case _PC_ACL_EXTENDED: 4481 *valp = 0; 4482 return (0); 4483 4484 case _PC_ACL_NFS4: 4485 *valp = 1; 4486 return (0); 4487 4488 case _PC_ACL_PATH_MAX: 4489 *valp = ACL_MAX_ENTRIES; 4490 return (0); 4491 4492 default: 4493 return (EOPNOTSUPP); 4494 } 4495} 4496 4497/*ARGSUSED*/ 4498static int 4499zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, 4500 caller_context_t *ct) 4501{ 4502 znode_t *zp = VTOZ(vp); 4503 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4504 int error; 4505 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 4506 4507 ZFS_ENTER(zfsvfs); 4508 ZFS_VERIFY_ZP(zp); 4509 error = zfs_getacl(zp, vsecp, skipaclchk, cr); 4510 ZFS_EXIT(zfsvfs); 4511 4512 return (error); 4513} 4514 4515/*ARGSUSED*/ 4516int 4517zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, 4518 caller_context_t *ct) 4519{ 4520 znode_t *zp = VTOZ(vp); 4521 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4522 int error; 4523 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 4524 zilog_t *zilog = zfsvfs->z_log; 4525 4526 ZFS_ENTER(zfsvfs); 4527 ZFS_VERIFY_ZP(zp); 4528 4529 error = zfs_setacl(zp, vsecp, skipaclchk, cr); 4530 4531 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 4532 zil_commit(zilog, 0); 4533 4534 ZFS_EXIT(zfsvfs); 4535 return (error); 4536} 4537 4538static int 4539zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind, 4540 int *rahead) 4541{ 4542 znode_t *zp = VTOZ(vp); 4543 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4544 objset_t *os = zp->z_zfsvfs->z_os; 4545 rl_t *rl; 4546 vm_object_t object; 4547 off_t start, end, obj_size; 4548 uint_t blksz; 4549 int pgsin_b, pgsin_a; 4550 int error; 4551 4552 ZFS_ENTER(zfsvfs); 4553 ZFS_VERIFY_ZP(zp); 4554 4555 start = IDX_TO_OFF(ma[0]->pindex); 4556 end = IDX_TO_OFF(ma[count - 1]->pindex + 1); 4557 4558 /* 4559 * Lock a range covering all required and optional pages. 4560 * Note that we need to handle the case of the block size growing. 4561 */ 4562 for (;;) { 4563 blksz = zp->z_blksz; 4564 rl = zfs_range_lock(zp, rounddown(start, blksz), 4565 roundup(end, blksz) - rounddown(start, blksz), RL_READER); 4566 if (blksz == zp->z_blksz) 4567 break; 4568 zfs_range_unlock(rl); 4569 } 4570 4571 object = ma[0]->object; 4572 zfs_vmobject_wlock(object); 4573 obj_size = object->un_pager.vnp.vnp_size; 4574 zfs_vmobject_wunlock(object); 4575 if (IDX_TO_OFF(ma[count - 1]->pindex) >= obj_size) { 4576 zfs_range_unlock(rl); 4577 ZFS_EXIT(zfsvfs); 4578 return (zfs_vm_pagerret_bad); 4579 } 4580 4581 pgsin_b = 0; 4582 if (rbehind != NULL) { 4583 pgsin_b = OFF_TO_IDX(start - rounddown(start, blksz)); 4584 pgsin_b = MIN(*rbehind, pgsin_b); 4585 } 4586 4587 pgsin_a = 0; 4588 if (rahead != NULL) { 4589 pgsin_a = OFF_TO_IDX(roundup(end, blksz) - end); 4590 if (end + IDX_TO_OFF(pgsin_a) >= obj_size) 4591 pgsin_a = OFF_TO_IDX(round_page(obj_size) - end); 4592 pgsin_a = MIN(*rahead, pgsin_a); 4593 } 4594 4595 /* 4596 * NB: we need to pass the exact byte size of the data that we expect 4597 * to read after accounting for the file size. This is required because 4598 * ZFS will panic if we request DMU to read beyond the end of the last 4599 * allocated block. 4600 */ 4601 error = dmu_read_pages(os, zp->z_id, ma, count, &pgsin_b, &pgsin_a, 4602 MIN(end, obj_size) - (end - PAGE_SIZE)); 4603 4604 zfs_range_unlock(rl); 4605 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 4606 ZFS_EXIT(zfsvfs); 4607 4608 if (error != 0) 4609 return (zfs_vm_pagerret_error); 4610 4611 PCPU_INC(cnt.v_vnodein); 4612 PCPU_ADD(cnt.v_vnodepgsin, count + pgsin_b + pgsin_a); 4613 if (rbehind != NULL) 4614 *rbehind = pgsin_b; 4615 if (rahead != NULL) 4616 *rahead = pgsin_a; 4617 return (zfs_vm_pagerret_ok); 4618} 4619 4620static int 4621zfs_freebsd_getpages(ap) 4622 struct vop_getpages_args /* { 4623 struct vnode *a_vp; 4624 vm_page_t *a_m; 4625 int a_count; 4626 int *a_rbehind; 4627 int *a_rahead; 4628 } */ *ap; 4629{ 4630 4631 return (zfs_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind, 4632 ap->a_rahead)); 4633} 4634 4635static int 4636zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags, 4637 int *rtvals) 4638{ 4639 znode_t *zp = VTOZ(vp); 4640 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4641 rl_t *rl; 4642 dmu_tx_t *tx; 4643 struct sf_buf *sf; 4644 vm_object_t object; 4645 vm_page_t m; 4646 caddr_t va; 4647 size_t tocopy; 4648 size_t lo_len; 4649 vm_ooffset_t lo_off; 4650 vm_ooffset_t off; 4651 uint_t blksz; 4652 int ncount; 4653 int pcount; 4654 int err; 4655 int i; 4656 4657 ZFS_ENTER(zfsvfs); 4658 ZFS_VERIFY_ZP(zp); 4659 4660 object = vp->v_object; 4661 pcount = btoc(len); 4662 ncount = pcount; 4663 4664 KASSERT(ma[0]->object == object, ("mismatching object")); 4665 KASSERT(len > 0 && (len & PAGE_MASK) == 0, ("unexpected length")); 4666 4667 for (i = 0; i < pcount; i++) 4668 rtvals[i] = zfs_vm_pagerret_error; 4669 4670 off = IDX_TO_OFF(ma[0]->pindex); 4671 blksz = zp->z_blksz; 4672 lo_off = rounddown(off, blksz); 4673 lo_len = roundup(len + (off - lo_off), blksz); 4674 rl = zfs_range_lock(zp, lo_off, lo_len, RL_WRITER); 4675 4676 zfs_vmobject_wlock(object); 4677 if (len + off > object->un_pager.vnp.vnp_size) { 4678 if (object->un_pager.vnp.vnp_size > off) { 4679 int pgoff; 4680 4681 len = object->un_pager.vnp.vnp_size - off; 4682 ncount = btoc(len); 4683 if ((pgoff = (int)len & PAGE_MASK) != 0) { 4684 /* 4685 * If the object is locked and the following 4686 * conditions hold, then the page's dirty 4687 * field cannot be concurrently changed by a 4688 * pmap operation. 4689 */ 4690 m = ma[ncount - 1]; 4691 vm_page_assert_sbusied(m); 4692 KASSERT(!pmap_page_is_write_mapped(m), 4693 ("zfs_putpages: page %p is not read-only", m)); 4694 vm_page_clear_dirty(m, pgoff, PAGE_SIZE - 4695 pgoff); 4696 } 4697 } else { 4698 len = 0; 4699 ncount = 0; 4700 } 4701 if (ncount < pcount) { 4702 for (i = ncount; i < pcount; i++) { 4703 rtvals[i] = zfs_vm_pagerret_bad; 4704 } 4705 } 4706 } 4707 zfs_vmobject_wunlock(object); 4708 4709 if (ncount == 0) 4710 goto out; 4711 4712 if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) || 4713 zfs_owner_overquota(zfsvfs, zp, B_TRUE)) { 4714 goto out; 4715 } 4716 4717 tx = dmu_tx_create(zfsvfs->z_os); 4718 dmu_tx_hold_write(tx, zp->z_id, off, len); 4719 4720 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 4721 zfs_sa_upgrade_txholds(tx, zp); 4722 err = dmu_tx_assign(tx, TXG_WAIT); 4723 if (err != 0) { 4724 dmu_tx_abort(tx); 4725 goto out; 4726 } 4727 4728 if (zp->z_blksz < PAGE_SIZE) { 4729 for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) { 4730 tocopy = len > PAGE_SIZE ? PAGE_SIZE : len; 4731 va = zfs_map_page(ma[i], &sf); 4732 dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx); 4733 zfs_unmap_page(sf); 4734 } 4735 } else { 4736 err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, ma, tx); 4737 } 4738 4739 if (err == 0) { 4740 uint64_t mtime[2], ctime[2]; 4741 sa_bulk_attr_t bulk[3]; 4742 int count = 0; 4743 4744 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, 4745 &mtime, 16); 4746 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, 4747 &ctime, 16); 4748 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 4749 &zp->z_pflags, 8); 4750 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, 4751 B_TRUE); 4752 err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 4753 ASSERT0(err); 4754 zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0); 4755 4756 zfs_vmobject_wlock(object); 4757 for (i = 0; i < ncount; i++) { 4758 rtvals[i] = zfs_vm_pagerret_ok; 4759 vm_page_undirty(ma[i]); 4760 } 4761 zfs_vmobject_wunlock(object); 4762 PCPU_INC(cnt.v_vnodeout); 4763 PCPU_ADD(cnt.v_vnodepgsout, ncount); 4764 } 4765 dmu_tx_commit(tx); 4766 4767out: 4768 zfs_range_unlock(rl); 4769 if ((flags & (zfs_vm_pagerput_sync | zfs_vm_pagerput_inval)) != 0 || 4770 zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 4771 zil_commit(zfsvfs->z_log, zp->z_id); 4772 ZFS_EXIT(zfsvfs); 4773 return (rtvals[0]); 4774} 4775 4776int 4777zfs_freebsd_putpages(ap) 4778 struct vop_putpages_args /* { 4779 struct vnode *a_vp; 4780 vm_page_t *a_m; 4781 int a_count; 4782 int a_sync; 4783 int *a_rtvals; 4784 } */ *ap; 4785{ 4786 4787 return (zfs_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync, 4788 ap->a_rtvals)); 4789} 4790 4791static int 4792zfs_freebsd_bmap(ap) 4793 struct vop_bmap_args /* { 4794 struct vnode *a_vp; 4795 daddr_t a_bn; 4796 struct bufobj **a_bop; 4797 daddr_t *a_bnp; 4798 int *a_runp; 4799 int *a_runb; 4800 } */ *ap; 4801{ 4802 4803 if (ap->a_bop != NULL) 4804 *ap->a_bop = &ap->a_vp->v_bufobj; 4805 if (ap->a_bnp != NULL) 4806 *ap->a_bnp = ap->a_bn; 4807 if (ap->a_runp != NULL) 4808 *ap->a_runp = 0; 4809 if (ap->a_runb != NULL) 4810 *ap->a_runb = 0; 4811 4812 return (0); 4813} 4814 4815static int 4816zfs_freebsd_open(ap) 4817 struct vop_open_args /* { 4818 struct vnode *a_vp; 4819 int a_mode; 4820 struct ucred *a_cred; 4821 struct thread *a_td; 4822 } */ *ap; 4823{ 4824 vnode_t *vp = ap->a_vp; 4825 znode_t *zp = VTOZ(vp); 4826 int error; 4827 4828 error = zfs_open(&vp, ap->a_mode, ap->a_cred, NULL); 4829 if (error == 0) 4830 vnode_create_vobject(vp, zp->z_size, ap->a_td); 4831 return (error); 4832} 4833 4834static int 4835zfs_freebsd_close(ap) 4836 struct vop_close_args /* { 4837 struct vnode *a_vp; 4838 int a_fflag; 4839 struct ucred *a_cred; 4840 struct thread *a_td; 4841 } */ *ap; 4842{ 4843 4844 return (zfs_close(ap->a_vp, ap->a_fflag, 1, 0, ap->a_cred, NULL)); 4845} 4846 4847static int 4848zfs_freebsd_ioctl(ap) 4849 struct vop_ioctl_args /* { 4850 struct vnode *a_vp; 4851 u_long a_command; 4852 caddr_t a_data; 4853 int a_fflag; 4854 struct ucred *cred; 4855 struct thread *td; 4856 } */ *ap; 4857{ 4858 4859 return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data, 4860 ap->a_fflag, ap->a_cred, NULL, NULL)); 4861} 4862 4863static int 4864ioflags(int ioflags) 4865{ 4866 int flags = 0; 4867 4868 if (ioflags & IO_APPEND) 4869 flags |= FAPPEND; 4870 if (ioflags & IO_NDELAY) 4871 flags |= FNONBLOCK; 4872 if (ioflags & IO_SYNC) 4873 flags |= (FSYNC | FDSYNC | FRSYNC); 4874 4875 return (flags); 4876} 4877 4878static int 4879zfs_freebsd_read(ap) 4880 struct vop_read_args /* { 4881 struct vnode *a_vp; 4882 struct uio *a_uio; 4883 int a_ioflag; 4884 struct ucred *a_cred; 4885 } */ *ap; 4886{ 4887 4888 return (zfs_read(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag), 4889 ap->a_cred, NULL)); 4890} 4891 4892static int 4893zfs_freebsd_write(ap) 4894 struct vop_write_args /* { 4895 struct vnode *a_vp; 4896 struct uio *a_uio; 4897 int a_ioflag; 4898 struct ucred *a_cred; 4899 } */ *ap; 4900{ 4901 4902 return (zfs_write(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag), 4903 ap->a_cred, NULL)); 4904} 4905 4906static int 4907zfs_freebsd_access(ap) 4908 struct vop_access_args /* { 4909 struct vnode *a_vp; 4910 accmode_t a_accmode; 4911 struct ucred *a_cred; 4912 struct thread *a_td; 4913 } */ *ap; 4914{ 4915 vnode_t *vp = ap->a_vp; 4916 znode_t *zp = VTOZ(vp); 4917 accmode_t accmode; 4918 int error = 0; 4919 4920 /* 4921 * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND, 4922 */ 4923 accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND); 4924 if (accmode != 0) 4925 error = zfs_access(ap->a_vp, accmode, 0, ap->a_cred, NULL); 4926 4927 /* 4928 * VADMIN has to be handled by vaccess(). 4929 */ 4930 if (error == 0) { 4931 accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND); 4932 if (accmode != 0) { 4933 error = vaccess(vp->v_type, zp->z_mode, zp->z_uid, 4934 zp->z_gid, accmode, ap->a_cred, NULL); 4935 } 4936 } 4937 4938 /* 4939 * For VEXEC, ensure that at least one execute bit is set for 4940 * non-directories. 4941 */ 4942 if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR && 4943 (zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) { 4944 error = EACCES; 4945 } 4946 4947 return (error); 4948} 4949 4950static int 4951zfs_freebsd_lookup(struct vop_lookup_args *ap, boolean_t cached) 4952{ 4953 struct componentname *cnp = ap->a_cnp; 4954 char nm[NAME_MAX + 1]; 4955 4956 ASSERT(cnp->cn_namelen < sizeof(nm)); 4957 strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm))); 4958 4959 return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop, 4960 cnp->cn_cred, cnp->cn_thread, 0, cached)); 4961} 4962 4963static int 4964zfs_freebsd_cachedlookup(struct vop_cachedlookup_args *ap) 4965{ 4966 4967 return (zfs_freebsd_lookup((struct vop_lookup_args *)ap, B_TRUE)); 4968} 4969 4970static int 4971zfs_cache_lookup(ap) 4972 struct vop_lookup_args /* { 4973 struct vnode *a_dvp; 4974 struct vnode **a_vpp; 4975 struct componentname *a_cnp; 4976 } */ *ap; 4977{ 4978 zfsvfs_t *zfsvfs; 4979 4980 zfsvfs = ap->a_dvp->v_mount->mnt_data; 4981 if (zfsvfs->z_use_namecache) 4982 return (vfs_cache_lookup(ap)); 4983 else 4984 return (zfs_freebsd_lookup(ap, B_FALSE)); 4985} 4986 4987static int 4988zfs_freebsd_create(ap) 4989 struct vop_create_args /* { 4990 struct vnode *a_dvp; 4991 struct vnode **a_vpp; 4992 struct componentname *a_cnp; 4993 struct vattr *a_vap; 4994 } */ *ap; 4995{ 4996 zfsvfs_t *zfsvfs; 4997 struct componentname *cnp = ap->a_cnp; 4998 vattr_t *vap = ap->a_vap; 4999 int error, mode; 5000 5001 ASSERT(cnp->cn_flags & SAVENAME); 5002 5003 vattr_init_mask(vap); 5004 mode = vap->va_mode & ALLPERMS; 5005 zfsvfs = ap->a_dvp->v_mount->mnt_data; 5006 5007 error = zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode, 5008 ap->a_vpp, cnp->cn_cred, cnp->cn_thread); 5009 if (zfsvfs->z_use_namecache && 5010 error == 0 && (cnp->cn_flags & MAKEENTRY) != 0) 5011 cache_enter(ap->a_dvp, *ap->a_vpp, cnp); 5012 return (error); 5013} 5014 5015static int 5016zfs_freebsd_remove(ap) 5017 struct vop_remove_args /* { 5018 struct vnode *a_dvp; 5019 struct vnode *a_vp; 5020 struct componentname *a_cnp; 5021 } */ *ap; 5022{ 5023 5024 ASSERT(ap->a_cnp->cn_flags & SAVENAME); 5025 5026 return (zfs_remove(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr, 5027 ap->a_cnp->cn_cred)); 5028} 5029 5030static int 5031zfs_freebsd_mkdir(ap) 5032 struct vop_mkdir_args /* { 5033 struct vnode *a_dvp; 5034 struct vnode **a_vpp; 5035 struct componentname *a_cnp; 5036 struct vattr *a_vap; 5037 } */ *ap; 5038{ 5039 vattr_t *vap = ap->a_vap; 5040 5041 ASSERT(ap->a_cnp->cn_flags & SAVENAME); 5042 5043 vattr_init_mask(vap); 5044 5045 return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp, 5046 ap->a_cnp->cn_cred)); 5047} 5048 5049static int 5050zfs_freebsd_rmdir(ap) 5051 struct vop_rmdir_args /* { 5052 struct vnode *a_dvp; 5053 struct vnode *a_vp; 5054 struct componentname *a_cnp; 5055 } */ *ap; 5056{ 5057 struct componentname *cnp = ap->a_cnp; 5058 5059 ASSERT(cnp->cn_flags & SAVENAME); 5060 5061 return (zfs_rmdir(ap->a_dvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred)); 5062} 5063 5064static int 5065zfs_freebsd_readdir(ap) 5066 struct vop_readdir_args /* { 5067 struct vnode *a_vp; 5068 struct uio *a_uio; 5069 struct ucred *a_cred; 5070 int *a_eofflag; 5071 int *a_ncookies; 5072 u_long **a_cookies; 5073 } */ *ap; 5074{ 5075 5076 return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag, 5077 ap->a_ncookies, ap->a_cookies)); 5078} 5079 5080static int 5081zfs_freebsd_fsync(ap) 5082 struct vop_fsync_args /* { 5083 struct vnode *a_vp; 5084 int a_waitfor; 5085 struct thread *a_td; 5086 } */ *ap; 5087{ 5088 5089 vop_stdfsync(ap); 5090 return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL)); 5091} 5092 5093static int 5094zfs_freebsd_getattr(ap) 5095 struct vop_getattr_args /* { 5096 struct vnode *a_vp; 5097 struct vattr *a_vap; 5098 struct ucred *a_cred; 5099 } */ *ap; 5100{ 5101 vattr_t *vap = ap->a_vap; 5102 xvattr_t xvap; 5103 u_long fflags = 0; 5104 int error; 5105 5106 xva_init(&xvap); 5107 xvap.xva_vattr = *vap; 5108 xvap.xva_vattr.va_mask |= AT_XVATTR; 5109 5110 /* Convert chflags into ZFS-type flags. */ 5111 /* XXX: what about SF_SETTABLE?. */ 5112 XVA_SET_REQ(&xvap, XAT_IMMUTABLE); 5113 XVA_SET_REQ(&xvap, XAT_APPENDONLY); 5114 XVA_SET_REQ(&xvap, XAT_NOUNLINK); 5115 XVA_SET_REQ(&xvap, XAT_NODUMP); 5116 XVA_SET_REQ(&xvap, XAT_READONLY); 5117 XVA_SET_REQ(&xvap, XAT_ARCHIVE); 5118 XVA_SET_REQ(&xvap, XAT_SYSTEM); 5119 XVA_SET_REQ(&xvap, XAT_HIDDEN); 5120 XVA_SET_REQ(&xvap, XAT_REPARSE); 5121 XVA_SET_REQ(&xvap, XAT_OFFLINE); 5122 XVA_SET_REQ(&xvap, XAT_SPARSE); 5123 5124 error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL); 5125 if (error != 0) 5126 return (error); 5127 5128 /* Convert ZFS xattr into chflags. */ 5129#define FLAG_CHECK(fflag, xflag, xfield) do { \ 5130 if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0) \ 5131 fflags |= (fflag); \ 5132} while (0) 5133 FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE, 5134 xvap.xva_xoptattrs.xoa_immutable); 5135 FLAG_CHECK(SF_APPEND, XAT_APPENDONLY, 5136 xvap.xva_xoptattrs.xoa_appendonly); 5137 FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK, 5138 xvap.xva_xoptattrs.xoa_nounlink); 5139 FLAG_CHECK(UF_ARCHIVE, XAT_ARCHIVE, 5140 xvap.xva_xoptattrs.xoa_archive); 5141 FLAG_CHECK(UF_NODUMP, XAT_NODUMP, 5142 xvap.xva_xoptattrs.xoa_nodump); 5143 FLAG_CHECK(UF_READONLY, XAT_READONLY, 5144 xvap.xva_xoptattrs.xoa_readonly); 5145 FLAG_CHECK(UF_SYSTEM, XAT_SYSTEM, 5146 xvap.xva_xoptattrs.xoa_system); 5147 FLAG_CHECK(UF_HIDDEN, XAT_HIDDEN, 5148 xvap.xva_xoptattrs.xoa_hidden); 5149 FLAG_CHECK(UF_REPARSE, XAT_REPARSE, 5150 xvap.xva_xoptattrs.xoa_reparse); 5151 FLAG_CHECK(UF_OFFLINE, XAT_OFFLINE, 5152 xvap.xva_xoptattrs.xoa_offline); 5153 FLAG_CHECK(UF_SPARSE, XAT_SPARSE, 5154 xvap.xva_xoptattrs.xoa_sparse); 5155 5156#undef FLAG_CHECK 5157 *vap = xvap.xva_vattr; 5158 vap->va_flags = fflags; 5159 return (0); 5160} 5161 5162static int 5163zfs_freebsd_setattr(ap) 5164 struct vop_setattr_args /* { 5165 struct vnode *a_vp; 5166 struct vattr *a_vap; 5167 struct ucred *a_cred; 5168 } */ *ap; 5169{ 5170 vnode_t *vp = ap->a_vp; 5171 vattr_t *vap = ap->a_vap; 5172 cred_t *cred = ap->a_cred; 5173 xvattr_t xvap; 5174 u_long fflags; 5175 uint64_t zflags; 5176 5177 vattr_init_mask(vap); 5178 vap->va_mask &= ~AT_NOSET; 5179 5180 xva_init(&xvap); 5181 xvap.xva_vattr = *vap; 5182 5183 zflags = VTOZ(vp)->z_pflags; 5184 5185 if (vap->va_flags != VNOVAL) { 5186 zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs; 5187 int error; 5188 5189 if (zfsvfs->z_use_fuids == B_FALSE) 5190 return (EOPNOTSUPP); 5191 5192 fflags = vap->va_flags; 5193 /* 5194 * XXX KDM 5195 * We need to figure out whether it makes sense to allow 5196 * UF_REPARSE through, since we don't really have other 5197 * facilities to handle reparse points and zfs_setattr() 5198 * doesn't currently allow setting that attribute anyway. 5199 */ 5200 if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_ARCHIVE| 5201 UF_NODUMP|UF_SYSTEM|UF_HIDDEN|UF_READONLY|UF_REPARSE| 5202 UF_OFFLINE|UF_SPARSE)) != 0) 5203 return (EOPNOTSUPP); 5204 /* 5205 * Unprivileged processes are not permitted to unset system 5206 * flags, or modify flags if any system flags are set. 5207 * Privileged non-jail processes may not modify system flags 5208 * if securelevel > 0 and any existing system flags are set. 5209 * Privileged jail processes behave like privileged non-jail 5210 * processes if the security.jail.chflags_allowed sysctl is 5211 * is non-zero; otherwise, they behave like unprivileged 5212 * processes. 5213 */ 5214 if (secpolicy_fs_owner(vp->v_mount, cred) == 0 || 5215 priv_check_cred(cred, PRIV_VFS_SYSFLAGS, 0) == 0) { 5216 if (zflags & 5217 (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) { 5218 error = securelevel_gt(cred, 0); 5219 if (error != 0) 5220 return (error); 5221 } 5222 } else { 5223 /* 5224 * Callers may only modify the file flags on objects they 5225 * have VADMIN rights for. 5226 */ 5227 if ((error = VOP_ACCESS(vp, VADMIN, cred, curthread)) != 0) 5228 return (error); 5229 if (zflags & 5230 (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) { 5231 return (EPERM); 5232 } 5233 if (fflags & 5234 (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) { 5235 return (EPERM); 5236 } 5237 } 5238 5239#define FLAG_CHANGE(fflag, zflag, xflag, xfield) do { \ 5240 if (((fflags & (fflag)) && !(zflags & (zflag))) || \ 5241 ((zflags & (zflag)) && !(fflags & (fflag)))) { \ 5242 XVA_SET_REQ(&xvap, (xflag)); \ 5243 (xfield) = ((fflags & (fflag)) != 0); \ 5244 } \ 5245} while (0) 5246 /* Convert chflags into ZFS-type flags. */ 5247 /* XXX: what about SF_SETTABLE?. */ 5248 FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE, 5249 xvap.xva_xoptattrs.xoa_immutable); 5250 FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY, 5251 xvap.xva_xoptattrs.xoa_appendonly); 5252 FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK, 5253 xvap.xva_xoptattrs.xoa_nounlink); 5254 FLAG_CHANGE(UF_ARCHIVE, ZFS_ARCHIVE, XAT_ARCHIVE, 5255 xvap.xva_xoptattrs.xoa_archive); 5256 FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP, 5257 xvap.xva_xoptattrs.xoa_nodump); 5258 FLAG_CHANGE(UF_READONLY, ZFS_READONLY, XAT_READONLY, 5259 xvap.xva_xoptattrs.xoa_readonly); 5260 FLAG_CHANGE(UF_SYSTEM, ZFS_SYSTEM, XAT_SYSTEM, 5261 xvap.xva_xoptattrs.xoa_system); 5262 FLAG_CHANGE(UF_HIDDEN, ZFS_HIDDEN, XAT_HIDDEN, 5263 xvap.xva_xoptattrs.xoa_hidden); 5264 FLAG_CHANGE(UF_REPARSE, ZFS_REPARSE, XAT_REPARSE, 5265 xvap.xva_xoptattrs.xoa_reparse); 5266 FLAG_CHANGE(UF_OFFLINE, ZFS_OFFLINE, XAT_OFFLINE, 5267 xvap.xva_xoptattrs.xoa_offline); 5268 FLAG_CHANGE(UF_SPARSE, ZFS_SPARSE, XAT_SPARSE, 5269 xvap.xva_xoptattrs.xoa_sparse); 5270#undef FLAG_CHANGE 5271 } 5272 if (vap->va_birthtime.tv_sec != VNOVAL) { 5273 xvap.xva_vattr.va_mask |= AT_XVATTR; 5274 XVA_SET_REQ(&xvap, XAT_CREATETIME); 5275 } 5276 return (zfs_setattr(vp, (vattr_t *)&xvap, 0, cred, NULL)); 5277} 5278 5279static int 5280zfs_freebsd_rename(ap) 5281 struct vop_rename_args /* { 5282 struct vnode *a_fdvp; 5283 struct vnode *a_fvp; 5284 struct componentname *a_fcnp; 5285 struct vnode *a_tdvp; 5286 struct vnode *a_tvp; 5287 struct componentname *a_tcnp; 5288 } */ *ap; 5289{ 5290 vnode_t *fdvp = ap->a_fdvp; 5291 vnode_t *fvp = ap->a_fvp; 5292 vnode_t *tdvp = ap->a_tdvp; 5293 vnode_t *tvp = ap->a_tvp; 5294 int error; 5295 5296 ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART)); 5297 ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART)); 5298 5299 error = zfs_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp, 5300 ap->a_tcnp, ap->a_fcnp->cn_cred); 5301 5302 vrele(fdvp); 5303 vrele(fvp); 5304 vrele(tdvp); 5305 if (tvp != NULL) 5306 vrele(tvp); 5307 5308 return (error); 5309} 5310 5311static int 5312zfs_freebsd_symlink(ap) 5313 struct vop_symlink_args /* { 5314 struct vnode *a_dvp; 5315 struct vnode **a_vpp; 5316 struct componentname *a_cnp; 5317 struct vattr *a_vap; 5318 char *a_target; 5319 } */ *ap; 5320{ 5321 struct componentname *cnp = ap->a_cnp; 5322 vattr_t *vap = ap->a_vap; 5323 5324 ASSERT(cnp->cn_flags & SAVENAME); 5325 5326 vap->va_type = VLNK; /* FreeBSD: Syscall only sets va_mode. */ 5327 vattr_init_mask(vap); 5328 5329 return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap, 5330 ap->a_target, cnp->cn_cred, cnp->cn_thread)); 5331} 5332 5333static int 5334zfs_freebsd_readlink(ap) 5335 struct vop_readlink_args /* { 5336 struct vnode *a_vp; 5337 struct uio *a_uio; 5338 struct ucred *a_cred; 5339 } */ *ap; 5340{ 5341 5342 return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL)); 5343} 5344 5345static int 5346zfs_freebsd_link(ap) 5347 struct vop_link_args /* { 5348 struct vnode *a_tdvp; 5349 struct vnode *a_vp; 5350 struct componentname *a_cnp; 5351 } */ *ap; 5352{ 5353 struct componentname *cnp = ap->a_cnp; 5354 vnode_t *vp = ap->a_vp; 5355 vnode_t *tdvp = ap->a_tdvp; 5356 5357 if (tdvp->v_mount != vp->v_mount) 5358 return (EXDEV); 5359 5360 ASSERT(cnp->cn_flags & SAVENAME); 5361 5362 return (zfs_link(tdvp, vp, cnp->cn_nameptr, cnp->cn_cred, NULL, 0)); 5363} 5364 5365static int 5366zfs_freebsd_inactive(ap) 5367 struct vop_inactive_args /* { 5368 struct vnode *a_vp; 5369 struct thread *a_td; 5370 } */ *ap; 5371{ 5372 vnode_t *vp = ap->a_vp; 5373 5374 zfs_inactive(vp, ap->a_td->td_ucred, NULL); 5375 return (0); 5376} 5377 5378static int 5379zfs_freebsd_reclaim(ap) 5380 struct vop_reclaim_args /* { 5381 struct vnode *a_vp; 5382 struct thread *a_td; 5383 } */ *ap; 5384{ 5385 vnode_t *vp = ap->a_vp; 5386 znode_t *zp = VTOZ(vp); 5387 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 5388 5389 ASSERT(zp != NULL); 5390 5391 /* Destroy the vm object and flush associated pages. */ 5392 vnode_destroy_vobject(vp); 5393 5394 /* 5395 * z_teardown_inactive_lock protects from a race with 5396 * zfs_znode_dmu_fini in zfsvfs_teardown during 5397 * force unmount. 5398 */ 5399 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); 5400 if (zp->z_sa_hdl == NULL) 5401 zfs_znode_free(zp); 5402 else 5403 zfs_zinactive(zp); 5404 rw_exit(&zfsvfs->z_teardown_inactive_lock); 5405 5406 vp->v_data = NULL; 5407 return (0); 5408} 5409 5410static int 5411zfs_freebsd_fid(ap) 5412 struct vop_fid_args /* { 5413 struct vnode *a_vp; 5414 struct fid *a_fid; 5415 } */ *ap; 5416{ 5417 5418 return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL)); 5419} 5420 5421static int 5422zfs_freebsd_pathconf(ap) 5423 struct vop_pathconf_args /* { 5424 struct vnode *a_vp; 5425 int a_name; 5426 register_t *a_retval; 5427 } */ *ap; 5428{ 5429 ulong_t val; 5430 int error; 5431 5432 error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred, NULL); 5433 if (error == 0) { 5434 *ap->a_retval = val; 5435 return (error); 5436 } 5437 if (error != EOPNOTSUPP) 5438 return (error); 5439 5440 switch (ap->a_name) { 5441 case _PC_NAME_MAX: 5442 *ap->a_retval = NAME_MAX; 5443 return (0); 5444 case _PC_PIPE_BUF: 5445 if (ap->a_vp->v_type == VDIR || ap->a_vp->v_type == VFIFO) { 5446 *ap->a_retval = PIPE_BUF; 5447 return (0); 5448 } 5449 return (EINVAL); 5450 default: 5451 return (vop_stdpathconf(ap)); 5452 } 5453} 5454 5455/* 5456 * FreeBSD's extended attributes namespace defines file name prefix for ZFS' 5457 * extended attribute name: 5458 * 5459 * NAMESPACE PREFIX 5460 * system freebsd:system: 5461 * user (none, can be used to access ZFS fsattr(5) attributes 5462 * created on Solaris) 5463 */ 5464static int 5465zfs_create_attrname(int attrnamespace, const char *name, char *attrname, 5466 size_t size) 5467{ 5468 const char *namespace, *prefix, *suffix; 5469 5470 /* We don't allow '/' character in attribute name. */ 5471 if (strchr(name, '/') != NULL) 5472 return (EINVAL); 5473 /* We don't allow attribute names that start with "freebsd:" string. */ 5474 if (strncmp(name, "freebsd:", 8) == 0) 5475 return (EINVAL); 5476 5477 bzero(attrname, size); 5478 5479 switch (attrnamespace) { 5480 case EXTATTR_NAMESPACE_USER: 5481#if 0 5482 prefix = "freebsd:"; 5483 namespace = EXTATTR_NAMESPACE_USER_STRING; 5484 suffix = ":"; 5485#else 5486 /* 5487 * This is the default namespace by which we can access all 5488 * attributes created on Solaris. 5489 */ 5490 prefix = namespace = suffix = ""; 5491#endif 5492 break; 5493 case EXTATTR_NAMESPACE_SYSTEM: 5494 prefix = "freebsd:"; 5495 namespace = EXTATTR_NAMESPACE_SYSTEM_STRING; 5496 suffix = ":"; 5497 break; 5498 case EXTATTR_NAMESPACE_EMPTY: 5499 default: 5500 return (EINVAL); 5501 } 5502 if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix, 5503 name) >= size) { 5504 return (ENAMETOOLONG); 5505 } 5506 return (0); 5507} 5508 5509/* 5510 * Vnode operating to retrieve a named extended attribute. 5511 */ 5512static int 5513zfs_getextattr(struct vop_getextattr_args *ap) 5514/* 5515vop_getextattr { 5516 IN struct vnode *a_vp; 5517 IN int a_attrnamespace; 5518 IN const char *a_name; 5519 INOUT struct uio *a_uio; 5520 OUT size_t *a_size; 5521 IN struct ucred *a_cred; 5522 IN struct thread *a_td; 5523}; 5524*/ 5525{ 5526 zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; 5527 struct thread *td = ap->a_td; 5528 struct nameidata nd; 5529 char attrname[255]; 5530 struct vattr va; 5531 vnode_t *xvp = NULL, *vp; 5532 int error, flags; 5533 5534 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 5535 ap->a_cred, ap->a_td, VREAD); 5536 if (error != 0) 5537 return (error); 5538 5539 error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, 5540 sizeof(attrname)); 5541 if (error != 0) 5542 return (error); 5543 5544 ZFS_ENTER(zfsvfs); 5545 5546 error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, 5547 LOOKUP_XATTR, B_FALSE); 5548 if (error != 0) { 5549 ZFS_EXIT(zfsvfs); 5550 return (error); 5551 } 5552 5553 flags = FREAD; 5554 NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, 5555 xvp, td); 5556 error = vn_open_cred(&nd, &flags, VN_OPEN_INVFS, 0, ap->a_cred, NULL); 5557 vp = nd.ni_vp; 5558 NDFREE(&nd, NDF_ONLY_PNBUF); 5559 if (error != 0) { 5560 ZFS_EXIT(zfsvfs); 5561 if (error == ENOENT) 5562 error = ENOATTR; 5563 return (error); 5564 } 5565 5566 if (ap->a_size != NULL) { 5567 error = VOP_GETATTR(vp, &va, ap->a_cred); 5568 if (error == 0) 5569 *ap->a_size = (size_t)va.va_size; 5570 } else if (ap->a_uio != NULL) 5571 error = VOP_READ(vp, ap->a_uio, IO_UNIT, ap->a_cred); 5572 5573 VOP_UNLOCK(vp, 0); 5574 vn_close(vp, flags, ap->a_cred, td); 5575 ZFS_EXIT(zfsvfs); 5576 5577 return (error); 5578} 5579 5580/* 5581 * Vnode operation to remove a named attribute. 5582 */ 5583int 5584zfs_deleteextattr(struct vop_deleteextattr_args *ap) 5585/* 5586vop_deleteextattr { 5587 IN struct vnode *a_vp; 5588 IN int a_attrnamespace; 5589 IN const char *a_name; 5590 IN struct ucred *a_cred; 5591 IN struct thread *a_td; 5592}; 5593*/ 5594{ 5595 zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; 5596 struct thread *td = ap->a_td; 5597 struct nameidata nd; 5598 char attrname[255]; 5599 struct vattr va; 5600 vnode_t *xvp = NULL, *vp; 5601 int error, flags; 5602 5603 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 5604 ap->a_cred, ap->a_td, VWRITE); 5605 if (error != 0) 5606 return (error); 5607 5608 error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, 5609 sizeof(attrname)); 5610 if (error != 0) 5611 return (error); 5612 5613 ZFS_ENTER(zfsvfs); 5614 5615 error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, 5616 LOOKUP_XATTR, B_FALSE); 5617 if (error != 0) { 5618 ZFS_EXIT(zfsvfs); 5619 return (error); 5620 } 5621 5622 NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF, 5623 UIO_SYSSPACE, attrname, xvp, td); 5624 error = namei(&nd); 5625 vp = nd.ni_vp; 5626 if (error != 0) { 5627 ZFS_EXIT(zfsvfs); 5628 NDFREE(&nd, NDF_ONLY_PNBUF); 5629 if (error == ENOENT) 5630 error = ENOATTR; 5631 return (error); 5632 } 5633 5634 error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd); 5635 NDFREE(&nd, NDF_ONLY_PNBUF); 5636 5637 vput(nd.ni_dvp); 5638 if (vp == nd.ni_dvp) 5639 vrele(vp); 5640 else 5641 vput(vp); 5642 ZFS_EXIT(zfsvfs); 5643 5644 return (error); 5645} 5646 5647/* 5648 * Vnode operation to set a named attribute. 5649 */ 5650static int 5651zfs_setextattr(struct vop_setextattr_args *ap) 5652/* 5653vop_setextattr { 5654 IN struct vnode *a_vp; 5655 IN int a_attrnamespace; 5656 IN const char *a_name; 5657 INOUT struct uio *a_uio; 5658 IN struct ucred *a_cred; 5659 IN struct thread *a_td; 5660}; 5661*/ 5662{ 5663 zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; 5664 struct thread *td = ap->a_td; 5665 struct nameidata nd; 5666 char attrname[255]; 5667 struct vattr va; 5668 vnode_t *xvp = NULL, *vp; 5669 int error, flags; 5670 5671 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 5672 ap->a_cred, ap->a_td, VWRITE); 5673 if (error != 0) 5674 return (error); 5675 5676 error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, 5677 sizeof(attrname)); 5678 if (error != 0) 5679 return (error); 5680 5681 ZFS_ENTER(zfsvfs); 5682 5683 error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, 5684 LOOKUP_XATTR | CREATE_XATTR_DIR, B_FALSE); 5685 if (error != 0) { 5686 ZFS_EXIT(zfsvfs); 5687 return (error); 5688 } 5689 5690 flags = FFLAGS(O_WRONLY | O_CREAT); 5691 NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, 5692 xvp, td); 5693 error = vn_open_cred(&nd, &flags, 0600, VN_OPEN_INVFS, ap->a_cred, 5694 NULL); 5695 vp = nd.ni_vp; 5696 NDFREE(&nd, NDF_ONLY_PNBUF); 5697 if (error != 0) { 5698 ZFS_EXIT(zfsvfs); 5699 return (error); 5700 } 5701 5702 VATTR_NULL(&va); 5703 va.va_size = 0; 5704 error = VOP_SETATTR(vp, &va, ap->a_cred); 5705 if (error == 0) 5706 VOP_WRITE(vp, ap->a_uio, IO_UNIT, ap->a_cred); 5707 5708 VOP_UNLOCK(vp, 0); 5709 vn_close(vp, flags, ap->a_cred, td); 5710 ZFS_EXIT(zfsvfs); 5711 5712 return (error); 5713} 5714 5715/* 5716 * Vnode operation to retrieve extended attributes on a vnode. 5717 */ 5718static int 5719zfs_listextattr(struct vop_listextattr_args *ap) 5720/* 5721vop_listextattr { 5722 IN struct vnode *a_vp; 5723 IN int a_attrnamespace; 5724 INOUT struct uio *a_uio; 5725 OUT size_t *a_size; 5726 IN struct ucred *a_cred; 5727 IN struct thread *a_td; 5728}; 5729*/ 5730{ 5731 zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; 5732 struct thread *td = ap->a_td; 5733 struct nameidata nd; 5734 char attrprefix[16]; 5735 u_char dirbuf[sizeof(struct dirent)]; 5736 struct dirent *dp; 5737 struct iovec aiov; 5738 struct uio auio, *uio = ap->a_uio; 5739 size_t *sizep = ap->a_size; 5740 size_t plen; 5741 vnode_t *xvp = NULL, *vp; 5742 int done, error, eof, pos; 5743 5744 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 5745 ap->a_cred, ap->a_td, VREAD); 5746 if (error != 0) 5747 return (error); 5748 5749 error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix, 5750 sizeof(attrprefix)); 5751 if (error != 0) 5752 return (error); 5753 plen = strlen(attrprefix); 5754 5755 ZFS_ENTER(zfsvfs); 5756 5757 if (sizep != NULL) 5758 *sizep = 0; 5759 5760 error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, 5761 LOOKUP_XATTR, B_FALSE); 5762 if (error != 0) { 5763 ZFS_EXIT(zfsvfs); 5764 /* 5765 * ENOATTR means that the EA directory does not yet exist, 5766 * i.e. there are no extended attributes there. 5767 */ 5768 if (error == ENOATTR) 5769 error = 0; 5770 return (error); 5771 } 5772 5773 NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED, 5774 UIO_SYSSPACE, ".", xvp, td); 5775 error = namei(&nd); 5776 vp = nd.ni_vp; 5777 NDFREE(&nd, NDF_ONLY_PNBUF); 5778 if (error != 0) { 5779 ZFS_EXIT(zfsvfs); 5780 return (error); 5781 } 5782 5783 auio.uio_iov = &aiov; 5784 auio.uio_iovcnt = 1; 5785 auio.uio_segflg = UIO_SYSSPACE; 5786 auio.uio_td = td; 5787 auio.uio_rw = UIO_READ; 5788 auio.uio_offset = 0; 5789 5790 do { 5791 u_char nlen; 5792 5793 aiov.iov_base = (void *)dirbuf; 5794 aiov.iov_len = sizeof(dirbuf); 5795 auio.uio_resid = sizeof(dirbuf); 5796 error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL); 5797 done = sizeof(dirbuf) - auio.uio_resid; 5798 if (error != 0) 5799 break; 5800 for (pos = 0; pos < done;) { 5801 dp = (struct dirent *)(dirbuf + pos); 5802 pos += dp->d_reclen; 5803 /* 5804 * XXX: Temporarily we also accept DT_UNKNOWN, as this 5805 * is what we get when attribute was created on Solaris. 5806 */ 5807 if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN) 5808 continue; 5809 if (plen == 0 && strncmp(dp->d_name, "freebsd:", 8) == 0) 5810 continue; 5811 else if (strncmp(dp->d_name, attrprefix, plen) != 0) 5812 continue; 5813 nlen = dp->d_namlen - plen; 5814 if (sizep != NULL) 5815 *sizep += 1 + nlen; 5816 else if (uio != NULL) { 5817 /* 5818 * Format of extattr name entry is one byte for 5819 * length and the rest for name. 5820 */ 5821 error = uiomove(&nlen, 1, uio->uio_rw, uio); 5822 if (error == 0) { 5823 error = uiomove(dp->d_name + plen, nlen, 5824 uio->uio_rw, uio); 5825 } 5826 if (error != 0) 5827 break; 5828 } 5829 } 5830 } while (!eof && error == 0); 5831 5832 vput(vp); 5833 ZFS_EXIT(zfsvfs); 5834 5835 return (error); 5836} 5837 5838int 5839zfs_freebsd_getacl(ap) 5840 struct vop_getacl_args /* { 5841 struct vnode *vp; 5842 acl_type_t type; 5843 struct acl *aclp; 5844 struct ucred *cred; 5845 struct thread *td; 5846 } */ *ap; 5847{ 5848 int error; 5849 vsecattr_t vsecattr; 5850 5851 if (ap->a_type != ACL_TYPE_NFS4) 5852 return (EINVAL); 5853 5854 vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT; 5855 if (error = zfs_getsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL)) 5856 return (error); 5857 5858 error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp, vsecattr.vsa_aclcnt); 5859 if (vsecattr.vsa_aclentp != NULL) 5860 kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz); 5861 5862 return (error); 5863} 5864 5865int 5866zfs_freebsd_setacl(ap) 5867 struct vop_setacl_args /* { 5868 struct vnode *vp; 5869 acl_type_t type; 5870 struct acl *aclp; 5871 struct ucred *cred; 5872 struct thread *td; 5873 } */ *ap; 5874{ 5875 int error; 5876 vsecattr_t vsecattr; 5877 int aclbsize; /* size of acl list in bytes */ 5878 aclent_t *aaclp; 5879 5880 if (ap->a_type != ACL_TYPE_NFS4) 5881 return (EINVAL); 5882 5883 if (ap->a_aclp == NULL) 5884 return (EINVAL); 5885 5886 if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES) 5887 return (EINVAL); 5888 5889 /* 5890 * With NFSv4 ACLs, chmod(2) may need to add additional entries, 5891 * splitting every entry into two and appending "canonical six" 5892 * entries at the end. Don't allow for setting an ACL that would 5893 * cause chmod(2) to run out of ACL entries. 5894 */ 5895 if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES) 5896 return (ENOSPC); 5897 5898 error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR); 5899 if (error != 0) 5900 return (error); 5901 5902 vsecattr.vsa_mask = VSA_ACE; 5903 aclbsize = ap->a_aclp->acl_cnt * sizeof(ace_t); 5904 vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP); 5905 aaclp = vsecattr.vsa_aclentp; 5906 vsecattr.vsa_aclentsz = aclbsize; 5907 5908 aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp); 5909 error = zfs_setsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL); 5910 kmem_free(aaclp, aclbsize); 5911 5912 return (error); 5913} 5914 5915int 5916zfs_freebsd_aclcheck(ap) 5917 struct vop_aclcheck_args /* { 5918 struct vnode *vp; 5919 acl_type_t type; 5920 struct acl *aclp; 5921 struct ucred *cred; 5922 struct thread *td; 5923 } */ *ap; 5924{ 5925 5926 return (EOPNOTSUPP); 5927} 5928 5929static int 5930zfs_vptocnp(struct vop_vptocnp_args *ap) 5931{ 5932 vnode_t *covered_vp; 5933 vnode_t *vp = ap->a_vp;; 5934 zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; 5935 znode_t *zp = VTOZ(vp); 5936 int ltype; 5937 int error; 5938 5939 ZFS_ENTER(zfsvfs); 5940 ZFS_VERIFY_ZP(zp); 5941 5942 /* 5943 * If we are a snapshot mounted under .zfs, run the operation 5944 * on the covered vnode. 5945 */ 5946 if (zp->z_id != zfsvfs->z_root || zfsvfs->z_parent == zfsvfs) { 5947 char name[MAXNAMLEN + 1]; 5948 znode_t *dzp; 5949 size_t len; 5950 5951 error = zfs_znode_parent_and_name(zp, &dzp, name); 5952 if (error == 0) { 5953 len = strlen(name); 5954 if (*ap->a_buflen < len) 5955 error = SET_ERROR(ENOMEM); 5956 } 5957 if (error == 0) { 5958 *ap->a_buflen -= len; 5959 bcopy(name, ap->a_buf + *ap->a_buflen, len); 5960 *ap->a_vpp = ZTOV(dzp); 5961 } 5962 ZFS_EXIT(zfsvfs); 5963 return (error); 5964 } 5965 ZFS_EXIT(zfsvfs); 5966 5967 covered_vp = vp->v_mount->mnt_vnodecovered; 5968 vhold(covered_vp); 5969 ltype = VOP_ISLOCKED(vp); 5970 VOP_UNLOCK(vp, 0); 5971 error = vget(covered_vp, LK_SHARED | LK_VNHELD, curthread); 5972 if (error == 0) { 5973 error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_cred, 5974 ap->a_buf, ap->a_buflen); 5975 vput(covered_vp); 5976 } 5977 vn_lock(vp, ltype | LK_RETRY); 5978 if ((vp->v_iflag & VI_DOOMED) != 0) 5979 error = SET_ERROR(ENOENT); 5980 return (error); 5981} 5982 5983#ifdef DIAGNOSTIC 5984static int 5985zfs_lock(ap) 5986 struct vop_lock1_args /* { 5987 struct vnode *a_vp; 5988 int a_flags; 5989 char *file; 5990 int line; 5991 } */ *ap; 5992{ 5993 vnode_t *vp; 5994 znode_t *zp; 5995 int err; 5996 5997 err = vop_stdlock(ap); 5998 if (err == 0 && (ap->a_flags & LK_NOWAIT) == 0) { 5999 vp = ap->a_vp; 6000 zp = vp->v_data; 6001 if (vp->v_mount != NULL && (vp->v_iflag & VI_DOOMED) == 0 && 6002 zp != NULL && (zp->z_pflags & ZFS_XATTR) == 0) 6003 VERIFY(!RRM_LOCK_HELD(&zp->z_zfsvfs->z_teardown_lock)); 6004 } 6005 return (err); 6006} 6007#endif 6008 6009struct vop_vector zfs_vnodeops; 6010struct vop_vector zfs_fifoops; 6011struct vop_vector zfs_shareops; 6012 6013struct vop_vector zfs_vnodeops = { 6014 .vop_default = &default_vnodeops, 6015 .vop_inactive = zfs_freebsd_inactive, 6016 .vop_reclaim = zfs_freebsd_reclaim, 6017 .vop_access = zfs_freebsd_access, 6018 .vop_lookup = zfs_cache_lookup, 6019 .vop_cachedlookup = zfs_freebsd_cachedlookup, 6020 .vop_getattr = zfs_freebsd_getattr, 6021 .vop_setattr = zfs_freebsd_setattr, 6022 .vop_create = zfs_freebsd_create, 6023 .vop_mknod = zfs_freebsd_create, 6024 .vop_mkdir = zfs_freebsd_mkdir, 6025 .vop_readdir = zfs_freebsd_readdir, 6026 .vop_fsync = zfs_freebsd_fsync, 6027 .vop_open = zfs_freebsd_open, 6028 .vop_close = zfs_freebsd_close, 6029 .vop_rmdir = zfs_freebsd_rmdir, 6030 .vop_ioctl = zfs_freebsd_ioctl, 6031 .vop_link = zfs_freebsd_link, 6032 .vop_symlink = zfs_freebsd_symlink, 6033 .vop_readlink = zfs_freebsd_readlink, 6034 .vop_read = zfs_freebsd_read, 6035 .vop_write = zfs_freebsd_write, 6036 .vop_remove = zfs_freebsd_remove, 6037 .vop_rename = zfs_freebsd_rename, 6038 .vop_pathconf = zfs_freebsd_pathconf, 6039 .vop_bmap = zfs_freebsd_bmap, 6040 .vop_fid = zfs_freebsd_fid, 6041 .vop_getextattr = zfs_getextattr, 6042 .vop_deleteextattr = zfs_deleteextattr, 6043 .vop_setextattr = zfs_setextattr, 6044 .vop_listextattr = zfs_listextattr, 6045 .vop_getacl = zfs_freebsd_getacl, 6046 .vop_setacl = zfs_freebsd_setacl, 6047 .vop_aclcheck = zfs_freebsd_aclcheck, 6048 .vop_getpages = zfs_freebsd_getpages, 6049 .vop_putpages = zfs_freebsd_putpages, 6050 .vop_vptocnp = zfs_vptocnp, 6051#ifdef DIAGNOSTIC 6052 .vop_lock1 = zfs_lock, 6053#endif 6054}; 6055 6056struct vop_vector zfs_fifoops = { 6057 .vop_default = &fifo_specops, 6058 .vop_fsync = zfs_freebsd_fsync, 6059 .vop_access = zfs_freebsd_access, 6060 .vop_getattr = zfs_freebsd_getattr, 6061 .vop_inactive = zfs_freebsd_inactive, 6062 .vop_read = VOP_PANIC, 6063 .vop_reclaim = zfs_freebsd_reclaim, 6064 .vop_setattr = zfs_freebsd_setattr, 6065 .vop_write = VOP_PANIC, 6066 .vop_pathconf = zfs_freebsd_pathconf, 6067 .vop_fid = zfs_freebsd_fid, 6068 .vop_getacl = zfs_freebsd_getacl, 6069 .vop_setacl = zfs_freebsd_setacl, 6070 .vop_aclcheck = zfs_freebsd_aclcheck, 6071}; 6072 6073/* 6074 * special share hidden files vnode operations template 6075 */ 6076struct vop_vector zfs_shareops = { 6077 .vop_default = &default_vnodeops, 6078 .vop_access = zfs_freebsd_access, 6079 .vop_inactive = zfs_freebsd_inactive, 6080 .vop_reclaim = zfs_freebsd_reclaim, 6081 .vop_fid = zfs_freebsd_fid, 6082 .vop_pathconf = zfs_freebsd_pathconf, 6083}; 6084