zfs_vnops.c revision 330986
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22/* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2012, 2015 by Delphix. All rights reserved. 25 * Copyright (c) 2014 Integros [integros.com] 26 * Copyright 2017 Nexenta Systems, Inc. 27 */ 28 29/* Portions Copyright 2007 Jeremy Teo */ 30/* Portions Copyright 2010 Robert Milkowski */ 31 32#include <sys/types.h> 33#include <sys/param.h> 34#include <sys/time.h> 35#include <sys/systm.h> 36#include <sys/sysmacros.h> 37#include <sys/resource.h> 38#include <sys/vfs.h> 39#include <sys/vm.h> 40#include <sys/vnode.h> 41#include <sys/file.h> 42#include <sys/stat.h> 43#include <sys/kmem.h> 44#include <sys/taskq.h> 45#include <sys/uio.h> 46#include <sys/atomic.h> 47#include <sys/namei.h> 48#include <sys/mman.h> 49#include <sys/cmn_err.h> 50#include <sys/errno.h> 51#include <sys/unistd.h> 52#include <sys/zfs_dir.h> 53#include <sys/zfs_ioctl.h> 54#include <sys/fs/zfs.h> 55#include <sys/dmu.h> 56#include <sys/dmu_objset.h> 57#include <sys/spa.h> 58#include <sys/txg.h> 59#include <sys/dbuf.h> 60#include <sys/zap.h> 61#include <sys/sa.h> 62#include <sys/dirent.h> 63#include <sys/policy.h> 64#include <sys/sunddi.h> 65#include <sys/filio.h> 66#include <sys/sid.h> 67#include <sys/zfs_ctldir.h> 68#include <sys/zfs_fuid.h> 69#include <sys/zfs_sa.h> 70#include <sys/zfs_rlock.h> 71#include <sys/extdirent.h> 72#include <sys/kidmap.h> 73#include <sys/bio.h> 74#include <sys/buf.h> 75#include <sys/sched.h> 76#include <sys/acl.h> 77#include <vm/vm_param.h> 78#include <sys/zil.h> 79 80/* 81 * Programming rules. 82 * 83 * Each vnode op performs some logical unit of work. To do this, the ZPL must 84 * properly lock its in-core state, create a DMU transaction, do the work, 85 * record this work in the intent log (ZIL), commit the DMU transaction, 86 * and wait for the intent log to commit if it is a synchronous operation. 87 * Moreover, the vnode ops must work in both normal and log replay context. 88 * The ordering of events is important to avoid deadlocks and references 89 * to freed memory. The example below illustrates the following Big Rules: 90 * 91 * (1) A check must be made in each zfs thread for a mounted file system. 92 * This is done avoiding races using ZFS_ENTER(zfsvfs). 93 * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes 94 * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros 95 * can return EIO from the calling function. 96 * 97 * (2) VN_RELE() should always be the last thing except for zil_commit() 98 * (if necessary) and ZFS_EXIT(). This is for 3 reasons: 99 * First, if it's the last reference, the vnode/znode 100 * can be freed, so the zp may point to freed memory. Second, the last 101 * reference will call zfs_zinactive(), which may induce a lot of work -- 102 * pushing cached pages (which acquires range locks) and syncing out 103 * cached atime changes. Third, zfs_zinactive() may require a new tx, 104 * which could deadlock the system if you were already holding one. 105 * If you must call VN_RELE() within a tx then use VN_RELE_ASYNC(). 106 * 107 * (3) All range locks must be grabbed before calling dmu_tx_assign(), 108 * as they can span dmu_tx_assign() calls. 109 * 110 * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to 111 * dmu_tx_assign(). This is critical because we don't want to block 112 * while holding locks. 113 * 114 * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This 115 * reduces lock contention and CPU usage when we must wait (note that if 116 * throughput is constrained by the storage, nearly every transaction 117 * must wait). 118 * 119 * Note, in particular, that if a lock is sometimes acquired before 120 * the tx assigns, and sometimes after (e.g. z_lock), then failing 121 * to use a non-blocking assign can deadlock the system. The scenario: 122 * 123 * Thread A has grabbed a lock before calling dmu_tx_assign(). 124 * Thread B is in an already-assigned tx, and blocks for this lock. 125 * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() 126 * forever, because the previous txg can't quiesce until B's tx commits. 127 * 128 * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, 129 * then drop all locks, call dmu_tx_wait(), and try again. On subsequent 130 * calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT, 131 * to indicate that this operation has already called dmu_tx_wait(). 132 * This will ensure that we don't retry forever, waiting a short bit 133 * each time. 134 * 135 * (5) If the operation succeeded, generate the intent log entry for it 136 * before dropping locks. This ensures that the ordering of events 137 * in the intent log matches the order in which they actually occurred. 138 * During ZIL replay the zfs_log_* functions will update the sequence 139 * number to indicate the zil transaction has replayed. 140 * 141 * (6) At the end of each vnode op, the DMU tx must always commit, 142 * regardless of whether there were any errors. 143 * 144 * (7) After dropping all locks, invoke zil_commit(zilog, foid) 145 * to ensure that synchronous semantics are provided when necessary. 146 * 147 * In general, this is how things should be ordered in each vnode op: 148 * 149 * ZFS_ENTER(zfsvfs); // exit if unmounted 150 * top: 151 * zfs_dirent_lookup(&dl, ...) // lock directory entry (may VN_HOLD()) 152 * rw_enter(...); // grab any other locks you need 153 * tx = dmu_tx_create(...); // get DMU tx 154 * dmu_tx_hold_*(); // hold each object you might modify 155 * error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 156 * if (error) { 157 * rw_exit(...); // drop locks 158 * zfs_dirent_unlock(dl); // unlock directory entry 159 * VN_RELE(...); // release held vnodes 160 * if (error == ERESTART) { 161 * waited = B_TRUE; 162 * dmu_tx_wait(tx); 163 * dmu_tx_abort(tx); 164 * goto top; 165 * } 166 * dmu_tx_abort(tx); // abort DMU tx 167 * ZFS_EXIT(zfsvfs); // finished in zfs 168 * return (error); // really out of space 169 * } 170 * error = do_real_work(); // do whatever this VOP does 171 * if (error == 0) 172 * zfs_log_*(...); // on success, make ZIL entry 173 * dmu_tx_commit(tx); // commit DMU tx -- error or not 174 * rw_exit(...); // drop locks 175 * zfs_dirent_unlock(dl); // unlock directory entry 176 * VN_RELE(...); // release held vnodes 177 * zil_commit(zilog, foid); // synchronous when necessary 178 * ZFS_EXIT(zfsvfs); // finished in zfs 179 * return (error); // done, report error 180 */ 181 182/* ARGSUSED */ 183static int 184zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) 185{ 186 znode_t *zp = VTOZ(*vpp); 187 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 188 189 ZFS_ENTER(zfsvfs); 190 ZFS_VERIFY_ZP(zp); 191 192 if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) && 193 ((flag & FAPPEND) == 0)) { 194 ZFS_EXIT(zfsvfs); 195 return (SET_ERROR(EPERM)); 196 } 197 198 if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && 199 ZTOV(zp)->v_type == VREG && 200 !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) { 201 if (fs_vscan(*vpp, cr, 0) != 0) { 202 ZFS_EXIT(zfsvfs); 203 return (SET_ERROR(EACCES)); 204 } 205 } 206 207 /* Keep a count of the synchronous opens in the znode */ 208 if (flag & (FSYNC | FDSYNC)) 209 atomic_inc_32(&zp->z_sync_cnt); 210 211 ZFS_EXIT(zfsvfs); 212 return (0); 213} 214 215/* ARGSUSED */ 216static int 217zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, 218 caller_context_t *ct) 219{ 220 znode_t *zp = VTOZ(vp); 221 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 222 223 /* 224 * Clean up any locks held by this process on the vp. 225 */ 226 cleanlocks(vp, ddi_get_pid(), 0); 227 cleanshares(vp, ddi_get_pid()); 228 229 ZFS_ENTER(zfsvfs); 230 ZFS_VERIFY_ZP(zp); 231 232 /* Decrement the synchronous opens in the znode */ 233 if ((flag & (FSYNC | FDSYNC)) && (count == 1)) 234 atomic_dec_32(&zp->z_sync_cnt); 235 236 if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && 237 ZTOV(zp)->v_type == VREG && 238 !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) 239 VERIFY(fs_vscan(vp, cr, 1) == 0); 240 241 ZFS_EXIT(zfsvfs); 242 return (0); 243} 244 245/* 246 * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and 247 * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter. 248 */ 249static int 250zfs_holey(vnode_t *vp, u_long cmd, offset_t *off) 251{ 252 znode_t *zp = VTOZ(vp); 253 uint64_t noff = (uint64_t)*off; /* new offset */ 254 uint64_t file_sz; 255 int error; 256 boolean_t hole; 257 258 file_sz = zp->z_size; 259 if (noff >= file_sz) { 260 return (SET_ERROR(ENXIO)); 261 } 262 263 if (cmd == _FIO_SEEK_HOLE) 264 hole = B_TRUE; 265 else 266 hole = B_FALSE; 267 268 error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff); 269 270 if (error == ESRCH) 271 return (SET_ERROR(ENXIO)); 272 273 /* 274 * We could find a hole that begins after the logical end-of-file, 275 * because dmu_offset_next() only works on whole blocks. If the 276 * EOF falls mid-block, then indicate that the "virtual hole" 277 * at the end of the file begins at the logical EOF, rather than 278 * at the end of the last block. 279 */ 280 if (noff > file_sz) { 281 ASSERT(hole); 282 noff = file_sz; 283 } 284 285 if (noff < *off) 286 return (error); 287 *off = noff; 288 return (error); 289} 290 291/* ARGSUSED */ 292static int 293zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred, 294 int *rvalp, caller_context_t *ct) 295{ 296 offset_t off; 297 offset_t ndata; 298 dmu_object_info_t doi; 299 int error; 300 zfsvfs_t *zfsvfs; 301 znode_t *zp; 302 303 switch (com) { 304 case _FIOFFS: 305 { 306 return (0); 307 308 /* 309 * The following two ioctls are used by bfu. Faking out, 310 * necessary to avoid bfu errors. 311 */ 312 } 313 case _FIOGDIO: 314 case _FIOSDIO: 315 { 316 return (0); 317 } 318 319 case _FIO_SEEK_DATA: 320 case _FIO_SEEK_HOLE: 321 { 322#ifdef illumos 323 if (ddi_copyin((void *)data, &off, sizeof (off), flag)) 324 return (SET_ERROR(EFAULT)); 325#else 326 off = *(offset_t *)data; 327#endif 328 zp = VTOZ(vp); 329 zfsvfs = zp->z_zfsvfs; 330 ZFS_ENTER(zfsvfs); 331 ZFS_VERIFY_ZP(zp); 332 333 /* offset parameter is in/out */ 334 error = zfs_holey(vp, com, &off); 335 ZFS_EXIT(zfsvfs); 336 if (error) 337 return (error); 338#ifdef illumos 339 if (ddi_copyout(&off, (void *)data, sizeof (off), flag)) 340 return (SET_ERROR(EFAULT)); 341#else 342 *(offset_t *)data = off; 343#endif 344 return (0); 345 } 346#ifdef illumos 347 case _FIO_COUNT_FILLED: 348 { 349 /* 350 * _FIO_COUNT_FILLED adds a new ioctl command which 351 * exposes the number of filled blocks in a 352 * ZFS object. 353 */ 354 zp = VTOZ(vp); 355 zfsvfs = zp->z_zfsvfs; 356 ZFS_ENTER(zfsvfs); 357 ZFS_VERIFY_ZP(zp); 358 359 /* 360 * Wait for all dirty blocks for this object 361 * to get synced out to disk, and the DMU info 362 * updated. 363 */ 364 error = dmu_object_wait_synced(zfsvfs->z_os, zp->z_id); 365 if (error) { 366 ZFS_EXIT(zfsvfs); 367 return (error); 368 } 369 370 /* 371 * Retrieve fill count from DMU object. 372 */ 373 error = dmu_object_info(zfsvfs->z_os, zp->z_id, &doi); 374 if (error) { 375 ZFS_EXIT(zfsvfs); 376 return (error); 377 } 378 379 ndata = doi.doi_fill_count; 380 381 ZFS_EXIT(zfsvfs); 382 if (ddi_copyout(&ndata, (void *)data, sizeof (ndata), flag)) 383 return (SET_ERROR(EFAULT)); 384 return (0); 385 } 386#endif 387 } 388 return (SET_ERROR(ENOTTY)); 389} 390 391static vm_page_t 392page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes) 393{ 394 vm_object_t obj; 395 vm_page_t pp; 396 int64_t end; 397 398 /* 399 * At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE 400 * aligned boundaries, if the range is not aligned. As a result a 401 * DEV_BSIZE subrange with partially dirty data may get marked as clean. 402 * It may happen that all DEV_BSIZE subranges are marked clean and thus 403 * the whole page would be considred clean despite have some dirty data. 404 * For this reason we should shrink the range to DEV_BSIZE aligned 405 * boundaries before calling vm_page_clear_dirty. 406 */ 407 end = rounddown2(off + nbytes, DEV_BSIZE); 408 off = roundup2(off, DEV_BSIZE); 409 nbytes = end - off; 410 411 obj = vp->v_object; 412 zfs_vmobject_assert_wlocked(obj); 413 414 for (;;) { 415 if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && 416 pp->valid) { 417 if (vm_page_xbusied(pp)) { 418 /* 419 * Reference the page before unlocking and 420 * sleeping so that the page daemon is less 421 * likely to reclaim it. 422 */ 423 vm_page_reference(pp); 424 vm_page_lock(pp); 425 zfs_vmobject_wunlock(obj); 426 vm_page_busy_sleep(pp, "zfsmwb", true); 427 zfs_vmobject_wlock(obj); 428 continue; 429 } 430 vm_page_sbusy(pp); 431 } else if (pp != NULL) { 432 ASSERT(!pp->valid); 433 pp = NULL; 434 } 435 436 if (pp != NULL) { 437 ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); 438 vm_object_pip_add(obj, 1); 439 pmap_remove_write(pp); 440 if (nbytes != 0) 441 vm_page_clear_dirty(pp, off, nbytes); 442 } 443 break; 444 } 445 return (pp); 446} 447 448static void 449page_unbusy(vm_page_t pp) 450{ 451 452 vm_page_sunbusy(pp); 453 vm_object_pip_subtract(pp->object, 1); 454} 455 456static vm_page_t 457page_hold(vnode_t *vp, int64_t start) 458{ 459 vm_object_t obj; 460 vm_page_t pp; 461 462 obj = vp->v_object; 463 zfs_vmobject_assert_wlocked(obj); 464 465 for (;;) { 466 if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && 467 pp->valid) { 468 if (vm_page_xbusied(pp)) { 469 /* 470 * Reference the page before unlocking and 471 * sleeping so that the page daemon is less 472 * likely to reclaim it. 473 */ 474 vm_page_reference(pp); 475 vm_page_lock(pp); 476 zfs_vmobject_wunlock(obj); 477 vm_page_busy_sleep(pp, "zfsmwb", true); 478 zfs_vmobject_wlock(obj); 479 continue; 480 } 481 482 ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); 483 vm_page_lock(pp); 484 vm_page_hold(pp); 485 vm_page_unlock(pp); 486 487 } else 488 pp = NULL; 489 break; 490 } 491 return (pp); 492} 493 494static void 495page_unhold(vm_page_t pp) 496{ 497 498 vm_page_lock(pp); 499 vm_page_unhold(pp); 500 vm_page_unlock(pp); 501} 502 503/* 504 * When a file is memory mapped, we must keep the IO data synchronized 505 * between the DMU cache and the memory mapped pages. What this means: 506 * 507 * On Write: If we find a memory mapped page, we write to *both* 508 * the page and the dmu buffer. 509 */ 510static void 511update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid, 512 int segflg, dmu_tx_t *tx) 513{ 514 vm_object_t obj; 515 struct sf_buf *sf; 516 caddr_t va; 517 int off; 518 519 ASSERT(segflg != UIO_NOCOPY); 520 ASSERT(vp->v_mount != NULL); 521 obj = vp->v_object; 522 ASSERT(obj != NULL); 523 524 off = start & PAGEOFFSET; 525 zfs_vmobject_wlock(obj); 526 for (start &= PAGEMASK; len > 0; start += PAGESIZE) { 527 vm_page_t pp; 528 int nbytes = imin(PAGESIZE - off, len); 529 530 if ((pp = page_busy(vp, start, off, nbytes)) != NULL) { 531 zfs_vmobject_wunlock(obj); 532 533 va = zfs_map_page(pp, &sf); 534 (void) dmu_read(os, oid, start+off, nbytes, 535 va+off, DMU_READ_PREFETCH);; 536 zfs_unmap_page(sf); 537 538 zfs_vmobject_wlock(obj); 539 page_unbusy(pp); 540 } 541 len -= nbytes; 542 off = 0; 543 } 544 vm_object_pip_wakeupn(obj, 0); 545 zfs_vmobject_wunlock(obj); 546} 547 548/* 549 * Read with UIO_NOCOPY flag means that sendfile(2) requests 550 * ZFS to populate a range of page cache pages with data. 551 * 552 * NOTE: this function could be optimized to pre-allocate 553 * all pages in advance, drain exclusive busy on all of them, 554 * map them into contiguous KVA region and populate them 555 * in one single dmu_read() call. 556 */ 557static int 558mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio) 559{ 560 znode_t *zp = VTOZ(vp); 561 objset_t *os = zp->z_zfsvfs->z_os; 562 struct sf_buf *sf; 563 vm_object_t obj; 564 vm_page_t pp; 565 int64_t start; 566 caddr_t va; 567 int len = nbytes; 568 int off; 569 int error = 0; 570 571 ASSERT(uio->uio_segflg == UIO_NOCOPY); 572 ASSERT(vp->v_mount != NULL); 573 obj = vp->v_object; 574 ASSERT(obj != NULL); 575 ASSERT((uio->uio_loffset & PAGEOFFSET) == 0); 576 577 zfs_vmobject_wlock(obj); 578 for (start = uio->uio_loffset; len > 0; start += PAGESIZE) { 579 int bytes = MIN(PAGESIZE, len); 580 581 pp = vm_page_grab(obj, OFF_TO_IDX(start), VM_ALLOC_SBUSY | 582 VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY); 583 if (pp->valid == 0) { 584 zfs_vmobject_wunlock(obj); 585 va = zfs_map_page(pp, &sf); 586 error = dmu_read(os, zp->z_id, start, bytes, va, 587 DMU_READ_PREFETCH); 588 if (bytes != PAGESIZE && error == 0) 589 bzero(va + bytes, PAGESIZE - bytes); 590 zfs_unmap_page(sf); 591 zfs_vmobject_wlock(obj); 592 vm_page_sunbusy(pp); 593 vm_page_lock(pp); 594 if (error) { 595 if (pp->wire_count == 0 && pp->valid == 0 && 596 !vm_page_busied(pp)) 597 vm_page_free(pp); 598 } else { 599 pp->valid = VM_PAGE_BITS_ALL; 600 vm_page_activate(pp); 601 } 602 vm_page_unlock(pp); 603 } else { 604 ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); 605 vm_page_sunbusy(pp); 606 } 607 if (error) 608 break; 609 uio->uio_resid -= bytes; 610 uio->uio_offset += bytes; 611 len -= bytes; 612 } 613 zfs_vmobject_wunlock(obj); 614 return (error); 615} 616 617/* 618 * When a file is memory mapped, we must keep the IO data synchronized 619 * between the DMU cache and the memory mapped pages. What this means: 620 * 621 * On Read: We "read" preferentially from memory mapped pages, 622 * else we default from the dmu buffer. 623 * 624 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when 625 * the file is memory mapped. 626 */ 627static int 628mappedread(vnode_t *vp, int nbytes, uio_t *uio) 629{ 630 znode_t *zp = VTOZ(vp); 631 vm_object_t obj; 632 int64_t start; 633 caddr_t va; 634 int len = nbytes; 635 int off; 636 int error = 0; 637 638 ASSERT(vp->v_mount != NULL); 639 obj = vp->v_object; 640 ASSERT(obj != NULL); 641 642 start = uio->uio_loffset; 643 off = start & PAGEOFFSET; 644 zfs_vmobject_wlock(obj); 645 for (start &= PAGEMASK; len > 0; start += PAGESIZE) { 646 vm_page_t pp; 647 uint64_t bytes = MIN(PAGESIZE - off, len); 648 649 if (pp = page_hold(vp, start)) { 650 struct sf_buf *sf; 651 caddr_t va; 652 653 zfs_vmobject_wunlock(obj); 654 va = zfs_map_page(pp, &sf); 655#ifdef illumos 656 error = uiomove(va + off, bytes, UIO_READ, uio); 657#else 658 error = vn_io_fault_uiomove(va + off, bytes, uio); 659#endif 660 zfs_unmap_page(sf); 661 zfs_vmobject_wlock(obj); 662 page_unhold(pp); 663 } else { 664 zfs_vmobject_wunlock(obj); 665 error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), 666 uio, bytes); 667 zfs_vmobject_wlock(obj); 668 } 669 len -= bytes; 670 off = 0; 671 if (error) 672 break; 673 } 674 zfs_vmobject_wunlock(obj); 675 return (error); 676} 677 678offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */ 679 680/* 681 * Read bytes from specified file into supplied buffer. 682 * 683 * IN: vp - vnode of file to be read from. 684 * uio - structure supplying read location, range info, 685 * and return buffer. 686 * ioflag - SYNC flags; used to provide FRSYNC semantics. 687 * cr - credentials of caller. 688 * ct - caller context 689 * 690 * OUT: uio - updated offset and range, buffer filled. 691 * 692 * RETURN: 0 on success, error code on failure. 693 * 694 * Side Effects: 695 * vp - atime updated if byte count > 0 696 */ 697/* ARGSUSED */ 698static int 699zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) 700{ 701 znode_t *zp = VTOZ(vp); 702 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 703 ssize_t n, nbytes; 704 int error = 0; 705 rl_t *rl; 706 xuio_t *xuio = NULL; 707 708 ZFS_ENTER(zfsvfs); 709 ZFS_VERIFY_ZP(zp); 710 711 if (zp->z_pflags & ZFS_AV_QUARANTINED) { 712 ZFS_EXIT(zfsvfs); 713 return (SET_ERROR(EACCES)); 714 } 715 716 /* 717 * Validate file offset 718 */ 719 if (uio->uio_loffset < (offset_t)0) { 720 ZFS_EXIT(zfsvfs); 721 return (SET_ERROR(EINVAL)); 722 } 723 724 /* 725 * Fasttrack empty reads 726 */ 727 if (uio->uio_resid == 0) { 728 ZFS_EXIT(zfsvfs); 729 return (0); 730 } 731 732 /* 733 * Check for mandatory locks 734 */ 735 if (MANDMODE(zp->z_mode)) { 736 if (error = chklock(vp, FREAD, 737 uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) { 738 ZFS_EXIT(zfsvfs); 739 return (error); 740 } 741 } 742 743 /* 744 * If we're in FRSYNC mode, sync out this znode before reading it. 745 */ 746 if (zfsvfs->z_log && 747 (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)) 748 zil_commit(zfsvfs->z_log, zp->z_id); 749 750 /* 751 * Lock the range against changes. 752 */ 753 rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER); 754 755 /* 756 * If we are reading past end-of-file we can skip 757 * to the end; but we might still need to set atime. 758 */ 759 if (uio->uio_loffset >= zp->z_size) { 760 error = 0; 761 goto out; 762 } 763 764 ASSERT(uio->uio_loffset < zp->z_size); 765 n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset); 766 767#ifdef illumos 768 if ((uio->uio_extflg == UIO_XUIO) && 769 (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) { 770 int nblk; 771 int blksz = zp->z_blksz; 772 uint64_t offset = uio->uio_loffset; 773 774 xuio = (xuio_t *)uio; 775 if ((ISP2(blksz))) { 776 nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset, 777 blksz)) / blksz; 778 } else { 779 ASSERT(offset + n <= blksz); 780 nblk = 1; 781 } 782 (void) dmu_xuio_init(xuio, nblk); 783 784 if (vn_has_cached_data(vp)) { 785 /* 786 * For simplicity, we always allocate a full buffer 787 * even if we only expect to read a portion of a block. 788 */ 789 while (--nblk >= 0) { 790 (void) dmu_xuio_add(xuio, 791 dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), 792 blksz), 0, blksz); 793 } 794 } 795 } 796#endif /* illumos */ 797 798 while (n > 0) { 799 nbytes = MIN(n, zfs_read_chunk_size - 800 P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); 801 802#ifdef __FreeBSD__ 803 if (uio->uio_segflg == UIO_NOCOPY) 804 error = mappedread_sf(vp, nbytes, uio); 805 else 806#endif /* __FreeBSD__ */ 807 if (vn_has_cached_data(vp)) { 808 error = mappedread(vp, nbytes, uio); 809 } else { 810 error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), 811 uio, nbytes); 812 } 813 if (error) { 814 /* convert checksum errors into IO errors */ 815 if (error == ECKSUM) 816 error = SET_ERROR(EIO); 817 break; 818 } 819 820 n -= nbytes; 821 } 822out: 823 zfs_range_unlock(rl); 824 825 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 826 ZFS_EXIT(zfsvfs); 827 return (error); 828} 829 830/* 831 * Write the bytes to a file. 832 * 833 * IN: vp - vnode of file to be written to. 834 * uio - structure supplying write location, range info, 835 * and data buffer. 836 * ioflag - FAPPEND, FSYNC, and/or FDSYNC. FAPPEND is 837 * set if in append mode. 838 * cr - credentials of caller. 839 * ct - caller context (NFS/CIFS fem monitor only) 840 * 841 * OUT: uio - updated offset and range. 842 * 843 * RETURN: 0 on success, error code on failure. 844 * 845 * Timestamps: 846 * vp - ctime|mtime updated if byte count > 0 847 */ 848 849/* ARGSUSED */ 850static int 851zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) 852{ 853 znode_t *zp = VTOZ(vp); 854 rlim64_t limit = MAXOFFSET_T; 855 ssize_t start_resid = uio->uio_resid; 856 ssize_t tx_bytes; 857 uint64_t end_size; 858 dmu_tx_t *tx; 859 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 860 zilog_t *zilog; 861 offset_t woff; 862 ssize_t n, nbytes; 863 rl_t *rl; 864 int max_blksz = zfsvfs->z_max_blksz; 865 int error = 0; 866 arc_buf_t *abuf; 867 iovec_t *aiov = NULL; 868 xuio_t *xuio = NULL; 869 int i_iov = 0; 870 int iovcnt = uio->uio_iovcnt; 871 iovec_t *iovp = uio->uio_iov; 872 int write_eof; 873 int count = 0; 874 sa_bulk_attr_t bulk[4]; 875 uint64_t mtime[2], ctime[2]; 876 877 /* 878 * Fasttrack empty write 879 */ 880 n = start_resid; 881 if (n == 0) 882 return (0); 883 884 if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) 885 limit = MAXOFFSET_T; 886 887 ZFS_ENTER(zfsvfs); 888 ZFS_VERIFY_ZP(zp); 889 890 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); 891 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); 892 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, 893 &zp->z_size, 8); 894 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 895 &zp->z_pflags, 8); 896 897 /* 898 * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our 899 * callers might not be able to detect properly that we are read-only, 900 * so check it explicitly here. 901 */ 902 if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { 903 ZFS_EXIT(zfsvfs); 904 return (SET_ERROR(EROFS)); 905 } 906 907 /* 908 * If immutable or not appending then return EPERM. 909 * Intentionally allow ZFS_READONLY through here. 910 * See zfs_zaccess_common() 911 */ 912 if ((zp->z_pflags & ZFS_IMMUTABLE) || 913 ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) && 914 (uio->uio_loffset < zp->z_size))) { 915 ZFS_EXIT(zfsvfs); 916 return (SET_ERROR(EPERM)); 917 } 918 919 zilog = zfsvfs->z_log; 920 921 /* 922 * Validate file offset 923 */ 924 woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset; 925 if (woff < 0) { 926 ZFS_EXIT(zfsvfs); 927 return (SET_ERROR(EINVAL)); 928 } 929 930 /* 931 * Check for mandatory locks before calling zfs_range_lock() 932 * in order to prevent a deadlock with locks set via fcntl(). 933 */ 934 if (MANDMODE((mode_t)zp->z_mode) && 935 (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) { 936 ZFS_EXIT(zfsvfs); 937 return (error); 938 } 939 940#ifdef illumos 941 /* 942 * Pre-fault the pages to ensure slow (eg NFS) pages 943 * don't hold up txg. 944 * Skip this if uio contains loaned arc_buf. 945 */ 946 if ((uio->uio_extflg == UIO_XUIO) && 947 (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) 948 xuio = (xuio_t *)uio; 949 else 950 uio_prefaultpages(MIN(n, max_blksz), uio); 951#endif 952 953 /* 954 * If in append mode, set the io offset pointer to eof. 955 */ 956 if (ioflag & FAPPEND) { 957 /* 958 * Obtain an appending range lock to guarantee file append 959 * semantics. We reset the write offset once we have the lock. 960 */ 961 rl = zfs_range_lock(zp, 0, n, RL_APPEND); 962 woff = rl->r_off; 963 if (rl->r_len == UINT64_MAX) { 964 /* 965 * We overlocked the file because this write will cause 966 * the file block size to increase. 967 * Note that zp_size cannot change with this lock held. 968 */ 969 woff = zp->z_size; 970 } 971 uio->uio_loffset = woff; 972 } else { 973 /* 974 * Note that if the file block size will change as a result of 975 * this write, then this range lock will lock the entire file 976 * so that we can re-write the block safely. 977 */ 978 rl = zfs_range_lock(zp, woff, n, RL_WRITER); 979 } 980 981 if (vn_rlimit_fsize(vp, uio, uio->uio_td)) { 982 zfs_range_unlock(rl); 983 ZFS_EXIT(zfsvfs); 984 return (EFBIG); 985 } 986 987 if (woff >= limit) { 988 zfs_range_unlock(rl); 989 ZFS_EXIT(zfsvfs); 990 return (SET_ERROR(EFBIG)); 991 } 992 993 if ((woff + n) > limit || woff > (limit - n)) 994 n = limit - woff; 995 996 /* Will this write extend the file length? */ 997 write_eof = (woff + n > zp->z_size); 998 999 end_size = MAX(zp->z_size, woff + n); 1000 1001 /* 1002 * Write the file in reasonable size chunks. Each chunk is written 1003 * in a separate transaction; this keeps the intent log records small 1004 * and allows us to do more fine-grained space accounting. 1005 */ 1006 while (n > 0) { 1007 abuf = NULL; 1008 woff = uio->uio_loffset; 1009 if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) || 1010 zfs_owner_overquota(zfsvfs, zp, B_TRUE)) { 1011 if (abuf != NULL) 1012 dmu_return_arcbuf(abuf); 1013 error = SET_ERROR(EDQUOT); 1014 break; 1015 } 1016 1017 if (xuio && abuf == NULL) { 1018 ASSERT(i_iov < iovcnt); 1019 aiov = &iovp[i_iov]; 1020 abuf = dmu_xuio_arcbuf(xuio, i_iov); 1021 dmu_xuio_clear(xuio, i_iov); 1022 DTRACE_PROBE3(zfs_cp_write, int, i_iov, 1023 iovec_t *, aiov, arc_buf_t *, abuf); 1024 ASSERT((aiov->iov_base == abuf->b_data) || 1025 ((char *)aiov->iov_base - (char *)abuf->b_data + 1026 aiov->iov_len == arc_buf_size(abuf))); 1027 i_iov++; 1028 } else if (abuf == NULL && n >= max_blksz && 1029 woff >= zp->z_size && 1030 P2PHASE(woff, max_blksz) == 0 && 1031 zp->z_blksz == max_blksz) { 1032 /* 1033 * This write covers a full block. "Borrow" a buffer 1034 * from the dmu so that we can fill it before we enter 1035 * a transaction. This avoids the possibility of 1036 * holding up the transaction if the data copy hangs 1037 * up on a pagefault (e.g., from an NFS server mapping). 1038 */ 1039 size_t cbytes; 1040 1041 abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), 1042 max_blksz); 1043 ASSERT(abuf != NULL); 1044 ASSERT(arc_buf_size(abuf) == max_blksz); 1045 if (error = uiocopy(abuf->b_data, max_blksz, 1046 UIO_WRITE, uio, &cbytes)) { 1047 dmu_return_arcbuf(abuf); 1048 break; 1049 } 1050 ASSERT(cbytes == max_blksz); 1051 } 1052 1053 /* 1054 * Start a transaction. 1055 */ 1056 tx = dmu_tx_create(zfsvfs->z_os); 1057 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 1058 dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz)); 1059 zfs_sa_upgrade_txholds(tx, zp); 1060 error = dmu_tx_assign(tx, TXG_WAIT); 1061 if (error) { 1062 dmu_tx_abort(tx); 1063 if (abuf != NULL) 1064 dmu_return_arcbuf(abuf); 1065 break; 1066 } 1067 1068 /* 1069 * If zfs_range_lock() over-locked we grow the blocksize 1070 * and then reduce the lock range. This will only happen 1071 * on the first iteration since zfs_range_reduce() will 1072 * shrink down r_len to the appropriate size. 1073 */ 1074 if (rl->r_len == UINT64_MAX) { 1075 uint64_t new_blksz; 1076 1077 if (zp->z_blksz > max_blksz) { 1078 /* 1079 * File's blocksize is already larger than the 1080 * "recordsize" property. Only let it grow to 1081 * the next power of 2. 1082 */ 1083 ASSERT(!ISP2(zp->z_blksz)); 1084 new_blksz = MIN(end_size, 1085 1 << highbit64(zp->z_blksz)); 1086 } else { 1087 new_blksz = MIN(end_size, max_blksz); 1088 } 1089 zfs_grow_blocksize(zp, new_blksz, tx); 1090 zfs_range_reduce(rl, woff, n); 1091 } 1092 1093 /* 1094 * XXX - should we really limit each write to z_max_blksz? 1095 * Perhaps we should use SPA_MAXBLOCKSIZE chunks? 1096 */ 1097 nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); 1098 1099 if (woff + nbytes > zp->z_size) 1100 vnode_pager_setsize(vp, woff + nbytes); 1101 1102 if (abuf == NULL) { 1103 tx_bytes = uio->uio_resid; 1104 error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), 1105 uio, nbytes, tx); 1106 tx_bytes -= uio->uio_resid; 1107 } else { 1108 tx_bytes = nbytes; 1109 ASSERT(xuio == NULL || tx_bytes == aiov->iov_len); 1110 /* 1111 * If this is not a full block write, but we are 1112 * extending the file past EOF and this data starts 1113 * block-aligned, use assign_arcbuf(). Otherwise, 1114 * write via dmu_write(). 1115 */ 1116 if (tx_bytes < max_blksz && (!write_eof || 1117 aiov->iov_base != abuf->b_data)) { 1118 ASSERT(xuio); 1119 dmu_write(zfsvfs->z_os, zp->z_id, woff, 1120 aiov->iov_len, aiov->iov_base, tx); 1121 dmu_return_arcbuf(abuf); 1122 xuio_stat_wbuf_copied(); 1123 } else { 1124 ASSERT(xuio || tx_bytes == max_blksz); 1125 dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl), 1126 woff, abuf, tx); 1127 } 1128 ASSERT(tx_bytes <= uio->uio_resid); 1129 uioskip(uio, tx_bytes); 1130 } 1131 if (tx_bytes && vn_has_cached_data(vp)) { 1132 update_pages(vp, woff, tx_bytes, zfsvfs->z_os, 1133 zp->z_id, uio->uio_segflg, tx); 1134 } 1135 1136 /* 1137 * If we made no progress, we're done. If we made even 1138 * partial progress, update the znode and ZIL accordingly. 1139 */ 1140 if (tx_bytes == 0) { 1141 (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), 1142 (void *)&zp->z_size, sizeof (uint64_t), tx); 1143 dmu_tx_commit(tx); 1144 ASSERT(error != 0); 1145 break; 1146 } 1147 1148 /* 1149 * Clear Set-UID/Set-GID bits on successful write if not 1150 * privileged and at least one of the excute bits is set. 1151 * 1152 * It would be nice to to this after all writes have 1153 * been done, but that would still expose the ISUID/ISGID 1154 * to another app after the partial write is committed. 1155 * 1156 * Note: we don't call zfs_fuid_map_id() here because 1157 * user 0 is not an ephemeral uid. 1158 */ 1159 mutex_enter(&zp->z_acl_lock); 1160 if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | 1161 (S_IXUSR >> 6))) != 0 && 1162 (zp->z_mode & (S_ISUID | S_ISGID)) != 0 && 1163 secpolicy_vnode_setid_retain(vp, cr, 1164 (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) { 1165 uint64_t newmode; 1166 zp->z_mode &= ~(S_ISUID | S_ISGID); 1167 newmode = zp->z_mode; 1168 (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), 1169 (void *)&newmode, sizeof (uint64_t), tx); 1170 } 1171 mutex_exit(&zp->z_acl_lock); 1172 1173 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, 1174 B_TRUE); 1175 1176 /* 1177 * Update the file size (zp_size) if it has changed; 1178 * account for possible concurrent updates. 1179 */ 1180 while ((end_size = zp->z_size) < uio->uio_loffset) { 1181 (void) atomic_cas_64(&zp->z_size, end_size, 1182 uio->uio_loffset); 1183#ifdef illumos 1184 ASSERT(error == 0); 1185#else 1186 ASSERT(error == 0 || error == EFAULT); 1187#endif 1188 } 1189 /* 1190 * If we are replaying and eof is non zero then force 1191 * the file size to the specified eof. Note, there's no 1192 * concurrency during replay. 1193 */ 1194 if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0) 1195 zp->z_size = zfsvfs->z_replay_eof; 1196 1197 if (error == 0) 1198 error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 1199 else 1200 (void) sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 1201 1202 zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag); 1203 dmu_tx_commit(tx); 1204 1205 if (error != 0) 1206 break; 1207 ASSERT(tx_bytes == nbytes); 1208 n -= nbytes; 1209 1210#ifdef illumos 1211 if (!xuio && n > 0) 1212 uio_prefaultpages(MIN(n, max_blksz), uio); 1213#endif 1214 } 1215 1216 zfs_range_unlock(rl); 1217 1218 /* 1219 * If we're in replay mode, or we made no progress, return error. 1220 * Otherwise, it's at least a partial write, so it's successful. 1221 */ 1222 if (zfsvfs->z_replay || uio->uio_resid == start_resid) { 1223 ZFS_EXIT(zfsvfs); 1224 return (error); 1225 } 1226 1227#ifdef __FreeBSD__ 1228 /* 1229 * EFAULT means that at least one page of the source buffer was not 1230 * available. VFS will re-try remaining I/O upon this error. 1231 */ 1232 if (error == EFAULT) { 1233 ZFS_EXIT(zfsvfs); 1234 return (error); 1235 } 1236#endif 1237 1238 if (ioflag & (FSYNC | FDSYNC) || 1239 zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 1240 zil_commit(zilog, zp->z_id); 1241 1242 ZFS_EXIT(zfsvfs); 1243 return (0); 1244} 1245 1246void 1247zfs_get_done(zgd_t *zgd, int error) 1248{ 1249 znode_t *zp = zgd->zgd_private; 1250 objset_t *os = zp->z_zfsvfs->z_os; 1251 1252 if (zgd->zgd_db) 1253 dmu_buf_rele(zgd->zgd_db, zgd); 1254 1255 zfs_range_unlock(zgd->zgd_rl); 1256 1257 /* 1258 * Release the vnode asynchronously as we currently have the 1259 * txg stopped from syncing. 1260 */ 1261 VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os))); 1262 1263 if (error == 0 && zgd->zgd_bp) 1264 zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp); 1265 1266 kmem_free(zgd, sizeof (zgd_t)); 1267} 1268 1269#ifdef DEBUG 1270static int zil_fault_io = 0; 1271#endif 1272 1273/* 1274 * Get data to generate a TX_WRITE intent log record. 1275 */ 1276int 1277zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) 1278{ 1279 zfsvfs_t *zfsvfs = arg; 1280 objset_t *os = zfsvfs->z_os; 1281 znode_t *zp; 1282 uint64_t object = lr->lr_foid; 1283 uint64_t offset = lr->lr_offset; 1284 uint64_t size = lr->lr_length; 1285 dmu_buf_t *db; 1286 zgd_t *zgd; 1287 int error = 0; 1288 1289 ASSERT3P(lwb, !=, NULL); 1290 ASSERT3P(zio, !=, NULL); 1291 ASSERT3U(size, !=, 0); 1292 1293 /* 1294 * Nothing to do if the file has been removed 1295 */ 1296 if (zfs_zget(zfsvfs, object, &zp) != 0) 1297 return (SET_ERROR(ENOENT)); 1298 if (zp->z_unlinked) { 1299 /* 1300 * Release the vnode asynchronously as we currently have the 1301 * txg stopped from syncing. 1302 */ 1303 VN_RELE_ASYNC(ZTOV(zp), 1304 dsl_pool_vnrele_taskq(dmu_objset_pool(os))); 1305 return (SET_ERROR(ENOENT)); 1306 } 1307 1308 zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP); 1309 zgd->zgd_lwb = lwb; 1310 zgd->zgd_private = zp; 1311 1312 /* 1313 * Write records come in two flavors: immediate and indirect. 1314 * For small writes it's cheaper to store the data with the 1315 * log record (immediate); for large writes it's cheaper to 1316 * sync the data and get a pointer to it (indirect) so that 1317 * we don't have to write the data twice. 1318 */ 1319 if (buf != NULL) { /* immediate write */ 1320 zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER); 1321 /* test for truncation needs to be done while range locked */ 1322 if (offset >= zp->z_size) { 1323 error = SET_ERROR(ENOENT); 1324 } else { 1325 error = dmu_read(os, object, offset, size, buf, 1326 DMU_READ_NO_PREFETCH); 1327 } 1328 ASSERT(error == 0 || error == ENOENT); 1329 } else { /* indirect write */ 1330 /* 1331 * Have to lock the whole block to ensure when it's 1332 * written out and its checksum is being calculated 1333 * that no one can change the data. We need to re-check 1334 * blocksize after we get the lock in case it's changed! 1335 */ 1336 for (;;) { 1337 uint64_t blkoff; 1338 size = zp->z_blksz; 1339 blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; 1340 offset -= blkoff; 1341 zgd->zgd_rl = zfs_range_lock(zp, offset, size, 1342 RL_READER); 1343 if (zp->z_blksz == size) 1344 break; 1345 offset += blkoff; 1346 zfs_range_unlock(zgd->zgd_rl); 1347 } 1348 /* test for truncation needs to be done while range locked */ 1349 if (lr->lr_offset >= zp->z_size) 1350 error = SET_ERROR(ENOENT); 1351#ifdef DEBUG 1352 if (zil_fault_io) { 1353 error = SET_ERROR(EIO); 1354 zil_fault_io = 0; 1355 } 1356#endif 1357 if (error == 0) 1358 error = dmu_buf_hold(os, object, offset, zgd, &db, 1359 DMU_READ_NO_PREFETCH); 1360 1361 if (error == 0) { 1362 blkptr_t *bp = &lr->lr_blkptr; 1363 1364 zgd->zgd_db = db; 1365 zgd->zgd_bp = bp; 1366 1367 ASSERT(db->db_offset == offset); 1368 ASSERT(db->db_size == size); 1369 1370 error = dmu_sync(zio, lr->lr_common.lrc_txg, 1371 zfs_get_done, zgd); 1372 ASSERT(error || lr->lr_length <= size); 1373 1374 /* 1375 * On success, we need to wait for the write I/O 1376 * initiated by dmu_sync() to complete before we can 1377 * release this dbuf. We will finish everything up 1378 * in the zfs_get_done() callback. 1379 */ 1380 if (error == 0) 1381 return (0); 1382 1383 if (error == EALREADY) { 1384 lr->lr_common.lrc_txtype = TX_WRITE2; 1385 error = 0; 1386 } 1387 } 1388 } 1389 1390 zfs_get_done(zgd, error); 1391 1392 return (error); 1393} 1394 1395/*ARGSUSED*/ 1396static int 1397zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr, 1398 caller_context_t *ct) 1399{ 1400 znode_t *zp = VTOZ(vp); 1401 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1402 int error; 1403 1404 ZFS_ENTER(zfsvfs); 1405 ZFS_VERIFY_ZP(zp); 1406 1407 if (flag & V_ACE_MASK) 1408 error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); 1409 else 1410 error = zfs_zaccess_rwx(zp, mode, flag, cr); 1411 1412 ZFS_EXIT(zfsvfs); 1413 return (error); 1414} 1415 1416static int 1417zfs_dd_callback(struct mount *mp, void *arg, int lkflags, struct vnode **vpp) 1418{ 1419 int error; 1420 1421 *vpp = arg; 1422 error = vn_lock(*vpp, lkflags); 1423 if (error != 0) 1424 vrele(*vpp); 1425 return (error); 1426} 1427 1428static int 1429zfs_lookup_lock(vnode_t *dvp, vnode_t *vp, const char *name, int lkflags) 1430{ 1431 znode_t *zdp = VTOZ(dvp); 1432 zfsvfs_t *zfsvfs = zdp->z_zfsvfs; 1433 int error; 1434 int ltype; 1435 1436 ASSERT_VOP_LOCKED(dvp, __func__); 1437#ifdef DIAGNOSTIC 1438 if ((zdp->z_pflags & ZFS_XATTR) == 0) 1439 VERIFY(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock)); 1440#endif 1441 1442 if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) { 1443 ASSERT3P(dvp, ==, vp); 1444 vref(dvp); 1445 ltype = lkflags & LK_TYPE_MASK; 1446 if (ltype != VOP_ISLOCKED(dvp)) { 1447 if (ltype == LK_EXCLUSIVE) 1448 vn_lock(dvp, LK_UPGRADE | LK_RETRY); 1449 else /* if (ltype == LK_SHARED) */ 1450 vn_lock(dvp, LK_DOWNGRADE | LK_RETRY); 1451 1452 /* 1453 * Relock for the "." case could leave us with 1454 * reclaimed vnode. 1455 */ 1456 if (dvp->v_iflag & VI_DOOMED) { 1457 vrele(dvp); 1458 return (SET_ERROR(ENOENT)); 1459 } 1460 } 1461 return (0); 1462 } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) { 1463 /* 1464 * Note that in this case, dvp is the child vnode, and we 1465 * are looking up the parent vnode - exactly reverse from 1466 * normal operation. Unlocking dvp requires some rather 1467 * tricky unlock/relock dance to prevent mp from being freed; 1468 * use vn_vget_ino_gen() which takes care of all that. 1469 * 1470 * XXX Note that there is a time window when both vnodes are 1471 * unlocked. It is possible, although highly unlikely, that 1472 * during that window the parent-child relationship between 1473 * the vnodes may change, for example, get reversed. 1474 * In that case we would have a wrong lock order for the vnodes. 1475 * All other filesystems seem to ignore this problem, so we 1476 * do the same here. 1477 * A potential solution could be implemented as follows: 1478 * - using LK_NOWAIT when locking the second vnode and retrying 1479 * if necessary 1480 * - checking that the parent-child relationship still holds 1481 * after locking both vnodes and retrying if it doesn't 1482 */ 1483 error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp); 1484 return (error); 1485 } else { 1486 error = vn_lock(vp, lkflags); 1487 if (error != 0) 1488 vrele(vp); 1489 return (error); 1490 } 1491} 1492 1493/* 1494 * Lookup an entry in a directory, or an extended attribute directory. 1495 * If it exists, return a held vnode reference for it. 1496 * 1497 * IN: dvp - vnode of directory to search. 1498 * nm - name of entry to lookup. 1499 * pnp - full pathname to lookup [UNUSED]. 1500 * flags - LOOKUP_XATTR set if looking for an attribute. 1501 * rdir - root directory vnode [UNUSED]. 1502 * cr - credentials of caller. 1503 * ct - caller context 1504 * 1505 * OUT: vpp - vnode of located entry, NULL if not found. 1506 * 1507 * RETURN: 0 on success, error code on failure. 1508 * 1509 * Timestamps: 1510 * NA 1511 */ 1512/* ARGSUSED */ 1513static int 1514zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp, 1515 int nameiop, cred_t *cr, kthread_t *td, int flags) 1516{ 1517 znode_t *zdp = VTOZ(dvp); 1518 znode_t *zp; 1519 zfsvfs_t *zfsvfs = zdp->z_zfsvfs; 1520 int error = 0; 1521 1522 /* 1523 * Fast path lookup, however we must skip DNLC lookup 1524 * for case folding or normalizing lookups because the 1525 * DNLC code only stores the passed in name. This means 1526 * creating 'a' and removing 'A' on a case insensitive 1527 * file system would work, but DNLC still thinks 'a' 1528 * exists and won't let you create it again on the next 1529 * pass through fast path. 1530 */ 1531 if (!(flags & LOOKUP_XATTR)) { 1532 if (dvp->v_type != VDIR) { 1533 return (SET_ERROR(ENOTDIR)); 1534 } else if (zdp->z_sa_hdl == NULL) { 1535 return (SET_ERROR(EIO)); 1536 } 1537 } 1538 1539 DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm); 1540 1541 ZFS_ENTER(zfsvfs); 1542 ZFS_VERIFY_ZP(zdp); 1543 1544 *vpp = NULL; 1545 1546 if (flags & LOOKUP_XATTR) { 1547#ifdef TODO 1548 /* 1549 * If the xattr property is off, refuse the lookup request. 1550 */ 1551 if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) { 1552 ZFS_EXIT(zfsvfs); 1553 return (SET_ERROR(EINVAL)); 1554 } 1555#endif 1556 1557 /* 1558 * We don't allow recursive attributes.. 1559 * Maybe someday we will. 1560 */ 1561 if (zdp->z_pflags & ZFS_XATTR) { 1562 ZFS_EXIT(zfsvfs); 1563 return (SET_ERROR(EINVAL)); 1564 } 1565 1566 if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) { 1567 ZFS_EXIT(zfsvfs); 1568 return (error); 1569 } 1570 1571 /* 1572 * Do we have permission to get into attribute directory? 1573 */ 1574 if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0, 1575 B_FALSE, cr)) { 1576 vrele(*vpp); 1577 *vpp = NULL; 1578 } 1579 1580 ZFS_EXIT(zfsvfs); 1581 return (error); 1582 } 1583 1584 /* 1585 * Check accessibility of directory. 1586 */ 1587 if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) { 1588 ZFS_EXIT(zfsvfs); 1589 return (error); 1590 } 1591 1592 if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), 1593 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 1594 ZFS_EXIT(zfsvfs); 1595 return (SET_ERROR(EILSEQ)); 1596 } 1597 1598 1599 /* 1600 * First handle the special cases. 1601 */ 1602 if ((cnp->cn_flags & ISDOTDOT) != 0) { 1603 /* 1604 * If we are a snapshot mounted under .zfs, return 1605 * the vp for the snapshot directory. 1606 */ 1607 if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) { 1608 struct componentname cn; 1609 vnode_t *zfsctl_vp; 1610 int ltype; 1611 1612 ZFS_EXIT(zfsvfs); 1613 ltype = VOP_ISLOCKED(dvp); 1614 VOP_UNLOCK(dvp, 0); 1615 error = zfsctl_root(zfsvfs->z_parent, LK_SHARED, 1616 &zfsctl_vp); 1617 if (error == 0) { 1618 cn.cn_nameptr = "snapshot"; 1619 cn.cn_namelen = strlen(cn.cn_nameptr); 1620 cn.cn_nameiop = cnp->cn_nameiop; 1621 cn.cn_flags = cnp->cn_flags & ~ISDOTDOT; 1622 cn.cn_lkflags = cnp->cn_lkflags; 1623 error = VOP_LOOKUP(zfsctl_vp, vpp, &cn); 1624 vput(zfsctl_vp); 1625 } 1626 vn_lock(dvp, ltype | LK_RETRY); 1627 return (error); 1628 } 1629 } 1630 if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) { 1631 ZFS_EXIT(zfsvfs); 1632 if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP) 1633 return (SET_ERROR(ENOTSUP)); 1634 error = zfsctl_root(zfsvfs, cnp->cn_lkflags, vpp); 1635 return (error); 1636 } 1637 1638 /* 1639 * The loop is retry the lookup if the parent-child relationship 1640 * changes during the dot-dot locking complexities. 1641 */ 1642 for (;;) { 1643 uint64_t parent; 1644 1645 error = zfs_dirlook(zdp, nm, &zp); 1646 if (error == 0) 1647 *vpp = ZTOV(zp); 1648 1649 ZFS_EXIT(zfsvfs); 1650 if (error != 0) 1651 break; 1652 1653 error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags); 1654 if (error != 0) { 1655 /* 1656 * If we've got a locking error, then the vnode 1657 * got reclaimed because of a force unmount. 1658 * We never enter doomed vnodes into the name cache. 1659 */ 1660 *vpp = NULL; 1661 return (error); 1662 } 1663 1664 if ((cnp->cn_flags & ISDOTDOT) == 0) 1665 break; 1666 1667 ZFS_ENTER(zfsvfs); 1668 if (zdp->z_sa_hdl == NULL) { 1669 error = SET_ERROR(EIO); 1670 } else { 1671 error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), 1672 &parent, sizeof (parent)); 1673 } 1674 if (error != 0) { 1675 ZFS_EXIT(zfsvfs); 1676 vput(ZTOV(zp)); 1677 break; 1678 } 1679 if (zp->z_id == parent) { 1680 ZFS_EXIT(zfsvfs); 1681 break; 1682 } 1683 vput(ZTOV(zp)); 1684 } 1685 1686out: 1687 if (error != 0) 1688 *vpp = NULL; 1689 1690 /* Translate errors and add SAVENAME when needed. */ 1691 if (cnp->cn_flags & ISLASTCN) { 1692 switch (nameiop) { 1693 case CREATE: 1694 case RENAME: 1695 if (error == ENOENT) { 1696 error = EJUSTRETURN; 1697 cnp->cn_flags |= SAVENAME; 1698 break; 1699 } 1700 /* FALLTHROUGH */ 1701 case DELETE: 1702 if (error == 0) 1703 cnp->cn_flags |= SAVENAME; 1704 break; 1705 } 1706 } 1707 1708 /* Insert name into cache (as non-existent) if appropriate. */ 1709 if (zfsvfs->z_use_namecache && 1710 error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0) 1711 cache_enter(dvp, NULL, cnp); 1712 1713 /* Insert name into cache if appropriate. */ 1714 if (zfsvfs->z_use_namecache && 1715 error == 0 && (cnp->cn_flags & MAKEENTRY)) { 1716 if (!(cnp->cn_flags & ISLASTCN) || 1717 (nameiop != DELETE && nameiop != RENAME)) { 1718 cache_enter(dvp, *vpp, cnp); 1719 } 1720 } 1721 1722 return (error); 1723} 1724 1725/* 1726 * Attempt to create a new entry in a directory. If the entry 1727 * already exists, truncate the file if permissible, else return 1728 * an error. Return the vp of the created or trunc'd file. 1729 * 1730 * IN: dvp - vnode of directory to put new file entry in. 1731 * name - name of new file entry. 1732 * vap - attributes of new file. 1733 * excl - flag indicating exclusive or non-exclusive mode. 1734 * mode - mode to open file with. 1735 * cr - credentials of caller. 1736 * flag - large file flag [UNUSED]. 1737 * ct - caller context 1738 * vsecp - ACL to be set 1739 * 1740 * OUT: vpp - vnode of created or trunc'd entry. 1741 * 1742 * RETURN: 0 on success, error code on failure. 1743 * 1744 * Timestamps: 1745 * dvp - ctime|mtime updated if new entry created 1746 * vp - ctime|mtime always, atime if new 1747 */ 1748 1749/* ARGSUSED */ 1750static int 1751zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode, 1752 vnode_t **vpp, cred_t *cr, kthread_t *td) 1753{ 1754 znode_t *zp, *dzp = VTOZ(dvp); 1755 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1756 zilog_t *zilog; 1757 objset_t *os; 1758 dmu_tx_t *tx; 1759 int error; 1760 ksid_t *ksid; 1761 uid_t uid; 1762 gid_t gid = crgetgid(cr); 1763 zfs_acl_ids_t acl_ids; 1764 boolean_t fuid_dirtied; 1765 void *vsecp = NULL; 1766 int flag = 0; 1767 uint64_t txtype; 1768 1769 /* 1770 * If we have an ephemeral id, ACL, or XVATTR then 1771 * make sure file system is at proper version 1772 */ 1773 1774 ksid = crgetsid(cr, KSID_OWNER); 1775 if (ksid) 1776 uid = ksid_getid(ksid); 1777 else 1778 uid = crgetuid(cr); 1779 1780 if (zfsvfs->z_use_fuids == B_FALSE && 1781 (vsecp || (vap->va_mask & AT_XVATTR) || 1782 IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) 1783 return (SET_ERROR(EINVAL)); 1784 1785 ZFS_ENTER(zfsvfs); 1786 ZFS_VERIFY_ZP(dzp); 1787 os = zfsvfs->z_os; 1788 zilog = zfsvfs->z_log; 1789 1790 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), 1791 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 1792 ZFS_EXIT(zfsvfs); 1793 return (SET_ERROR(EILSEQ)); 1794 } 1795 1796 if (vap->va_mask & AT_XVATTR) { 1797 if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap, 1798 crgetuid(cr), cr, vap->va_type)) != 0) { 1799 ZFS_EXIT(zfsvfs); 1800 return (error); 1801 } 1802 } 1803 1804 *vpp = NULL; 1805 1806 if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr)) 1807 vap->va_mode &= ~S_ISVTX; 1808 1809 error = zfs_dirent_lookup(dzp, name, &zp, ZNEW); 1810 if (error) { 1811 ZFS_EXIT(zfsvfs); 1812 return (error); 1813 } 1814 ASSERT3P(zp, ==, NULL); 1815 1816 /* 1817 * Create a new file object and update the directory 1818 * to reference it. 1819 */ 1820 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { 1821 goto out; 1822 } 1823 1824 /* 1825 * We only support the creation of regular files in 1826 * extended attribute directories. 1827 */ 1828 1829 if ((dzp->z_pflags & ZFS_XATTR) && 1830 (vap->va_type != VREG)) { 1831 error = SET_ERROR(EINVAL); 1832 goto out; 1833 } 1834 1835 if ((error = zfs_acl_ids_create(dzp, 0, vap, 1836 cr, vsecp, &acl_ids)) != 0) 1837 goto out; 1838 1839 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { 1840 zfs_acl_ids_free(&acl_ids); 1841 error = SET_ERROR(EDQUOT); 1842 goto out; 1843 } 1844 1845 getnewvnode_reserve(1); 1846 1847 tx = dmu_tx_create(os); 1848 1849 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 1850 ZFS_SA_BASE_ATTR_SIZE); 1851 1852 fuid_dirtied = zfsvfs->z_fuid_dirty; 1853 if (fuid_dirtied) 1854 zfs_fuid_txhold(zfsvfs, tx); 1855 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 1856 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); 1857 if (!zfsvfs->z_use_sa && 1858 acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 1859 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 1860 0, acl_ids.z_aclp->z_acl_bytes); 1861 } 1862 error = dmu_tx_assign(tx, TXG_WAIT); 1863 if (error) { 1864 zfs_acl_ids_free(&acl_ids); 1865 dmu_tx_abort(tx); 1866 getnewvnode_drop_reserve(); 1867 ZFS_EXIT(zfsvfs); 1868 return (error); 1869 } 1870 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); 1871 1872 if (fuid_dirtied) 1873 zfs_fuid_sync(zfsvfs, tx); 1874 1875 (void) zfs_link_create(dzp, name, zp, tx, ZNEW); 1876 txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); 1877 zfs_log_create(zilog, tx, txtype, dzp, zp, name, 1878 vsecp, acl_ids.z_fuidp, vap); 1879 zfs_acl_ids_free(&acl_ids); 1880 dmu_tx_commit(tx); 1881 1882 getnewvnode_drop_reserve(); 1883 1884out: 1885 if (error == 0) { 1886 *vpp = ZTOV(zp); 1887 } 1888 1889 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 1890 zil_commit(zilog, 0); 1891 1892 ZFS_EXIT(zfsvfs); 1893 return (error); 1894} 1895 1896/* 1897 * Remove an entry from a directory. 1898 * 1899 * IN: dvp - vnode of directory to remove entry from. 1900 * name - name of entry to remove. 1901 * cr - credentials of caller. 1902 * ct - caller context 1903 * flags - case flags 1904 * 1905 * RETURN: 0 on success, error code on failure. 1906 * 1907 * Timestamps: 1908 * dvp - ctime|mtime 1909 * vp - ctime (if nlink > 0) 1910 */ 1911 1912/*ARGSUSED*/ 1913static int 1914zfs_remove(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr) 1915{ 1916 znode_t *dzp = VTOZ(dvp); 1917 znode_t *zp = VTOZ(vp); 1918 znode_t *xzp; 1919 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1920 zilog_t *zilog; 1921 uint64_t acl_obj, xattr_obj; 1922 uint64_t obj = 0; 1923 dmu_tx_t *tx; 1924 boolean_t unlinked, toobig = FALSE; 1925 uint64_t txtype; 1926 int error; 1927 1928 ZFS_ENTER(zfsvfs); 1929 ZFS_VERIFY_ZP(dzp); 1930 ZFS_VERIFY_ZP(zp); 1931 zilog = zfsvfs->z_log; 1932 zp = VTOZ(vp); 1933 1934 xattr_obj = 0; 1935 xzp = NULL; 1936 1937 if (error = zfs_zaccess_delete(dzp, zp, cr)) { 1938 goto out; 1939 } 1940 1941 /* 1942 * Need to use rmdir for removing directories. 1943 */ 1944 if (vp->v_type == VDIR) { 1945 error = SET_ERROR(EPERM); 1946 goto out; 1947 } 1948 1949 vnevent_remove(vp, dvp, name, ct); 1950 1951 obj = zp->z_id; 1952 1953 /* are there any extended attributes? */ 1954 error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), 1955 &xattr_obj, sizeof (xattr_obj)); 1956 if (error == 0 && xattr_obj) { 1957 error = zfs_zget(zfsvfs, xattr_obj, &xzp); 1958 ASSERT0(error); 1959 } 1960 1961 /* 1962 * We may delete the znode now, or we may put it in the unlinked set; 1963 * it depends on whether we're the last link, and on whether there are 1964 * other holds on the vnode. So we dmu_tx_hold() the right things to 1965 * allow for either case. 1966 */ 1967 tx = dmu_tx_create(zfsvfs->z_os); 1968 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); 1969 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 1970 zfs_sa_upgrade_txholds(tx, zp); 1971 zfs_sa_upgrade_txholds(tx, dzp); 1972 1973 if (xzp) { 1974 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 1975 dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); 1976 } 1977 1978 /* charge as an update -- would be nice not to charge at all */ 1979 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 1980 1981 /* 1982 * Mark this transaction as typically resulting in a net free of space 1983 */ 1984 dmu_tx_mark_netfree(tx); 1985 1986 error = dmu_tx_assign(tx, TXG_WAIT); 1987 if (error) { 1988 dmu_tx_abort(tx); 1989 ZFS_EXIT(zfsvfs); 1990 return (error); 1991 } 1992 1993 /* 1994 * Remove the directory entry. 1995 */ 1996 error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked); 1997 1998 if (error) { 1999 dmu_tx_commit(tx); 2000 goto out; 2001 } 2002 2003 if (unlinked) { 2004 zfs_unlinked_add(zp, tx); 2005 vp->v_vflag |= VV_NOSYNC; 2006 } 2007 2008 txtype = TX_REMOVE; 2009 zfs_log_remove(zilog, tx, txtype, dzp, name, obj); 2010 2011 dmu_tx_commit(tx); 2012out: 2013 2014 if (xzp) 2015 vrele(ZTOV(xzp)); 2016 2017 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 2018 zil_commit(zilog, 0); 2019 2020 ZFS_EXIT(zfsvfs); 2021 return (error); 2022} 2023 2024/* 2025 * Create a new directory and insert it into dvp using the name 2026 * provided. Return a pointer to the inserted directory. 2027 * 2028 * IN: dvp - vnode of directory to add subdir to. 2029 * dirname - name of new directory. 2030 * vap - attributes of new directory. 2031 * cr - credentials of caller. 2032 * ct - caller context 2033 * flags - case flags 2034 * vsecp - ACL to be set 2035 * 2036 * OUT: vpp - vnode of created directory. 2037 * 2038 * RETURN: 0 on success, error code on failure. 2039 * 2040 * Timestamps: 2041 * dvp - ctime|mtime updated 2042 * vp - ctime|mtime|atime updated 2043 */ 2044/*ARGSUSED*/ 2045static int 2046zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr) 2047{ 2048 znode_t *zp, *dzp = VTOZ(dvp); 2049 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 2050 zilog_t *zilog; 2051 uint64_t txtype; 2052 dmu_tx_t *tx; 2053 int error; 2054 ksid_t *ksid; 2055 uid_t uid; 2056 gid_t gid = crgetgid(cr); 2057 zfs_acl_ids_t acl_ids; 2058 boolean_t fuid_dirtied; 2059 2060 ASSERT(vap->va_type == VDIR); 2061 2062 /* 2063 * If we have an ephemeral id, ACL, or XVATTR then 2064 * make sure file system is at proper version 2065 */ 2066 2067 ksid = crgetsid(cr, KSID_OWNER); 2068 if (ksid) 2069 uid = ksid_getid(ksid); 2070 else 2071 uid = crgetuid(cr); 2072 if (zfsvfs->z_use_fuids == B_FALSE && 2073 ((vap->va_mask & AT_XVATTR) || 2074 IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) 2075 return (SET_ERROR(EINVAL)); 2076 2077 ZFS_ENTER(zfsvfs); 2078 ZFS_VERIFY_ZP(dzp); 2079 zilog = zfsvfs->z_log; 2080 2081 if (dzp->z_pflags & ZFS_XATTR) { 2082 ZFS_EXIT(zfsvfs); 2083 return (SET_ERROR(EINVAL)); 2084 } 2085 2086 if (zfsvfs->z_utf8 && u8_validate(dirname, 2087 strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 2088 ZFS_EXIT(zfsvfs); 2089 return (SET_ERROR(EILSEQ)); 2090 } 2091 2092 if (vap->va_mask & AT_XVATTR) { 2093 if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap, 2094 crgetuid(cr), cr, vap->va_type)) != 0) { 2095 ZFS_EXIT(zfsvfs); 2096 return (error); 2097 } 2098 } 2099 2100 if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, 2101 NULL, &acl_ids)) != 0) { 2102 ZFS_EXIT(zfsvfs); 2103 return (error); 2104 } 2105 2106 /* 2107 * First make sure the new directory doesn't exist. 2108 * 2109 * Existence is checked first to make sure we don't return 2110 * EACCES instead of EEXIST which can cause some applications 2111 * to fail. 2112 */ 2113 *vpp = NULL; 2114 2115 if (error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW)) { 2116 zfs_acl_ids_free(&acl_ids); 2117 ZFS_EXIT(zfsvfs); 2118 return (error); 2119 } 2120 ASSERT3P(zp, ==, NULL); 2121 2122 if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) { 2123 zfs_acl_ids_free(&acl_ids); 2124 ZFS_EXIT(zfsvfs); 2125 return (error); 2126 } 2127 2128 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { 2129 zfs_acl_ids_free(&acl_ids); 2130 ZFS_EXIT(zfsvfs); 2131 return (SET_ERROR(EDQUOT)); 2132 } 2133 2134 /* 2135 * Add a new entry to the directory. 2136 */ 2137 getnewvnode_reserve(1); 2138 tx = dmu_tx_create(zfsvfs->z_os); 2139 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); 2140 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); 2141 fuid_dirtied = zfsvfs->z_fuid_dirty; 2142 if (fuid_dirtied) 2143 zfs_fuid_txhold(zfsvfs, tx); 2144 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 2145 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 2146 acl_ids.z_aclp->z_acl_bytes); 2147 } 2148 2149 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 2150 ZFS_SA_BASE_ATTR_SIZE); 2151 2152 error = dmu_tx_assign(tx, TXG_WAIT); 2153 if (error) { 2154 zfs_acl_ids_free(&acl_ids); 2155 dmu_tx_abort(tx); 2156 getnewvnode_drop_reserve(); 2157 ZFS_EXIT(zfsvfs); 2158 return (error); 2159 } 2160 2161 /* 2162 * Create new node. 2163 */ 2164 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); 2165 2166 if (fuid_dirtied) 2167 zfs_fuid_sync(zfsvfs, tx); 2168 2169 /* 2170 * Now put new name in parent dir. 2171 */ 2172 (void) zfs_link_create(dzp, dirname, zp, tx, ZNEW); 2173 2174 *vpp = ZTOV(zp); 2175 2176 txtype = zfs_log_create_txtype(Z_DIR, NULL, vap); 2177 zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL, 2178 acl_ids.z_fuidp, vap); 2179 2180 zfs_acl_ids_free(&acl_ids); 2181 2182 dmu_tx_commit(tx); 2183 2184 getnewvnode_drop_reserve(); 2185 2186 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 2187 zil_commit(zilog, 0); 2188 2189 ZFS_EXIT(zfsvfs); 2190 return (0); 2191} 2192 2193/* 2194 * Remove a directory subdir entry. If the current working 2195 * directory is the same as the subdir to be removed, the 2196 * remove will fail. 2197 * 2198 * IN: dvp - vnode of directory to remove from. 2199 * name - name of directory to be removed. 2200 * cwd - vnode of current working directory. 2201 * cr - credentials of caller. 2202 * ct - caller context 2203 * flags - case flags 2204 * 2205 * RETURN: 0 on success, error code on failure. 2206 * 2207 * Timestamps: 2208 * dvp - ctime|mtime updated 2209 */ 2210/*ARGSUSED*/ 2211static int 2212zfs_rmdir(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr) 2213{ 2214 znode_t *dzp = VTOZ(dvp); 2215 znode_t *zp = VTOZ(vp); 2216 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 2217 zilog_t *zilog; 2218 dmu_tx_t *tx; 2219 int error; 2220 2221 ZFS_ENTER(zfsvfs); 2222 ZFS_VERIFY_ZP(dzp); 2223 ZFS_VERIFY_ZP(zp); 2224 zilog = zfsvfs->z_log; 2225 2226 2227 if (error = zfs_zaccess_delete(dzp, zp, cr)) { 2228 goto out; 2229 } 2230 2231 if (vp->v_type != VDIR) { 2232 error = SET_ERROR(ENOTDIR); 2233 goto out; 2234 } 2235 2236 vnevent_rmdir(vp, dvp, name, ct); 2237 2238 tx = dmu_tx_create(zfsvfs->z_os); 2239 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); 2240 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 2241 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 2242 zfs_sa_upgrade_txholds(tx, zp); 2243 zfs_sa_upgrade_txholds(tx, dzp); 2244 dmu_tx_mark_netfree(tx); 2245 error = dmu_tx_assign(tx, TXG_WAIT); 2246 if (error) { 2247 dmu_tx_abort(tx); 2248 ZFS_EXIT(zfsvfs); 2249 return (error); 2250 } 2251 2252 cache_purge(dvp); 2253 2254 error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL); 2255 2256 if (error == 0) { 2257 uint64_t txtype = TX_RMDIR; 2258 zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT); 2259 } 2260 2261 dmu_tx_commit(tx); 2262 2263 cache_purge(vp); 2264out: 2265 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 2266 zil_commit(zilog, 0); 2267 2268 ZFS_EXIT(zfsvfs); 2269 return (error); 2270} 2271 2272/* 2273 * Read as many directory entries as will fit into the provided 2274 * buffer from the given directory cursor position (specified in 2275 * the uio structure). 2276 * 2277 * IN: vp - vnode of directory to read. 2278 * uio - structure supplying read location, range info, 2279 * and return buffer. 2280 * cr - credentials of caller. 2281 * ct - caller context 2282 * flags - case flags 2283 * 2284 * OUT: uio - updated offset and range, buffer filled. 2285 * eofp - set to true if end-of-file detected. 2286 * 2287 * RETURN: 0 on success, error code on failure. 2288 * 2289 * Timestamps: 2290 * vp - atime updated 2291 * 2292 * Note that the low 4 bits of the cookie returned by zap is always zero. 2293 * This allows us to use the low range for "special" directory entries: 2294 * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, 2295 * we use the offset 2 for the '.zfs' directory. 2296 */ 2297/* ARGSUSED */ 2298static int 2299zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies) 2300{ 2301 znode_t *zp = VTOZ(vp); 2302 iovec_t *iovp; 2303 edirent_t *eodp; 2304 dirent64_t *odp; 2305 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2306 objset_t *os; 2307 caddr_t outbuf; 2308 size_t bufsize; 2309 zap_cursor_t zc; 2310 zap_attribute_t zap; 2311 uint_t bytes_wanted; 2312 uint64_t offset; /* must be unsigned; checks for < 1 */ 2313 uint64_t parent; 2314 int local_eof; 2315 int outcount; 2316 int error; 2317 uint8_t prefetch; 2318 boolean_t check_sysattrs; 2319 uint8_t type; 2320 int ncooks; 2321 u_long *cooks = NULL; 2322 int flags = 0; 2323 2324 ZFS_ENTER(zfsvfs); 2325 ZFS_VERIFY_ZP(zp); 2326 2327 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), 2328 &parent, sizeof (parent))) != 0) { 2329 ZFS_EXIT(zfsvfs); 2330 return (error); 2331 } 2332 2333 /* 2334 * If we are not given an eof variable, 2335 * use a local one. 2336 */ 2337 if (eofp == NULL) 2338 eofp = &local_eof; 2339 2340 /* 2341 * Check for valid iov_len. 2342 */ 2343 if (uio->uio_iov->iov_len <= 0) { 2344 ZFS_EXIT(zfsvfs); 2345 return (SET_ERROR(EINVAL)); 2346 } 2347 2348 /* 2349 * Quit if directory has been removed (posix) 2350 */ 2351 if ((*eofp = zp->z_unlinked) != 0) { 2352 ZFS_EXIT(zfsvfs); 2353 return (0); 2354 } 2355 2356 error = 0; 2357 os = zfsvfs->z_os; 2358 offset = uio->uio_loffset; 2359 prefetch = zp->z_zn_prefetch; 2360 2361 /* 2362 * Initialize the iterator cursor. 2363 */ 2364 if (offset <= 3) { 2365 /* 2366 * Start iteration from the beginning of the directory. 2367 */ 2368 zap_cursor_init(&zc, os, zp->z_id); 2369 } else { 2370 /* 2371 * The offset is a serialized cursor. 2372 */ 2373 zap_cursor_init_serialized(&zc, os, zp->z_id, offset); 2374 } 2375 2376 /* 2377 * Get space to change directory entries into fs independent format. 2378 */ 2379 iovp = uio->uio_iov; 2380 bytes_wanted = iovp->iov_len; 2381 if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) { 2382 bufsize = bytes_wanted; 2383 outbuf = kmem_alloc(bufsize, KM_SLEEP); 2384 odp = (struct dirent64 *)outbuf; 2385 } else { 2386 bufsize = bytes_wanted; 2387 outbuf = NULL; 2388 odp = (struct dirent64 *)iovp->iov_base; 2389 } 2390 eodp = (struct edirent *)odp; 2391 2392 if (ncookies != NULL) { 2393 /* 2394 * Minimum entry size is dirent size and 1 byte for a file name. 2395 */ 2396 ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1); 2397 cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK); 2398 *cookies = cooks; 2399 *ncookies = ncooks; 2400 } 2401 /* 2402 * If this VFS supports the system attribute view interface; and 2403 * we're looking at an extended attribute directory; and we care 2404 * about normalization conflicts on this vfs; then we must check 2405 * for normalization conflicts with the sysattr name space. 2406 */ 2407#ifdef TODO 2408 check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && 2409 (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm && 2410 (flags & V_RDDIR_ENTFLAGS); 2411#else 2412 check_sysattrs = 0; 2413#endif 2414 2415 /* 2416 * Transform to file-system independent format 2417 */ 2418 outcount = 0; 2419 while (outcount < bytes_wanted) { 2420 ino64_t objnum; 2421 ushort_t reclen; 2422 off64_t *next = NULL; 2423 2424 /* 2425 * Special case `.', `..', and `.zfs'. 2426 */ 2427 if (offset == 0) { 2428 (void) strcpy(zap.za_name, "."); 2429 zap.za_normalization_conflict = 0; 2430 objnum = zp->z_id; 2431 type = DT_DIR; 2432 } else if (offset == 1) { 2433 (void) strcpy(zap.za_name, ".."); 2434 zap.za_normalization_conflict = 0; 2435 objnum = parent; 2436 type = DT_DIR; 2437 } else if (offset == 2 && zfs_show_ctldir(zp)) { 2438 (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); 2439 zap.za_normalization_conflict = 0; 2440 objnum = ZFSCTL_INO_ROOT; 2441 type = DT_DIR; 2442 } else { 2443 /* 2444 * Grab next entry. 2445 */ 2446 if (error = zap_cursor_retrieve(&zc, &zap)) { 2447 if ((*eofp = (error == ENOENT)) != 0) 2448 break; 2449 else 2450 goto update; 2451 } 2452 2453 if (zap.za_integer_length != 8 || 2454 zap.za_num_integers != 1) { 2455 cmn_err(CE_WARN, "zap_readdir: bad directory " 2456 "entry, obj = %lld, offset = %lld\n", 2457 (u_longlong_t)zp->z_id, 2458 (u_longlong_t)offset); 2459 error = SET_ERROR(ENXIO); 2460 goto update; 2461 } 2462 2463 objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); 2464 /* 2465 * MacOS X can extract the object type here such as: 2466 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer); 2467 */ 2468 type = ZFS_DIRENT_TYPE(zap.za_first_integer); 2469 2470 if (check_sysattrs && !zap.za_normalization_conflict) { 2471#ifdef TODO 2472 zap.za_normalization_conflict = 2473 xattr_sysattr_casechk(zap.za_name); 2474#else 2475 panic("%s:%u: TODO", __func__, __LINE__); 2476#endif 2477 } 2478 } 2479 2480 if (flags & V_RDDIR_ACCFILTER) { 2481 /* 2482 * If we have no access at all, don't include 2483 * this entry in the returned information 2484 */ 2485 znode_t *ezp; 2486 if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0) 2487 goto skip_entry; 2488 if (!zfs_has_access(ezp, cr)) { 2489 vrele(ZTOV(ezp)); 2490 goto skip_entry; 2491 } 2492 vrele(ZTOV(ezp)); 2493 } 2494 2495 if (flags & V_RDDIR_ENTFLAGS) 2496 reclen = EDIRENT_RECLEN(strlen(zap.za_name)); 2497 else 2498 reclen = DIRENT64_RECLEN(strlen(zap.za_name)); 2499 2500 /* 2501 * Will this entry fit in the buffer? 2502 */ 2503 if (outcount + reclen > bufsize) { 2504 /* 2505 * Did we manage to fit anything in the buffer? 2506 */ 2507 if (!outcount) { 2508 error = SET_ERROR(EINVAL); 2509 goto update; 2510 } 2511 break; 2512 } 2513 if (flags & V_RDDIR_ENTFLAGS) { 2514 /* 2515 * Add extended flag entry: 2516 */ 2517 eodp->ed_ino = objnum; 2518 eodp->ed_reclen = reclen; 2519 /* NOTE: ed_off is the offset for the *next* entry */ 2520 next = &(eodp->ed_off); 2521 eodp->ed_eflags = zap.za_normalization_conflict ? 2522 ED_CASE_CONFLICT : 0; 2523 (void) strncpy(eodp->ed_name, zap.za_name, 2524 EDIRENT_NAMELEN(reclen)); 2525 eodp = (edirent_t *)((intptr_t)eodp + reclen); 2526 } else { 2527 /* 2528 * Add normal entry: 2529 */ 2530 odp->d_ino = objnum; 2531 odp->d_reclen = reclen; 2532 odp->d_namlen = strlen(zap.za_name); 2533 (void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1); 2534 odp->d_type = type; 2535 odp = (dirent64_t *)((intptr_t)odp + reclen); 2536 } 2537 outcount += reclen; 2538 2539 ASSERT(outcount <= bufsize); 2540 2541 /* Prefetch znode */ 2542 if (prefetch) 2543 dmu_prefetch(os, objnum, 0, 0, 0, 2544 ZIO_PRIORITY_SYNC_READ); 2545 2546 skip_entry: 2547 /* 2548 * Move to the next entry, fill in the previous offset. 2549 */ 2550 if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { 2551 zap_cursor_advance(&zc); 2552 offset = zap_cursor_serialize(&zc); 2553 } else { 2554 offset += 1; 2555 } 2556 2557 if (cooks != NULL) { 2558 *cooks++ = offset; 2559 ncooks--; 2560 KASSERT(ncooks >= 0, ("ncookies=%d", ncooks)); 2561 } 2562 } 2563 zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ 2564 2565 /* Subtract unused cookies */ 2566 if (ncookies != NULL) 2567 *ncookies -= ncooks; 2568 2569 if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) { 2570 iovp->iov_base += outcount; 2571 iovp->iov_len -= outcount; 2572 uio->uio_resid -= outcount; 2573 } else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) { 2574 /* 2575 * Reset the pointer. 2576 */ 2577 offset = uio->uio_loffset; 2578 } 2579 2580update: 2581 zap_cursor_fini(&zc); 2582 if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) 2583 kmem_free(outbuf, bufsize); 2584 2585 if (error == ENOENT) 2586 error = 0; 2587 2588 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 2589 2590 uio->uio_loffset = offset; 2591 ZFS_EXIT(zfsvfs); 2592 if (error != 0 && cookies != NULL) { 2593 free(*cookies, M_TEMP); 2594 *cookies = NULL; 2595 *ncookies = 0; 2596 } 2597 return (error); 2598} 2599 2600ulong_t zfs_fsync_sync_cnt = 4; 2601 2602static int 2603zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct) 2604{ 2605 znode_t *zp = VTOZ(vp); 2606 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2607 2608 (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt); 2609 2610 if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { 2611 ZFS_ENTER(zfsvfs); 2612 ZFS_VERIFY_ZP(zp); 2613 zil_commit(zfsvfs->z_log, zp->z_id); 2614 ZFS_EXIT(zfsvfs); 2615 } 2616 return (0); 2617} 2618 2619 2620/* 2621 * Get the requested file attributes and place them in the provided 2622 * vattr structure. 2623 * 2624 * IN: vp - vnode of file. 2625 * vap - va_mask identifies requested attributes. 2626 * If AT_XVATTR set, then optional attrs are requested 2627 * flags - ATTR_NOACLCHECK (CIFS server context) 2628 * cr - credentials of caller. 2629 * ct - caller context 2630 * 2631 * OUT: vap - attribute values. 2632 * 2633 * RETURN: 0 (always succeeds). 2634 */ 2635/* ARGSUSED */ 2636static int 2637zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, 2638 caller_context_t *ct) 2639{ 2640 znode_t *zp = VTOZ(vp); 2641 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2642 int error = 0; 2643 uint32_t blksize; 2644 u_longlong_t nblocks; 2645 uint64_t links; 2646 uint64_t mtime[2], ctime[2], crtime[2], rdev; 2647 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ 2648 xoptattr_t *xoap = NULL; 2649 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 2650 sa_bulk_attr_t bulk[4]; 2651 int count = 0; 2652 2653 ZFS_ENTER(zfsvfs); 2654 ZFS_VERIFY_ZP(zp); 2655 2656 zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid); 2657 2658 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); 2659 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); 2660 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); 2661 if (vp->v_type == VBLK || vp->v_type == VCHR) 2662 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL, 2663 &rdev, 8); 2664 2665 if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) { 2666 ZFS_EXIT(zfsvfs); 2667 return (error); 2668 } 2669 2670 /* 2671 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES. 2672 * Also, if we are the owner don't bother, since owner should 2673 * always be allowed to read basic attributes of file. 2674 */ 2675 if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) && 2676 (vap->va_uid != crgetuid(cr))) { 2677 if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0, 2678 skipaclchk, cr)) { 2679 ZFS_EXIT(zfsvfs); 2680 return (error); 2681 } 2682 } 2683 2684 /* 2685 * Return all attributes. It's cheaper to provide the answer 2686 * than to determine whether we were asked the question. 2687 */ 2688 2689 vap->va_type = IFTOVT(zp->z_mode); 2690 vap->va_mode = zp->z_mode & ~S_IFMT; 2691#ifdef illumos 2692 vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev; 2693#else 2694 vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; 2695#endif 2696 vap->va_nodeid = zp->z_id; 2697 if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp)) 2698 links = zp->z_links + 1; 2699 else 2700 links = zp->z_links; 2701 vap->va_nlink = MIN(links, LINK_MAX); /* nlink_t limit! */ 2702 vap->va_size = zp->z_size; 2703#ifdef illumos 2704 vap->va_rdev = vp->v_rdev; 2705#else 2706 if (vp->v_type == VBLK || vp->v_type == VCHR) 2707 vap->va_rdev = zfs_cmpldev(rdev); 2708#endif 2709 vap->va_seq = zp->z_seq; 2710 vap->va_flags = 0; /* FreeBSD: Reset chflags(2) flags. */ 2711 vap->va_filerev = zp->z_seq; 2712 2713 /* 2714 * Add in any requested optional attributes and the create time. 2715 * Also set the corresponding bits in the returned attribute bitmap. 2716 */ 2717 if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) { 2718 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { 2719 xoap->xoa_archive = 2720 ((zp->z_pflags & ZFS_ARCHIVE) != 0); 2721 XVA_SET_RTN(xvap, XAT_ARCHIVE); 2722 } 2723 2724 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { 2725 xoap->xoa_readonly = 2726 ((zp->z_pflags & ZFS_READONLY) != 0); 2727 XVA_SET_RTN(xvap, XAT_READONLY); 2728 } 2729 2730 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { 2731 xoap->xoa_system = 2732 ((zp->z_pflags & ZFS_SYSTEM) != 0); 2733 XVA_SET_RTN(xvap, XAT_SYSTEM); 2734 } 2735 2736 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { 2737 xoap->xoa_hidden = 2738 ((zp->z_pflags & ZFS_HIDDEN) != 0); 2739 XVA_SET_RTN(xvap, XAT_HIDDEN); 2740 } 2741 2742 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { 2743 xoap->xoa_nounlink = 2744 ((zp->z_pflags & ZFS_NOUNLINK) != 0); 2745 XVA_SET_RTN(xvap, XAT_NOUNLINK); 2746 } 2747 2748 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { 2749 xoap->xoa_immutable = 2750 ((zp->z_pflags & ZFS_IMMUTABLE) != 0); 2751 XVA_SET_RTN(xvap, XAT_IMMUTABLE); 2752 } 2753 2754 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { 2755 xoap->xoa_appendonly = 2756 ((zp->z_pflags & ZFS_APPENDONLY) != 0); 2757 XVA_SET_RTN(xvap, XAT_APPENDONLY); 2758 } 2759 2760 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { 2761 xoap->xoa_nodump = 2762 ((zp->z_pflags & ZFS_NODUMP) != 0); 2763 XVA_SET_RTN(xvap, XAT_NODUMP); 2764 } 2765 2766 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { 2767 xoap->xoa_opaque = 2768 ((zp->z_pflags & ZFS_OPAQUE) != 0); 2769 XVA_SET_RTN(xvap, XAT_OPAQUE); 2770 } 2771 2772 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { 2773 xoap->xoa_av_quarantined = 2774 ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0); 2775 XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); 2776 } 2777 2778 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { 2779 xoap->xoa_av_modified = 2780 ((zp->z_pflags & ZFS_AV_MODIFIED) != 0); 2781 XVA_SET_RTN(xvap, XAT_AV_MODIFIED); 2782 } 2783 2784 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) && 2785 vp->v_type == VREG) { 2786 zfs_sa_get_scanstamp(zp, xvap); 2787 } 2788 2789 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { 2790 xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0); 2791 XVA_SET_RTN(xvap, XAT_REPARSE); 2792 } 2793 if (XVA_ISSET_REQ(xvap, XAT_GEN)) { 2794 xoap->xoa_generation = zp->z_gen; 2795 XVA_SET_RTN(xvap, XAT_GEN); 2796 } 2797 2798 if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { 2799 xoap->xoa_offline = 2800 ((zp->z_pflags & ZFS_OFFLINE) != 0); 2801 XVA_SET_RTN(xvap, XAT_OFFLINE); 2802 } 2803 2804 if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { 2805 xoap->xoa_sparse = 2806 ((zp->z_pflags & ZFS_SPARSE) != 0); 2807 XVA_SET_RTN(xvap, XAT_SPARSE); 2808 } 2809 } 2810 2811 ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime); 2812 ZFS_TIME_DECODE(&vap->va_mtime, mtime); 2813 ZFS_TIME_DECODE(&vap->va_ctime, ctime); 2814 ZFS_TIME_DECODE(&vap->va_birthtime, crtime); 2815 2816 2817 sa_object_size(zp->z_sa_hdl, &blksize, &nblocks); 2818 vap->va_blksize = blksize; 2819 vap->va_bytes = nblocks << 9; /* nblocks * 512 */ 2820 2821 if (zp->z_blksz == 0) { 2822 /* 2823 * Block size hasn't been set; suggest maximal I/O transfers. 2824 */ 2825 vap->va_blksize = zfsvfs->z_max_blksz; 2826 } 2827 2828 ZFS_EXIT(zfsvfs); 2829 return (0); 2830} 2831 2832/* 2833 * Set the file attributes to the values contained in the 2834 * vattr structure. 2835 * 2836 * IN: vp - vnode of file to be modified. 2837 * vap - new attribute values. 2838 * If AT_XVATTR set, then optional attrs are being set 2839 * flags - ATTR_UTIME set if non-default time values provided. 2840 * - ATTR_NOACLCHECK (CIFS context only). 2841 * cr - credentials of caller. 2842 * ct - caller context 2843 * 2844 * RETURN: 0 on success, error code on failure. 2845 * 2846 * Timestamps: 2847 * vp - ctime updated, mtime updated if size changed. 2848 */ 2849/* ARGSUSED */ 2850static int 2851zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, 2852 caller_context_t *ct) 2853{ 2854 znode_t *zp = VTOZ(vp); 2855 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2856 zilog_t *zilog; 2857 dmu_tx_t *tx; 2858 vattr_t oldva; 2859 xvattr_t tmpxvattr; 2860 uint_t mask = vap->va_mask; 2861 uint_t saved_mask = 0; 2862 uint64_t saved_mode; 2863 int trim_mask = 0; 2864 uint64_t new_mode; 2865 uint64_t new_uid, new_gid; 2866 uint64_t xattr_obj; 2867 uint64_t mtime[2], ctime[2]; 2868 znode_t *attrzp; 2869 int need_policy = FALSE; 2870 int err, err2; 2871 zfs_fuid_info_t *fuidp = NULL; 2872 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ 2873 xoptattr_t *xoap; 2874 zfs_acl_t *aclp; 2875 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 2876 boolean_t fuid_dirtied = B_FALSE; 2877 sa_bulk_attr_t bulk[7], xattr_bulk[7]; 2878 int count = 0, xattr_count = 0; 2879 2880 if (mask == 0) 2881 return (0); 2882 2883 if (mask & AT_NOSET) 2884 return (SET_ERROR(EINVAL)); 2885 2886 ZFS_ENTER(zfsvfs); 2887 ZFS_VERIFY_ZP(zp); 2888 2889 zilog = zfsvfs->z_log; 2890 2891 /* 2892 * Make sure that if we have ephemeral uid/gid or xvattr specified 2893 * that file system is at proper version level 2894 */ 2895 2896 if (zfsvfs->z_use_fuids == B_FALSE && 2897 (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) || 2898 ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) || 2899 (mask & AT_XVATTR))) { 2900 ZFS_EXIT(zfsvfs); 2901 return (SET_ERROR(EINVAL)); 2902 } 2903 2904 if (mask & AT_SIZE && vp->v_type == VDIR) { 2905 ZFS_EXIT(zfsvfs); 2906 return (SET_ERROR(EISDIR)); 2907 } 2908 2909 if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) { 2910 ZFS_EXIT(zfsvfs); 2911 return (SET_ERROR(EINVAL)); 2912 } 2913 2914 /* 2915 * If this is an xvattr_t, then get a pointer to the structure of 2916 * optional attributes. If this is NULL, then we have a vattr_t. 2917 */ 2918 xoap = xva_getxoptattr(xvap); 2919 2920 xva_init(&tmpxvattr); 2921 2922 /* 2923 * Immutable files can only alter immutable bit and atime 2924 */ 2925 if ((zp->z_pflags & ZFS_IMMUTABLE) && 2926 ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) || 2927 ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { 2928 ZFS_EXIT(zfsvfs); 2929 return (SET_ERROR(EPERM)); 2930 } 2931 2932 /* 2933 * Note: ZFS_READONLY is handled in zfs_zaccess_common. 2934 */ 2935 2936 /* 2937 * Verify timestamps doesn't overflow 32 bits. 2938 * ZFS can handle large timestamps, but 32bit syscalls can't 2939 * handle times greater than 2039. This check should be removed 2940 * once large timestamps are fully supported. 2941 */ 2942 if (mask & (AT_ATIME | AT_MTIME)) { 2943 if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || 2944 ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { 2945 ZFS_EXIT(zfsvfs); 2946 return (SET_ERROR(EOVERFLOW)); 2947 } 2948 } 2949 if (xoap && (mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME) && 2950 TIMESPEC_OVERFLOW(&vap->va_birthtime)) { 2951 ZFS_EXIT(zfsvfs); 2952 return (SET_ERROR(EOVERFLOW)); 2953 } 2954 2955 attrzp = NULL; 2956 aclp = NULL; 2957 2958 /* Can this be moved to before the top label? */ 2959 if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { 2960 ZFS_EXIT(zfsvfs); 2961 return (SET_ERROR(EROFS)); 2962 } 2963 2964 /* 2965 * First validate permissions 2966 */ 2967 2968 if (mask & AT_SIZE) { 2969 /* 2970 * XXX - Note, we are not providing any open 2971 * mode flags here (like FNDELAY), so we may 2972 * block if there are locks present... this 2973 * should be addressed in openat(). 2974 */ 2975 /* XXX - would it be OK to generate a log record here? */ 2976 err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); 2977 if (err) { 2978 ZFS_EXIT(zfsvfs); 2979 return (err); 2980 } 2981 } 2982 2983 if (mask & (AT_ATIME|AT_MTIME) || 2984 ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || 2985 XVA_ISSET_REQ(xvap, XAT_READONLY) || 2986 XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || 2987 XVA_ISSET_REQ(xvap, XAT_OFFLINE) || 2988 XVA_ISSET_REQ(xvap, XAT_SPARSE) || 2989 XVA_ISSET_REQ(xvap, XAT_CREATETIME) || 2990 XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { 2991 need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, 2992 skipaclchk, cr); 2993 } 2994 2995 if (mask & (AT_UID|AT_GID)) { 2996 int idmask = (mask & (AT_UID|AT_GID)); 2997 int take_owner; 2998 int take_group; 2999 3000 /* 3001 * NOTE: even if a new mode is being set, 3002 * we may clear S_ISUID/S_ISGID bits. 3003 */ 3004 3005 if (!(mask & AT_MODE)) 3006 vap->va_mode = zp->z_mode; 3007 3008 /* 3009 * Take ownership or chgrp to group we are a member of 3010 */ 3011 3012 take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr)); 3013 take_group = (mask & AT_GID) && 3014 zfs_groupmember(zfsvfs, vap->va_gid, cr); 3015 3016 /* 3017 * If both AT_UID and AT_GID are set then take_owner and 3018 * take_group must both be set in order to allow taking 3019 * ownership. 3020 * 3021 * Otherwise, send the check through secpolicy_vnode_setattr() 3022 * 3023 */ 3024 3025 if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) || 3026 ((idmask == AT_UID) && take_owner) || 3027 ((idmask == AT_GID) && take_group)) { 3028 if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, 3029 skipaclchk, cr) == 0) { 3030 /* 3031 * Remove setuid/setgid for non-privileged users 3032 */ 3033 secpolicy_setid_clear(vap, vp, cr); 3034 trim_mask = (mask & (AT_UID|AT_GID)); 3035 } else { 3036 need_policy = TRUE; 3037 } 3038 } else { 3039 need_policy = TRUE; 3040 } 3041 } 3042 3043 oldva.va_mode = zp->z_mode; 3044 zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); 3045 if (mask & AT_XVATTR) { 3046 /* 3047 * Update xvattr mask to include only those attributes 3048 * that are actually changing. 3049 * 3050 * the bits will be restored prior to actually setting 3051 * the attributes so the caller thinks they were set. 3052 */ 3053 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { 3054 if (xoap->xoa_appendonly != 3055 ((zp->z_pflags & ZFS_APPENDONLY) != 0)) { 3056 need_policy = TRUE; 3057 } else { 3058 XVA_CLR_REQ(xvap, XAT_APPENDONLY); 3059 XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY); 3060 } 3061 } 3062 3063 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { 3064 if (xoap->xoa_nounlink != 3065 ((zp->z_pflags & ZFS_NOUNLINK) != 0)) { 3066 need_policy = TRUE; 3067 } else { 3068 XVA_CLR_REQ(xvap, XAT_NOUNLINK); 3069 XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK); 3070 } 3071 } 3072 3073 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { 3074 if (xoap->xoa_immutable != 3075 ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) { 3076 need_policy = TRUE; 3077 } else { 3078 XVA_CLR_REQ(xvap, XAT_IMMUTABLE); 3079 XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE); 3080 } 3081 } 3082 3083 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { 3084 if (xoap->xoa_nodump != 3085 ((zp->z_pflags & ZFS_NODUMP) != 0)) { 3086 need_policy = TRUE; 3087 } else { 3088 XVA_CLR_REQ(xvap, XAT_NODUMP); 3089 XVA_SET_REQ(&tmpxvattr, XAT_NODUMP); 3090 } 3091 } 3092 3093 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { 3094 if (xoap->xoa_av_modified != 3095 ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) { 3096 need_policy = TRUE; 3097 } else { 3098 XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); 3099 XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED); 3100 } 3101 } 3102 3103 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { 3104 if ((vp->v_type != VREG && 3105 xoap->xoa_av_quarantined) || 3106 xoap->xoa_av_quarantined != 3107 ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) { 3108 need_policy = TRUE; 3109 } else { 3110 XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); 3111 XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED); 3112 } 3113 } 3114 3115 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { 3116 ZFS_EXIT(zfsvfs); 3117 return (SET_ERROR(EPERM)); 3118 } 3119 3120 if (need_policy == FALSE && 3121 (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) || 3122 XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { 3123 need_policy = TRUE; 3124 } 3125 } 3126 3127 if (mask & AT_MODE) { 3128 if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) { 3129 err = secpolicy_setid_setsticky_clear(vp, vap, 3130 &oldva, cr); 3131 if (err) { 3132 ZFS_EXIT(zfsvfs); 3133 return (err); 3134 } 3135 trim_mask |= AT_MODE; 3136 } else { 3137 need_policy = TRUE; 3138 } 3139 } 3140 3141 if (need_policy) { 3142 /* 3143 * If trim_mask is set then take ownership 3144 * has been granted or write_acl is present and user 3145 * has the ability to modify mode. In that case remove 3146 * UID|GID and or MODE from mask so that 3147 * secpolicy_vnode_setattr() doesn't revoke it. 3148 */ 3149 3150 if (trim_mask) { 3151 saved_mask = vap->va_mask; 3152 vap->va_mask &= ~trim_mask; 3153 if (trim_mask & AT_MODE) { 3154 /* 3155 * Save the mode, as secpolicy_vnode_setattr() 3156 * will overwrite it with ova.va_mode. 3157 */ 3158 saved_mode = vap->va_mode; 3159 } 3160 } 3161 err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags, 3162 (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp); 3163 if (err) { 3164 ZFS_EXIT(zfsvfs); 3165 return (err); 3166 } 3167 3168 if (trim_mask) { 3169 vap->va_mask |= saved_mask; 3170 if (trim_mask & AT_MODE) { 3171 /* 3172 * Recover the mode after 3173 * secpolicy_vnode_setattr(). 3174 */ 3175 vap->va_mode = saved_mode; 3176 } 3177 } 3178 } 3179 3180 /* 3181 * secpolicy_vnode_setattr, or take ownership may have 3182 * changed va_mask 3183 */ 3184 mask = vap->va_mask; 3185 3186 if ((mask & (AT_UID | AT_GID))) { 3187 err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), 3188 &xattr_obj, sizeof (xattr_obj)); 3189 3190 if (err == 0 && xattr_obj) { 3191 err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp); 3192 if (err == 0) { 3193 err = vn_lock(ZTOV(attrzp), LK_EXCLUSIVE); 3194 if (err != 0) 3195 vrele(ZTOV(attrzp)); 3196 } 3197 if (err) 3198 goto out2; 3199 } 3200 if (mask & AT_UID) { 3201 new_uid = zfs_fuid_create(zfsvfs, 3202 (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); 3203 if (new_uid != zp->z_uid && 3204 zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) { 3205 if (attrzp) 3206 vput(ZTOV(attrzp)); 3207 err = SET_ERROR(EDQUOT); 3208 goto out2; 3209 } 3210 } 3211 3212 if (mask & AT_GID) { 3213 new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid, 3214 cr, ZFS_GROUP, &fuidp); 3215 if (new_gid != zp->z_gid && 3216 zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) { 3217 if (attrzp) 3218 vput(ZTOV(attrzp)); 3219 err = SET_ERROR(EDQUOT); 3220 goto out2; 3221 } 3222 } 3223 } 3224 tx = dmu_tx_create(zfsvfs->z_os); 3225 3226 if (mask & AT_MODE) { 3227 uint64_t pmode = zp->z_mode; 3228 uint64_t acl_obj; 3229 new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); 3230 3231 if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED && 3232 !(zp->z_pflags & ZFS_ACL_TRIVIAL)) { 3233 err = SET_ERROR(EPERM); 3234 goto out; 3235 } 3236 3237 if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)) 3238 goto out; 3239 3240 if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) { 3241 /* 3242 * Are we upgrading ACL from old V0 format 3243 * to V1 format? 3244 */ 3245 if (zfsvfs->z_version >= ZPL_VERSION_FUID && 3246 zfs_znode_acl_version(zp) == 3247 ZFS_ACL_VERSION_INITIAL) { 3248 dmu_tx_hold_free(tx, acl_obj, 0, 3249 DMU_OBJECT_END); 3250 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 3251 0, aclp->z_acl_bytes); 3252 } else { 3253 dmu_tx_hold_write(tx, acl_obj, 0, 3254 aclp->z_acl_bytes); 3255 } 3256 } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { 3257 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 3258 0, aclp->z_acl_bytes); 3259 } 3260 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 3261 } else { 3262 if ((mask & AT_XVATTR) && 3263 XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) 3264 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 3265 else 3266 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 3267 } 3268 3269 if (attrzp) { 3270 dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE); 3271 } 3272 3273 fuid_dirtied = zfsvfs->z_fuid_dirty; 3274 if (fuid_dirtied) 3275 zfs_fuid_txhold(zfsvfs, tx); 3276 3277 zfs_sa_upgrade_txholds(tx, zp); 3278 3279 err = dmu_tx_assign(tx, TXG_WAIT); 3280 if (err) 3281 goto out; 3282 3283 count = 0; 3284 /* 3285 * Set each attribute requested. 3286 * We group settings according to the locks they need to acquire. 3287 * 3288 * Note: you cannot set ctime directly, although it will be 3289 * updated as a side-effect of calling this function. 3290 */ 3291 3292 if (mask & (AT_UID|AT_GID|AT_MODE)) 3293 mutex_enter(&zp->z_acl_lock); 3294 3295 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 3296 &zp->z_pflags, sizeof (zp->z_pflags)); 3297 3298 if (attrzp) { 3299 if (mask & (AT_UID|AT_GID|AT_MODE)) 3300 mutex_enter(&attrzp->z_acl_lock); 3301 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 3302 SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags, 3303 sizeof (attrzp->z_pflags)); 3304 } 3305 3306 if (mask & (AT_UID|AT_GID)) { 3307 3308 if (mask & AT_UID) { 3309 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, 3310 &new_uid, sizeof (new_uid)); 3311 zp->z_uid = new_uid; 3312 if (attrzp) { 3313 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 3314 SA_ZPL_UID(zfsvfs), NULL, &new_uid, 3315 sizeof (new_uid)); 3316 attrzp->z_uid = new_uid; 3317 } 3318 } 3319 3320 if (mask & AT_GID) { 3321 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), 3322 NULL, &new_gid, sizeof (new_gid)); 3323 zp->z_gid = new_gid; 3324 if (attrzp) { 3325 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 3326 SA_ZPL_GID(zfsvfs), NULL, &new_gid, 3327 sizeof (new_gid)); 3328 attrzp->z_gid = new_gid; 3329 } 3330 } 3331 if (!(mask & AT_MODE)) { 3332 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), 3333 NULL, &new_mode, sizeof (new_mode)); 3334 new_mode = zp->z_mode; 3335 } 3336 err = zfs_acl_chown_setattr(zp); 3337 ASSERT(err == 0); 3338 if (attrzp) { 3339 err = zfs_acl_chown_setattr(attrzp); 3340 ASSERT(err == 0); 3341 } 3342 } 3343 3344 if (mask & AT_MODE) { 3345 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, 3346 &new_mode, sizeof (new_mode)); 3347 zp->z_mode = new_mode; 3348 ASSERT3U((uintptr_t)aclp, !=, 0); 3349 err = zfs_aclset_common(zp, aclp, cr, tx); 3350 ASSERT0(err); 3351 if (zp->z_acl_cached) 3352 zfs_acl_free(zp->z_acl_cached); 3353 zp->z_acl_cached = aclp; 3354 aclp = NULL; 3355 } 3356 3357 3358 if (mask & AT_ATIME) { 3359 ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime); 3360 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, 3361 &zp->z_atime, sizeof (zp->z_atime)); 3362 } 3363 3364 if (mask & AT_MTIME) { 3365 ZFS_TIME_ENCODE(&vap->va_mtime, mtime); 3366 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, 3367 mtime, sizeof (mtime)); 3368 } 3369 3370 /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */ 3371 if (mask & AT_SIZE && !(mask & AT_MTIME)) { 3372 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), 3373 NULL, mtime, sizeof (mtime)); 3374 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, 3375 &ctime, sizeof (ctime)); 3376 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, 3377 B_TRUE); 3378 } else if (mask != 0) { 3379 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, 3380 &ctime, sizeof (ctime)); 3381 zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime, 3382 B_TRUE); 3383 if (attrzp) { 3384 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 3385 SA_ZPL_CTIME(zfsvfs), NULL, 3386 &ctime, sizeof (ctime)); 3387 zfs_tstamp_update_setup(attrzp, STATE_CHANGED, 3388 mtime, ctime, B_TRUE); 3389 } 3390 } 3391 /* 3392 * Do this after setting timestamps to prevent timestamp 3393 * update from toggling bit 3394 */ 3395 3396 if (xoap && (mask & AT_XVATTR)) { 3397 3398 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) 3399 xoap->xoa_createtime = vap->va_birthtime; 3400 /* 3401 * restore trimmed off masks 3402 * so that return masks can be set for caller. 3403 */ 3404 3405 if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) { 3406 XVA_SET_REQ(xvap, XAT_APPENDONLY); 3407 } 3408 if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) { 3409 XVA_SET_REQ(xvap, XAT_NOUNLINK); 3410 } 3411 if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) { 3412 XVA_SET_REQ(xvap, XAT_IMMUTABLE); 3413 } 3414 if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) { 3415 XVA_SET_REQ(xvap, XAT_NODUMP); 3416 } 3417 if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) { 3418 XVA_SET_REQ(xvap, XAT_AV_MODIFIED); 3419 } 3420 if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) { 3421 XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); 3422 } 3423 3424 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) 3425 ASSERT(vp->v_type == VREG); 3426 3427 zfs_xvattr_set(zp, xvap, tx); 3428 } 3429 3430 if (fuid_dirtied) 3431 zfs_fuid_sync(zfsvfs, tx); 3432 3433 if (mask != 0) 3434 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); 3435 3436 if (mask & (AT_UID|AT_GID|AT_MODE)) 3437 mutex_exit(&zp->z_acl_lock); 3438 3439 if (attrzp) { 3440 if (mask & (AT_UID|AT_GID|AT_MODE)) 3441 mutex_exit(&attrzp->z_acl_lock); 3442 } 3443out: 3444 if (err == 0 && attrzp) { 3445 err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, 3446 xattr_count, tx); 3447 ASSERT(err2 == 0); 3448 } 3449 3450 if (attrzp) 3451 vput(ZTOV(attrzp)); 3452 3453 if (aclp) 3454 zfs_acl_free(aclp); 3455 3456 if (fuidp) { 3457 zfs_fuid_info_free(fuidp); 3458 fuidp = NULL; 3459 } 3460 3461 if (err) { 3462 dmu_tx_abort(tx); 3463 } else { 3464 err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 3465 dmu_tx_commit(tx); 3466 } 3467 3468out2: 3469 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 3470 zil_commit(zilog, 0); 3471 3472 ZFS_EXIT(zfsvfs); 3473 return (err); 3474} 3475 3476/* 3477 * We acquire all but fdvp locks using non-blocking acquisitions. If we 3478 * fail to acquire any lock in the path we will drop all held locks, 3479 * acquire the new lock in a blocking fashion, and then release it and 3480 * restart the rename. This acquire/release step ensures that we do not 3481 * spin on a lock waiting for release. On error release all vnode locks 3482 * and decrement references the way tmpfs_rename() would do. 3483 */ 3484static int 3485zfs_rename_relock(struct vnode *sdvp, struct vnode **svpp, 3486 struct vnode *tdvp, struct vnode **tvpp, 3487 const struct componentname *scnp, const struct componentname *tcnp) 3488{ 3489 zfsvfs_t *zfsvfs; 3490 struct vnode *nvp, *svp, *tvp; 3491 znode_t *sdzp, *tdzp, *szp, *tzp; 3492 const char *snm = scnp->cn_nameptr; 3493 const char *tnm = tcnp->cn_nameptr; 3494 int error; 3495 3496 VOP_UNLOCK(tdvp, 0); 3497 if (*tvpp != NULL && *tvpp != tdvp) 3498 VOP_UNLOCK(*tvpp, 0); 3499 3500relock: 3501 error = vn_lock(sdvp, LK_EXCLUSIVE); 3502 if (error) 3503 goto out; 3504 sdzp = VTOZ(sdvp); 3505 3506 error = vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT); 3507 if (error != 0) { 3508 VOP_UNLOCK(sdvp, 0); 3509 if (error != EBUSY) 3510 goto out; 3511 error = vn_lock(tdvp, LK_EXCLUSIVE); 3512 if (error) 3513 goto out; 3514 VOP_UNLOCK(tdvp, 0); 3515 goto relock; 3516 } 3517 tdzp = VTOZ(tdvp); 3518 3519 /* 3520 * Before using sdzp and tdzp we must ensure that they are live. 3521 * As a porting legacy from illumos we have two things to worry 3522 * about. One is typical for FreeBSD and it is that the vnode is 3523 * not reclaimed (doomed). The other is that the znode is live. 3524 * The current code can invalidate the znode without acquiring the 3525 * corresponding vnode lock if the object represented by the znode 3526 * and vnode is no longer valid after a rollback or receive operation. 3527 * z_teardown_lock hidden behind ZFS_ENTER and ZFS_EXIT is the lock 3528 * that protects the znodes from the invalidation. 3529 */ 3530 zfsvfs = sdzp->z_zfsvfs; 3531 ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs); 3532 ZFS_ENTER(zfsvfs); 3533 3534 /* 3535 * We can not use ZFS_VERIFY_ZP() here because it could directly return 3536 * bypassing the cleanup code in the case of an error. 3537 */ 3538 if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) { 3539 ZFS_EXIT(zfsvfs); 3540 VOP_UNLOCK(sdvp, 0); 3541 VOP_UNLOCK(tdvp, 0); 3542 error = SET_ERROR(EIO); 3543 goto out; 3544 } 3545 3546 /* 3547 * Re-resolve svp to be certain it still exists and fetch the 3548 * correct vnode. 3549 */ 3550 error = zfs_dirent_lookup(sdzp, snm, &szp, ZEXISTS); 3551 if (error != 0) { 3552 /* Source entry invalid or not there. */ 3553 ZFS_EXIT(zfsvfs); 3554 VOP_UNLOCK(sdvp, 0); 3555 VOP_UNLOCK(tdvp, 0); 3556 if ((scnp->cn_flags & ISDOTDOT) != 0 || 3557 (scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.')) 3558 error = SET_ERROR(EINVAL); 3559 goto out; 3560 } 3561 svp = ZTOV(szp); 3562 3563 /* 3564 * Re-resolve tvp, if it disappeared we just carry on. 3565 */ 3566 error = zfs_dirent_lookup(tdzp, tnm, &tzp, 0); 3567 if (error != 0) { 3568 ZFS_EXIT(zfsvfs); 3569 VOP_UNLOCK(sdvp, 0); 3570 VOP_UNLOCK(tdvp, 0); 3571 vrele(svp); 3572 if ((tcnp->cn_flags & ISDOTDOT) != 0) 3573 error = SET_ERROR(EINVAL); 3574 goto out; 3575 } 3576 if (tzp != NULL) 3577 tvp = ZTOV(tzp); 3578 else 3579 tvp = NULL; 3580 3581 /* 3582 * At present the vnode locks must be acquired before z_teardown_lock, 3583 * although it would be more logical to use the opposite order. 3584 */ 3585 ZFS_EXIT(zfsvfs); 3586 3587 /* 3588 * Now try acquire locks on svp and tvp. 3589 */ 3590 nvp = svp; 3591 error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT); 3592 if (error != 0) { 3593 VOP_UNLOCK(sdvp, 0); 3594 VOP_UNLOCK(tdvp, 0); 3595 if (tvp != NULL) 3596 vrele(tvp); 3597 if (error != EBUSY) { 3598 vrele(nvp); 3599 goto out; 3600 } 3601 error = vn_lock(nvp, LK_EXCLUSIVE); 3602 if (error != 0) { 3603 vrele(nvp); 3604 goto out; 3605 } 3606 VOP_UNLOCK(nvp, 0); 3607 /* 3608 * Concurrent rename race. 3609 * XXX ? 3610 */ 3611 if (nvp == tdvp) { 3612 vrele(nvp); 3613 error = SET_ERROR(EINVAL); 3614 goto out; 3615 } 3616 vrele(*svpp); 3617 *svpp = nvp; 3618 goto relock; 3619 } 3620 vrele(*svpp); 3621 *svpp = nvp; 3622 3623 if (*tvpp != NULL) 3624 vrele(*tvpp); 3625 *tvpp = NULL; 3626 if (tvp != NULL) { 3627 nvp = tvp; 3628 error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT); 3629 if (error != 0) { 3630 VOP_UNLOCK(sdvp, 0); 3631 VOP_UNLOCK(tdvp, 0); 3632 VOP_UNLOCK(*svpp, 0); 3633 if (error != EBUSY) { 3634 vrele(nvp); 3635 goto out; 3636 } 3637 error = vn_lock(nvp, LK_EXCLUSIVE); 3638 if (error != 0) { 3639 vrele(nvp); 3640 goto out; 3641 } 3642 vput(nvp); 3643 goto relock; 3644 } 3645 *tvpp = nvp; 3646 } 3647 3648 return (0); 3649 3650out: 3651 return (error); 3652} 3653 3654/* 3655 * Note that we must use VRELE_ASYNC in this function as it walks 3656 * up the directory tree and vrele may need to acquire an exclusive 3657 * lock if a last reference to a vnode is dropped. 3658 */ 3659static int 3660zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp) 3661{ 3662 zfsvfs_t *zfsvfs; 3663 znode_t *zp, *zp1; 3664 uint64_t parent; 3665 int error; 3666 3667 zfsvfs = tdzp->z_zfsvfs; 3668 if (tdzp == szp) 3669 return (SET_ERROR(EINVAL)); 3670 if (tdzp == sdzp) 3671 return (0); 3672 if (tdzp->z_id == zfsvfs->z_root) 3673 return (0); 3674 zp = tdzp; 3675 for (;;) { 3676 ASSERT(!zp->z_unlinked); 3677 if ((error = sa_lookup(zp->z_sa_hdl, 3678 SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) 3679 break; 3680 3681 if (parent == szp->z_id) { 3682 error = SET_ERROR(EINVAL); 3683 break; 3684 } 3685 if (parent == zfsvfs->z_root) 3686 break; 3687 if (parent == sdzp->z_id) 3688 break; 3689 3690 error = zfs_zget(zfsvfs, parent, &zp1); 3691 if (error != 0) 3692 break; 3693 3694 if (zp != tdzp) 3695 VN_RELE_ASYNC(ZTOV(zp), 3696 dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os))); 3697 zp = zp1; 3698 } 3699 3700 if (error == ENOTDIR) 3701 panic("checkpath: .. not a directory\n"); 3702 if (zp != tdzp) 3703 VN_RELE_ASYNC(ZTOV(zp), 3704 dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os))); 3705 return (error); 3706} 3707 3708/* 3709 * Move an entry from the provided source directory to the target 3710 * directory. Change the entry name as indicated. 3711 * 3712 * IN: sdvp - Source directory containing the "old entry". 3713 * snm - Old entry name. 3714 * tdvp - Target directory to contain the "new entry". 3715 * tnm - New entry name. 3716 * cr - credentials of caller. 3717 * ct - caller context 3718 * flags - case flags 3719 * 3720 * RETURN: 0 on success, error code on failure. 3721 * 3722 * Timestamps: 3723 * sdvp,tdvp - ctime|mtime updated 3724 */ 3725/*ARGSUSED*/ 3726static int 3727zfs_rename(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp, 3728 vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp, 3729 cred_t *cr) 3730{ 3731 zfsvfs_t *zfsvfs; 3732 znode_t *sdzp, *tdzp, *szp, *tzp; 3733 zilog_t *zilog = NULL; 3734 dmu_tx_t *tx; 3735 char *snm = scnp->cn_nameptr; 3736 char *tnm = tcnp->cn_nameptr; 3737 int error = 0; 3738 3739 /* Reject renames across filesystems. */ 3740 if ((*svpp)->v_mount != tdvp->v_mount || 3741 ((*tvpp) != NULL && (*svpp)->v_mount != (*tvpp)->v_mount)) { 3742 error = SET_ERROR(EXDEV); 3743 goto out; 3744 } 3745 3746 if (zfsctl_is_node(tdvp)) { 3747 error = SET_ERROR(EXDEV); 3748 goto out; 3749 } 3750 3751 /* 3752 * Lock all four vnodes to ensure safety and semantics of renaming. 3753 */ 3754 error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp); 3755 if (error != 0) { 3756 /* no vnodes are locked in the case of error here */ 3757 return (error); 3758 } 3759 3760 tdzp = VTOZ(tdvp); 3761 sdzp = VTOZ(sdvp); 3762 zfsvfs = tdzp->z_zfsvfs; 3763 zilog = zfsvfs->z_log; 3764 3765 /* 3766 * After we re-enter ZFS_ENTER() we will have to revalidate all 3767 * znodes involved. 3768 */ 3769 ZFS_ENTER(zfsvfs); 3770 3771 if (zfsvfs->z_utf8 && u8_validate(tnm, 3772 strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 3773 error = SET_ERROR(EILSEQ); 3774 goto unlockout; 3775 } 3776 3777 /* If source and target are the same file, there is nothing to do. */ 3778 if ((*svpp) == (*tvpp)) { 3779 error = 0; 3780 goto unlockout; 3781 } 3782 3783 if (((*svpp)->v_type == VDIR && (*svpp)->v_mountedhere != NULL) || 3784 ((*tvpp) != NULL && (*tvpp)->v_type == VDIR && 3785 (*tvpp)->v_mountedhere != NULL)) { 3786 error = SET_ERROR(EXDEV); 3787 goto unlockout; 3788 } 3789 3790 /* 3791 * We can not use ZFS_VERIFY_ZP() here because it could directly return 3792 * bypassing the cleanup code in the case of an error. 3793 */ 3794 if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) { 3795 error = SET_ERROR(EIO); 3796 goto unlockout; 3797 } 3798 3799 szp = VTOZ(*svpp); 3800 tzp = *tvpp == NULL ? NULL : VTOZ(*tvpp); 3801 if (szp->z_sa_hdl == NULL || (tzp != NULL && tzp->z_sa_hdl == NULL)) { 3802 error = SET_ERROR(EIO); 3803 goto unlockout; 3804 } 3805 3806 /* 3807 * This is to prevent the creation of links into attribute space 3808 * by renaming a linked file into/outof an attribute directory. 3809 * See the comment in zfs_link() for why this is considered bad. 3810 */ 3811 if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { 3812 error = SET_ERROR(EINVAL); 3813 goto unlockout; 3814 } 3815 3816 /* 3817 * Must have write access at the source to remove the old entry 3818 * and write access at the target to create the new entry. 3819 * Note that if target and source are the same, this can be 3820 * done in a single check. 3821 */ 3822 if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr)) 3823 goto unlockout; 3824 3825 if ((*svpp)->v_type == VDIR) { 3826 /* 3827 * Avoid ".", "..", and aliases of "." for obvious reasons. 3828 */ 3829 if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') || 3830 sdzp == szp || 3831 (scnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) { 3832 error = EINVAL; 3833 goto unlockout; 3834 } 3835 3836 /* 3837 * Check to make sure rename is valid. 3838 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d 3839 */ 3840 if (error = zfs_rename_check(szp, sdzp, tdzp)) 3841 goto unlockout; 3842 } 3843 3844 /* 3845 * Does target exist? 3846 */ 3847 if (tzp) { 3848 /* 3849 * Source and target must be the same type. 3850 */ 3851 if ((*svpp)->v_type == VDIR) { 3852 if ((*tvpp)->v_type != VDIR) { 3853 error = SET_ERROR(ENOTDIR); 3854 goto unlockout; 3855 } else { 3856 cache_purge(tdvp); 3857 if (sdvp != tdvp) 3858 cache_purge(sdvp); 3859 } 3860 } else { 3861 if ((*tvpp)->v_type == VDIR) { 3862 error = SET_ERROR(EISDIR); 3863 goto unlockout; 3864 } 3865 } 3866 } 3867 3868 vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct); 3869 if (tzp) 3870 vnevent_rename_dest(*tvpp, tdvp, tnm, ct); 3871 3872 /* 3873 * notify the target directory if it is not the same 3874 * as source directory. 3875 */ 3876 if (tdvp != sdvp) { 3877 vnevent_rename_dest_dir(tdvp, ct); 3878 } 3879 3880 tx = dmu_tx_create(zfsvfs->z_os); 3881 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); 3882 dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); 3883 dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); 3884 dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); 3885 if (sdzp != tdzp) { 3886 dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); 3887 zfs_sa_upgrade_txholds(tx, tdzp); 3888 } 3889 if (tzp) { 3890 dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); 3891 zfs_sa_upgrade_txholds(tx, tzp); 3892 } 3893 3894 zfs_sa_upgrade_txholds(tx, szp); 3895 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 3896 error = dmu_tx_assign(tx, TXG_WAIT); 3897 if (error) { 3898 dmu_tx_abort(tx); 3899 goto unlockout; 3900 } 3901 3902 3903 if (tzp) /* Attempt to remove the existing target */ 3904 error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL); 3905 3906 if (error == 0) { 3907 error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING); 3908 if (error == 0) { 3909 szp->z_pflags |= ZFS_AV_MODIFIED; 3910 3911 error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), 3912 (void *)&szp->z_pflags, sizeof (uint64_t), tx); 3913 ASSERT0(error); 3914 3915 error = zfs_link_destroy(sdzp, snm, szp, tx, ZRENAMING, 3916 NULL); 3917 if (error == 0) { 3918 zfs_log_rename(zilog, tx, TX_RENAME, sdzp, 3919 snm, tdzp, tnm, szp); 3920 3921 /* 3922 * Update path information for the target vnode 3923 */ 3924 vn_renamepath(tdvp, *svpp, tnm, strlen(tnm)); 3925 } else { 3926 /* 3927 * At this point, we have successfully created 3928 * the target name, but have failed to remove 3929 * the source name. Since the create was done 3930 * with the ZRENAMING flag, there are 3931 * complications; for one, the link count is 3932 * wrong. The easiest way to deal with this 3933 * is to remove the newly created target, and 3934 * return the original error. This must 3935 * succeed; fortunately, it is very unlikely to 3936 * fail, since we just created it. 3937 */ 3938 VERIFY3U(zfs_link_destroy(tdzp, tnm, szp, tx, 3939 ZRENAMING, NULL), ==, 0); 3940 } 3941 } 3942 if (error == 0) { 3943 cache_purge(*svpp); 3944 if (*tvpp != NULL) 3945 cache_purge(*tvpp); 3946 cache_purge_negative(tdvp); 3947 } 3948 } 3949 3950 dmu_tx_commit(tx); 3951 3952unlockout: /* all 4 vnodes are locked, ZFS_ENTER called */ 3953 ZFS_EXIT(zfsvfs); 3954 VOP_UNLOCK(*svpp, 0); 3955 VOP_UNLOCK(sdvp, 0); 3956 3957out: /* original two vnodes are locked */ 3958 if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 3959 zil_commit(zilog, 0); 3960 3961 if (*tvpp != NULL) 3962 VOP_UNLOCK(*tvpp, 0); 3963 if (tdvp != *tvpp) 3964 VOP_UNLOCK(tdvp, 0); 3965 return (error); 3966} 3967 3968/* 3969 * Insert the indicated symbolic reference entry into the directory. 3970 * 3971 * IN: dvp - Directory to contain new symbolic link. 3972 * link - Name for new symlink entry. 3973 * vap - Attributes of new entry. 3974 * cr - credentials of caller. 3975 * ct - caller context 3976 * flags - case flags 3977 * 3978 * RETURN: 0 on success, error code on failure. 3979 * 3980 * Timestamps: 3981 * dvp - ctime|mtime updated 3982 */ 3983/*ARGSUSED*/ 3984static int 3985zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link, 3986 cred_t *cr, kthread_t *td) 3987{ 3988 znode_t *zp, *dzp = VTOZ(dvp); 3989 dmu_tx_t *tx; 3990 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 3991 zilog_t *zilog; 3992 uint64_t len = strlen(link); 3993 int error; 3994 zfs_acl_ids_t acl_ids; 3995 boolean_t fuid_dirtied; 3996 uint64_t txtype = TX_SYMLINK; 3997 int flags = 0; 3998 3999 ASSERT(vap->va_type == VLNK); 4000 4001 ZFS_ENTER(zfsvfs); 4002 ZFS_VERIFY_ZP(dzp); 4003 zilog = zfsvfs->z_log; 4004 4005 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), 4006 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 4007 ZFS_EXIT(zfsvfs); 4008 return (SET_ERROR(EILSEQ)); 4009 } 4010 4011 if (len > MAXPATHLEN) { 4012 ZFS_EXIT(zfsvfs); 4013 return (SET_ERROR(ENAMETOOLONG)); 4014 } 4015 4016 if ((error = zfs_acl_ids_create(dzp, 0, 4017 vap, cr, NULL, &acl_ids)) != 0) { 4018 ZFS_EXIT(zfsvfs); 4019 return (error); 4020 } 4021 4022 /* 4023 * Attempt to lock directory; fail if entry already exists. 4024 */ 4025 error = zfs_dirent_lookup(dzp, name, &zp, ZNEW); 4026 if (error) { 4027 zfs_acl_ids_free(&acl_ids); 4028 ZFS_EXIT(zfsvfs); 4029 return (error); 4030 } 4031 4032 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { 4033 zfs_acl_ids_free(&acl_ids); 4034 ZFS_EXIT(zfsvfs); 4035 return (error); 4036 } 4037 4038 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { 4039 zfs_acl_ids_free(&acl_ids); 4040 ZFS_EXIT(zfsvfs); 4041 return (SET_ERROR(EDQUOT)); 4042 } 4043 4044 getnewvnode_reserve(1); 4045 tx = dmu_tx_create(zfsvfs->z_os); 4046 fuid_dirtied = zfsvfs->z_fuid_dirty; 4047 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); 4048 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 4049 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 4050 ZFS_SA_BASE_ATTR_SIZE + len); 4051 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); 4052 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 4053 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 4054 acl_ids.z_aclp->z_acl_bytes); 4055 } 4056 if (fuid_dirtied) 4057 zfs_fuid_txhold(zfsvfs, tx); 4058 error = dmu_tx_assign(tx, TXG_WAIT); 4059 if (error) { 4060 zfs_acl_ids_free(&acl_ids); 4061 dmu_tx_abort(tx); 4062 getnewvnode_drop_reserve(); 4063 ZFS_EXIT(zfsvfs); 4064 return (error); 4065 } 4066 4067 /* 4068 * Create a new object for the symlink. 4069 * for version 4 ZPL datsets the symlink will be an SA attribute 4070 */ 4071 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); 4072 4073 if (fuid_dirtied) 4074 zfs_fuid_sync(zfsvfs, tx); 4075 4076 if (zp->z_is_sa) 4077 error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), 4078 link, len, tx); 4079 else 4080 zfs_sa_symlink(zp, link, len, tx); 4081 4082 zp->z_size = len; 4083 (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), 4084 &zp->z_size, sizeof (zp->z_size), tx); 4085 /* 4086 * Insert the new object into the directory. 4087 */ 4088 (void) zfs_link_create(dzp, name, zp, tx, ZNEW); 4089 4090 zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); 4091 *vpp = ZTOV(zp); 4092 4093 zfs_acl_ids_free(&acl_ids); 4094 4095 dmu_tx_commit(tx); 4096 4097 getnewvnode_drop_reserve(); 4098 4099 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 4100 zil_commit(zilog, 0); 4101 4102 ZFS_EXIT(zfsvfs); 4103 return (error); 4104} 4105 4106/* 4107 * Return, in the buffer contained in the provided uio structure, 4108 * the symbolic path referred to by vp. 4109 * 4110 * IN: vp - vnode of symbolic link. 4111 * uio - structure to contain the link path. 4112 * cr - credentials of caller. 4113 * ct - caller context 4114 * 4115 * OUT: uio - structure containing the link path. 4116 * 4117 * RETURN: 0 on success, error code on failure. 4118 * 4119 * Timestamps: 4120 * vp - atime updated 4121 */ 4122/* ARGSUSED */ 4123static int 4124zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct) 4125{ 4126 znode_t *zp = VTOZ(vp); 4127 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4128 int error; 4129 4130 ZFS_ENTER(zfsvfs); 4131 ZFS_VERIFY_ZP(zp); 4132 4133 if (zp->z_is_sa) 4134 error = sa_lookup_uio(zp->z_sa_hdl, 4135 SA_ZPL_SYMLINK(zfsvfs), uio); 4136 else 4137 error = zfs_sa_readlink(zp, uio); 4138 4139 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 4140 4141 ZFS_EXIT(zfsvfs); 4142 return (error); 4143} 4144 4145/* 4146 * Insert a new entry into directory tdvp referencing svp. 4147 * 4148 * IN: tdvp - Directory to contain new entry. 4149 * svp - vnode of new entry. 4150 * name - name of new entry. 4151 * cr - credentials of caller. 4152 * ct - caller context 4153 * 4154 * RETURN: 0 on success, error code on failure. 4155 * 4156 * Timestamps: 4157 * tdvp - ctime|mtime updated 4158 * svp - ctime updated 4159 */ 4160/* ARGSUSED */ 4161static int 4162zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr, 4163 caller_context_t *ct, int flags) 4164{ 4165 znode_t *dzp = VTOZ(tdvp); 4166 znode_t *tzp, *szp; 4167 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 4168 zilog_t *zilog; 4169 dmu_tx_t *tx; 4170 int error; 4171 uint64_t parent; 4172 uid_t owner; 4173 4174 ASSERT(tdvp->v_type == VDIR); 4175 4176 ZFS_ENTER(zfsvfs); 4177 ZFS_VERIFY_ZP(dzp); 4178 zilog = zfsvfs->z_log; 4179 4180 /* 4181 * POSIX dictates that we return EPERM here. 4182 * Better choices include ENOTSUP or EISDIR. 4183 */ 4184 if (svp->v_type == VDIR) { 4185 ZFS_EXIT(zfsvfs); 4186 return (SET_ERROR(EPERM)); 4187 } 4188 4189 szp = VTOZ(svp); 4190 ZFS_VERIFY_ZP(szp); 4191 4192 if (szp->z_pflags & (ZFS_APPENDONLY | ZFS_IMMUTABLE | ZFS_READONLY)) { 4193 ZFS_EXIT(zfsvfs); 4194 return (SET_ERROR(EPERM)); 4195 } 4196 4197 /* Prevent links to .zfs/shares files */ 4198 4199 if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), 4200 &parent, sizeof (uint64_t))) != 0) { 4201 ZFS_EXIT(zfsvfs); 4202 return (error); 4203 } 4204 if (parent == zfsvfs->z_shares_dir) { 4205 ZFS_EXIT(zfsvfs); 4206 return (SET_ERROR(EPERM)); 4207 } 4208 4209 if (zfsvfs->z_utf8 && u8_validate(name, 4210 strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 4211 ZFS_EXIT(zfsvfs); 4212 return (SET_ERROR(EILSEQ)); 4213 } 4214 4215 /* 4216 * We do not support links between attributes and non-attributes 4217 * because of the potential security risk of creating links 4218 * into "normal" file space in order to circumvent restrictions 4219 * imposed in attribute space. 4220 */ 4221 if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) { 4222 ZFS_EXIT(zfsvfs); 4223 return (SET_ERROR(EINVAL)); 4224 } 4225 4226 4227 owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER); 4228 if (owner != crgetuid(cr) && secpolicy_basic_link(svp, cr) != 0) { 4229 ZFS_EXIT(zfsvfs); 4230 return (SET_ERROR(EPERM)); 4231 } 4232 4233 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { 4234 ZFS_EXIT(zfsvfs); 4235 return (error); 4236 } 4237 4238 /* 4239 * Attempt to lock directory; fail if entry already exists. 4240 */ 4241 error = zfs_dirent_lookup(dzp, name, &tzp, ZNEW); 4242 if (error) { 4243 ZFS_EXIT(zfsvfs); 4244 return (error); 4245 } 4246 4247 tx = dmu_tx_create(zfsvfs->z_os); 4248 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); 4249 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 4250 zfs_sa_upgrade_txholds(tx, szp); 4251 zfs_sa_upgrade_txholds(tx, dzp); 4252 error = dmu_tx_assign(tx, TXG_WAIT); 4253 if (error) { 4254 dmu_tx_abort(tx); 4255 ZFS_EXIT(zfsvfs); 4256 return (error); 4257 } 4258 4259 error = zfs_link_create(dzp, name, szp, tx, 0); 4260 4261 if (error == 0) { 4262 uint64_t txtype = TX_LINK; 4263 zfs_log_link(zilog, tx, txtype, dzp, szp, name); 4264 } 4265 4266 dmu_tx_commit(tx); 4267 4268 if (error == 0) { 4269 vnevent_link(svp, ct); 4270 } 4271 4272 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 4273 zil_commit(zilog, 0); 4274 4275 ZFS_EXIT(zfsvfs); 4276 return (error); 4277} 4278 4279 4280/*ARGSUSED*/ 4281void 4282zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) 4283{ 4284 znode_t *zp = VTOZ(vp); 4285 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4286 int error; 4287 4288 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); 4289 if (zp->z_sa_hdl == NULL) { 4290 /* 4291 * The fs has been unmounted, or we did a 4292 * suspend/resume and this file no longer exists. 4293 */ 4294 rw_exit(&zfsvfs->z_teardown_inactive_lock); 4295 vrecycle(vp); 4296 return; 4297 } 4298 4299 if (zp->z_unlinked) { 4300 /* 4301 * Fast path to recycle a vnode of a removed file. 4302 */ 4303 rw_exit(&zfsvfs->z_teardown_inactive_lock); 4304 vrecycle(vp); 4305 return; 4306 } 4307 4308 if (zp->z_atime_dirty && zp->z_unlinked == 0) { 4309 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); 4310 4311 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 4312 zfs_sa_upgrade_txholds(tx, zp); 4313 error = dmu_tx_assign(tx, TXG_WAIT); 4314 if (error) { 4315 dmu_tx_abort(tx); 4316 } else { 4317 (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), 4318 (void *)&zp->z_atime, sizeof (zp->z_atime), tx); 4319 zp->z_atime_dirty = 0; 4320 dmu_tx_commit(tx); 4321 } 4322 } 4323 rw_exit(&zfsvfs->z_teardown_inactive_lock); 4324} 4325 4326 4327CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid)); 4328CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid)); 4329 4330/*ARGSUSED*/ 4331static int 4332zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct) 4333{ 4334 znode_t *zp = VTOZ(vp); 4335 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4336 uint32_t gen; 4337 uint64_t gen64; 4338 uint64_t object = zp->z_id; 4339 zfid_short_t *zfid; 4340 int size, i, error; 4341 4342 ZFS_ENTER(zfsvfs); 4343 ZFS_VERIFY_ZP(zp); 4344 4345 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), 4346 &gen64, sizeof (uint64_t))) != 0) { 4347 ZFS_EXIT(zfsvfs); 4348 return (error); 4349 } 4350 4351 gen = (uint32_t)gen64; 4352 4353 size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN; 4354 4355#ifdef illumos 4356 if (fidp->fid_len < size) { 4357 fidp->fid_len = size; 4358 ZFS_EXIT(zfsvfs); 4359 return (SET_ERROR(ENOSPC)); 4360 } 4361#else 4362 fidp->fid_len = size; 4363#endif 4364 4365 zfid = (zfid_short_t *)fidp; 4366 4367 zfid->zf_len = size; 4368 4369 for (i = 0; i < sizeof (zfid->zf_object); i++) 4370 zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); 4371 4372 /* Must have a non-zero generation number to distinguish from .zfs */ 4373 if (gen == 0) 4374 gen = 1; 4375 for (i = 0; i < sizeof (zfid->zf_gen); i++) 4376 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); 4377 4378 if (size == LONG_FID_LEN) { 4379 uint64_t objsetid = dmu_objset_id(zfsvfs->z_os); 4380 zfid_long_t *zlfid; 4381 4382 zlfid = (zfid_long_t *)fidp; 4383 4384 for (i = 0; i < sizeof (zlfid->zf_setid); i++) 4385 zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i)); 4386 4387 /* XXX - this should be the generation number for the objset */ 4388 for (i = 0; i < sizeof (zlfid->zf_setgen); i++) 4389 zlfid->zf_setgen[i] = 0; 4390 } 4391 4392 ZFS_EXIT(zfsvfs); 4393 return (0); 4394} 4395 4396static int 4397zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, 4398 caller_context_t *ct) 4399{ 4400 znode_t *zp, *xzp; 4401 zfsvfs_t *zfsvfs; 4402 int error; 4403 4404 switch (cmd) { 4405 case _PC_LINK_MAX: 4406 *valp = INT_MAX; 4407 return (0); 4408 4409 case _PC_FILESIZEBITS: 4410 *valp = 64; 4411 return (0); 4412#ifdef illumos 4413 case _PC_XATTR_EXISTS: 4414 zp = VTOZ(vp); 4415 zfsvfs = zp->z_zfsvfs; 4416 ZFS_ENTER(zfsvfs); 4417 ZFS_VERIFY_ZP(zp); 4418 *valp = 0; 4419 error = zfs_dirent_lookup(zp, "", &xzp, 4420 ZXATTR | ZEXISTS | ZSHARED); 4421 if (error == 0) { 4422 if (!zfs_dirempty(xzp)) 4423 *valp = 1; 4424 vrele(ZTOV(xzp)); 4425 } else if (error == ENOENT) { 4426 /* 4427 * If there aren't extended attributes, it's the 4428 * same as having zero of them. 4429 */ 4430 error = 0; 4431 } 4432 ZFS_EXIT(zfsvfs); 4433 return (error); 4434 4435 case _PC_SATTR_ENABLED: 4436 case _PC_SATTR_EXISTS: 4437 *valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && 4438 (vp->v_type == VREG || vp->v_type == VDIR); 4439 return (0); 4440 4441 case _PC_ACCESS_FILTERING: 4442 *valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) && 4443 vp->v_type == VDIR; 4444 return (0); 4445 4446 case _PC_ACL_ENABLED: 4447 *valp = _ACL_ACE_ENABLED; 4448 return (0); 4449#endif /* illumos */ 4450 case _PC_MIN_HOLE_SIZE: 4451 *valp = (int)SPA_MINBLOCKSIZE; 4452 return (0); 4453#ifdef illumos 4454 case _PC_TIMESTAMP_RESOLUTION: 4455 /* nanosecond timestamp resolution */ 4456 *valp = 1L; 4457 return (0); 4458#endif 4459 case _PC_ACL_EXTENDED: 4460 *valp = 0; 4461 return (0); 4462 4463 case _PC_ACL_NFS4: 4464 *valp = 1; 4465 return (0); 4466 4467 case _PC_ACL_PATH_MAX: 4468 *valp = ACL_MAX_ENTRIES; 4469 return (0); 4470 4471 default: 4472 return (EOPNOTSUPP); 4473 } 4474} 4475 4476/*ARGSUSED*/ 4477static int 4478zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, 4479 caller_context_t *ct) 4480{ 4481 znode_t *zp = VTOZ(vp); 4482 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4483 int error; 4484 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 4485 4486 ZFS_ENTER(zfsvfs); 4487 ZFS_VERIFY_ZP(zp); 4488 error = zfs_getacl(zp, vsecp, skipaclchk, cr); 4489 ZFS_EXIT(zfsvfs); 4490 4491 return (error); 4492} 4493 4494/*ARGSUSED*/ 4495int 4496zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, 4497 caller_context_t *ct) 4498{ 4499 znode_t *zp = VTOZ(vp); 4500 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4501 int error; 4502 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 4503 zilog_t *zilog = zfsvfs->z_log; 4504 4505 ZFS_ENTER(zfsvfs); 4506 ZFS_VERIFY_ZP(zp); 4507 4508 error = zfs_setacl(zp, vsecp, skipaclchk, cr); 4509 4510 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 4511 zil_commit(zilog, 0); 4512 4513 ZFS_EXIT(zfsvfs); 4514 return (error); 4515} 4516 4517static int 4518zfs_getpages(struct vnode *vp, vm_page_t *m, int count, int *rbehind, 4519 int *rahead) 4520{ 4521 znode_t *zp = VTOZ(vp); 4522 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4523 objset_t *os = zp->z_zfsvfs->z_os; 4524 vm_page_t mlast; 4525 vm_object_t object; 4526 caddr_t va; 4527 struct sf_buf *sf; 4528 off_t startoff, endoff; 4529 int i, error; 4530 vm_pindex_t reqstart, reqend; 4531 int lsize, size; 4532 4533 object = m[0]->object; 4534 error = 0; 4535 4536 ZFS_ENTER(zfsvfs); 4537 ZFS_VERIFY_ZP(zp); 4538 4539 zfs_vmobject_wlock(object); 4540 if (m[count - 1]->valid != 0 && --count == 0) { 4541 zfs_vmobject_wunlock(object); 4542 goto out; 4543 } 4544 4545 mlast = m[count - 1]; 4546 4547 if (IDX_TO_OFF(mlast->pindex) >= 4548 object->un_pager.vnp.vnp_size) { 4549 zfs_vmobject_wunlock(object); 4550 ZFS_EXIT(zfsvfs); 4551 return (zfs_vm_pagerret_bad); 4552 } 4553 4554 PCPU_INC(cnt.v_vnodein); 4555 PCPU_ADD(cnt.v_vnodepgsin, count); 4556 4557 lsize = PAGE_SIZE; 4558 if (IDX_TO_OFF(mlast->pindex) + lsize > object->un_pager.vnp.vnp_size) 4559 lsize = object->un_pager.vnp.vnp_size - 4560 IDX_TO_OFF(mlast->pindex); 4561 zfs_vmobject_wunlock(object); 4562 4563 for (i = 0; i < count; i++) { 4564 size = PAGE_SIZE; 4565 if (i == count - 1) 4566 size = lsize; 4567 va = zfs_map_page(m[i], &sf); 4568 error = dmu_read(os, zp->z_id, IDX_TO_OFF(m[i]->pindex), 4569 size, va, DMU_READ_PREFETCH); 4570 if (size != PAGE_SIZE) 4571 bzero(va + size, PAGE_SIZE - size); 4572 zfs_unmap_page(sf); 4573 if (error != 0) 4574 goto out; 4575 } 4576 4577 zfs_vmobject_wlock(object); 4578 for (i = 0; i < count; i++) 4579 m[i]->valid = VM_PAGE_BITS_ALL; 4580 zfs_vmobject_wunlock(object); 4581 4582out: 4583 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 4584 ZFS_EXIT(zfsvfs); 4585 if (error == 0) { 4586 if (rbehind) 4587 *rbehind = 0; 4588 if (rahead) 4589 *rahead = 0; 4590 return (zfs_vm_pagerret_ok); 4591 } else 4592 return (zfs_vm_pagerret_error); 4593} 4594 4595static int 4596zfs_freebsd_getpages(ap) 4597 struct vop_getpages_args /* { 4598 struct vnode *a_vp; 4599 vm_page_t *a_m; 4600 int a_count; 4601 int *a_rbehind; 4602 int *a_rahead; 4603 } */ *ap; 4604{ 4605 4606 return (zfs_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind, 4607 ap->a_rahead)); 4608} 4609 4610static int 4611zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags, 4612 int *rtvals) 4613{ 4614 znode_t *zp = VTOZ(vp); 4615 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4616 rl_t *rl; 4617 dmu_tx_t *tx; 4618 struct sf_buf *sf; 4619 vm_object_t object; 4620 vm_page_t m; 4621 caddr_t va; 4622 size_t tocopy; 4623 size_t lo_len; 4624 vm_ooffset_t lo_off; 4625 vm_ooffset_t off; 4626 uint_t blksz; 4627 int ncount; 4628 int pcount; 4629 int err; 4630 int i; 4631 4632 ZFS_ENTER(zfsvfs); 4633 ZFS_VERIFY_ZP(zp); 4634 4635 object = vp->v_object; 4636 pcount = btoc(len); 4637 ncount = pcount; 4638 4639 KASSERT(ma[0]->object == object, ("mismatching object")); 4640 KASSERT(len > 0 && (len & PAGE_MASK) == 0, ("unexpected length")); 4641 4642 for (i = 0; i < pcount; i++) 4643 rtvals[i] = zfs_vm_pagerret_error; 4644 4645 off = IDX_TO_OFF(ma[0]->pindex); 4646 blksz = zp->z_blksz; 4647 lo_off = rounddown(off, blksz); 4648 lo_len = roundup(len + (off - lo_off), blksz); 4649 rl = zfs_range_lock(zp, lo_off, lo_len, RL_WRITER); 4650 4651 zfs_vmobject_wlock(object); 4652 if (len + off > object->un_pager.vnp.vnp_size) { 4653 if (object->un_pager.vnp.vnp_size > off) { 4654 int pgoff; 4655 4656 len = object->un_pager.vnp.vnp_size - off; 4657 ncount = btoc(len); 4658 if ((pgoff = (int)len & PAGE_MASK) != 0) { 4659 /* 4660 * If the object is locked and the following 4661 * conditions hold, then the page's dirty 4662 * field cannot be concurrently changed by a 4663 * pmap operation. 4664 */ 4665 m = ma[ncount - 1]; 4666 vm_page_assert_sbusied(m); 4667 KASSERT(!pmap_page_is_write_mapped(m), 4668 ("zfs_putpages: page %p is not read-only", m)); 4669 vm_page_clear_dirty(m, pgoff, PAGE_SIZE - 4670 pgoff); 4671 } 4672 } else { 4673 len = 0; 4674 ncount = 0; 4675 } 4676 if (ncount < pcount) { 4677 for (i = ncount; i < pcount; i++) { 4678 rtvals[i] = zfs_vm_pagerret_bad; 4679 } 4680 } 4681 } 4682 zfs_vmobject_wunlock(object); 4683 4684 if (ncount == 0) 4685 goto out; 4686 4687 if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) || 4688 zfs_owner_overquota(zfsvfs, zp, B_TRUE)) { 4689 goto out; 4690 } 4691 4692 tx = dmu_tx_create(zfsvfs->z_os); 4693 dmu_tx_hold_write(tx, zp->z_id, off, len); 4694 4695 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 4696 zfs_sa_upgrade_txholds(tx, zp); 4697 err = dmu_tx_assign(tx, TXG_WAIT); 4698 if (err != 0) { 4699 dmu_tx_abort(tx); 4700 goto out; 4701 } 4702 4703 if (zp->z_blksz < PAGE_SIZE) { 4704 for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) { 4705 tocopy = len > PAGE_SIZE ? PAGE_SIZE : len; 4706 va = zfs_map_page(ma[i], &sf); 4707 dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx); 4708 zfs_unmap_page(sf); 4709 } 4710 } else { 4711 err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, ma, tx); 4712 } 4713 4714 if (err == 0) { 4715 uint64_t mtime[2], ctime[2]; 4716 sa_bulk_attr_t bulk[3]; 4717 int count = 0; 4718 4719 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, 4720 &mtime, 16); 4721 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, 4722 &ctime, 16); 4723 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 4724 &zp->z_pflags, 8); 4725 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, 4726 B_TRUE); 4727 err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 4728 ASSERT0(err); 4729 zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0); 4730 4731 zfs_vmobject_wlock(object); 4732 for (i = 0; i < ncount; i++) { 4733 rtvals[i] = zfs_vm_pagerret_ok; 4734 vm_page_undirty(ma[i]); 4735 } 4736 zfs_vmobject_wunlock(object); 4737 PCPU_INC(cnt.v_vnodeout); 4738 PCPU_ADD(cnt.v_vnodepgsout, ncount); 4739 } 4740 dmu_tx_commit(tx); 4741 4742out: 4743 zfs_range_unlock(rl); 4744 if ((flags & (zfs_vm_pagerput_sync | zfs_vm_pagerput_inval)) != 0 || 4745 zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 4746 zil_commit(zfsvfs->z_log, zp->z_id); 4747 ZFS_EXIT(zfsvfs); 4748 return (rtvals[0]); 4749} 4750 4751int 4752zfs_freebsd_putpages(ap) 4753 struct vop_putpages_args /* { 4754 struct vnode *a_vp; 4755 vm_page_t *a_m; 4756 int a_count; 4757 int a_sync; 4758 int *a_rtvals; 4759 } */ *ap; 4760{ 4761 4762 return (zfs_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync, 4763 ap->a_rtvals)); 4764} 4765 4766static int 4767zfs_freebsd_bmap(ap) 4768 struct vop_bmap_args /* { 4769 struct vnode *a_vp; 4770 daddr_t a_bn; 4771 struct bufobj **a_bop; 4772 daddr_t *a_bnp; 4773 int *a_runp; 4774 int *a_runb; 4775 } */ *ap; 4776{ 4777 4778 if (ap->a_bop != NULL) 4779 *ap->a_bop = &ap->a_vp->v_bufobj; 4780 if (ap->a_bnp != NULL) 4781 *ap->a_bnp = ap->a_bn; 4782 if (ap->a_runp != NULL) 4783 *ap->a_runp = 0; 4784 if (ap->a_runb != NULL) 4785 *ap->a_runb = 0; 4786 4787 return (0); 4788} 4789 4790static int 4791zfs_freebsd_open(ap) 4792 struct vop_open_args /* { 4793 struct vnode *a_vp; 4794 int a_mode; 4795 struct ucred *a_cred; 4796 struct thread *a_td; 4797 } */ *ap; 4798{ 4799 vnode_t *vp = ap->a_vp; 4800 znode_t *zp = VTOZ(vp); 4801 int error; 4802 4803 error = zfs_open(&vp, ap->a_mode, ap->a_cred, NULL); 4804 if (error == 0) 4805 vnode_create_vobject(vp, zp->z_size, ap->a_td); 4806 return (error); 4807} 4808 4809static int 4810zfs_freebsd_close(ap) 4811 struct vop_close_args /* { 4812 struct vnode *a_vp; 4813 int a_fflag; 4814 struct ucred *a_cred; 4815 struct thread *a_td; 4816 } */ *ap; 4817{ 4818 4819 return (zfs_close(ap->a_vp, ap->a_fflag, 1, 0, ap->a_cred, NULL)); 4820} 4821 4822static int 4823zfs_freebsd_ioctl(ap) 4824 struct vop_ioctl_args /* { 4825 struct vnode *a_vp; 4826 u_long a_command; 4827 caddr_t a_data; 4828 int a_fflag; 4829 struct ucred *cred; 4830 struct thread *td; 4831 } */ *ap; 4832{ 4833 4834 return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data, 4835 ap->a_fflag, ap->a_cred, NULL, NULL)); 4836} 4837 4838static int 4839ioflags(int ioflags) 4840{ 4841 int flags = 0; 4842 4843 if (ioflags & IO_APPEND) 4844 flags |= FAPPEND; 4845 if (ioflags & IO_NDELAY) 4846 flags |= FNONBLOCK; 4847 if (ioflags & IO_SYNC) 4848 flags |= (FSYNC | FDSYNC | FRSYNC); 4849 4850 return (flags); 4851} 4852 4853static int 4854zfs_freebsd_read(ap) 4855 struct vop_read_args /* { 4856 struct vnode *a_vp; 4857 struct uio *a_uio; 4858 int a_ioflag; 4859 struct ucred *a_cred; 4860 } */ *ap; 4861{ 4862 4863 return (zfs_read(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag), 4864 ap->a_cred, NULL)); 4865} 4866 4867static int 4868zfs_freebsd_write(ap) 4869 struct vop_write_args /* { 4870 struct vnode *a_vp; 4871 struct uio *a_uio; 4872 int a_ioflag; 4873 struct ucred *a_cred; 4874 } */ *ap; 4875{ 4876 4877 return (zfs_write(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag), 4878 ap->a_cred, NULL)); 4879} 4880 4881static int 4882zfs_freebsd_access(ap) 4883 struct vop_access_args /* { 4884 struct vnode *a_vp; 4885 accmode_t a_accmode; 4886 struct ucred *a_cred; 4887 struct thread *a_td; 4888 } */ *ap; 4889{ 4890 vnode_t *vp = ap->a_vp; 4891 znode_t *zp = VTOZ(vp); 4892 accmode_t accmode; 4893 int error = 0; 4894 4895 /* 4896 * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND, 4897 */ 4898 accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND); 4899 if (accmode != 0) 4900 error = zfs_access(ap->a_vp, accmode, 0, ap->a_cred, NULL); 4901 4902 /* 4903 * VADMIN has to be handled by vaccess(). 4904 */ 4905 if (error == 0) { 4906 accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND); 4907 if (accmode != 0) { 4908 error = vaccess(vp->v_type, zp->z_mode, zp->z_uid, 4909 zp->z_gid, accmode, ap->a_cred, NULL); 4910 } 4911 } 4912 4913 /* 4914 * For VEXEC, ensure that at least one execute bit is set for 4915 * non-directories. 4916 */ 4917 if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR && 4918 (zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) { 4919 error = EACCES; 4920 } 4921 4922 return (error); 4923} 4924 4925static int 4926zfs_freebsd_lookup(ap) 4927 struct vop_lookup_args /* { 4928 struct vnode *a_dvp; 4929 struct vnode **a_vpp; 4930 struct componentname *a_cnp; 4931 } */ *ap; 4932{ 4933 struct componentname *cnp = ap->a_cnp; 4934 char nm[NAME_MAX + 1]; 4935 4936 ASSERT(cnp->cn_namelen < sizeof(nm)); 4937 strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm))); 4938 4939 return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop, 4940 cnp->cn_cred, cnp->cn_thread, 0)); 4941} 4942 4943static int 4944zfs_cache_lookup(ap) 4945 struct vop_lookup_args /* { 4946 struct vnode *a_dvp; 4947 struct vnode **a_vpp; 4948 struct componentname *a_cnp; 4949 } */ *ap; 4950{ 4951 zfsvfs_t *zfsvfs; 4952 4953 zfsvfs = ap->a_dvp->v_mount->mnt_data; 4954 if (zfsvfs->z_use_namecache) 4955 return (vfs_cache_lookup(ap)); 4956 else 4957 return (zfs_freebsd_lookup(ap)); 4958} 4959 4960static int 4961zfs_freebsd_create(ap) 4962 struct vop_create_args /* { 4963 struct vnode *a_dvp; 4964 struct vnode **a_vpp; 4965 struct componentname *a_cnp; 4966 struct vattr *a_vap; 4967 } */ *ap; 4968{ 4969 zfsvfs_t *zfsvfs; 4970 struct componentname *cnp = ap->a_cnp; 4971 vattr_t *vap = ap->a_vap; 4972 int error, mode; 4973 4974 ASSERT(cnp->cn_flags & SAVENAME); 4975 4976 vattr_init_mask(vap); 4977 mode = vap->va_mode & ALLPERMS; 4978 zfsvfs = ap->a_dvp->v_mount->mnt_data; 4979 4980 error = zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode, 4981 ap->a_vpp, cnp->cn_cred, cnp->cn_thread); 4982 if (zfsvfs->z_use_namecache && 4983 error == 0 && (cnp->cn_flags & MAKEENTRY) != 0) 4984 cache_enter(ap->a_dvp, *ap->a_vpp, cnp); 4985 return (error); 4986} 4987 4988static int 4989zfs_freebsd_remove(ap) 4990 struct vop_remove_args /* { 4991 struct vnode *a_dvp; 4992 struct vnode *a_vp; 4993 struct componentname *a_cnp; 4994 } */ *ap; 4995{ 4996 4997 ASSERT(ap->a_cnp->cn_flags & SAVENAME); 4998 4999 return (zfs_remove(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr, 5000 ap->a_cnp->cn_cred)); 5001} 5002 5003static int 5004zfs_freebsd_mkdir(ap) 5005 struct vop_mkdir_args /* { 5006 struct vnode *a_dvp; 5007 struct vnode **a_vpp; 5008 struct componentname *a_cnp; 5009 struct vattr *a_vap; 5010 } */ *ap; 5011{ 5012 vattr_t *vap = ap->a_vap; 5013 5014 ASSERT(ap->a_cnp->cn_flags & SAVENAME); 5015 5016 vattr_init_mask(vap); 5017 5018 return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp, 5019 ap->a_cnp->cn_cred)); 5020} 5021 5022static int 5023zfs_freebsd_rmdir(ap) 5024 struct vop_rmdir_args /* { 5025 struct vnode *a_dvp; 5026 struct vnode *a_vp; 5027 struct componentname *a_cnp; 5028 } */ *ap; 5029{ 5030 struct componentname *cnp = ap->a_cnp; 5031 5032 ASSERT(cnp->cn_flags & SAVENAME); 5033 5034 return (zfs_rmdir(ap->a_dvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred)); 5035} 5036 5037static int 5038zfs_freebsd_readdir(ap) 5039 struct vop_readdir_args /* { 5040 struct vnode *a_vp; 5041 struct uio *a_uio; 5042 struct ucred *a_cred; 5043 int *a_eofflag; 5044 int *a_ncookies; 5045 u_long **a_cookies; 5046 } */ *ap; 5047{ 5048 5049 return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag, 5050 ap->a_ncookies, ap->a_cookies)); 5051} 5052 5053static int 5054zfs_freebsd_fsync(ap) 5055 struct vop_fsync_args /* { 5056 struct vnode *a_vp; 5057 int a_waitfor; 5058 struct thread *a_td; 5059 } */ *ap; 5060{ 5061 5062 vop_stdfsync(ap); 5063 return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL)); 5064} 5065 5066static int 5067zfs_freebsd_getattr(ap) 5068 struct vop_getattr_args /* { 5069 struct vnode *a_vp; 5070 struct vattr *a_vap; 5071 struct ucred *a_cred; 5072 } */ *ap; 5073{ 5074 vattr_t *vap = ap->a_vap; 5075 xvattr_t xvap; 5076 u_long fflags = 0; 5077 int error; 5078 5079 xva_init(&xvap); 5080 xvap.xva_vattr = *vap; 5081 xvap.xva_vattr.va_mask |= AT_XVATTR; 5082 5083 /* Convert chflags into ZFS-type flags. */ 5084 /* XXX: what about SF_SETTABLE?. */ 5085 XVA_SET_REQ(&xvap, XAT_IMMUTABLE); 5086 XVA_SET_REQ(&xvap, XAT_APPENDONLY); 5087 XVA_SET_REQ(&xvap, XAT_NOUNLINK); 5088 XVA_SET_REQ(&xvap, XAT_NODUMP); 5089 XVA_SET_REQ(&xvap, XAT_READONLY); 5090 XVA_SET_REQ(&xvap, XAT_ARCHIVE); 5091 XVA_SET_REQ(&xvap, XAT_SYSTEM); 5092 XVA_SET_REQ(&xvap, XAT_HIDDEN); 5093 XVA_SET_REQ(&xvap, XAT_REPARSE); 5094 XVA_SET_REQ(&xvap, XAT_OFFLINE); 5095 XVA_SET_REQ(&xvap, XAT_SPARSE); 5096 5097 error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL); 5098 if (error != 0) 5099 return (error); 5100 5101 /* Convert ZFS xattr into chflags. */ 5102#define FLAG_CHECK(fflag, xflag, xfield) do { \ 5103 if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0) \ 5104 fflags |= (fflag); \ 5105} while (0) 5106 FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE, 5107 xvap.xva_xoptattrs.xoa_immutable); 5108 FLAG_CHECK(SF_APPEND, XAT_APPENDONLY, 5109 xvap.xva_xoptattrs.xoa_appendonly); 5110 FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK, 5111 xvap.xva_xoptattrs.xoa_nounlink); 5112 FLAG_CHECK(UF_ARCHIVE, XAT_ARCHIVE, 5113 xvap.xva_xoptattrs.xoa_archive); 5114 FLAG_CHECK(UF_NODUMP, XAT_NODUMP, 5115 xvap.xva_xoptattrs.xoa_nodump); 5116 FLAG_CHECK(UF_READONLY, XAT_READONLY, 5117 xvap.xva_xoptattrs.xoa_readonly); 5118 FLAG_CHECK(UF_SYSTEM, XAT_SYSTEM, 5119 xvap.xva_xoptattrs.xoa_system); 5120 FLAG_CHECK(UF_HIDDEN, XAT_HIDDEN, 5121 xvap.xva_xoptattrs.xoa_hidden); 5122 FLAG_CHECK(UF_REPARSE, XAT_REPARSE, 5123 xvap.xva_xoptattrs.xoa_reparse); 5124 FLAG_CHECK(UF_OFFLINE, XAT_OFFLINE, 5125 xvap.xva_xoptattrs.xoa_offline); 5126 FLAG_CHECK(UF_SPARSE, XAT_SPARSE, 5127 xvap.xva_xoptattrs.xoa_sparse); 5128 5129#undef FLAG_CHECK 5130 *vap = xvap.xva_vattr; 5131 vap->va_flags = fflags; 5132 return (0); 5133} 5134 5135static int 5136zfs_freebsd_setattr(ap) 5137 struct vop_setattr_args /* { 5138 struct vnode *a_vp; 5139 struct vattr *a_vap; 5140 struct ucred *a_cred; 5141 } */ *ap; 5142{ 5143 vnode_t *vp = ap->a_vp; 5144 vattr_t *vap = ap->a_vap; 5145 cred_t *cred = ap->a_cred; 5146 xvattr_t xvap; 5147 u_long fflags; 5148 uint64_t zflags; 5149 5150 vattr_init_mask(vap); 5151 vap->va_mask &= ~AT_NOSET; 5152 5153 xva_init(&xvap); 5154 xvap.xva_vattr = *vap; 5155 5156 zflags = VTOZ(vp)->z_pflags; 5157 5158 if (vap->va_flags != VNOVAL) { 5159 zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs; 5160 int error; 5161 5162 if (zfsvfs->z_use_fuids == B_FALSE) 5163 return (EOPNOTSUPP); 5164 5165 fflags = vap->va_flags; 5166 /* 5167 * XXX KDM 5168 * We need to figure out whether it makes sense to allow 5169 * UF_REPARSE through, since we don't really have other 5170 * facilities to handle reparse points and zfs_setattr() 5171 * doesn't currently allow setting that attribute anyway. 5172 */ 5173 if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_ARCHIVE| 5174 UF_NODUMP|UF_SYSTEM|UF_HIDDEN|UF_READONLY|UF_REPARSE| 5175 UF_OFFLINE|UF_SPARSE)) != 0) 5176 return (EOPNOTSUPP); 5177 /* 5178 * Unprivileged processes are not permitted to unset system 5179 * flags, or modify flags if any system flags are set. 5180 * Privileged non-jail processes may not modify system flags 5181 * if securelevel > 0 and any existing system flags are set. 5182 * Privileged jail processes behave like privileged non-jail 5183 * processes if the security.jail.chflags_allowed sysctl is 5184 * is non-zero; otherwise, they behave like unprivileged 5185 * processes. 5186 */ 5187 if (secpolicy_fs_owner(vp->v_mount, cred) == 0 || 5188 priv_check_cred(cred, PRIV_VFS_SYSFLAGS, 0) == 0) { 5189 if (zflags & 5190 (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) { 5191 error = securelevel_gt(cred, 0); 5192 if (error != 0) 5193 return (error); 5194 } 5195 } else { 5196 /* 5197 * Callers may only modify the file flags on objects they 5198 * have VADMIN rights for. 5199 */ 5200 if ((error = VOP_ACCESS(vp, VADMIN, cred, curthread)) != 0) 5201 return (error); 5202 if (zflags & 5203 (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) { 5204 return (EPERM); 5205 } 5206 if (fflags & 5207 (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) { 5208 return (EPERM); 5209 } 5210 } 5211 5212#define FLAG_CHANGE(fflag, zflag, xflag, xfield) do { \ 5213 if (((fflags & (fflag)) && !(zflags & (zflag))) || \ 5214 ((zflags & (zflag)) && !(fflags & (fflag)))) { \ 5215 XVA_SET_REQ(&xvap, (xflag)); \ 5216 (xfield) = ((fflags & (fflag)) != 0); \ 5217 } \ 5218} while (0) 5219 /* Convert chflags into ZFS-type flags. */ 5220 /* XXX: what about SF_SETTABLE?. */ 5221 FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE, 5222 xvap.xva_xoptattrs.xoa_immutable); 5223 FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY, 5224 xvap.xva_xoptattrs.xoa_appendonly); 5225 FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK, 5226 xvap.xva_xoptattrs.xoa_nounlink); 5227 FLAG_CHANGE(UF_ARCHIVE, ZFS_ARCHIVE, XAT_ARCHIVE, 5228 xvap.xva_xoptattrs.xoa_archive); 5229 FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP, 5230 xvap.xva_xoptattrs.xoa_nodump); 5231 FLAG_CHANGE(UF_READONLY, ZFS_READONLY, XAT_READONLY, 5232 xvap.xva_xoptattrs.xoa_readonly); 5233 FLAG_CHANGE(UF_SYSTEM, ZFS_SYSTEM, XAT_SYSTEM, 5234 xvap.xva_xoptattrs.xoa_system); 5235 FLAG_CHANGE(UF_HIDDEN, ZFS_HIDDEN, XAT_HIDDEN, 5236 xvap.xva_xoptattrs.xoa_hidden); 5237 FLAG_CHANGE(UF_REPARSE, ZFS_REPARSE, XAT_REPARSE, 5238 xvap.xva_xoptattrs.xoa_hidden); 5239 FLAG_CHANGE(UF_OFFLINE, ZFS_OFFLINE, XAT_OFFLINE, 5240 xvap.xva_xoptattrs.xoa_offline); 5241 FLAG_CHANGE(UF_SPARSE, ZFS_SPARSE, XAT_SPARSE, 5242 xvap.xva_xoptattrs.xoa_sparse); 5243#undef FLAG_CHANGE 5244 } 5245 if (vap->va_birthtime.tv_sec != VNOVAL) { 5246 xvap.xva_vattr.va_mask |= AT_XVATTR; 5247 XVA_SET_REQ(&xvap, XAT_CREATETIME); 5248 } 5249 return (zfs_setattr(vp, (vattr_t *)&xvap, 0, cred, NULL)); 5250} 5251 5252static int 5253zfs_freebsd_rename(ap) 5254 struct vop_rename_args /* { 5255 struct vnode *a_fdvp; 5256 struct vnode *a_fvp; 5257 struct componentname *a_fcnp; 5258 struct vnode *a_tdvp; 5259 struct vnode *a_tvp; 5260 struct componentname *a_tcnp; 5261 } */ *ap; 5262{ 5263 vnode_t *fdvp = ap->a_fdvp; 5264 vnode_t *fvp = ap->a_fvp; 5265 vnode_t *tdvp = ap->a_tdvp; 5266 vnode_t *tvp = ap->a_tvp; 5267 int error; 5268 5269 ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART)); 5270 ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART)); 5271 5272 error = zfs_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp, 5273 ap->a_tcnp, ap->a_fcnp->cn_cred); 5274 5275 vrele(fdvp); 5276 vrele(fvp); 5277 vrele(tdvp); 5278 if (tvp != NULL) 5279 vrele(tvp); 5280 5281 return (error); 5282} 5283 5284static int 5285zfs_freebsd_symlink(ap) 5286 struct vop_symlink_args /* { 5287 struct vnode *a_dvp; 5288 struct vnode **a_vpp; 5289 struct componentname *a_cnp; 5290 struct vattr *a_vap; 5291 char *a_target; 5292 } */ *ap; 5293{ 5294 struct componentname *cnp = ap->a_cnp; 5295 vattr_t *vap = ap->a_vap; 5296 5297 ASSERT(cnp->cn_flags & SAVENAME); 5298 5299 vap->va_type = VLNK; /* FreeBSD: Syscall only sets va_mode. */ 5300 vattr_init_mask(vap); 5301 5302 return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap, 5303 ap->a_target, cnp->cn_cred, cnp->cn_thread)); 5304} 5305 5306static int 5307zfs_freebsd_readlink(ap) 5308 struct vop_readlink_args /* { 5309 struct vnode *a_vp; 5310 struct uio *a_uio; 5311 struct ucred *a_cred; 5312 } */ *ap; 5313{ 5314 5315 return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL)); 5316} 5317 5318static int 5319zfs_freebsd_link(ap) 5320 struct vop_link_args /* { 5321 struct vnode *a_tdvp; 5322 struct vnode *a_vp; 5323 struct componentname *a_cnp; 5324 } */ *ap; 5325{ 5326 struct componentname *cnp = ap->a_cnp; 5327 vnode_t *vp = ap->a_vp; 5328 vnode_t *tdvp = ap->a_tdvp; 5329 5330 if (tdvp->v_mount != vp->v_mount) 5331 return (EXDEV); 5332 5333 ASSERT(cnp->cn_flags & SAVENAME); 5334 5335 return (zfs_link(tdvp, vp, cnp->cn_nameptr, cnp->cn_cred, NULL, 0)); 5336} 5337 5338static int 5339zfs_freebsd_inactive(ap) 5340 struct vop_inactive_args /* { 5341 struct vnode *a_vp; 5342 struct thread *a_td; 5343 } */ *ap; 5344{ 5345 vnode_t *vp = ap->a_vp; 5346 5347 zfs_inactive(vp, ap->a_td->td_ucred, NULL); 5348 return (0); 5349} 5350 5351static int 5352zfs_freebsd_reclaim(ap) 5353 struct vop_reclaim_args /* { 5354 struct vnode *a_vp; 5355 struct thread *a_td; 5356 } */ *ap; 5357{ 5358 vnode_t *vp = ap->a_vp; 5359 znode_t *zp = VTOZ(vp); 5360 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 5361 5362 ASSERT(zp != NULL); 5363 5364 /* Destroy the vm object and flush associated pages. */ 5365 vnode_destroy_vobject(vp); 5366 5367 /* 5368 * z_teardown_inactive_lock protects from a race with 5369 * zfs_znode_dmu_fini in zfsvfs_teardown during 5370 * force unmount. 5371 */ 5372 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); 5373 if (zp->z_sa_hdl == NULL) 5374 zfs_znode_free(zp); 5375 else 5376 zfs_zinactive(zp); 5377 rw_exit(&zfsvfs->z_teardown_inactive_lock); 5378 5379 vp->v_data = NULL; 5380 return (0); 5381} 5382 5383static int 5384zfs_freebsd_fid(ap) 5385 struct vop_fid_args /* { 5386 struct vnode *a_vp; 5387 struct fid *a_fid; 5388 } */ *ap; 5389{ 5390 5391 return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL)); 5392} 5393 5394static int 5395zfs_freebsd_pathconf(ap) 5396 struct vop_pathconf_args /* { 5397 struct vnode *a_vp; 5398 int a_name; 5399 register_t *a_retval; 5400 } */ *ap; 5401{ 5402 ulong_t val; 5403 int error; 5404 5405 error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred, NULL); 5406 if (error == 0) { 5407 *ap->a_retval = val; 5408 return (error); 5409 } 5410 if (error != EOPNOTSUPP) 5411 return (error); 5412 5413 switch (ap->a_name) { 5414 case _PC_NAME_MAX: 5415 *ap->a_retval = NAME_MAX; 5416 return (0); 5417 case _PC_PIPE_BUF: 5418 if (ap->a_vp->v_type == VDIR || ap->a_vp->v_type == VFIFO) { 5419 *ap->a_retval = PIPE_BUF; 5420 return (0); 5421 } 5422 return (EINVAL); 5423 default: 5424 return (vop_stdpathconf(ap)); 5425 } 5426} 5427 5428/* 5429 * FreeBSD's extended attributes namespace defines file name prefix for ZFS' 5430 * extended attribute name: 5431 * 5432 * NAMESPACE PREFIX 5433 * system freebsd:system: 5434 * user (none, can be used to access ZFS fsattr(5) attributes 5435 * created on Solaris) 5436 */ 5437static int 5438zfs_create_attrname(int attrnamespace, const char *name, char *attrname, 5439 size_t size) 5440{ 5441 const char *namespace, *prefix, *suffix; 5442 5443 /* We don't allow '/' character in attribute name. */ 5444 if (strchr(name, '/') != NULL) 5445 return (EINVAL); 5446 /* We don't allow attribute names that start with "freebsd:" string. */ 5447 if (strncmp(name, "freebsd:", 8) == 0) 5448 return (EINVAL); 5449 5450 bzero(attrname, size); 5451 5452 switch (attrnamespace) { 5453 case EXTATTR_NAMESPACE_USER: 5454#if 0 5455 prefix = "freebsd:"; 5456 namespace = EXTATTR_NAMESPACE_USER_STRING; 5457 suffix = ":"; 5458#else 5459 /* 5460 * This is the default namespace by which we can access all 5461 * attributes created on Solaris. 5462 */ 5463 prefix = namespace = suffix = ""; 5464#endif 5465 break; 5466 case EXTATTR_NAMESPACE_SYSTEM: 5467 prefix = "freebsd:"; 5468 namespace = EXTATTR_NAMESPACE_SYSTEM_STRING; 5469 suffix = ":"; 5470 break; 5471 case EXTATTR_NAMESPACE_EMPTY: 5472 default: 5473 return (EINVAL); 5474 } 5475 if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix, 5476 name) >= size) { 5477 return (ENAMETOOLONG); 5478 } 5479 return (0); 5480} 5481 5482/* 5483 * Vnode operating to retrieve a named extended attribute. 5484 */ 5485static int 5486zfs_getextattr(struct vop_getextattr_args *ap) 5487/* 5488vop_getextattr { 5489 IN struct vnode *a_vp; 5490 IN int a_attrnamespace; 5491 IN const char *a_name; 5492 INOUT struct uio *a_uio; 5493 OUT size_t *a_size; 5494 IN struct ucred *a_cred; 5495 IN struct thread *a_td; 5496}; 5497*/ 5498{ 5499 zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; 5500 struct thread *td = ap->a_td; 5501 struct nameidata nd; 5502 char attrname[255]; 5503 struct vattr va; 5504 vnode_t *xvp = NULL, *vp; 5505 int error, flags; 5506 5507 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 5508 ap->a_cred, ap->a_td, VREAD); 5509 if (error != 0) 5510 return (error); 5511 5512 error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, 5513 sizeof(attrname)); 5514 if (error != 0) 5515 return (error); 5516 5517 ZFS_ENTER(zfsvfs); 5518 5519 error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, 5520 LOOKUP_XATTR); 5521 if (error != 0) { 5522 ZFS_EXIT(zfsvfs); 5523 return (error); 5524 } 5525 5526 flags = FREAD; 5527 NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, 5528 xvp, td); 5529 error = vn_open_cred(&nd, &flags, 0, 0, ap->a_cred, NULL); 5530 vp = nd.ni_vp; 5531 NDFREE(&nd, NDF_ONLY_PNBUF); 5532 if (error != 0) { 5533 ZFS_EXIT(zfsvfs); 5534 if (error == ENOENT) 5535 error = ENOATTR; 5536 return (error); 5537 } 5538 5539 if (ap->a_size != NULL) { 5540 error = VOP_GETATTR(vp, &va, ap->a_cred); 5541 if (error == 0) 5542 *ap->a_size = (size_t)va.va_size; 5543 } else if (ap->a_uio != NULL) 5544 error = VOP_READ(vp, ap->a_uio, IO_UNIT, ap->a_cred); 5545 5546 VOP_UNLOCK(vp, 0); 5547 vn_close(vp, flags, ap->a_cred, td); 5548 ZFS_EXIT(zfsvfs); 5549 5550 return (error); 5551} 5552 5553/* 5554 * Vnode operation to remove a named attribute. 5555 */ 5556int 5557zfs_deleteextattr(struct vop_deleteextattr_args *ap) 5558/* 5559vop_deleteextattr { 5560 IN struct vnode *a_vp; 5561 IN int a_attrnamespace; 5562 IN const char *a_name; 5563 IN struct ucred *a_cred; 5564 IN struct thread *a_td; 5565}; 5566*/ 5567{ 5568 zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; 5569 struct thread *td = ap->a_td; 5570 struct nameidata nd; 5571 char attrname[255]; 5572 struct vattr va; 5573 vnode_t *xvp = NULL, *vp; 5574 int error, flags; 5575 5576 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 5577 ap->a_cred, ap->a_td, VWRITE); 5578 if (error != 0) 5579 return (error); 5580 5581 error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, 5582 sizeof(attrname)); 5583 if (error != 0) 5584 return (error); 5585 5586 ZFS_ENTER(zfsvfs); 5587 5588 error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, 5589 LOOKUP_XATTR); 5590 if (error != 0) { 5591 ZFS_EXIT(zfsvfs); 5592 return (error); 5593 } 5594 5595 NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF, 5596 UIO_SYSSPACE, attrname, xvp, td); 5597 error = namei(&nd); 5598 vp = nd.ni_vp; 5599 if (error != 0) { 5600 ZFS_EXIT(zfsvfs); 5601 NDFREE(&nd, NDF_ONLY_PNBUF); 5602 if (error == ENOENT) 5603 error = ENOATTR; 5604 return (error); 5605 } 5606 5607 error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd); 5608 NDFREE(&nd, NDF_ONLY_PNBUF); 5609 5610 vput(nd.ni_dvp); 5611 if (vp == nd.ni_dvp) 5612 vrele(vp); 5613 else 5614 vput(vp); 5615 ZFS_EXIT(zfsvfs); 5616 5617 return (error); 5618} 5619 5620/* 5621 * Vnode operation to set a named attribute. 5622 */ 5623static int 5624zfs_setextattr(struct vop_setextattr_args *ap) 5625/* 5626vop_setextattr { 5627 IN struct vnode *a_vp; 5628 IN int a_attrnamespace; 5629 IN const char *a_name; 5630 INOUT struct uio *a_uio; 5631 IN struct ucred *a_cred; 5632 IN struct thread *a_td; 5633}; 5634*/ 5635{ 5636 zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; 5637 struct thread *td = ap->a_td; 5638 struct nameidata nd; 5639 char attrname[255]; 5640 struct vattr va; 5641 vnode_t *xvp = NULL, *vp; 5642 int error, flags; 5643 5644 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 5645 ap->a_cred, ap->a_td, VWRITE); 5646 if (error != 0) 5647 return (error); 5648 5649 error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, 5650 sizeof(attrname)); 5651 if (error != 0) 5652 return (error); 5653 5654 ZFS_ENTER(zfsvfs); 5655 5656 error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, 5657 LOOKUP_XATTR | CREATE_XATTR_DIR); 5658 if (error != 0) { 5659 ZFS_EXIT(zfsvfs); 5660 return (error); 5661 } 5662 5663 flags = FFLAGS(O_WRONLY | O_CREAT); 5664 NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, 5665 xvp, td); 5666 error = vn_open_cred(&nd, &flags, 0600, 0, ap->a_cred, NULL); 5667 vp = nd.ni_vp; 5668 NDFREE(&nd, NDF_ONLY_PNBUF); 5669 if (error != 0) { 5670 ZFS_EXIT(zfsvfs); 5671 return (error); 5672 } 5673 5674 VATTR_NULL(&va); 5675 va.va_size = 0; 5676 error = VOP_SETATTR(vp, &va, ap->a_cred); 5677 if (error == 0) 5678 VOP_WRITE(vp, ap->a_uio, IO_UNIT, ap->a_cred); 5679 5680 VOP_UNLOCK(vp, 0); 5681 vn_close(vp, flags, ap->a_cred, td); 5682 ZFS_EXIT(zfsvfs); 5683 5684 return (error); 5685} 5686 5687/* 5688 * Vnode operation to retrieve extended attributes on a vnode. 5689 */ 5690static int 5691zfs_listextattr(struct vop_listextattr_args *ap) 5692/* 5693vop_listextattr { 5694 IN struct vnode *a_vp; 5695 IN int a_attrnamespace; 5696 INOUT struct uio *a_uio; 5697 OUT size_t *a_size; 5698 IN struct ucred *a_cred; 5699 IN struct thread *a_td; 5700}; 5701*/ 5702{ 5703 zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; 5704 struct thread *td = ap->a_td; 5705 struct nameidata nd; 5706 char attrprefix[16]; 5707 u_char dirbuf[sizeof(struct dirent)]; 5708 struct dirent *dp; 5709 struct iovec aiov; 5710 struct uio auio, *uio = ap->a_uio; 5711 size_t *sizep = ap->a_size; 5712 size_t plen; 5713 vnode_t *xvp = NULL, *vp; 5714 int done, error, eof, pos; 5715 5716 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 5717 ap->a_cred, ap->a_td, VREAD); 5718 if (error != 0) 5719 return (error); 5720 5721 error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix, 5722 sizeof(attrprefix)); 5723 if (error != 0) 5724 return (error); 5725 plen = strlen(attrprefix); 5726 5727 ZFS_ENTER(zfsvfs); 5728 5729 if (sizep != NULL) 5730 *sizep = 0; 5731 5732 error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, 5733 LOOKUP_XATTR); 5734 if (error != 0) { 5735 ZFS_EXIT(zfsvfs); 5736 /* 5737 * ENOATTR means that the EA directory does not yet exist, 5738 * i.e. there are no extended attributes there. 5739 */ 5740 if (error == ENOATTR) 5741 error = 0; 5742 return (error); 5743 } 5744 5745 NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED, 5746 UIO_SYSSPACE, ".", xvp, td); 5747 error = namei(&nd); 5748 vp = nd.ni_vp; 5749 NDFREE(&nd, NDF_ONLY_PNBUF); 5750 if (error != 0) { 5751 ZFS_EXIT(zfsvfs); 5752 return (error); 5753 } 5754 5755 auio.uio_iov = &aiov; 5756 auio.uio_iovcnt = 1; 5757 auio.uio_segflg = UIO_SYSSPACE; 5758 auio.uio_td = td; 5759 auio.uio_rw = UIO_READ; 5760 auio.uio_offset = 0; 5761 5762 do { 5763 u_char nlen; 5764 5765 aiov.iov_base = (void *)dirbuf; 5766 aiov.iov_len = sizeof(dirbuf); 5767 auio.uio_resid = sizeof(dirbuf); 5768 error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL); 5769 done = sizeof(dirbuf) - auio.uio_resid; 5770 if (error != 0) 5771 break; 5772 for (pos = 0; pos < done;) { 5773 dp = (struct dirent *)(dirbuf + pos); 5774 pos += dp->d_reclen; 5775 /* 5776 * XXX: Temporarily we also accept DT_UNKNOWN, as this 5777 * is what we get when attribute was created on Solaris. 5778 */ 5779 if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN) 5780 continue; 5781 if (plen == 0 && strncmp(dp->d_name, "freebsd:", 8) == 0) 5782 continue; 5783 else if (strncmp(dp->d_name, attrprefix, plen) != 0) 5784 continue; 5785 nlen = dp->d_namlen - plen; 5786 if (sizep != NULL) 5787 *sizep += 1 + nlen; 5788 else if (uio != NULL) { 5789 /* 5790 * Format of extattr name entry is one byte for 5791 * length and the rest for name. 5792 */ 5793 error = uiomove(&nlen, 1, uio->uio_rw, uio); 5794 if (error == 0) { 5795 error = uiomove(dp->d_name + plen, nlen, 5796 uio->uio_rw, uio); 5797 } 5798 if (error != 0) 5799 break; 5800 } 5801 } 5802 } while (!eof && error == 0); 5803 5804 vput(vp); 5805 ZFS_EXIT(zfsvfs); 5806 5807 return (error); 5808} 5809 5810int 5811zfs_freebsd_getacl(ap) 5812 struct vop_getacl_args /* { 5813 struct vnode *vp; 5814 acl_type_t type; 5815 struct acl *aclp; 5816 struct ucred *cred; 5817 struct thread *td; 5818 } */ *ap; 5819{ 5820 int error; 5821 vsecattr_t vsecattr; 5822 5823 if (ap->a_type != ACL_TYPE_NFS4) 5824 return (EINVAL); 5825 5826 vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT; 5827 if (error = zfs_getsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL)) 5828 return (error); 5829 5830 error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp, vsecattr.vsa_aclcnt); 5831 if (vsecattr.vsa_aclentp != NULL) 5832 kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz); 5833 5834 return (error); 5835} 5836 5837int 5838zfs_freebsd_setacl(ap) 5839 struct vop_setacl_args /* { 5840 struct vnode *vp; 5841 acl_type_t type; 5842 struct acl *aclp; 5843 struct ucred *cred; 5844 struct thread *td; 5845 } */ *ap; 5846{ 5847 int error; 5848 vsecattr_t vsecattr; 5849 int aclbsize; /* size of acl list in bytes */ 5850 aclent_t *aaclp; 5851 5852 if (ap->a_type != ACL_TYPE_NFS4) 5853 return (EINVAL); 5854 5855 if (ap->a_aclp == NULL) 5856 return (EINVAL); 5857 5858 if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES) 5859 return (EINVAL); 5860 5861 /* 5862 * With NFSv4 ACLs, chmod(2) may need to add additional entries, 5863 * splitting every entry into two and appending "canonical six" 5864 * entries at the end. Don't allow for setting an ACL that would 5865 * cause chmod(2) to run out of ACL entries. 5866 */ 5867 if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES) 5868 return (ENOSPC); 5869 5870 error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR); 5871 if (error != 0) 5872 return (error); 5873 5874 vsecattr.vsa_mask = VSA_ACE; 5875 aclbsize = ap->a_aclp->acl_cnt * sizeof(ace_t); 5876 vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP); 5877 aaclp = vsecattr.vsa_aclentp; 5878 vsecattr.vsa_aclentsz = aclbsize; 5879 5880 aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp); 5881 error = zfs_setsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL); 5882 kmem_free(aaclp, aclbsize); 5883 5884 return (error); 5885} 5886 5887int 5888zfs_freebsd_aclcheck(ap) 5889 struct vop_aclcheck_args /* { 5890 struct vnode *vp; 5891 acl_type_t type; 5892 struct acl *aclp; 5893 struct ucred *cred; 5894 struct thread *td; 5895 } */ *ap; 5896{ 5897 5898 return (EOPNOTSUPP); 5899} 5900 5901static int 5902zfs_vptocnp(struct vop_vptocnp_args *ap) 5903{ 5904 vnode_t *covered_vp; 5905 vnode_t *vp = ap->a_vp;; 5906 zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; 5907 znode_t *zp = VTOZ(vp); 5908 int ltype; 5909 int error; 5910 5911 ZFS_ENTER(zfsvfs); 5912 ZFS_VERIFY_ZP(zp); 5913 5914 /* 5915 * If we are a snapshot mounted under .zfs, run the operation 5916 * on the covered vnode. 5917 */ 5918 if (zp->z_id != zfsvfs->z_root || zfsvfs->z_parent == zfsvfs) { 5919 char name[MAXNAMLEN + 1]; 5920 znode_t *dzp; 5921 size_t len; 5922 5923 error = zfs_znode_parent_and_name(zp, &dzp, name); 5924 if (error == 0) { 5925 len = strlen(name); 5926 if (*ap->a_buflen < len) 5927 error = SET_ERROR(ENOMEM); 5928 } 5929 if (error == 0) { 5930 *ap->a_buflen -= len; 5931 bcopy(name, ap->a_buf + *ap->a_buflen, len); 5932 *ap->a_vpp = ZTOV(dzp); 5933 } 5934 ZFS_EXIT(zfsvfs); 5935 return (error); 5936 } 5937 ZFS_EXIT(zfsvfs); 5938 5939 covered_vp = vp->v_mount->mnt_vnodecovered; 5940 vhold(covered_vp); 5941 ltype = VOP_ISLOCKED(vp); 5942 VOP_UNLOCK(vp, 0); 5943 error = vget(covered_vp, LK_SHARED | LK_VNHELD, curthread); 5944 if (error == 0) { 5945 error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_cred, 5946 ap->a_buf, ap->a_buflen); 5947 vput(covered_vp); 5948 } 5949 vn_lock(vp, ltype | LK_RETRY); 5950 if ((vp->v_iflag & VI_DOOMED) != 0) 5951 error = SET_ERROR(ENOENT); 5952 return (error); 5953} 5954 5955#ifdef DIAGNOSTIC 5956static int 5957zfs_lock(ap) 5958 struct vop_lock1_args /* { 5959 struct vnode *a_vp; 5960 int a_flags; 5961 char *file; 5962 int line; 5963 } */ *ap; 5964{ 5965 vnode_t *vp; 5966 znode_t *zp; 5967 int err; 5968 5969 err = vop_stdlock(ap); 5970 if (err == 0 && (ap->a_flags & LK_NOWAIT) == 0) { 5971 vp = ap->a_vp; 5972 zp = vp->v_data; 5973 if (vp->v_mount != NULL && (vp->v_iflag & VI_DOOMED) == 0 && 5974 zp != NULL && (zp->z_pflags & ZFS_XATTR) == 0) 5975 VERIFY(!RRM_LOCK_HELD(&zp->z_zfsvfs->z_teardown_lock)); 5976 } 5977 return (err); 5978} 5979#endif 5980 5981struct vop_vector zfs_vnodeops; 5982struct vop_vector zfs_fifoops; 5983struct vop_vector zfs_shareops; 5984 5985struct vop_vector zfs_vnodeops = { 5986 .vop_default = &default_vnodeops, 5987 .vop_inactive = zfs_freebsd_inactive, 5988 .vop_reclaim = zfs_freebsd_reclaim, 5989 .vop_access = zfs_freebsd_access, 5990 .vop_lookup = zfs_cache_lookup, 5991 .vop_cachedlookup = zfs_freebsd_lookup, 5992 .vop_getattr = zfs_freebsd_getattr, 5993 .vop_setattr = zfs_freebsd_setattr, 5994 .vop_create = zfs_freebsd_create, 5995 .vop_mknod = zfs_freebsd_create, 5996 .vop_mkdir = zfs_freebsd_mkdir, 5997 .vop_readdir = zfs_freebsd_readdir, 5998 .vop_fsync = zfs_freebsd_fsync, 5999 .vop_open = zfs_freebsd_open, 6000 .vop_close = zfs_freebsd_close, 6001 .vop_rmdir = zfs_freebsd_rmdir, 6002 .vop_ioctl = zfs_freebsd_ioctl, 6003 .vop_link = zfs_freebsd_link, 6004 .vop_symlink = zfs_freebsd_symlink, 6005 .vop_readlink = zfs_freebsd_readlink, 6006 .vop_read = zfs_freebsd_read, 6007 .vop_write = zfs_freebsd_write, 6008 .vop_remove = zfs_freebsd_remove, 6009 .vop_rename = zfs_freebsd_rename, 6010 .vop_pathconf = zfs_freebsd_pathconf, 6011 .vop_bmap = zfs_freebsd_bmap, 6012 .vop_fid = zfs_freebsd_fid, 6013 .vop_getextattr = zfs_getextattr, 6014 .vop_deleteextattr = zfs_deleteextattr, 6015 .vop_setextattr = zfs_setextattr, 6016 .vop_listextattr = zfs_listextattr, 6017 .vop_getacl = zfs_freebsd_getacl, 6018 .vop_setacl = zfs_freebsd_setacl, 6019 .vop_aclcheck = zfs_freebsd_aclcheck, 6020 .vop_getpages = zfs_freebsd_getpages, 6021 .vop_putpages = zfs_freebsd_putpages, 6022 .vop_vptocnp = zfs_vptocnp, 6023#ifdef DIAGNOSTIC 6024 .vop_lock1 = zfs_lock, 6025#endif 6026}; 6027 6028struct vop_vector zfs_fifoops = { 6029 .vop_default = &fifo_specops, 6030 .vop_fsync = zfs_freebsd_fsync, 6031 .vop_access = zfs_freebsd_access, 6032 .vop_getattr = zfs_freebsd_getattr, 6033 .vop_inactive = zfs_freebsd_inactive, 6034 .vop_read = VOP_PANIC, 6035 .vop_reclaim = zfs_freebsd_reclaim, 6036 .vop_setattr = zfs_freebsd_setattr, 6037 .vop_write = VOP_PANIC, 6038 .vop_pathconf = zfs_freebsd_pathconf, 6039 .vop_fid = zfs_freebsd_fid, 6040 .vop_getacl = zfs_freebsd_getacl, 6041 .vop_setacl = zfs_freebsd_setacl, 6042 .vop_aclcheck = zfs_freebsd_aclcheck, 6043}; 6044 6045/* 6046 * special share hidden files vnode operations template 6047 */ 6048struct vop_vector zfs_shareops = { 6049 .vop_default = &default_vnodeops, 6050 .vop_access = zfs_freebsd_access, 6051 .vop_inactive = zfs_freebsd_inactive, 6052 .vop_reclaim = zfs_freebsd_reclaim, 6053 .vop_fid = zfs_freebsd_fid, 6054 .vop_pathconf = zfs_freebsd_pathconf, 6055}; 6056