zfs_vnops.c revision 330991
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22/* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2012, 2015 by Delphix. All rights reserved. 25 * Copyright (c) 2014 Integros [integros.com] 26 * Copyright 2017 Nexenta Systems, Inc. 27 */ 28 29/* Portions Copyright 2007 Jeremy Teo */ 30/* Portions Copyright 2010 Robert Milkowski */ 31 32#include <sys/types.h> 33#include <sys/param.h> 34#include <sys/time.h> 35#include <sys/systm.h> 36#include <sys/sysmacros.h> 37#include <sys/resource.h> 38#include <sys/vfs.h> 39#include <sys/vm.h> 40#include <sys/vnode.h> 41#include <sys/file.h> 42#include <sys/stat.h> 43#include <sys/kmem.h> 44#include <sys/taskq.h> 45#include <sys/uio.h> 46#include <sys/atomic.h> 47#include <sys/namei.h> 48#include <sys/mman.h> 49#include <sys/cmn_err.h> 50#include <sys/errno.h> 51#include <sys/unistd.h> 52#include <sys/zfs_dir.h> 53#include <sys/zfs_ioctl.h> 54#include <sys/fs/zfs.h> 55#include <sys/dmu.h> 56#include <sys/dmu_objset.h> 57#include <sys/spa.h> 58#include <sys/txg.h> 59#include <sys/dbuf.h> 60#include <sys/zap.h> 61#include <sys/sa.h> 62#include <sys/dirent.h> 63#include <sys/policy.h> 64#include <sys/sunddi.h> 65#include <sys/filio.h> 66#include <sys/sid.h> 67#include <sys/zfs_ctldir.h> 68#include <sys/zfs_fuid.h> 69#include <sys/zfs_sa.h> 70#include <sys/zfs_rlock.h> 71#include <sys/extdirent.h> 72#include <sys/kidmap.h> 73#include <sys/bio.h> 74#include <sys/buf.h> 75#include <sys/sched.h> 76#include <sys/acl.h> 77#include <vm/vm_param.h> 78#include <sys/zil.h> 79 80/* 81 * Programming rules. 82 * 83 * Each vnode op performs some logical unit of work. To do this, the ZPL must 84 * properly lock its in-core state, create a DMU transaction, do the work, 85 * record this work in the intent log (ZIL), commit the DMU transaction, 86 * and wait for the intent log to commit if it is a synchronous operation. 87 * Moreover, the vnode ops must work in both normal and log replay context. 88 * The ordering of events is important to avoid deadlocks and references 89 * to freed memory. The example below illustrates the following Big Rules: 90 * 91 * (1) A check must be made in each zfs thread for a mounted file system. 92 * This is done avoiding races using ZFS_ENTER(zfsvfs). 93 * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes 94 * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros 95 * can return EIO from the calling function. 96 * 97 * (2) VN_RELE() should always be the last thing except for zil_commit() 98 * (if necessary) and ZFS_EXIT(). This is for 3 reasons: 99 * First, if it's the last reference, the vnode/znode 100 * can be freed, so the zp may point to freed memory. Second, the last 101 * reference will call zfs_zinactive(), which may induce a lot of work -- 102 * pushing cached pages (which acquires range locks) and syncing out 103 * cached atime changes. Third, zfs_zinactive() may require a new tx, 104 * which could deadlock the system if you were already holding one. 105 * If you must call VN_RELE() within a tx then use VN_RELE_ASYNC(). 106 * 107 * (3) All range locks must be grabbed before calling dmu_tx_assign(), 108 * as they can span dmu_tx_assign() calls. 109 * 110 * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to 111 * dmu_tx_assign(). This is critical because we don't want to block 112 * while holding locks. 113 * 114 * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This 115 * reduces lock contention and CPU usage when we must wait (note that if 116 * throughput is constrained by the storage, nearly every transaction 117 * must wait). 118 * 119 * Note, in particular, that if a lock is sometimes acquired before 120 * the tx assigns, and sometimes after (e.g. z_lock), then failing 121 * to use a non-blocking assign can deadlock the system. The scenario: 122 * 123 * Thread A has grabbed a lock before calling dmu_tx_assign(). 124 * Thread B is in an already-assigned tx, and blocks for this lock. 125 * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() 126 * forever, because the previous txg can't quiesce until B's tx commits. 127 * 128 * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, 129 * then drop all locks, call dmu_tx_wait(), and try again. On subsequent 130 * calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT, 131 * to indicate that this operation has already called dmu_tx_wait(). 132 * This will ensure that we don't retry forever, waiting a short bit 133 * each time. 134 * 135 * (5) If the operation succeeded, generate the intent log entry for it 136 * before dropping locks. This ensures that the ordering of events 137 * in the intent log matches the order in which they actually occurred. 138 * During ZIL replay the zfs_log_* functions will update the sequence 139 * number to indicate the zil transaction has replayed. 140 * 141 * (6) At the end of each vnode op, the DMU tx must always commit, 142 * regardless of whether there were any errors. 143 * 144 * (7) After dropping all locks, invoke zil_commit(zilog, foid) 145 * to ensure that synchronous semantics are provided when necessary. 146 * 147 * In general, this is how things should be ordered in each vnode op: 148 * 149 * ZFS_ENTER(zfsvfs); // exit if unmounted 150 * top: 151 * zfs_dirent_lookup(&dl, ...) // lock directory entry (may VN_HOLD()) 152 * rw_enter(...); // grab any other locks you need 153 * tx = dmu_tx_create(...); // get DMU tx 154 * dmu_tx_hold_*(); // hold each object you might modify 155 * error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 156 * if (error) { 157 * rw_exit(...); // drop locks 158 * zfs_dirent_unlock(dl); // unlock directory entry 159 * VN_RELE(...); // release held vnodes 160 * if (error == ERESTART) { 161 * waited = B_TRUE; 162 * dmu_tx_wait(tx); 163 * dmu_tx_abort(tx); 164 * goto top; 165 * } 166 * dmu_tx_abort(tx); // abort DMU tx 167 * ZFS_EXIT(zfsvfs); // finished in zfs 168 * return (error); // really out of space 169 * } 170 * error = do_real_work(); // do whatever this VOP does 171 * if (error == 0) 172 * zfs_log_*(...); // on success, make ZIL entry 173 * dmu_tx_commit(tx); // commit DMU tx -- error or not 174 * rw_exit(...); // drop locks 175 * zfs_dirent_unlock(dl); // unlock directory entry 176 * VN_RELE(...); // release held vnodes 177 * zil_commit(zilog, foid); // synchronous when necessary 178 * ZFS_EXIT(zfsvfs); // finished in zfs 179 * return (error); // done, report error 180 */ 181 182/* ARGSUSED */ 183static int 184zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) 185{ 186 znode_t *zp = VTOZ(*vpp); 187 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 188 189 ZFS_ENTER(zfsvfs); 190 ZFS_VERIFY_ZP(zp); 191 192 if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) && 193 ((flag & FAPPEND) == 0)) { 194 ZFS_EXIT(zfsvfs); 195 return (SET_ERROR(EPERM)); 196 } 197 198 if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && 199 ZTOV(zp)->v_type == VREG && 200 !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) { 201 if (fs_vscan(*vpp, cr, 0) != 0) { 202 ZFS_EXIT(zfsvfs); 203 return (SET_ERROR(EACCES)); 204 } 205 } 206 207 /* Keep a count of the synchronous opens in the znode */ 208 if (flag & (FSYNC | FDSYNC)) 209 atomic_inc_32(&zp->z_sync_cnt); 210 211 ZFS_EXIT(zfsvfs); 212 return (0); 213} 214 215/* ARGSUSED */ 216static int 217zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, 218 caller_context_t *ct) 219{ 220 znode_t *zp = VTOZ(vp); 221 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 222 223 /* 224 * Clean up any locks held by this process on the vp. 225 */ 226 cleanlocks(vp, ddi_get_pid(), 0); 227 cleanshares(vp, ddi_get_pid()); 228 229 ZFS_ENTER(zfsvfs); 230 ZFS_VERIFY_ZP(zp); 231 232 /* Decrement the synchronous opens in the znode */ 233 if ((flag & (FSYNC | FDSYNC)) && (count == 1)) 234 atomic_dec_32(&zp->z_sync_cnt); 235 236 if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && 237 ZTOV(zp)->v_type == VREG && 238 !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) 239 VERIFY(fs_vscan(vp, cr, 1) == 0); 240 241 ZFS_EXIT(zfsvfs); 242 return (0); 243} 244 245/* 246 * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and 247 * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter. 248 */ 249static int 250zfs_holey(vnode_t *vp, u_long cmd, offset_t *off) 251{ 252 znode_t *zp = VTOZ(vp); 253 uint64_t noff = (uint64_t)*off; /* new offset */ 254 uint64_t file_sz; 255 int error; 256 boolean_t hole; 257 258 file_sz = zp->z_size; 259 if (noff >= file_sz) { 260 return (SET_ERROR(ENXIO)); 261 } 262 263 if (cmd == _FIO_SEEK_HOLE) 264 hole = B_TRUE; 265 else 266 hole = B_FALSE; 267 268 error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff); 269 270 if (error == ESRCH) 271 return (SET_ERROR(ENXIO)); 272 273 /* 274 * We could find a hole that begins after the logical end-of-file, 275 * because dmu_offset_next() only works on whole blocks. If the 276 * EOF falls mid-block, then indicate that the "virtual hole" 277 * at the end of the file begins at the logical EOF, rather than 278 * at the end of the last block. 279 */ 280 if (noff > file_sz) { 281 ASSERT(hole); 282 noff = file_sz; 283 } 284 285 if (noff < *off) 286 return (error); 287 *off = noff; 288 return (error); 289} 290 291/* ARGSUSED */ 292static int 293zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred, 294 int *rvalp, caller_context_t *ct) 295{ 296 offset_t off; 297 offset_t ndata; 298 dmu_object_info_t doi; 299 int error; 300 zfsvfs_t *zfsvfs; 301 znode_t *zp; 302 303 switch (com) { 304 case _FIOFFS: 305 { 306 return (0); 307 308 /* 309 * The following two ioctls are used by bfu. Faking out, 310 * necessary to avoid bfu errors. 311 */ 312 } 313 case _FIOGDIO: 314 case _FIOSDIO: 315 { 316 return (0); 317 } 318 319 case _FIO_SEEK_DATA: 320 case _FIO_SEEK_HOLE: 321 { 322#ifdef illumos 323 if (ddi_copyin((void *)data, &off, sizeof (off), flag)) 324 return (SET_ERROR(EFAULT)); 325#else 326 off = *(offset_t *)data; 327#endif 328 zp = VTOZ(vp); 329 zfsvfs = zp->z_zfsvfs; 330 ZFS_ENTER(zfsvfs); 331 ZFS_VERIFY_ZP(zp); 332 333 /* offset parameter is in/out */ 334 error = zfs_holey(vp, com, &off); 335 ZFS_EXIT(zfsvfs); 336 if (error) 337 return (error); 338#ifdef illumos 339 if (ddi_copyout(&off, (void *)data, sizeof (off), flag)) 340 return (SET_ERROR(EFAULT)); 341#else 342 *(offset_t *)data = off; 343#endif 344 return (0); 345 } 346#ifdef illumos 347 case _FIO_COUNT_FILLED: 348 { 349 /* 350 * _FIO_COUNT_FILLED adds a new ioctl command which 351 * exposes the number of filled blocks in a 352 * ZFS object. 353 */ 354 zp = VTOZ(vp); 355 zfsvfs = zp->z_zfsvfs; 356 ZFS_ENTER(zfsvfs); 357 ZFS_VERIFY_ZP(zp); 358 359 /* 360 * Wait for all dirty blocks for this object 361 * to get synced out to disk, and the DMU info 362 * updated. 363 */ 364 error = dmu_object_wait_synced(zfsvfs->z_os, zp->z_id); 365 if (error) { 366 ZFS_EXIT(zfsvfs); 367 return (error); 368 } 369 370 /* 371 * Retrieve fill count from DMU object. 372 */ 373 error = dmu_object_info(zfsvfs->z_os, zp->z_id, &doi); 374 if (error) { 375 ZFS_EXIT(zfsvfs); 376 return (error); 377 } 378 379 ndata = doi.doi_fill_count; 380 381 ZFS_EXIT(zfsvfs); 382 if (ddi_copyout(&ndata, (void *)data, sizeof (ndata), flag)) 383 return (SET_ERROR(EFAULT)); 384 return (0); 385 } 386#endif 387 } 388 return (SET_ERROR(ENOTTY)); 389} 390 391static vm_page_t 392page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes) 393{ 394 vm_object_t obj; 395 vm_page_t pp; 396 int64_t end; 397 398 /* 399 * At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE 400 * aligned boundaries, if the range is not aligned. As a result a 401 * DEV_BSIZE subrange with partially dirty data may get marked as clean. 402 * It may happen that all DEV_BSIZE subranges are marked clean and thus 403 * the whole page would be considred clean despite have some dirty data. 404 * For this reason we should shrink the range to DEV_BSIZE aligned 405 * boundaries before calling vm_page_clear_dirty. 406 */ 407 end = rounddown2(off + nbytes, DEV_BSIZE); 408 off = roundup2(off, DEV_BSIZE); 409 nbytes = end - off; 410 411 obj = vp->v_object; 412 zfs_vmobject_assert_wlocked(obj); 413 414 for (;;) { 415 if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && 416 pp->valid) { 417 if (vm_page_xbusied(pp)) { 418 /* 419 * Reference the page before unlocking and 420 * sleeping so that the page daemon is less 421 * likely to reclaim it. 422 */ 423 vm_page_reference(pp); 424 vm_page_lock(pp); 425 zfs_vmobject_wunlock(obj); 426 vm_page_busy_sleep(pp, "zfsmwb", true); 427 zfs_vmobject_wlock(obj); 428 continue; 429 } 430 vm_page_sbusy(pp); 431 } else if (pp != NULL) { 432 ASSERT(!pp->valid); 433 pp = NULL; 434 } 435 436 if (pp != NULL) { 437 ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); 438 vm_object_pip_add(obj, 1); 439 pmap_remove_write(pp); 440 if (nbytes != 0) 441 vm_page_clear_dirty(pp, off, nbytes); 442 } 443 break; 444 } 445 return (pp); 446} 447 448static void 449page_unbusy(vm_page_t pp) 450{ 451 452 vm_page_sunbusy(pp); 453 vm_object_pip_subtract(pp->object, 1); 454} 455 456static vm_page_t 457page_hold(vnode_t *vp, int64_t start) 458{ 459 vm_object_t obj; 460 vm_page_t pp; 461 462 obj = vp->v_object; 463 zfs_vmobject_assert_wlocked(obj); 464 465 for (;;) { 466 if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && 467 pp->valid) { 468 if (vm_page_xbusied(pp)) { 469 /* 470 * Reference the page before unlocking and 471 * sleeping so that the page daemon is less 472 * likely to reclaim it. 473 */ 474 vm_page_reference(pp); 475 vm_page_lock(pp); 476 zfs_vmobject_wunlock(obj); 477 vm_page_busy_sleep(pp, "zfsmwb", true); 478 zfs_vmobject_wlock(obj); 479 continue; 480 } 481 482 ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); 483 vm_page_lock(pp); 484 vm_page_hold(pp); 485 vm_page_unlock(pp); 486 487 } else 488 pp = NULL; 489 break; 490 } 491 return (pp); 492} 493 494static void 495page_unhold(vm_page_t pp) 496{ 497 498 vm_page_lock(pp); 499 vm_page_unhold(pp); 500 vm_page_unlock(pp); 501} 502 503/* 504 * When a file is memory mapped, we must keep the IO data synchronized 505 * between the DMU cache and the memory mapped pages. What this means: 506 * 507 * On Write: If we find a memory mapped page, we write to *both* 508 * the page and the dmu buffer. 509 */ 510static void 511update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid, 512 int segflg, dmu_tx_t *tx) 513{ 514 vm_object_t obj; 515 struct sf_buf *sf; 516 caddr_t va; 517 int off; 518 519 ASSERT(segflg != UIO_NOCOPY); 520 ASSERT(vp->v_mount != NULL); 521 obj = vp->v_object; 522 ASSERT(obj != NULL); 523 524 off = start & PAGEOFFSET; 525 zfs_vmobject_wlock(obj); 526 for (start &= PAGEMASK; len > 0; start += PAGESIZE) { 527 vm_page_t pp; 528 int nbytes = imin(PAGESIZE - off, len); 529 530 if ((pp = page_busy(vp, start, off, nbytes)) != NULL) { 531 zfs_vmobject_wunlock(obj); 532 533 va = zfs_map_page(pp, &sf); 534 (void) dmu_read(os, oid, start+off, nbytes, 535 va+off, DMU_READ_PREFETCH);; 536 zfs_unmap_page(sf); 537 538 zfs_vmobject_wlock(obj); 539 page_unbusy(pp); 540 } 541 len -= nbytes; 542 off = 0; 543 } 544 vm_object_pip_wakeupn(obj, 0); 545 zfs_vmobject_wunlock(obj); 546} 547 548/* 549 * Read with UIO_NOCOPY flag means that sendfile(2) requests 550 * ZFS to populate a range of page cache pages with data. 551 * 552 * NOTE: this function could be optimized to pre-allocate 553 * all pages in advance, drain exclusive busy on all of them, 554 * map them into contiguous KVA region and populate them 555 * in one single dmu_read() call. 556 */ 557static int 558mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio) 559{ 560 znode_t *zp = VTOZ(vp); 561 objset_t *os = zp->z_zfsvfs->z_os; 562 struct sf_buf *sf; 563 vm_object_t obj; 564 vm_page_t pp; 565 int64_t start; 566 caddr_t va; 567 int len = nbytes; 568 int off; 569 int error = 0; 570 571 ASSERT(uio->uio_segflg == UIO_NOCOPY); 572 ASSERT(vp->v_mount != NULL); 573 obj = vp->v_object; 574 ASSERT(obj != NULL); 575 ASSERT((uio->uio_loffset & PAGEOFFSET) == 0); 576 577 zfs_vmobject_wlock(obj); 578 for (start = uio->uio_loffset; len > 0; start += PAGESIZE) { 579 int bytes = MIN(PAGESIZE, len); 580 581 pp = vm_page_grab(obj, OFF_TO_IDX(start), VM_ALLOC_SBUSY | 582 VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY); 583 if (pp->valid == 0) { 584 zfs_vmobject_wunlock(obj); 585 va = zfs_map_page(pp, &sf); 586 error = dmu_read(os, zp->z_id, start, bytes, va, 587 DMU_READ_PREFETCH); 588 if (bytes != PAGESIZE && error == 0) 589 bzero(va + bytes, PAGESIZE - bytes); 590 zfs_unmap_page(sf); 591 zfs_vmobject_wlock(obj); 592 vm_page_sunbusy(pp); 593 vm_page_lock(pp); 594 if (error) { 595 if (pp->wire_count == 0 && pp->valid == 0 && 596 !vm_page_busied(pp)) 597 vm_page_free(pp); 598 } else { 599 pp->valid = VM_PAGE_BITS_ALL; 600 vm_page_activate(pp); 601 } 602 vm_page_unlock(pp); 603 } else { 604 ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); 605 vm_page_sunbusy(pp); 606 } 607 if (error) 608 break; 609 uio->uio_resid -= bytes; 610 uio->uio_offset += bytes; 611 len -= bytes; 612 } 613 zfs_vmobject_wunlock(obj); 614 return (error); 615} 616 617/* 618 * When a file is memory mapped, we must keep the IO data synchronized 619 * between the DMU cache and the memory mapped pages. What this means: 620 * 621 * On Read: We "read" preferentially from memory mapped pages, 622 * else we default from the dmu buffer. 623 * 624 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when 625 * the file is memory mapped. 626 */ 627static int 628mappedread(vnode_t *vp, int nbytes, uio_t *uio) 629{ 630 znode_t *zp = VTOZ(vp); 631 vm_object_t obj; 632 int64_t start; 633 caddr_t va; 634 int len = nbytes; 635 int off; 636 int error = 0; 637 638 ASSERT(vp->v_mount != NULL); 639 obj = vp->v_object; 640 ASSERT(obj != NULL); 641 642 start = uio->uio_loffset; 643 off = start & PAGEOFFSET; 644 zfs_vmobject_wlock(obj); 645 for (start &= PAGEMASK; len > 0; start += PAGESIZE) { 646 vm_page_t pp; 647 uint64_t bytes = MIN(PAGESIZE - off, len); 648 649 if (pp = page_hold(vp, start)) { 650 struct sf_buf *sf; 651 caddr_t va; 652 653 zfs_vmobject_wunlock(obj); 654 va = zfs_map_page(pp, &sf); 655#ifdef illumos 656 error = uiomove(va + off, bytes, UIO_READ, uio); 657#else 658 error = vn_io_fault_uiomove(va + off, bytes, uio); 659#endif 660 zfs_unmap_page(sf); 661 zfs_vmobject_wlock(obj); 662 page_unhold(pp); 663 } else { 664 zfs_vmobject_wunlock(obj); 665 error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), 666 uio, bytes); 667 zfs_vmobject_wlock(obj); 668 } 669 len -= bytes; 670 off = 0; 671 if (error) 672 break; 673 } 674 zfs_vmobject_wunlock(obj); 675 return (error); 676} 677 678offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */ 679 680/* 681 * Read bytes from specified file into supplied buffer. 682 * 683 * IN: vp - vnode of file to be read from. 684 * uio - structure supplying read location, range info, 685 * and return buffer. 686 * ioflag - SYNC flags; used to provide FRSYNC semantics. 687 * cr - credentials of caller. 688 * ct - caller context 689 * 690 * OUT: uio - updated offset and range, buffer filled. 691 * 692 * RETURN: 0 on success, error code on failure. 693 * 694 * Side Effects: 695 * vp - atime updated if byte count > 0 696 */ 697/* ARGSUSED */ 698static int 699zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) 700{ 701 znode_t *zp = VTOZ(vp); 702 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 703 ssize_t n, nbytes; 704 int error = 0; 705 rl_t *rl; 706 xuio_t *xuio = NULL; 707 708 ZFS_ENTER(zfsvfs); 709 ZFS_VERIFY_ZP(zp); 710 711 if (zp->z_pflags & ZFS_AV_QUARANTINED) { 712 ZFS_EXIT(zfsvfs); 713 return (SET_ERROR(EACCES)); 714 } 715 716 /* 717 * Validate file offset 718 */ 719 if (uio->uio_loffset < (offset_t)0) { 720 ZFS_EXIT(zfsvfs); 721 return (SET_ERROR(EINVAL)); 722 } 723 724 /* 725 * Fasttrack empty reads 726 */ 727 if (uio->uio_resid == 0) { 728 ZFS_EXIT(zfsvfs); 729 return (0); 730 } 731 732 /* 733 * Check for mandatory locks 734 */ 735 if (MANDMODE(zp->z_mode)) { 736 if (error = chklock(vp, FREAD, 737 uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) { 738 ZFS_EXIT(zfsvfs); 739 return (error); 740 } 741 } 742 743 /* 744 * If we're in FRSYNC mode, sync out this znode before reading it. 745 */ 746 if (zfsvfs->z_log && 747 (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)) 748 zil_commit(zfsvfs->z_log, zp->z_id); 749 750 /* 751 * Lock the range against changes. 752 */ 753 rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER); 754 755 /* 756 * If we are reading past end-of-file we can skip 757 * to the end; but we might still need to set atime. 758 */ 759 if (uio->uio_loffset >= zp->z_size) { 760 error = 0; 761 goto out; 762 } 763 764 ASSERT(uio->uio_loffset < zp->z_size); 765 n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset); 766 767#ifdef illumos 768 if ((uio->uio_extflg == UIO_XUIO) && 769 (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) { 770 int nblk; 771 int blksz = zp->z_blksz; 772 uint64_t offset = uio->uio_loffset; 773 774 xuio = (xuio_t *)uio; 775 if ((ISP2(blksz))) { 776 nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset, 777 blksz)) / blksz; 778 } else { 779 ASSERT(offset + n <= blksz); 780 nblk = 1; 781 } 782 (void) dmu_xuio_init(xuio, nblk); 783 784 if (vn_has_cached_data(vp)) { 785 /* 786 * For simplicity, we always allocate a full buffer 787 * even if we only expect to read a portion of a block. 788 */ 789 while (--nblk >= 0) { 790 (void) dmu_xuio_add(xuio, 791 dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), 792 blksz), 0, blksz); 793 } 794 } 795 } 796#endif /* illumos */ 797 798 while (n > 0) { 799 nbytes = MIN(n, zfs_read_chunk_size - 800 P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); 801 802#ifdef __FreeBSD__ 803 if (uio->uio_segflg == UIO_NOCOPY) 804 error = mappedread_sf(vp, nbytes, uio); 805 else 806#endif /* __FreeBSD__ */ 807 if (vn_has_cached_data(vp)) { 808 error = mappedread(vp, nbytes, uio); 809 } else { 810 error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), 811 uio, nbytes); 812 } 813 if (error) { 814 /* convert checksum errors into IO errors */ 815 if (error == ECKSUM) 816 error = SET_ERROR(EIO); 817 break; 818 } 819 820 n -= nbytes; 821 } 822out: 823 zfs_range_unlock(rl); 824 825 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 826 ZFS_EXIT(zfsvfs); 827 return (error); 828} 829 830/* 831 * Write the bytes to a file. 832 * 833 * IN: vp - vnode of file to be written to. 834 * uio - structure supplying write location, range info, 835 * and data buffer. 836 * ioflag - FAPPEND, FSYNC, and/or FDSYNC. FAPPEND is 837 * set if in append mode. 838 * cr - credentials of caller. 839 * ct - caller context (NFS/CIFS fem monitor only) 840 * 841 * OUT: uio - updated offset and range. 842 * 843 * RETURN: 0 on success, error code on failure. 844 * 845 * Timestamps: 846 * vp - ctime|mtime updated if byte count > 0 847 */ 848 849/* ARGSUSED */ 850static int 851zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) 852{ 853 znode_t *zp = VTOZ(vp); 854 rlim64_t limit = MAXOFFSET_T; 855 ssize_t start_resid = uio->uio_resid; 856 ssize_t tx_bytes; 857 uint64_t end_size; 858 dmu_tx_t *tx; 859 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 860 zilog_t *zilog; 861 offset_t woff; 862 ssize_t n, nbytes; 863 rl_t *rl; 864 int max_blksz = zfsvfs->z_max_blksz; 865 int error = 0; 866 arc_buf_t *abuf; 867 iovec_t *aiov = NULL; 868 xuio_t *xuio = NULL; 869 int i_iov = 0; 870 int iovcnt = uio->uio_iovcnt; 871 iovec_t *iovp = uio->uio_iov; 872 int write_eof; 873 int count = 0; 874 sa_bulk_attr_t bulk[4]; 875 uint64_t mtime[2], ctime[2]; 876 877 /* 878 * Fasttrack empty write 879 */ 880 n = start_resid; 881 if (n == 0) 882 return (0); 883 884 if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) 885 limit = MAXOFFSET_T; 886 887 ZFS_ENTER(zfsvfs); 888 ZFS_VERIFY_ZP(zp); 889 890 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); 891 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); 892 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, 893 &zp->z_size, 8); 894 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 895 &zp->z_pflags, 8); 896 897 /* 898 * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our 899 * callers might not be able to detect properly that we are read-only, 900 * so check it explicitly here. 901 */ 902 if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { 903 ZFS_EXIT(zfsvfs); 904 return (SET_ERROR(EROFS)); 905 } 906 907 /* 908 * If immutable or not appending then return EPERM. 909 * Intentionally allow ZFS_READONLY through here. 910 * See zfs_zaccess_common() 911 */ 912 if ((zp->z_pflags & ZFS_IMMUTABLE) || 913 ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) && 914 (uio->uio_loffset < zp->z_size))) { 915 ZFS_EXIT(zfsvfs); 916 return (SET_ERROR(EPERM)); 917 } 918 919 zilog = zfsvfs->z_log; 920 921 /* 922 * Validate file offset 923 */ 924 woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset; 925 if (woff < 0) { 926 ZFS_EXIT(zfsvfs); 927 return (SET_ERROR(EINVAL)); 928 } 929 930 /* 931 * Check for mandatory locks before calling zfs_range_lock() 932 * in order to prevent a deadlock with locks set via fcntl(). 933 */ 934 if (MANDMODE((mode_t)zp->z_mode) && 935 (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) { 936 ZFS_EXIT(zfsvfs); 937 return (error); 938 } 939 940#ifdef illumos 941 /* 942 * Pre-fault the pages to ensure slow (eg NFS) pages 943 * don't hold up txg. 944 * Skip this if uio contains loaned arc_buf. 945 */ 946 if ((uio->uio_extflg == UIO_XUIO) && 947 (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) 948 xuio = (xuio_t *)uio; 949 else 950 uio_prefaultpages(MIN(n, max_blksz), uio); 951#endif 952 953 /* 954 * If in append mode, set the io offset pointer to eof. 955 */ 956 if (ioflag & FAPPEND) { 957 /* 958 * Obtain an appending range lock to guarantee file append 959 * semantics. We reset the write offset once we have the lock. 960 */ 961 rl = zfs_range_lock(zp, 0, n, RL_APPEND); 962 woff = rl->r_off; 963 if (rl->r_len == UINT64_MAX) { 964 /* 965 * We overlocked the file because this write will cause 966 * the file block size to increase. 967 * Note that zp_size cannot change with this lock held. 968 */ 969 woff = zp->z_size; 970 } 971 uio->uio_loffset = woff; 972 } else { 973 /* 974 * Note that if the file block size will change as a result of 975 * this write, then this range lock will lock the entire file 976 * so that we can re-write the block safely. 977 */ 978 rl = zfs_range_lock(zp, woff, n, RL_WRITER); 979 } 980 981 if (vn_rlimit_fsize(vp, uio, uio->uio_td)) { 982 zfs_range_unlock(rl); 983 ZFS_EXIT(zfsvfs); 984 return (EFBIG); 985 } 986 987 if (woff >= limit) { 988 zfs_range_unlock(rl); 989 ZFS_EXIT(zfsvfs); 990 return (SET_ERROR(EFBIG)); 991 } 992 993 if ((woff + n) > limit || woff > (limit - n)) 994 n = limit - woff; 995 996 /* Will this write extend the file length? */ 997 write_eof = (woff + n > zp->z_size); 998 999 end_size = MAX(zp->z_size, woff + n); 1000 1001 /* 1002 * Write the file in reasonable size chunks. Each chunk is written 1003 * in a separate transaction; this keeps the intent log records small 1004 * and allows us to do more fine-grained space accounting. 1005 */ 1006 while (n > 0) { 1007 abuf = NULL; 1008 woff = uio->uio_loffset; 1009 if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) || 1010 zfs_owner_overquota(zfsvfs, zp, B_TRUE)) { 1011 if (abuf != NULL) 1012 dmu_return_arcbuf(abuf); 1013 error = SET_ERROR(EDQUOT); 1014 break; 1015 } 1016 1017 if (xuio && abuf == NULL) { 1018 ASSERT(i_iov < iovcnt); 1019 aiov = &iovp[i_iov]; 1020 abuf = dmu_xuio_arcbuf(xuio, i_iov); 1021 dmu_xuio_clear(xuio, i_iov); 1022 DTRACE_PROBE3(zfs_cp_write, int, i_iov, 1023 iovec_t *, aiov, arc_buf_t *, abuf); 1024 ASSERT((aiov->iov_base == abuf->b_data) || 1025 ((char *)aiov->iov_base - (char *)abuf->b_data + 1026 aiov->iov_len == arc_buf_size(abuf))); 1027 i_iov++; 1028 } else if (abuf == NULL && n >= max_blksz && 1029 woff >= zp->z_size && 1030 P2PHASE(woff, max_blksz) == 0 && 1031 zp->z_blksz == max_blksz) { 1032 /* 1033 * This write covers a full block. "Borrow" a buffer 1034 * from the dmu so that we can fill it before we enter 1035 * a transaction. This avoids the possibility of 1036 * holding up the transaction if the data copy hangs 1037 * up on a pagefault (e.g., from an NFS server mapping). 1038 */ 1039 size_t cbytes; 1040 1041 abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), 1042 max_blksz); 1043 ASSERT(abuf != NULL); 1044 ASSERT(arc_buf_size(abuf) == max_blksz); 1045 if (error = uiocopy(abuf->b_data, max_blksz, 1046 UIO_WRITE, uio, &cbytes)) { 1047 dmu_return_arcbuf(abuf); 1048 break; 1049 } 1050 ASSERT(cbytes == max_blksz); 1051 } 1052 1053 /* 1054 * Start a transaction. 1055 */ 1056 tx = dmu_tx_create(zfsvfs->z_os); 1057 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 1058 dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz)); 1059 zfs_sa_upgrade_txholds(tx, zp); 1060 error = dmu_tx_assign(tx, TXG_WAIT); 1061 if (error) { 1062 dmu_tx_abort(tx); 1063 if (abuf != NULL) 1064 dmu_return_arcbuf(abuf); 1065 break; 1066 } 1067 1068 /* 1069 * If zfs_range_lock() over-locked we grow the blocksize 1070 * and then reduce the lock range. This will only happen 1071 * on the first iteration since zfs_range_reduce() will 1072 * shrink down r_len to the appropriate size. 1073 */ 1074 if (rl->r_len == UINT64_MAX) { 1075 uint64_t new_blksz; 1076 1077 if (zp->z_blksz > max_blksz) { 1078 /* 1079 * File's blocksize is already larger than the 1080 * "recordsize" property. Only let it grow to 1081 * the next power of 2. 1082 */ 1083 ASSERT(!ISP2(zp->z_blksz)); 1084 new_blksz = MIN(end_size, 1085 1 << highbit64(zp->z_blksz)); 1086 } else { 1087 new_blksz = MIN(end_size, max_blksz); 1088 } 1089 zfs_grow_blocksize(zp, new_blksz, tx); 1090 zfs_range_reduce(rl, woff, n); 1091 } 1092 1093 /* 1094 * XXX - should we really limit each write to z_max_blksz? 1095 * Perhaps we should use SPA_MAXBLOCKSIZE chunks? 1096 */ 1097 nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); 1098 1099 if (woff + nbytes > zp->z_size) 1100 vnode_pager_setsize(vp, woff + nbytes); 1101 1102 if (abuf == NULL) { 1103 tx_bytes = uio->uio_resid; 1104 error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), 1105 uio, nbytes, tx); 1106 tx_bytes -= uio->uio_resid; 1107 } else { 1108 tx_bytes = nbytes; 1109 ASSERT(xuio == NULL || tx_bytes == aiov->iov_len); 1110 /* 1111 * If this is not a full block write, but we are 1112 * extending the file past EOF and this data starts 1113 * block-aligned, use assign_arcbuf(). Otherwise, 1114 * write via dmu_write(). 1115 */ 1116 if (tx_bytes < max_blksz && (!write_eof || 1117 aiov->iov_base != abuf->b_data)) { 1118 ASSERT(xuio); 1119 dmu_write(zfsvfs->z_os, zp->z_id, woff, 1120 aiov->iov_len, aiov->iov_base, tx); 1121 dmu_return_arcbuf(abuf); 1122 xuio_stat_wbuf_copied(); 1123 } else { 1124 ASSERT(xuio || tx_bytes == max_blksz); 1125 dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl), 1126 woff, abuf, tx); 1127 } 1128 ASSERT(tx_bytes <= uio->uio_resid); 1129 uioskip(uio, tx_bytes); 1130 } 1131 if (tx_bytes && vn_has_cached_data(vp)) { 1132 update_pages(vp, woff, tx_bytes, zfsvfs->z_os, 1133 zp->z_id, uio->uio_segflg, tx); 1134 } 1135 1136 /* 1137 * If we made no progress, we're done. If we made even 1138 * partial progress, update the znode and ZIL accordingly. 1139 */ 1140 if (tx_bytes == 0) { 1141 (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), 1142 (void *)&zp->z_size, sizeof (uint64_t), tx); 1143 dmu_tx_commit(tx); 1144 ASSERT(error != 0); 1145 break; 1146 } 1147 1148 /* 1149 * Clear Set-UID/Set-GID bits on successful write if not 1150 * privileged and at least one of the excute bits is set. 1151 * 1152 * It would be nice to to this after all writes have 1153 * been done, but that would still expose the ISUID/ISGID 1154 * to another app after the partial write is committed. 1155 * 1156 * Note: we don't call zfs_fuid_map_id() here because 1157 * user 0 is not an ephemeral uid. 1158 */ 1159 mutex_enter(&zp->z_acl_lock); 1160 if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | 1161 (S_IXUSR >> 6))) != 0 && 1162 (zp->z_mode & (S_ISUID | S_ISGID)) != 0 && 1163 secpolicy_vnode_setid_retain(vp, cr, 1164 (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) { 1165 uint64_t newmode; 1166 zp->z_mode &= ~(S_ISUID | S_ISGID); 1167 newmode = zp->z_mode; 1168 (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), 1169 (void *)&newmode, sizeof (uint64_t), tx); 1170 } 1171 mutex_exit(&zp->z_acl_lock); 1172 1173 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, 1174 B_TRUE); 1175 1176 /* 1177 * Update the file size (zp_size) if it has changed; 1178 * account for possible concurrent updates. 1179 */ 1180 while ((end_size = zp->z_size) < uio->uio_loffset) { 1181 (void) atomic_cas_64(&zp->z_size, end_size, 1182 uio->uio_loffset); 1183#ifdef illumos 1184 ASSERT(error == 0); 1185#else 1186 ASSERT(error == 0 || error == EFAULT); 1187#endif 1188 } 1189 /* 1190 * If we are replaying and eof is non zero then force 1191 * the file size to the specified eof. Note, there's no 1192 * concurrency during replay. 1193 */ 1194 if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0) 1195 zp->z_size = zfsvfs->z_replay_eof; 1196 1197 if (error == 0) 1198 error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 1199 else 1200 (void) sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 1201 1202 zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag); 1203 dmu_tx_commit(tx); 1204 1205 if (error != 0) 1206 break; 1207 ASSERT(tx_bytes == nbytes); 1208 n -= nbytes; 1209 1210#ifdef illumos 1211 if (!xuio && n > 0) 1212 uio_prefaultpages(MIN(n, max_blksz), uio); 1213#endif 1214 } 1215 1216 zfs_range_unlock(rl); 1217 1218 /* 1219 * If we're in replay mode, or we made no progress, return error. 1220 * Otherwise, it's at least a partial write, so it's successful. 1221 */ 1222 if (zfsvfs->z_replay || uio->uio_resid == start_resid) { 1223 ZFS_EXIT(zfsvfs); 1224 return (error); 1225 } 1226 1227#ifdef __FreeBSD__ 1228 /* 1229 * EFAULT means that at least one page of the source buffer was not 1230 * available. VFS will re-try remaining I/O upon this error. 1231 */ 1232 if (error == EFAULT) { 1233 ZFS_EXIT(zfsvfs); 1234 return (error); 1235 } 1236#endif 1237 1238 if (ioflag & (FSYNC | FDSYNC) || 1239 zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 1240 zil_commit(zilog, zp->z_id); 1241 1242 ZFS_EXIT(zfsvfs); 1243 return (0); 1244} 1245 1246void 1247zfs_get_done(zgd_t *zgd, int error) 1248{ 1249 znode_t *zp = zgd->zgd_private; 1250 objset_t *os = zp->z_zfsvfs->z_os; 1251 1252 if (zgd->zgd_db) 1253 dmu_buf_rele(zgd->zgd_db, zgd); 1254 1255 zfs_range_unlock(zgd->zgd_rl); 1256 1257 /* 1258 * Release the vnode asynchronously as we currently have the 1259 * txg stopped from syncing. 1260 */ 1261 VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os))); 1262 1263 if (error == 0 && zgd->zgd_bp) 1264 zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp); 1265 1266 kmem_free(zgd, sizeof (zgd_t)); 1267} 1268 1269#ifdef DEBUG 1270static int zil_fault_io = 0; 1271#endif 1272 1273/* 1274 * Get data to generate a TX_WRITE intent log record. 1275 */ 1276int 1277zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) 1278{ 1279 zfsvfs_t *zfsvfs = arg; 1280 objset_t *os = zfsvfs->z_os; 1281 znode_t *zp; 1282 uint64_t object = lr->lr_foid; 1283 uint64_t offset = lr->lr_offset; 1284 uint64_t size = lr->lr_length; 1285 dmu_buf_t *db; 1286 zgd_t *zgd; 1287 int error = 0; 1288 1289 ASSERT3P(lwb, !=, NULL); 1290 ASSERT3P(zio, !=, NULL); 1291 ASSERT3U(size, !=, 0); 1292 1293 /* 1294 * Nothing to do if the file has been removed 1295 */ 1296 if (zfs_zget(zfsvfs, object, &zp) != 0) 1297 return (SET_ERROR(ENOENT)); 1298 if (zp->z_unlinked) { 1299 /* 1300 * Release the vnode asynchronously as we currently have the 1301 * txg stopped from syncing. 1302 */ 1303 VN_RELE_ASYNC(ZTOV(zp), 1304 dsl_pool_vnrele_taskq(dmu_objset_pool(os))); 1305 return (SET_ERROR(ENOENT)); 1306 } 1307 1308 zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP); 1309 zgd->zgd_lwb = lwb; 1310 zgd->zgd_private = zp; 1311 1312 /* 1313 * Write records come in two flavors: immediate and indirect. 1314 * For small writes it's cheaper to store the data with the 1315 * log record (immediate); for large writes it's cheaper to 1316 * sync the data and get a pointer to it (indirect) so that 1317 * we don't have to write the data twice. 1318 */ 1319 if (buf != NULL) { /* immediate write */ 1320 zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER); 1321 /* test for truncation needs to be done while range locked */ 1322 if (offset >= zp->z_size) { 1323 error = SET_ERROR(ENOENT); 1324 } else { 1325 error = dmu_read(os, object, offset, size, buf, 1326 DMU_READ_NO_PREFETCH); 1327 } 1328 ASSERT(error == 0 || error == ENOENT); 1329 } else { /* indirect write */ 1330 /* 1331 * Have to lock the whole block to ensure when it's 1332 * written out and its checksum is being calculated 1333 * that no one can change the data. We need to re-check 1334 * blocksize after we get the lock in case it's changed! 1335 */ 1336 for (;;) { 1337 uint64_t blkoff; 1338 size = zp->z_blksz; 1339 blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; 1340 offset -= blkoff; 1341 zgd->zgd_rl = zfs_range_lock(zp, offset, size, 1342 RL_READER); 1343 if (zp->z_blksz == size) 1344 break; 1345 offset += blkoff; 1346 zfs_range_unlock(zgd->zgd_rl); 1347 } 1348 /* test for truncation needs to be done while range locked */ 1349 if (lr->lr_offset >= zp->z_size) 1350 error = SET_ERROR(ENOENT); 1351#ifdef DEBUG 1352 if (zil_fault_io) { 1353 error = SET_ERROR(EIO); 1354 zil_fault_io = 0; 1355 } 1356#endif 1357 if (error == 0) 1358 error = dmu_buf_hold(os, object, offset, zgd, &db, 1359 DMU_READ_NO_PREFETCH); 1360 1361 if (error == 0) { 1362 blkptr_t *bp = &lr->lr_blkptr; 1363 1364 zgd->zgd_db = db; 1365 zgd->zgd_bp = bp; 1366 1367 ASSERT(db->db_offset == offset); 1368 ASSERT(db->db_size == size); 1369 1370 error = dmu_sync(zio, lr->lr_common.lrc_txg, 1371 zfs_get_done, zgd); 1372 ASSERT(error || lr->lr_length <= size); 1373 1374 /* 1375 * On success, we need to wait for the write I/O 1376 * initiated by dmu_sync() to complete before we can 1377 * release this dbuf. We will finish everything up 1378 * in the zfs_get_done() callback. 1379 */ 1380 if (error == 0) 1381 return (0); 1382 1383 if (error == EALREADY) { 1384 lr->lr_common.lrc_txtype = TX_WRITE2; 1385 error = 0; 1386 } 1387 } 1388 } 1389 1390 zfs_get_done(zgd, error); 1391 1392 return (error); 1393} 1394 1395/*ARGSUSED*/ 1396static int 1397zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr, 1398 caller_context_t *ct) 1399{ 1400 znode_t *zp = VTOZ(vp); 1401 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1402 int error; 1403 1404 ZFS_ENTER(zfsvfs); 1405 ZFS_VERIFY_ZP(zp); 1406 1407 if (flag & V_ACE_MASK) 1408 error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); 1409 else 1410 error = zfs_zaccess_rwx(zp, mode, flag, cr); 1411 1412 ZFS_EXIT(zfsvfs); 1413 return (error); 1414} 1415 1416static int 1417zfs_dd_callback(struct mount *mp, void *arg, int lkflags, struct vnode **vpp) 1418{ 1419 int error; 1420 1421 *vpp = arg; 1422 error = vn_lock(*vpp, lkflags); 1423 if (error != 0) 1424 vrele(*vpp); 1425 return (error); 1426} 1427 1428static int 1429zfs_lookup_lock(vnode_t *dvp, vnode_t *vp, const char *name, int lkflags) 1430{ 1431 znode_t *zdp = VTOZ(dvp); 1432 zfsvfs_t *zfsvfs = zdp->z_zfsvfs; 1433 int error; 1434 int ltype; 1435 1436 ASSERT_VOP_LOCKED(dvp, __func__); 1437#ifdef DIAGNOSTIC 1438 if ((zdp->z_pflags & ZFS_XATTR) == 0) 1439 VERIFY(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock)); 1440#endif 1441 1442 if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) { 1443 ASSERT3P(dvp, ==, vp); 1444 vref(dvp); 1445 ltype = lkflags & LK_TYPE_MASK; 1446 if (ltype != VOP_ISLOCKED(dvp)) { 1447 if (ltype == LK_EXCLUSIVE) 1448 vn_lock(dvp, LK_UPGRADE | LK_RETRY); 1449 else /* if (ltype == LK_SHARED) */ 1450 vn_lock(dvp, LK_DOWNGRADE | LK_RETRY); 1451 1452 /* 1453 * Relock for the "." case could leave us with 1454 * reclaimed vnode. 1455 */ 1456 if (dvp->v_iflag & VI_DOOMED) { 1457 vrele(dvp); 1458 return (SET_ERROR(ENOENT)); 1459 } 1460 } 1461 return (0); 1462 } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) { 1463 /* 1464 * Note that in this case, dvp is the child vnode, and we 1465 * are looking up the parent vnode - exactly reverse from 1466 * normal operation. Unlocking dvp requires some rather 1467 * tricky unlock/relock dance to prevent mp from being freed; 1468 * use vn_vget_ino_gen() which takes care of all that. 1469 * 1470 * XXX Note that there is a time window when both vnodes are 1471 * unlocked. It is possible, although highly unlikely, that 1472 * during that window the parent-child relationship between 1473 * the vnodes may change, for example, get reversed. 1474 * In that case we would have a wrong lock order for the vnodes. 1475 * All other filesystems seem to ignore this problem, so we 1476 * do the same here. 1477 * A potential solution could be implemented as follows: 1478 * - using LK_NOWAIT when locking the second vnode and retrying 1479 * if necessary 1480 * - checking that the parent-child relationship still holds 1481 * after locking both vnodes and retrying if it doesn't 1482 */ 1483 error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp); 1484 return (error); 1485 } else { 1486 error = vn_lock(vp, lkflags); 1487 if (error != 0) 1488 vrele(vp); 1489 return (error); 1490 } 1491} 1492 1493/* 1494 * Lookup an entry in a directory, or an extended attribute directory. 1495 * If it exists, return a held vnode reference for it. 1496 * 1497 * IN: dvp - vnode of directory to search. 1498 * nm - name of entry to lookup. 1499 * pnp - full pathname to lookup [UNUSED]. 1500 * flags - LOOKUP_XATTR set if looking for an attribute. 1501 * rdir - root directory vnode [UNUSED]. 1502 * cr - credentials of caller. 1503 * ct - caller context 1504 * 1505 * OUT: vpp - vnode of located entry, NULL if not found. 1506 * 1507 * RETURN: 0 on success, error code on failure. 1508 * 1509 * Timestamps: 1510 * NA 1511 */ 1512/* ARGSUSED */ 1513static int 1514zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp, 1515 int nameiop, cred_t *cr, kthread_t *td, int flags) 1516{ 1517 znode_t *zdp = VTOZ(dvp); 1518 znode_t *zp; 1519 zfsvfs_t *zfsvfs = zdp->z_zfsvfs; 1520 int error = 0; 1521 1522 /* 1523 * Fast path lookup, however we must skip DNLC lookup 1524 * for case folding or normalizing lookups because the 1525 * DNLC code only stores the passed in name. This means 1526 * creating 'a' and removing 'A' on a case insensitive 1527 * file system would work, but DNLC still thinks 'a' 1528 * exists and won't let you create it again on the next 1529 * pass through fast path. 1530 */ 1531 if (!(flags & LOOKUP_XATTR)) { 1532 if (dvp->v_type != VDIR) { 1533 return (SET_ERROR(ENOTDIR)); 1534 } else if (zdp->z_sa_hdl == NULL) { 1535 return (SET_ERROR(EIO)); 1536 } 1537 } 1538 1539 DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm); 1540 1541 ZFS_ENTER(zfsvfs); 1542 ZFS_VERIFY_ZP(zdp); 1543 1544 *vpp = NULL; 1545 1546 if (flags & LOOKUP_XATTR) { 1547#ifdef TODO 1548 /* 1549 * If the xattr property is off, refuse the lookup request. 1550 */ 1551 if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) { 1552 ZFS_EXIT(zfsvfs); 1553 return (SET_ERROR(EINVAL)); 1554 } 1555#endif 1556 1557 /* 1558 * We don't allow recursive attributes.. 1559 * Maybe someday we will. 1560 */ 1561 if (zdp->z_pflags & ZFS_XATTR) { 1562 ZFS_EXIT(zfsvfs); 1563 return (SET_ERROR(EINVAL)); 1564 } 1565 1566 if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) { 1567 ZFS_EXIT(zfsvfs); 1568 return (error); 1569 } 1570 1571 /* 1572 * Do we have permission to get into attribute directory? 1573 */ 1574 if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0, 1575 B_FALSE, cr)) { 1576 vrele(*vpp); 1577 *vpp = NULL; 1578 } 1579 1580 ZFS_EXIT(zfsvfs); 1581 return (error); 1582 } 1583 1584 /* 1585 * Check accessibility of directory. 1586 */ 1587 if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) { 1588 ZFS_EXIT(zfsvfs); 1589 return (error); 1590 } 1591 1592 if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), 1593 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 1594 ZFS_EXIT(zfsvfs); 1595 return (SET_ERROR(EILSEQ)); 1596 } 1597 1598 1599 /* 1600 * First handle the special cases. 1601 */ 1602 if ((cnp->cn_flags & ISDOTDOT) != 0) { 1603 /* 1604 * If we are a snapshot mounted under .zfs, return 1605 * the vp for the snapshot directory. 1606 */ 1607 if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) { 1608 struct componentname cn; 1609 vnode_t *zfsctl_vp; 1610 int ltype; 1611 1612 ZFS_EXIT(zfsvfs); 1613 ltype = VOP_ISLOCKED(dvp); 1614 VOP_UNLOCK(dvp, 0); 1615 error = zfsctl_root(zfsvfs->z_parent, LK_SHARED, 1616 &zfsctl_vp); 1617 if (error == 0) { 1618 cn.cn_nameptr = "snapshot"; 1619 cn.cn_namelen = strlen(cn.cn_nameptr); 1620 cn.cn_nameiop = cnp->cn_nameiop; 1621 cn.cn_flags = cnp->cn_flags & ~ISDOTDOT; 1622 cn.cn_lkflags = cnp->cn_lkflags; 1623 error = VOP_LOOKUP(zfsctl_vp, vpp, &cn); 1624 vput(zfsctl_vp); 1625 } 1626 vn_lock(dvp, ltype | LK_RETRY); 1627 return (error); 1628 } 1629 } 1630 if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) { 1631 ZFS_EXIT(zfsvfs); 1632 if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP) 1633 return (SET_ERROR(ENOTSUP)); 1634 error = zfsctl_root(zfsvfs, cnp->cn_lkflags, vpp); 1635 return (error); 1636 } 1637 1638 /* 1639 * The loop is retry the lookup if the parent-child relationship 1640 * changes during the dot-dot locking complexities. 1641 */ 1642 for (;;) { 1643 uint64_t parent; 1644 1645 error = zfs_dirlook(zdp, nm, &zp); 1646 if (error == 0) 1647 *vpp = ZTOV(zp); 1648 1649 ZFS_EXIT(zfsvfs); 1650 if (error != 0) 1651 break; 1652 1653 error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags); 1654 if (error != 0) { 1655 /* 1656 * If we've got a locking error, then the vnode 1657 * got reclaimed because of a force unmount. 1658 * We never enter doomed vnodes into the name cache. 1659 */ 1660 *vpp = NULL; 1661 return (error); 1662 } 1663 1664 if ((cnp->cn_flags & ISDOTDOT) == 0) 1665 break; 1666 1667 ZFS_ENTER(zfsvfs); 1668 if (zdp->z_sa_hdl == NULL) { 1669 error = SET_ERROR(EIO); 1670 } else { 1671 error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), 1672 &parent, sizeof (parent)); 1673 } 1674 if (error != 0) { 1675 ZFS_EXIT(zfsvfs); 1676 vput(ZTOV(zp)); 1677 break; 1678 } 1679 if (zp->z_id == parent) { 1680 ZFS_EXIT(zfsvfs); 1681 break; 1682 } 1683 vput(ZTOV(zp)); 1684 } 1685 1686out: 1687 if (error != 0) 1688 *vpp = NULL; 1689 1690 /* Translate errors and add SAVENAME when needed. */ 1691 if (cnp->cn_flags & ISLASTCN) { 1692 switch (nameiop) { 1693 case CREATE: 1694 case RENAME: 1695 if (error == ENOENT) { 1696 error = EJUSTRETURN; 1697 cnp->cn_flags |= SAVENAME; 1698 break; 1699 } 1700 /* FALLTHROUGH */ 1701 case DELETE: 1702 if (error == 0) 1703 cnp->cn_flags |= SAVENAME; 1704 break; 1705 } 1706 } 1707 1708 /* Insert name into cache (as non-existent) if appropriate. */ 1709 if (zfsvfs->z_use_namecache && 1710 error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0) 1711 cache_enter(dvp, NULL, cnp); 1712 1713 /* Insert name into cache if appropriate. */ 1714 if (zfsvfs->z_use_namecache && 1715 error == 0 && (cnp->cn_flags & MAKEENTRY)) { 1716 if (!(cnp->cn_flags & ISLASTCN) || 1717 (nameiop != DELETE && nameiop != RENAME)) { 1718 cache_enter(dvp, *vpp, cnp); 1719 } 1720 } 1721 1722 return (error); 1723} 1724 1725/* 1726 * Attempt to create a new entry in a directory. If the entry 1727 * already exists, truncate the file if permissible, else return 1728 * an error. Return the vp of the created or trunc'd file. 1729 * 1730 * IN: dvp - vnode of directory to put new file entry in. 1731 * name - name of new file entry. 1732 * vap - attributes of new file. 1733 * excl - flag indicating exclusive or non-exclusive mode. 1734 * mode - mode to open file with. 1735 * cr - credentials of caller. 1736 * flag - large file flag [UNUSED]. 1737 * ct - caller context 1738 * vsecp - ACL to be set 1739 * 1740 * OUT: vpp - vnode of created or trunc'd entry. 1741 * 1742 * RETURN: 0 on success, error code on failure. 1743 * 1744 * Timestamps: 1745 * dvp - ctime|mtime updated if new entry created 1746 * vp - ctime|mtime always, atime if new 1747 */ 1748 1749/* ARGSUSED */ 1750static int 1751zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode, 1752 vnode_t **vpp, cred_t *cr, kthread_t *td) 1753{ 1754 znode_t *zp, *dzp = VTOZ(dvp); 1755 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1756 zilog_t *zilog; 1757 objset_t *os; 1758 dmu_tx_t *tx; 1759 int error; 1760 ksid_t *ksid; 1761 uid_t uid; 1762 gid_t gid = crgetgid(cr); 1763 zfs_acl_ids_t acl_ids; 1764 boolean_t fuid_dirtied; 1765 void *vsecp = NULL; 1766 int flag = 0; 1767 uint64_t txtype; 1768 1769 /* 1770 * If we have an ephemeral id, ACL, or XVATTR then 1771 * make sure file system is at proper version 1772 */ 1773 1774 ksid = crgetsid(cr, KSID_OWNER); 1775 if (ksid) 1776 uid = ksid_getid(ksid); 1777 else 1778 uid = crgetuid(cr); 1779 1780 if (zfsvfs->z_use_fuids == B_FALSE && 1781 (vsecp || (vap->va_mask & AT_XVATTR) || 1782 IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) 1783 return (SET_ERROR(EINVAL)); 1784 1785 ZFS_ENTER(zfsvfs); 1786 ZFS_VERIFY_ZP(dzp); 1787 os = zfsvfs->z_os; 1788 zilog = zfsvfs->z_log; 1789 1790 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), 1791 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 1792 ZFS_EXIT(zfsvfs); 1793 return (SET_ERROR(EILSEQ)); 1794 } 1795 1796 if (vap->va_mask & AT_XVATTR) { 1797 if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap, 1798 crgetuid(cr), cr, vap->va_type)) != 0) { 1799 ZFS_EXIT(zfsvfs); 1800 return (error); 1801 } 1802 } 1803 1804 *vpp = NULL; 1805 1806 if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr)) 1807 vap->va_mode &= ~S_ISVTX; 1808 1809 error = zfs_dirent_lookup(dzp, name, &zp, ZNEW); 1810 if (error) { 1811 ZFS_EXIT(zfsvfs); 1812 return (error); 1813 } 1814 ASSERT3P(zp, ==, NULL); 1815 1816 /* 1817 * Create a new file object and update the directory 1818 * to reference it. 1819 */ 1820 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { 1821 goto out; 1822 } 1823 1824 /* 1825 * We only support the creation of regular files in 1826 * extended attribute directories. 1827 */ 1828 1829 if ((dzp->z_pflags & ZFS_XATTR) && 1830 (vap->va_type != VREG)) { 1831 error = SET_ERROR(EINVAL); 1832 goto out; 1833 } 1834 1835 if ((error = zfs_acl_ids_create(dzp, 0, vap, 1836 cr, vsecp, &acl_ids)) != 0) 1837 goto out; 1838 1839 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { 1840 zfs_acl_ids_free(&acl_ids); 1841 error = SET_ERROR(EDQUOT); 1842 goto out; 1843 } 1844 1845 getnewvnode_reserve(1); 1846 1847 tx = dmu_tx_create(os); 1848 1849 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 1850 ZFS_SA_BASE_ATTR_SIZE); 1851 1852 fuid_dirtied = zfsvfs->z_fuid_dirty; 1853 if (fuid_dirtied) 1854 zfs_fuid_txhold(zfsvfs, tx); 1855 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 1856 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); 1857 if (!zfsvfs->z_use_sa && 1858 acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 1859 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 1860 0, acl_ids.z_aclp->z_acl_bytes); 1861 } 1862 error = dmu_tx_assign(tx, TXG_WAIT); 1863 if (error) { 1864 zfs_acl_ids_free(&acl_ids); 1865 dmu_tx_abort(tx); 1866 getnewvnode_drop_reserve(); 1867 ZFS_EXIT(zfsvfs); 1868 return (error); 1869 } 1870 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); 1871 1872 if (fuid_dirtied) 1873 zfs_fuid_sync(zfsvfs, tx); 1874 1875 (void) zfs_link_create(dzp, name, zp, tx, ZNEW); 1876 txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); 1877 zfs_log_create(zilog, tx, txtype, dzp, zp, name, 1878 vsecp, acl_ids.z_fuidp, vap); 1879 zfs_acl_ids_free(&acl_ids); 1880 dmu_tx_commit(tx); 1881 1882 getnewvnode_drop_reserve(); 1883 1884out: 1885 if (error == 0) { 1886 *vpp = ZTOV(zp); 1887 } 1888 1889 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 1890 zil_commit(zilog, 0); 1891 1892 ZFS_EXIT(zfsvfs); 1893 return (error); 1894} 1895 1896/* 1897 * Remove an entry from a directory. 1898 * 1899 * IN: dvp - vnode of directory to remove entry from. 1900 * name - name of entry to remove. 1901 * cr - credentials of caller. 1902 * ct - caller context 1903 * flags - case flags 1904 * 1905 * RETURN: 0 on success, error code on failure. 1906 * 1907 * Timestamps: 1908 * dvp - ctime|mtime 1909 * vp - ctime (if nlink > 0) 1910 */ 1911 1912/*ARGSUSED*/ 1913static int 1914zfs_remove(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr) 1915{ 1916 znode_t *dzp = VTOZ(dvp); 1917 znode_t *zp = VTOZ(vp); 1918 znode_t *xzp; 1919 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1920 zilog_t *zilog; 1921 uint64_t acl_obj, xattr_obj; 1922 uint64_t obj = 0; 1923 dmu_tx_t *tx; 1924 boolean_t unlinked, toobig = FALSE; 1925 uint64_t txtype; 1926 int error; 1927 1928 ZFS_ENTER(zfsvfs); 1929 ZFS_VERIFY_ZP(dzp); 1930 ZFS_VERIFY_ZP(zp); 1931 zilog = zfsvfs->z_log; 1932 zp = VTOZ(vp); 1933 1934 xattr_obj = 0; 1935 xzp = NULL; 1936 1937 if (error = zfs_zaccess_delete(dzp, zp, cr)) { 1938 goto out; 1939 } 1940 1941 /* 1942 * Need to use rmdir for removing directories. 1943 */ 1944 if (vp->v_type == VDIR) { 1945 error = SET_ERROR(EPERM); 1946 goto out; 1947 } 1948 1949 vnevent_remove(vp, dvp, name, ct); 1950 1951 obj = zp->z_id; 1952 1953 /* are there any extended attributes? */ 1954 error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), 1955 &xattr_obj, sizeof (xattr_obj)); 1956 if (error == 0 && xattr_obj) { 1957 error = zfs_zget(zfsvfs, xattr_obj, &xzp); 1958 ASSERT0(error); 1959 } 1960 1961 /* 1962 * We may delete the znode now, or we may put it in the unlinked set; 1963 * it depends on whether we're the last link, and on whether there are 1964 * other holds on the vnode. So we dmu_tx_hold() the right things to 1965 * allow for either case. 1966 */ 1967 tx = dmu_tx_create(zfsvfs->z_os); 1968 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); 1969 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 1970 zfs_sa_upgrade_txholds(tx, zp); 1971 zfs_sa_upgrade_txholds(tx, dzp); 1972 1973 if (xzp) { 1974 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 1975 dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); 1976 } 1977 1978 /* charge as an update -- would be nice not to charge at all */ 1979 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 1980 1981 /* 1982 * Mark this transaction as typically resulting in a net free of space 1983 */ 1984 dmu_tx_mark_netfree(tx); 1985 1986 error = dmu_tx_assign(tx, TXG_WAIT); 1987 if (error) { 1988 dmu_tx_abort(tx); 1989 ZFS_EXIT(zfsvfs); 1990 return (error); 1991 } 1992 1993 /* 1994 * Remove the directory entry. 1995 */ 1996 error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked); 1997 1998 if (error) { 1999 dmu_tx_commit(tx); 2000 goto out; 2001 } 2002 2003 if (unlinked) { 2004 zfs_unlinked_add(zp, tx); 2005 vp->v_vflag |= VV_NOSYNC; 2006 } 2007 2008 txtype = TX_REMOVE; 2009 zfs_log_remove(zilog, tx, txtype, dzp, name, obj); 2010 2011 dmu_tx_commit(tx); 2012out: 2013 2014 if (xzp) 2015 vrele(ZTOV(xzp)); 2016 2017 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 2018 zil_commit(zilog, 0); 2019 2020 ZFS_EXIT(zfsvfs); 2021 return (error); 2022} 2023 2024/* 2025 * Create a new directory and insert it into dvp using the name 2026 * provided. Return a pointer to the inserted directory. 2027 * 2028 * IN: dvp - vnode of directory to add subdir to. 2029 * dirname - name of new directory. 2030 * vap - attributes of new directory. 2031 * cr - credentials of caller. 2032 * ct - caller context 2033 * flags - case flags 2034 * vsecp - ACL to be set 2035 * 2036 * OUT: vpp - vnode of created directory. 2037 * 2038 * RETURN: 0 on success, error code on failure. 2039 * 2040 * Timestamps: 2041 * dvp - ctime|mtime updated 2042 * vp - ctime|mtime|atime updated 2043 */ 2044/*ARGSUSED*/ 2045static int 2046zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr) 2047{ 2048 znode_t *zp, *dzp = VTOZ(dvp); 2049 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 2050 zilog_t *zilog; 2051 uint64_t txtype; 2052 dmu_tx_t *tx; 2053 int error; 2054 ksid_t *ksid; 2055 uid_t uid; 2056 gid_t gid = crgetgid(cr); 2057 zfs_acl_ids_t acl_ids; 2058 boolean_t fuid_dirtied; 2059 2060 ASSERT(vap->va_type == VDIR); 2061 2062 /* 2063 * If we have an ephemeral id, ACL, or XVATTR then 2064 * make sure file system is at proper version 2065 */ 2066 2067 ksid = crgetsid(cr, KSID_OWNER); 2068 if (ksid) 2069 uid = ksid_getid(ksid); 2070 else 2071 uid = crgetuid(cr); 2072 if (zfsvfs->z_use_fuids == B_FALSE && 2073 ((vap->va_mask & AT_XVATTR) || 2074 IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) 2075 return (SET_ERROR(EINVAL)); 2076 2077 ZFS_ENTER(zfsvfs); 2078 ZFS_VERIFY_ZP(dzp); 2079 zilog = zfsvfs->z_log; 2080 2081 if (dzp->z_pflags & ZFS_XATTR) { 2082 ZFS_EXIT(zfsvfs); 2083 return (SET_ERROR(EINVAL)); 2084 } 2085 2086 if (zfsvfs->z_utf8 && u8_validate(dirname, 2087 strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 2088 ZFS_EXIT(zfsvfs); 2089 return (SET_ERROR(EILSEQ)); 2090 } 2091 2092 if (vap->va_mask & AT_XVATTR) { 2093 if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap, 2094 crgetuid(cr), cr, vap->va_type)) != 0) { 2095 ZFS_EXIT(zfsvfs); 2096 return (error); 2097 } 2098 } 2099 2100 if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, 2101 NULL, &acl_ids)) != 0) { 2102 ZFS_EXIT(zfsvfs); 2103 return (error); 2104 } 2105 2106 /* 2107 * First make sure the new directory doesn't exist. 2108 * 2109 * Existence is checked first to make sure we don't return 2110 * EACCES instead of EEXIST which can cause some applications 2111 * to fail. 2112 */ 2113 *vpp = NULL; 2114 2115 if (error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW)) { 2116 zfs_acl_ids_free(&acl_ids); 2117 ZFS_EXIT(zfsvfs); 2118 return (error); 2119 } 2120 ASSERT3P(zp, ==, NULL); 2121 2122 if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) { 2123 zfs_acl_ids_free(&acl_ids); 2124 ZFS_EXIT(zfsvfs); 2125 return (error); 2126 } 2127 2128 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { 2129 zfs_acl_ids_free(&acl_ids); 2130 ZFS_EXIT(zfsvfs); 2131 return (SET_ERROR(EDQUOT)); 2132 } 2133 2134 /* 2135 * Add a new entry to the directory. 2136 */ 2137 getnewvnode_reserve(1); 2138 tx = dmu_tx_create(zfsvfs->z_os); 2139 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); 2140 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); 2141 fuid_dirtied = zfsvfs->z_fuid_dirty; 2142 if (fuid_dirtied) 2143 zfs_fuid_txhold(zfsvfs, tx); 2144 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 2145 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 2146 acl_ids.z_aclp->z_acl_bytes); 2147 } 2148 2149 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 2150 ZFS_SA_BASE_ATTR_SIZE); 2151 2152 error = dmu_tx_assign(tx, TXG_WAIT); 2153 if (error) { 2154 zfs_acl_ids_free(&acl_ids); 2155 dmu_tx_abort(tx); 2156 getnewvnode_drop_reserve(); 2157 ZFS_EXIT(zfsvfs); 2158 return (error); 2159 } 2160 2161 /* 2162 * Create new node. 2163 */ 2164 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); 2165 2166 if (fuid_dirtied) 2167 zfs_fuid_sync(zfsvfs, tx); 2168 2169 /* 2170 * Now put new name in parent dir. 2171 */ 2172 (void) zfs_link_create(dzp, dirname, zp, tx, ZNEW); 2173 2174 *vpp = ZTOV(zp); 2175 2176 txtype = zfs_log_create_txtype(Z_DIR, NULL, vap); 2177 zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL, 2178 acl_ids.z_fuidp, vap); 2179 2180 zfs_acl_ids_free(&acl_ids); 2181 2182 dmu_tx_commit(tx); 2183 2184 getnewvnode_drop_reserve(); 2185 2186 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 2187 zil_commit(zilog, 0); 2188 2189 ZFS_EXIT(zfsvfs); 2190 return (0); 2191} 2192 2193/* 2194 * Remove a directory subdir entry. If the current working 2195 * directory is the same as the subdir to be removed, the 2196 * remove will fail. 2197 * 2198 * IN: dvp - vnode of directory to remove from. 2199 * name - name of directory to be removed. 2200 * cwd - vnode of current working directory. 2201 * cr - credentials of caller. 2202 * ct - caller context 2203 * flags - case flags 2204 * 2205 * RETURN: 0 on success, error code on failure. 2206 * 2207 * Timestamps: 2208 * dvp - ctime|mtime updated 2209 */ 2210/*ARGSUSED*/ 2211static int 2212zfs_rmdir(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr) 2213{ 2214 znode_t *dzp = VTOZ(dvp); 2215 znode_t *zp = VTOZ(vp); 2216 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 2217 zilog_t *zilog; 2218 dmu_tx_t *tx; 2219 int error; 2220 2221 ZFS_ENTER(zfsvfs); 2222 ZFS_VERIFY_ZP(dzp); 2223 ZFS_VERIFY_ZP(zp); 2224 zilog = zfsvfs->z_log; 2225 2226 2227 if (error = zfs_zaccess_delete(dzp, zp, cr)) { 2228 goto out; 2229 } 2230 2231 if (vp->v_type != VDIR) { 2232 error = SET_ERROR(ENOTDIR); 2233 goto out; 2234 } 2235 2236 vnevent_rmdir(vp, dvp, name, ct); 2237 2238 tx = dmu_tx_create(zfsvfs->z_os); 2239 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); 2240 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 2241 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 2242 zfs_sa_upgrade_txholds(tx, zp); 2243 zfs_sa_upgrade_txholds(tx, dzp); 2244 dmu_tx_mark_netfree(tx); 2245 error = dmu_tx_assign(tx, TXG_WAIT); 2246 if (error) { 2247 dmu_tx_abort(tx); 2248 ZFS_EXIT(zfsvfs); 2249 return (error); 2250 } 2251 2252 cache_purge(dvp); 2253 2254 error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL); 2255 2256 if (error == 0) { 2257 uint64_t txtype = TX_RMDIR; 2258 zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT); 2259 } 2260 2261 dmu_tx_commit(tx); 2262 2263 cache_purge(vp); 2264out: 2265 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 2266 zil_commit(zilog, 0); 2267 2268 ZFS_EXIT(zfsvfs); 2269 return (error); 2270} 2271 2272/* 2273 * Read as many directory entries as will fit into the provided 2274 * buffer from the given directory cursor position (specified in 2275 * the uio structure). 2276 * 2277 * IN: vp - vnode of directory to read. 2278 * uio - structure supplying read location, range info, 2279 * and return buffer. 2280 * cr - credentials of caller. 2281 * ct - caller context 2282 * flags - case flags 2283 * 2284 * OUT: uio - updated offset and range, buffer filled. 2285 * eofp - set to true if end-of-file detected. 2286 * 2287 * RETURN: 0 on success, error code on failure. 2288 * 2289 * Timestamps: 2290 * vp - atime updated 2291 * 2292 * Note that the low 4 bits of the cookie returned by zap is always zero. 2293 * This allows us to use the low range for "special" directory entries: 2294 * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, 2295 * we use the offset 2 for the '.zfs' directory. 2296 */ 2297/* ARGSUSED */ 2298static int 2299zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies) 2300{ 2301 znode_t *zp = VTOZ(vp); 2302 iovec_t *iovp; 2303 edirent_t *eodp; 2304 dirent64_t *odp; 2305 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2306 objset_t *os; 2307 caddr_t outbuf; 2308 size_t bufsize; 2309 zap_cursor_t zc; 2310 zap_attribute_t zap; 2311 uint_t bytes_wanted; 2312 uint64_t offset; /* must be unsigned; checks for < 1 */ 2313 uint64_t parent; 2314 int local_eof; 2315 int outcount; 2316 int error; 2317 uint8_t prefetch; 2318 boolean_t check_sysattrs; 2319 uint8_t type; 2320 int ncooks; 2321 u_long *cooks = NULL; 2322 int flags = 0; 2323 2324 ZFS_ENTER(zfsvfs); 2325 ZFS_VERIFY_ZP(zp); 2326 2327 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), 2328 &parent, sizeof (parent))) != 0) { 2329 ZFS_EXIT(zfsvfs); 2330 return (error); 2331 } 2332 2333 /* 2334 * If we are not given an eof variable, 2335 * use a local one. 2336 */ 2337 if (eofp == NULL) 2338 eofp = &local_eof; 2339 2340 /* 2341 * Check for valid iov_len. 2342 */ 2343 if (uio->uio_iov->iov_len <= 0) { 2344 ZFS_EXIT(zfsvfs); 2345 return (SET_ERROR(EINVAL)); 2346 } 2347 2348 /* 2349 * Quit if directory has been removed (posix) 2350 */ 2351 if ((*eofp = zp->z_unlinked) != 0) { 2352 ZFS_EXIT(zfsvfs); 2353 return (0); 2354 } 2355 2356 error = 0; 2357 os = zfsvfs->z_os; 2358 offset = uio->uio_loffset; 2359 prefetch = zp->z_zn_prefetch; 2360 2361 /* 2362 * Initialize the iterator cursor. 2363 */ 2364 if (offset <= 3) { 2365 /* 2366 * Start iteration from the beginning of the directory. 2367 */ 2368 zap_cursor_init(&zc, os, zp->z_id); 2369 } else { 2370 /* 2371 * The offset is a serialized cursor. 2372 */ 2373 zap_cursor_init_serialized(&zc, os, zp->z_id, offset); 2374 } 2375 2376 /* 2377 * Get space to change directory entries into fs independent format. 2378 */ 2379 iovp = uio->uio_iov; 2380 bytes_wanted = iovp->iov_len; 2381 if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) { 2382 bufsize = bytes_wanted; 2383 outbuf = kmem_alloc(bufsize, KM_SLEEP); 2384 odp = (struct dirent64 *)outbuf; 2385 } else { 2386 bufsize = bytes_wanted; 2387 outbuf = NULL; 2388 odp = (struct dirent64 *)iovp->iov_base; 2389 } 2390 eodp = (struct edirent *)odp; 2391 2392 if (ncookies != NULL) { 2393 /* 2394 * Minimum entry size is dirent size and 1 byte for a file name. 2395 */ 2396 ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1); 2397 cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK); 2398 *cookies = cooks; 2399 *ncookies = ncooks; 2400 } 2401 /* 2402 * If this VFS supports the system attribute view interface; and 2403 * we're looking at an extended attribute directory; and we care 2404 * about normalization conflicts on this vfs; then we must check 2405 * for normalization conflicts with the sysattr name space. 2406 */ 2407#ifdef TODO 2408 check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && 2409 (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm && 2410 (flags & V_RDDIR_ENTFLAGS); 2411#else 2412 check_sysattrs = 0; 2413#endif 2414 2415 /* 2416 * Transform to file-system independent format 2417 */ 2418 outcount = 0; 2419 while (outcount < bytes_wanted) { 2420 ino64_t objnum; 2421 ushort_t reclen; 2422 off64_t *next = NULL; 2423 2424 /* 2425 * Special case `.', `..', and `.zfs'. 2426 */ 2427 if (offset == 0) { 2428 (void) strcpy(zap.za_name, "."); 2429 zap.za_normalization_conflict = 0; 2430 objnum = zp->z_id; 2431 type = DT_DIR; 2432 } else if (offset == 1) { 2433 (void) strcpy(zap.za_name, ".."); 2434 zap.za_normalization_conflict = 0; 2435 objnum = parent; 2436 type = DT_DIR; 2437 } else if (offset == 2 && zfs_show_ctldir(zp)) { 2438 (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); 2439 zap.za_normalization_conflict = 0; 2440 objnum = ZFSCTL_INO_ROOT; 2441 type = DT_DIR; 2442 } else { 2443 /* 2444 * Grab next entry. 2445 */ 2446 if (error = zap_cursor_retrieve(&zc, &zap)) { 2447 if ((*eofp = (error == ENOENT)) != 0) 2448 break; 2449 else 2450 goto update; 2451 } 2452 2453 if (zap.za_integer_length != 8 || 2454 zap.za_num_integers != 1) { 2455 cmn_err(CE_WARN, "zap_readdir: bad directory " 2456 "entry, obj = %lld, offset = %lld\n", 2457 (u_longlong_t)zp->z_id, 2458 (u_longlong_t)offset); 2459 error = SET_ERROR(ENXIO); 2460 goto update; 2461 } 2462 2463 objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); 2464 /* 2465 * MacOS X can extract the object type here such as: 2466 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer); 2467 */ 2468 type = ZFS_DIRENT_TYPE(zap.za_first_integer); 2469 2470 if (check_sysattrs && !zap.za_normalization_conflict) { 2471#ifdef TODO 2472 zap.za_normalization_conflict = 2473 xattr_sysattr_casechk(zap.za_name); 2474#else 2475 panic("%s:%u: TODO", __func__, __LINE__); 2476#endif 2477 } 2478 } 2479 2480 if (flags & V_RDDIR_ACCFILTER) { 2481 /* 2482 * If we have no access at all, don't include 2483 * this entry in the returned information 2484 */ 2485 znode_t *ezp; 2486 if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0) 2487 goto skip_entry; 2488 if (!zfs_has_access(ezp, cr)) { 2489 vrele(ZTOV(ezp)); 2490 goto skip_entry; 2491 } 2492 vrele(ZTOV(ezp)); 2493 } 2494 2495 if (flags & V_RDDIR_ENTFLAGS) 2496 reclen = EDIRENT_RECLEN(strlen(zap.za_name)); 2497 else 2498 reclen = DIRENT64_RECLEN(strlen(zap.za_name)); 2499 2500 /* 2501 * Will this entry fit in the buffer? 2502 */ 2503 if (outcount + reclen > bufsize) { 2504 /* 2505 * Did we manage to fit anything in the buffer? 2506 */ 2507 if (!outcount) { 2508 error = SET_ERROR(EINVAL); 2509 goto update; 2510 } 2511 break; 2512 } 2513 if (flags & V_RDDIR_ENTFLAGS) { 2514 /* 2515 * Add extended flag entry: 2516 */ 2517 eodp->ed_ino = objnum; 2518 eodp->ed_reclen = reclen; 2519 /* NOTE: ed_off is the offset for the *next* entry */ 2520 next = &(eodp->ed_off); 2521 eodp->ed_eflags = zap.za_normalization_conflict ? 2522 ED_CASE_CONFLICT : 0; 2523 (void) strncpy(eodp->ed_name, zap.za_name, 2524 EDIRENT_NAMELEN(reclen)); 2525 eodp = (edirent_t *)((intptr_t)eodp + reclen); 2526 } else { 2527 /* 2528 * Add normal entry: 2529 */ 2530 odp->d_ino = objnum; 2531 odp->d_reclen = reclen; 2532 odp->d_namlen = strlen(zap.za_name); 2533 (void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1); 2534 odp->d_type = type; 2535 odp = (dirent64_t *)((intptr_t)odp + reclen); 2536 } 2537 outcount += reclen; 2538 2539 ASSERT(outcount <= bufsize); 2540 2541 /* Prefetch znode */ 2542 if (prefetch) 2543 dmu_prefetch(os, objnum, 0, 0, 0, 2544 ZIO_PRIORITY_SYNC_READ); 2545 2546 skip_entry: 2547 /* 2548 * Move to the next entry, fill in the previous offset. 2549 */ 2550 if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { 2551 zap_cursor_advance(&zc); 2552 offset = zap_cursor_serialize(&zc); 2553 } else { 2554 offset += 1; 2555 } 2556 2557 if (cooks != NULL) { 2558 *cooks++ = offset; 2559 ncooks--; 2560 KASSERT(ncooks >= 0, ("ncookies=%d", ncooks)); 2561 } 2562 } 2563 zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ 2564 2565 /* Subtract unused cookies */ 2566 if (ncookies != NULL) 2567 *ncookies -= ncooks; 2568 2569 if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) { 2570 iovp->iov_base += outcount; 2571 iovp->iov_len -= outcount; 2572 uio->uio_resid -= outcount; 2573 } else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) { 2574 /* 2575 * Reset the pointer. 2576 */ 2577 offset = uio->uio_loffset; 2578 } 2579 2580update: 2581 zap_cursor_fini(&zc); 2582 if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) 2583 kmem_free(outbuf, bufsize); 2584 2585 if (error == ENOENT) 2586 error = 0; 2587 2588 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 2589 2590 uio->uio_loffset = offset; 2591 ZFS_EXIT(zfsvfs); 2592 if (error != 0 && cookies != NULL) { 2593 free(*cookies, M_TEMP); 2594 *cookies = NULL; 2595 *ncookies = 0; 2596 } 2597 return (error); 2598} 2599 2600ulong_t zfs_fsync_sync_cnt = 4; 2601 2602static int 2603zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct) 2604{ 2605 znode_t *zp = VTOZ(vp); 2606 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2607 2608 (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt); 2609 2610 if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { 2611 ZFS_ENTER(zfsvfs); 2612 ZFS_VERIFY_ZP(zp); 2613 zil_commit(zfsvfs->z_log, zp->z_id); 2614 ZFS_EXIT(zfsvfs); 2615 } 2616 return (0); 2617} 2618 2619 2620/* 2621 * Get the requested file attributes and place them in the provided 2622 * vattr structure. 2623 * 2624 * IN: vp - vnode of file. 2625 * vap - va_mask identifies requested attributes. 2626 * If AT_XVATTR set, then optional attrs are requested 2627 * flags - ATTR_NOACLCHECK (CIFS server context) 2628 * cr - credentials of caller. 2629 * ct - caller context 2630 * 2631 * OUT: vap - attribute values. 2632 * 2633 * RETURN: 0 (always succeeds). 2634 */ 2635/* ARGSUSED */ 2636static int 2637zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, 2638 caller_context_t *ct) 2639{ 2640 znode_t *zp = VTOZ(vp); 2641 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2642 int error = 0; 2643 uint32_t blksize; 2644 u_longlong_t nblocks; 2645 uint64_t links; 2646 uint64_t mtime[2], ctime[2], crtime[2], rdev; 2647 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ 2648 xoptattr_t *xoap = NULL; 2649 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 2650 sa_bulk_attr_t bulk[4]; 2651 int count = 0; 2652 2653 ZFS_ENTER(zfsvfs); 2654 ZFS_VERIFY_ZP(zp); 2655 2656 zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid); 2657 2658 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); 2659 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); 2660 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); 2661 if (vp->v_type == VBLK || vp->v_type == VCHR) 2662 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL, 2663 &rdev, 8); 2664 2665 if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) { 2666 ZFS_EXIT(zfsvfs); 2667 return (error); 2668 } 2669 2670 /* 2671 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES. 2672 * Also, if we are the owner don't bother, since owner should 2673 * always be allowed to read basic attributes of file. 2674 */ 2675 if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) && 2676 (vap->va_uid != crgetuid(cr))) { 2677 if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0, 2678 skipaclchk, cr)) { 2679 ZFS_EXIT(zfsvfs); 2680 return (error); 2681 } 2682 } 2683 2684 /* 2685 * Return all attributes. It's cheaper to provide the answer 2686 * than to determine whether we were asked the question. 2687 */ 2688 2689 vap->va_type = IFTOVT(zp->z_mode); 2690 vap->va_mode = zp->z_mode & ~S_IFMT; 2691#ifdef illumos 2692 vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev; 2693#else 2694 vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; 2695#endif 2696 vap->va_nodeid = zp->z_id; 2697 if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp)) 2698 links = zp->z_links + 1; 2699 else 2700 links = zp->z_links; 2701 vap->va_nlink = MIN(links, LINK_MAX); /* nlink_t limit! */ 2702 vap->va_size = zp->z_size; 2703#ifdef illumos 2704 vap->va_rdev = vp->v_rdev; 2705#else 2706 if (vp->v_type == VBLK || vp->v_type == VCHR) 2707 vap->va_rdev = zfs_cmpldev(rdev); 2708#endif 2709 vap->va_seq = zp->z_seq; 2710 vap->va_flags = 0; /* FreeBSD: Reset chflags(2) flags. */ 2711 vap->va_filerev = zp->z_seq; 2712 2713 /* 2714 * Add in any requested optional attributes and the create time. 2715 * Also set the corresponding bits in the returned attribute bitmap. 2716 */ 2717 if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) { 2718 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { 2719 xoap->xoa_archive = 2720 ((zp->z_pflags & ZFS_ARCHIVE) != 0); 2721 XVA_SET_RTN(xvap, XAT_ARCHIVE); 2722 } 2723 2724 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { 2725 xoap->xoa_readonly = 2726 ((zp->z_pflags & ZFS_READONLY) != 0); 2727 XVA_SET_RTN(xvap, XAT_READONLY); 2728 } 2729 2730 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { 2731 xoap->xoa_system = 2732 ((zp->z_pflags & ZFS_SYSTEM) != 0); 2733 XVA_SET_RTN(xvap, XAT_SYSTEM); 2734 } 2735 2736 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { 2737 xoap->xoa_hidden = 2738 ((zp->z_pflags & ZFS_HIDDEN) != 0); 2739 XVA_SET_RTN(xvap, XAT_HIDDEN); 2740 } 2741 2742 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { 2743 xoap->xoa_nounlink = 2744 ((zp->z_pflags & ZFS_NOUNLINK) != 0); 2745 XVA_SET_RTN(xvap, XAT_NOUNLINK); 2746 } 2747 2748 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { 2749 xoap->xoa_immutable = 2750 ((zp->z_pflags & ZFS_IMMUTABLE) != 0); 2751 XVA_SET_RTN(xvap, XAT_IMMUTABLE); 2752 } 2753 2754 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { 2755 xoap->xoa_appendonly = 2756 ((zp->z_pflags & ZFS_APPENDONLY) != 0); 2757 XVA_SET_RTN(xvap, XAT_APPENDONLY); 2758 } 2759 2760 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { 2761 xoap->xoa_nodump = 2762 ((zp->z_pflags & ZFS_NODUMP) != 0); 2763 XVA_SET_RTN(xvap, XAT_NODUMP); 2764 } 2765 2766 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { 2767 xoap->xoa_opaque = 2768 ((zp->z_pflags & ZFS_OPAQUE) != 0); 2769 XVA_SET_RTN(xvap, XAT_OPAQUE); 2770 } 2771 2772 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { 2773 xoap->xoa_av_quarantined = 2774 ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0); 2775 XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); 2776 } 2777 2778 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { 2779 xoap->xoa_av_modified = 2780 ((zp->z_pflags & ZFS_AV_MODIFIED) != 0); 2781 XVA_SET_RTN(xvap, XAT_AV_MODIFIED); 2782 } 2783 2784 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) && 2785 vp->v_type == VREG) { 2786 zfs_sa_get_scanstamp(zp, xvap); 2787 } 2788 2789 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { 2790 xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0); 2791 XVA_SET_RTN(xvap, XAT_REPARSE); 2792 } 2793 if (XVA_ISSET_REQ(xvap, XAT_GEN)) { 2794 xoap->xoa_generation = zp->z_gen; 2795 XVA_SET_RTN(xvap, XAT_GEN); 2796 } 2797 2798 if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { 2799 xoap->xoa_offline = 2800 ((zp->z_pflags & ZFS_OFFLINE) != 0); 2801 XVA_SET_RTN(xvap, XAT_OFFLINE); 2802 } 2803 2804 if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { 2805 xoap->xoa_sparse = 2806 ((zp->z_pflags & ZFS_SPARSE) != 0); 2807 XVA_SET_RTN(xvap, XAT_SPARSE); 2808 } 2809 } 2810 2811 ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime); 2812 ZFS_TIME_DECODE(&vap->va_mtime, mtime); 2813 ZFS_TIME_DECODE(&vap->va_ctime, ctime); 2814 ZFS_TIME_DECODE(&vap->va_birthtime, crtime); 2815 2816 2817 sa_object_size(zp->z_sa_hdl, &blksize, &nblocks); 2818 vap->va_blksize = blksize; 2819 vap->va_bytes = nblocks << 9; /* nblocks * 512 */ 2820 2821 if (zp->z_blksz == 0) { 2822 /* 2823 * Block size hasn't been set; suggest maximal I/O transfers. 2824 */ 2825 vap->va_blksize = zfsvfs->z_max_blksz; 2826 } 2827 2828 ZFS_EXIT(zfsvfs); 2829 return (0); 2830} 2831 2832/* 2833 * Set the file attributes to the values contained in the 2834 * vattr structure. 2835 * 2836 * IN: vp - vnode of file to be modified. 2837 * vap - new attribute values. 2838 * If AT_XVATTR set, then optional attrs are being set 2839 * flags - ATTR_UTIME set if non-default time values provided. 2840 * - ATTR_NOACLCHECK (CIFS context only). 2841 * cr - credentials of caller. 2842 * ct - caller context 2843 * 2844 * RETURN: 0 on success, error code on failure. 2845 * 2846 * Timestamps: 2847 * vp - ctime updated, mtime updated if size changed. 2848 */ 2849/* ARGSUSED */ 2850static int 2851zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, 2852 caller_context_t *ct) 2853{ 2854 znode_t *zp = VTOZ(vp); 2855 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2856 zilog_t *zilog; 2857 dmu_tx_t *tx; 2858 vattr_t oldva; 2859 xvattr_t tmpxvattr; 2860 uint_t mask = vap->va_mask; 2861 uint_t saved_mask = 0; 2862 uint64_t saved_mode; 2863 int trim_mask = 0; 2864 uint64_t new_mode; 2865 uint64_t new_uid, new_gid; 2866 uint64_t xattr_obj; 2867 uint64_t mtime[2], ctime[2]; 2868 znode_t *attrzp; 2869 int need_policy = FALSE; 2870 int err, err2; 2871 zfs_fuid_info_t *fuidp = NULL; 2872 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ 2873 xoptattr_t *xoap; 2874 zfs_acl_t *aclp; 2875 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 2876 boolean_t fuid_dirtied = B_FALSE; 2877 sa_bulk_attr_t bulk[7], xattr_bulk[7]; 2878 int count = 0, xattr_count = 0; 2879 2880 if (mask == 0) 2881 return (0); 2882 2883 if (mask & AT_NOSET) 2884 return (SET_ERROR(EINVAL)); 2885 2886 ZFS_ENTER(zfsvfs); 2887 ZFS_VERIFY_ZP(zp); 2888 2889 zilog = zfsvfs->z_log; 2890 2891 /* 2892 * Make sure that if we have ephemeral uid/gid or xvattr specified 2893 * that file system is at proper version level 2894 */ 2895 2896 if (zfsvfs->z_use_fuids == B_FALSE && 2897 (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) || 2898 ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) || 2899 (mask & AT_XVATTR))) { 2900 ZFS_EXIT(zfsvfs); 2901 return (SET_ERROR(EINVAL)); 2902 } 2903 2904 if (mask & AT_SIZE && vp->v_type == VDIR) { 2905 ZFS_EXIT(zfsvfs); 2906 return (SET_ERROR(EISDIR)); 2907 } 2908 2909 if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) { 2910 ZFS_EXIT(zfsvfs); 2911 return (SET_ERROR(EINVAL)); 2912 } 2913 2914 /* 2915 * If this is an xvattr_t, then get a pointer to the structure of 2916 * optional attributes. If this is NULL, then we have a vattr_t. 2917 */ 2918 xoap = xva_getxoptattr(xvap); 2919 2920 xva_init(&tmpxvattr); 2921 2922 /* 2923 * Immutable files can only alter immutable bit and atime 2924 */ 2925 if ((zp->z_pflags & ZFS_IMMUTABLE) && 2926 ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) || 2927 ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { 2928 ZFS_EXIT(zfsvfs); 2929 return (SET_ERROR(EPERM)); 2930 } 2931 2932 /* 2933 * Note: ZFS_READONLY is handled in zfs_zaccess_common. 2934 */ 2935 2936 /* 2937 * Verify timestamps doesn't overflow 32 bits. 2938 * ZFS can handle large timestamps, but 32bit syscalls can't 2939 * handle times greater than 2039. This check should be removed 2940 * once large timestamps are fully supported. 2941 */ 2942 if (mask & (AT_ATIME | AT_MTIME)) { 2943 if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || 2944 ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { 2945 ZFS_EXIT(zfsvfs); 2946 return (SET_ERROR(EOVERFLOW)); 2947 } 2948 } 2949 if (xoap && (mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME) && 2950 TIMESPEC_OVERFLOW(&vap->va_birthtime)) { 2951 ZFS_EXIT(zfsvfs); 2952 return (SET_ERROR(EOVERFLOW)); 2953 } 2954 2955 attrzp = NULL; 2956 aclp = NULL; 2957 2958 /* Can this be moved to before the top label? */ 2959 if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { 2960 ZFS_EXIT(zfsvfs); 2961 return (SET_ERROR(EROFS)); 2962 } 2963 2964 /* 2965 * First validate permissions 2966 */ 2967 2968 if (mask & AT_SIZE) { 2969 /* 2970 * XXX - Note, we are not providing any open 2971 * mode flags here (like FNDELAY), so we may 2972 * block if there are locks present... this 2973 * should be addressed in openat(). 2974 */ 2975 /* XXX - would it be OK to generate a log record here? */ 2976 err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); 2977 if (err) { 2978 ZFS_EXIT(zfsvfs); 2979 return (err); 2980 } 2981 } 2982 2983 if (mask & (AT_ATIME|AT_MTIME) || 2984 ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || 2985 XVA_ISSET_REQ(xvap, XAT_READONLY) || 2986 XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || 2987 XVA_ISSET_REQ(xvap, XAT_OFFLINE) || 2988 XVA_ISSET_REQ(xvap, XAT_SPARSE) || 2989 XVA_ISSET_REQ(xvap, XAT_CREATETIME) || 2990 XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { 2991 need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, 2992 skipaclchk, cr); 2993 } 2994 2995 if (mask & (AT_UID|AT_GID)) { 2996 int idmask = (mask & (AT_UID|AT_GID)); 2997 int take_owner; 2998 int take_group; 2999 3000 /* 3001 * NOTE: even if a new mode is being set, 3002 * we may clear S_ISUID/S_ISGID bits. 3003 */ 3004 3005 if (!(mask & AT_MODE)) 3006 vap->va_mode = zp->z_mode; 3007 3008 /* 3009 * Take ownership or chgrp to group we are a member of 3010 */ 3011 3012 take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr)); 3013 take_group = (mask & AT_GID) && 3014 zfs_groupmember(zfsvfs, vap->va_gid, cr); 3015 3016 /* 3017 * If both AT_UID and AT_GID are set then take_owner and 3018 * take_group must both be set in order to allow taking 3019 * ownership. 3020 * 3021 * Otherwise, send the check through secpolicy_vnode_setattr() 3022 * 3023 */ 3024 3025 if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) || 3026 ((idmask == AT_UID) && take_owner) || 3027 ((idmask == AT_GID) && take_group)) { 3028 if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, 3029 skipaclchk, cr) == 0) { 3030 /* 3031 * Remove setuid/setgid for non-privileged users 3032 */ 3033 secpolicy_setid_clear(vap, vp, cr); 3034 trim_mask = (mask & (AT_UID|AT_GID)); 3035 } else { 3036 need_policy = TRUE; 3037 } 3038 } else { 3039 need_policy = TRUE; 3040 } 3041 } 3042 3043 oldva.va_mode = zp->z_mode; 3044 zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); 3045 if (mask & AT_XVATTR) { 3046 /* 3047 * Update xvattr mask to include only those attributes 3048 * that are actually changing. 3049 * 3050 * the bits will be restored prior to actually setting 3051 * the attributes so the caller thinks they were set. 3052 */ 3053 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { 3054 if (xoap->xoa_appendonly != 3055 ((zp->z_pflags & ZFS_APPENDONLY) != 0)) { 3056 need_policy = TRUE; 3057 } else { 3058 XVA_CLR_REQ(xvap, XAT_APPENDONLY); 3059 XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY); 3060 } 3061 } 3062 3063 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { 3064 if (xoap->xoa_nounlink != 3065 ((zp->z_pflags & ZFS_NOUNLINK) != 0)) { 3066 need_policy = TRUE; 3067 } else { 3068 XVA_CLR_REQ(xvap, XAT_NOUNLINK); 3069 XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK); 3070 } 3071 } 3072 3073 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { 3074 if (xoap->xoa_immutable != 3075 ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) { 3076 need_policy = TRUE; 3077 } else { 3078 XVA_CLR_REQ(xvap, XAT_IMMUTABLE); 3079 XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE); 3080 } 3081 } 3082 3083 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { 3084 if (xoap->xoa_nodump != 3085 ((zp->z_pflags & ZFS_NODUMP) != 0)) { 3086 need_policy = TRUE; 3087 } else { 3088 XVA_CLR_REQ(xvap, XAT_NODUMP); 3089 XVA_SET_REQ(&tmpxvattr, XAT_NODUMP); 3090 } 3091 } 3092 3093 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { 3094 if (xoap->xoa_av_modified != 3095 ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) { 3096 need_policy = TRUE; 3097 } else { 3098 XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); 3099 XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED); 3100 } 3101 } 3102 3103 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { 3104 if ((vp->v_type != VREG && 3105 xoap->xoa_av_quarantined) || 3106 xoap->xoa_av_quarantined != 3107 ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) { 3108 need_policy = TRUE; 3109 } else { 3110 XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); 3111 XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED); 3112 } 3113 } 3114 3115 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { 3116 ZFS_EXIT(zfsvfs); 3117 return (SET_ERROR(EPERM)); 3118 } 3119 3120 if (need_policy == FALSE && 3121 (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) || 3122 XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { 3123 need_policy = TRUE; 3124 } 3125 } 3126 3127 if (mask & AT_MODE) { 3128 if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) { 3129 err = secpolicy_setid_setsticky_clear(vp, vap, 3130 &oldva, cr); 3131 if (err) { 3132 ZFS_EXIT(zfsvfs); 3133 return (err); 3134 } 3135 trim_mask |= AT_MODE; 3136 } else { 3137 need_policy = TRUE; 3138 } 3139 } 3140 3141 if (need_policy) { 3142 /* 3143 * If trim_mask is set then take ownership 3144 * has been granted or write_acl is present and user 3145 * has the ability to modify mode. In that case remove 3146 * UID|GID and or MODE from mask so that 3147 * secpolicy_vnode_setattr() doesn't revoke it. 3148 */ 3149 3150 if (trim_mask) { 3151 saved_mask = vap->va_mask; 3152 vap->va_mask &= ~trim_mask; 3153 if (trim_mask & AT_MODE) { 3154 /* 3155 * Save the mode, as secpolicy_vnode_setattr() 3156 * will overwrite it with ova.va_mode. 3157 */ 3158 saved_mode = vap->va_mode; 3159 } 3160 } 3161 err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags, 3162 (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp); 3163 if (err) { 3164 ZFS_EXIT(zfsvfs); 3165 return (err); 3166 } 3167 3168 if (trim_mask) { 3169 vap->va_mask |= saved_mask; 3170 if (trim_mask & AT_MODE) { 3171 /* 3172 * Recover the mode after 3173 * secpolicy_vnode_setattr(). 3174 */ 3175 vap->va_mode = saved_mode; 3176 } 3177 } 3178 } 3179 3180 /* 3181 * secpolicy_vnode_setattr, or take ownership may have 3182 * changed va_mask 3183 */ 3184 mask = vap->va_mask; 3185 3186 if ((mask & (AT_UID | AT_GID))) { 3187 err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), 3188 &xattr_obj, sizeof (xattr_obj)); 3189 3190 if (err == 0 && xattr_obj) { 3191 err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp); 3192 if (err == 0) { 3193 err = vn_lock(ZTOV(attrzp), LK_EXCLUSIVE); 3194 if (err != 0) 3195 vrele(ZTOV(attrzp)); 3196 } 3197 if (err) 3198 goto out2; 3199 } 3200 if (mask & AT_UID) { 3201 new_uid = zfs_fuid_create(zfsvfs, 3202 (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); 3203 if (new_uid != zp->z_uid && 3204 zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) { 3205 if (attrzp) 3206 vput(ZTOV(attrzp)); 3207 err = SET_ERROR(EDQUOT); 3208 goto out2; 3209 } 3210 } 3211 3212 if (mask & AT_GID) { 3213 new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid, 3214 cr, ZFS_GROUP, &fuidp); 3215 if (new_gid != zp->z_gid && 3216 zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) { 3217 if (attrzp) 3218 vput(ZTOV(attrzp)); 3219 err = SET_ERROR(EDQUOT); 3220 goto out2; 3221 } 3222 } 3223 } 3224 tx = dmu_tx_create(zfsvfs->z_os); 3225 3226 if (mask & AT_MODE) { 3227 uint64_t pmode = zp->z_mode; 3228 uint64_t acl_obj; 3229 new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); 3230 3231 if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED && 3232 !(zp->z_pflags & ZFS_ACL_TRIVIAL)) { 3233 err = SET_ERROR(EPERM); 3234 goto out; 3235 } 3236 3237 if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)) 3238 goto out; 3239 3240 if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) { 3241 /* 3242 * Are we upgrading ACL from old V0 format 3243 * to V1 format? 3244 */ 3245 if (zfsvfs->z_version >= ZPL_VERSION_FUID && 3246 zfs_znode_acl_version(zp) == 3247 ZFS_ACL_VERSION_INITIAL) { 3248 dmu_tx_hold_free(tx, acl_obj, 0, 3249 DMU_OBJECT_END); 3250 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 3251 0, aclp->z_acl_bytes); 3252 } else { 3253 dmu_tx_hold_write(tx, acl_obj, 0, 3254 aclp->z_acl_bytes); 3255 } 3256 } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { 3257 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 3258 0, aclp->z_acl_bytes); 3259 } 3260 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 3261 } else { 3262 if ((mask & AT_XVATTR) && 3263 XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) 3264 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 3265 else 3266 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 3267 } 3268 3269 if (attrzp) { 3270 dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE); 3271 } 3272 3273 fuid_dirtied = zfsvfs->z_fuid_dirty; 3274 if (fuid_dirtied) 3275 zfs_fuid_txhold(zfsvfs, tx); 3276 3277 zfs_sa_upgrade_txholds(tx, zp); 3278 3279 err = dmu_tx_assign(tx, TXG_WAIT); 3280 if (err) 3281 goto out; 3282 3283 count = 0; 3284 /* 3285 * Set each attribute requested. 3286 * We group settings according to the locks they need to acquire. 3287 * 3288 * Note: you cannot set ctime directly, although it will be 3289 * updated as a side-effect of calling this function. 3290 */ 3291 3292 if (mask & (AT_UID|AT_GID|AT_MODE)) 3293 mutex_enter(&zp->z_acl_lock); 3294 3295 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 3296 &zp->z_pflags, sizeof (zp->z_pflags)); 3297 3298 if (attrzp) { 3299 if (mask & (AT_UID|AT_GID|AT_MODE)) 3300 mutex_enter(&attrzp->z_acl_lock); 3301 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 3302 SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags, 3303 sizeof (attrzp->z_pflags)); 3304 } 3305 3306 if (mask & (AT_UID|AT_GID)) { 3307 3308 if (mask & AT_UID) { 3309 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, 3310 &new_uid, sizeof (new_uid)); 3311 zp->z_uid = new_uid; 3312 if (attrzp) { 3313 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 3314 SA_ZPL_UID(zfsvfs), NULL, &new_uid, 3315 sizeof (new_uid)); 3316 attrzp->z_uid = new_uid; 3317 } 3318 } 3319 3320 if (mask & AT_GID) { 3321 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), 3322 NULL, &new_gid, sizeof (new_gid)); 3323 zp->z_gid = new_gid; 3324 if (attrzp) { 3325 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 3326 SA_ZPL_GID(zfsvfs), NULL, &new_gid, 3327 sizeof (new_gid)); 3328 attrzp->z_gid = new_gid; 3329 } 3330 } 3331 if (!(mask & AT_MODE)) { 3332 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), 3333 NULL, &new_mode, sizeof (new_mode)); 3334 new_mode = zp->z_mode; 3335 } 3336 err = zfs_acl_chown_setattr(zp); 3337 ASSERT(err == 0); 3338 if (attrzp) { 3339 err = zfs_acl_chown_setattr(attrzp); 3340 ASSERT(err == 0); 3341 } 3342 } 3343 3344 if (mask & AT_MODE) { 3345 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, 3346 &new_mode, sizeof (new_mode)); 3347 zp->z_mode = new_mode; 3348 ASSERT3U((uintptr_t)aclp, !=, 0); 3349 err = zfs_aclset_common(zp, aclp, cr, tx); 3350 ASSERT0(err); 3351 if (zp->z_acl_cached) 3352 zfs_acl_free(zp->z_acl_cached); 3353 zp->z_acl_cached = aclp; 3354 aclp = NULL; 3355 } 3356 3357 3358 if (mask & AT_ATIME) { 3359 ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime); 3360 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, 3361 &zp->z_atime, sizeof (zp->z_atime)); 3362 } 3363 3364 if (mask & AT_MTIME) { 3365 ZFS_TIME_ENCODE(&vap->va_mtime, mtime); 3366 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, 3367 mtime, sizeof (mtime)); 3368 } 3369 3370 /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */ 3371 if (mask & AT_SIZE && !(mask & AT_MTIME)) { 3372 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), 3373 NULL, mtime, sizeof (mtime)); 3374 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, 3375 &ctime, sizeof (ctime)); 3376 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, 3377 B_TRUE); 3378 } else if (mask != 0) { 3379 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, 3380 &ctime, sizeof (ctime)); 3381 zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime, 3382 B_TRUE); 3383 if (attrzp) { 3384 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 3385 SA_ZPL_CTIME(zfsvfs), NULL, 3386 &ctime, sizeof (ctime)); 3387 zfs_tstamp_update_setup(attrzp, STATE_CHANGED, 3388 mtime, ctime, B_TRUE); 3389 } 3390 } 3391 /* 3392 * Do this after setting timestamps to prevent timestamp 3393 * update from toggling bit 3394 */ 3395 3396 if (xoap && (mask & AT_XVATTR)) { 3397 3398 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) 3399 xoap->xoa_createtime = vap->va_birthtime; 3400 /* 3401 * restore trimmed off masks 3402 * so that return masks can be set for caller. 3403 */ 3404 3405 if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) { 3406 XVA_SET_REQ(xvap, XAT_APPENDONLY); 3407 } 3408 if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) { 3409 XVA_SET_REQ(xvap, XAT_NOUNLINK); 3410 } 3411 if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) { 3412 XVA_SET_REQ(xvap, XAT_IMMUTABLE); 3413 } 3414 if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) { 3415 XVA_SET_REQ(xvap, XAT_NODUMP); 3416 } 3417 if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) { 3418 XVA_SET_REQ(xvap, XAT_AV_MODIFIED); 3419 } 3420 if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) { 3421 XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); 3422 } 3423 3424 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) 3425 ASSERT(vp->v_type == VREG); 3426 3427 zfs_xvattr_set(zp, xvap, tx); 3428 } 3429 3430 if (fuid_dirtied) 3431 zfs_fuid_sync(zfsvfs, tx); 3432 3433 if (mask != 0) 3434 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); 3435 3436 if (mask & (AT_UID|AT_GID|AT_MODE)) 3437 mutex_exit(&zp->z_acl_lock); 3438 3439 if (attrzp) { 3440 if (mask & (AT_UID|AT_GID|AT_MODE)) 3441 mutex_exit(&attrzp->z_acl_lock); 3442 } 3443out: 3444 if (err == 0 && attrzp) { 3445 err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, 3446 xattr_count, tx); 3447 ASSERT(err2 == 0); 3448 } 3449 3450 if (attrzp) 3451 vput(ZTOV(attrzp)); 3452 3453 if (aclp) 3454 zfs_acl_free(aclp); 3455 3456 if (fuidp) { 3457 zfs_fuid_info_free(fuidp); 3458 fuidp = NULL; 3459 } 3460 3461 if (err) { 3462 dmu_tx_abort(tx); 3463 } else { 3464 err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 3465 dmu_tx_commit(tx); 3466 } 3467 3468out2: 3469 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 3470 zil_commit(zilog, 0); 3471 3472 ZFS_EXIT(zfsvfs); 3473 return (err); 3474} 3475 3476/* 3477 * We acquire all but fdvp locks using non-blocking acquisitions. If we 3478 * fail to acquire any lock in the path we will drop all held locks, 3479 * acquire the new lock in a blocking fashion, and then release it and 3480 * restart the rename. This acquire/release step ensures that we do not 3481 * spin on a lock waiting for release. On error release all vnode locks 3482 * and decrement references the way tmpfs_rename() would do. 3483 */ 3484static int 3485zfs_rename_relock(struct vnode *sdvp, struct vnode **svpp, 3486 struct vnode *tdvp, struct vnode **tvpp, 3487 const struct componentname *scnp, const struct componentname *tcnp) 3488{ 3489 zfsvfs_t *zfsvfs; 3490 struct vnode *nvp, *svp, *tvp; 3491 znode_t *sdzp, *tdzp, *szp, *tzp; 3492 const char *snm = scnp->cn_nameptr; 3493 const char *tnm = tcnp->cn_nameptr; 3494 int error; 3495 3496 VOP_UNLOCK(tdvp, 0); 3497 if (*tvpp != NULL && *tvpp != tdvp) 3498 VOP_UNLOCK(*tvpp, 0); 3499 3500relock: 3501 error = vn_lock(sdvp, LK_EXCLUSIVE); 3502 if (error) 3503 goto out; 3504 sdzp = VTOZ(sdvp); 3505 3506 error = vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT); 3507 if (error != 0) { 3508 VOP_UNLOCK(sdvp, 0); 3509 if (error != EBUSY) 3510 goto out; 3511 error = vn_lock(tdvp, LK_EXCLUSIVE); 3512 if (error) 3513 goto out; 3514 VOP_UNLOCK(tdvp, 0); 3515 goto relock; 3516 } 3517 tdzp = VTOZ(tdvp); 3518 3519 /* 3520 * Before using sdzp and tdzp we must ensure that they are live. 3521 * As a porting legacy from illumos we have two things to worry 3522 * about. One is typical for FreeBSD and it is that the vnode is 3523 * not reclaimed (doomed). The other is that the znode is live. 3524 * The current code can invalidate the znode without acquiring the 3525 * corresponding vnode lock if the object represented by the znode 3526 * and vnode is no longer valid after a rollback or receive operation. 3527 * z_teardown_lock hidden behind ZFS_ENTER and ZFS_EXIT is the lock 3528 * that protects the znodes from the invalidation. 3529 */ 3530 zfsvfs = sdzp->z_zfsvfs; 3531 ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs); 3532 ZFS_ENTER(zfsvfs); 3533 3534 /* 3535 * We can not use ZFS_VERIFY_ZP() here because it could directly return 3536 * bypassing the cleanup code in the case of an error. 3537 */ 3538 if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) { 3539 ZFS_EXIT(zfsvfs); 3540 VOP_UNLOCK(sdvp, 0); 3541 VOP_UNLOCK(tdvp, 0); 3542 error = SET_ERROR(EIO); 3543 goto out; 3544 } 3545 3546 /* 3547 * Re-resolve svp to be certain it still exists and fetch the 3548 * correct vnode. 3549 */ 3550 error = zfs_dirent_lookup(sdzp, snm, &szp, ZEXISTS); 3551 if (error != 0) { 3552 /* Source entry invalid or not there. */ 3553 ZFS_EXIT(zfsvfs); 3554 VOP_UNLOCK(sdvp, 0); 3555 VOP_UNLOCK(tdvp, 0); 3556 if ((scnp->cn_flags & ISDOTDOT) != 0 || 3557 (scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.')) 3558 error = SET_ERROR(EINVAL); 3559 goto out; 3560 } 3561 svp = ZTOV(szp); 3562 3563 /* 3564 * Re-resolve tvp, if it disappeared we just carry on. 3565 */ 3566 error = zfs_dirent_lookup(tdzp, tnm, &tzp, 0); 3567 if (error != 0) { 3568 ZFS_EXIT(zfsvfs); 3569 VOP_UNLOCK(sdvp, 0); 3570 VOP_UNLOCK(tdvp, 0); 3571 vrele(svp); 3572 if ((tcnp->cn_flags & ISDOTDOT) != 0) 3573 error = SET_ERROR(EINVAL); 3574 goto out; 3575 } 3576 if (tzp != NULL) 3577 tvp = ZTOV(tzp); 3578 else 3579 tvp = NULL; 3580 3581 /* 3582 * At present the vnode locks must be acquired before z_teardown_lock, 3583 * although it would be more logical to use the opposite order. 3584 */ 3585 ZFS_EXIT(zfsvfs); 3586 3587 /* 3588 * Now try acquire locks on svp and tvp. 3589 */ 3590 nvp = svp; 3591 error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT); 3592 if (error != 0) { 3593 VOP_UNLOCK(sdvp, 0); 3594 VOP_UNLOCK(tdvp, 0); 3595 if (tvp != NULL) 3596 vrele(tvp); 3597 if (error != EBUSY) { 3598 vrele(nvp); 3599 goto out; 3600 } 3601 error = vn_lock(nvp, LK_EXCLUSIVE); 3602 if (error != 0) { 3603 vrele(nvp); 3604 goto out; 3605 } 3606 VOP_UNLOCK(nvp, 0); 3607 /* 3608 * Concurrent rename race. 3609 * XXX ? 3610 */ 3611 if (nvp == tdvp) { 3612 vrele(nvp); 3613 error = SET_ERROR(EINVAL); 3614 goto out; 3615 } 3616 vrele(*svpp); 3617 *svpp = nvp; 3618 goto relock; 3619 } 3620 vrele(*svpp); 3621 *svpp = nvp; 3622 3623 if (*tvpp != NULL) 3624 vrele(*tvpp); 3625 *tvpp = NULL; 3626 if (tvp != NULL) { 3627 nvp = tvp; 3628 error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT); 3629 if (error != 0) { 3630 VOP_UNLOCK(sdvp, 0); 3631 VOP_UNLOCK(tdvp, 0); 3632 VOP_UNLOCK(*svpp, 0); 3633 if (error != EBUSY) { 3634 vrele(nvp); 3635 goto out; 3636 } 3637 error = vn_lock(nvp, LK_EXCLUSIVE); 3638 if (error != 0) { 3639 vrele(nvp); 3640 goto out; 3641 } 3642 vput(nvp); 3643 goto relock; 3644 } 3645 *tvpp = nvp; 3646 } 3647 3648 return (0); 3649 3650out: 3651 return (error); 3652} 3653 3654/* 3655 * Note that we must use VRELE_ASYNC in this function as it walks 3656 * up the directory tree and vrele may need to acquire an exclusive 3657 * lock if a last reference to a vnode is dropped. 3658 */ 3659static int 3660zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp) 3661{ 3662 zfsvfs_t *zfsvfs; 3663 znode_t *zp, *zp1; 3664 uint64_t parent; 3665 int error; 3666 3667 zfsvfs = tdzp->z_zfsvfs; 3668 if (tdzp == szp) 3669 return (SET_ERROR(EINVAL)); 3670 if (tdzp == sdzp) 3671 return (0); 3672 if (tdzp->z_id == zfsvfs->z_root) 3673 return (0); 3674 zp = tdzp; 3675 for (;;) { 3676 ASSERT(!zp->z_unlinked); 3677 if ((error = sa_lookup(zp->z_sa_hdl, 3678 SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) 3679 break; 3680 3681 if (parent == szp->z_id) { 3682 error = SET_ERROR(EINVAL); 3683 break; 3684 } 3685 if (parent == zfsvfs->z_root) 3686 break; 3687 if (parent == sdzp->z_id) 3688 break; 3689 3690 error = zfs_zget(zfsvfs, parent, &zp1); 3691 if (error != 0) 3692 break; 3693 3694 if (zp != tdzp) 3695 VN_RELE_ASYNC(ZTOV(zp), 3696 dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os))); 3697 zp = zp1; 3698 } 3699 3700 if (error == ENOTDIR) 3701 panic("checkpath: .. not a directory\n"); 3702 if (zp != tdzp) 3703 VN_RELE_ASYNC(ZTOV(zp), 3704 dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os))); 3705 return (error); 3706} 3707 3708/* 3709 * Move an entry from the provided source directory to the target 3710 * directory. Change the entry name as indicated. 3711 * 3712 * IN: sdvp - Source directory containing the "old entry". 3713 * snm - Old entry name. 3714 * tdvp - Target directory to contain the "new entry". 3715 * tnm - New entry name. 3716 * cr - credentials of caller. 3717 * ct - caller context 3718 * flags - case flags 3719 * 3720 * RETURN: 0 on success, error code on failure. 3721 * 3722 * Timestamps: 3723 * sdvp,tdvp - ctime|mtime updated 3724 */ 3725/*ARGSUSED*/ 3726static int 3727zfs_rename(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp, 3728 vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp, 3729 cred_t *cr) 3730{ 3731 zfsvfs_t *zfsvfs; 3732 znode_t *sdzp, *tdzp, *szp, *tzp; 3733 zilog_t *zilog = NULL; 3734 dmu_tx_t *tx; 3735 char *snm = scnp->cn_nameptr; 3736 char *tnm = tcnp->cn_nameptr; 3737 int error = 0; 3738 3739 /* Reject renames across filesystems. */ 3740 if ((*svpp)->v_mount != tdvp->v_mount || 3741 ((*tvpp) != NULL && (*svpp)->v_mount != (*tvpp)->v_mount)) { 3742 error = SET_ERROR(EXDEV); 3743 goto out; 3744 } 3745 3746 if (zfsctl_is_node(tdvp)) { 3747 error = SET_ERROR(EXDEV); 3748 goto out; 3749 } 3750 3751 /* 3752 * Lock all four vnodes to ensure safety and semantics of renaming. 3753 */ 3754 error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp); 3755 if (error != 0) { 3756 /* no vnodes are locked in the case of error here */ 3757 return (error); 3758 } 3759 3760 tdzp = VTOZ(tdvp); 3761 sdzp = VTOZ(sdvp); 3762 zfsvfs = tdzp->z_zfsvfs; 3763 zilog = zfsvfs->z_log; 3764 3765 /* 3766 * After we re-enter ZFS_ENTER() we will have to revalidate all 3767 * znodes involved. 3768 */ 3769 ZFS_ENTER(zfsvfs); 3770 3771 if (zfsvfs->z_utf8 && u8_validate(tnm, 3772 strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 3773 error = SET_ERROR(EILSEQ); 3774 goto unlockout; 3775 } 3776 3777 /* If source and target are the same file, there is nothing to do. */ 3778 if ((*svpp) == (*tvpp)) { 3779 error = 0; 3780 goto unlockout; 3781 } 3782 3783 if (((*svpp)->v_type == VDIR && (*svpp)->v_mountedhere != NULL) || 3784 ((*tvpp) != NULL && (*tvpp)->v_type == VDIR && 3785 (*tvpp)->v_mountedhere != NULL)) { 3786 error = SET_ERROR(EXDEV); 3787 goto unlockout; 3788 } 3789 3790 /* 3791 * We can not use ZFS_VERIFY_ZP() here because it could directly return 3792 * bypassing the cleanup code in the case of an error. 3793 */ 3794 if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) { 3795 error = SET_ERROR(EIO); 3796 goto unlockout; 3797 } 3798 3799 szp = VTOZ(*svpp); 3800 tzp = *tvpp == NULL ? NULL : VTOZ(*tvpp); 3801 if (szp->z_sa_hdl == NULL || (tzp != NULL && tzp->z_sa_hdl == NULL)) { 3802 error = SET_ERROR(EIO); 3803 goto unlockout; 3804 } 3805 3806 /* 3807 * This is to prevent the creation of links into attribute space 3808 * by renaming a linked file into/outof an attribute directory. 3809 * See the comment in zfs_link() for why this is considered bad. 3810 */ 3811 if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { 3812 error = SET_ERROR(EINVAL); 3813 goto unlockout; 3814 } 3815 3816 /* 3817 * Must have write access at the source to remove the old entry 3818 * and write access at the target to create the new entry. 3819 * Note that if target and source are the same, this can be 3820 * done in a single check. 3821 */ 3822 if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr)) 3823 goto unlockout; 3824 3825 if ((*svpp)->v_type == VDIR) { 3826 /* 3827 * Avoid ".", "..", and aliases of "." for obvious reasons. 3828 */ 3829 if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') || 3830 sdzp == szp || 3831 (scnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) { 3832 error = EINVAL; 3833 goto unlockout; 3834 } 3835 3836 /* 3837 * Check to make sure rename is valid. 3838 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d 3839 */ 3840 if (error = zfs_rename_check(szp, sdzp, tdzp)) 3841 goto unlockout; 3842 } 3843 3844 /* 3845 * Does target exist? 3846 */ 3847 if (tzp) { 3848 /* 3849 * Source and target must be the same type. 3850 */ 3851 if ((*svpp)->v_type == VDIR) { 3852 if ((*tvpp)->v_type != VDIR) { 3853 error = SET_ERROR(ENOTDIR); 3854 goto unlockout; 3855 } else { 3856 cache_purge(tdvp); 3857 if (sdvp != tdvp) 3858 cache_purge(sdvp); 3859 } 3860 } else { 3861 if ((*tvpp)->v_type == VDIR) { 3862 error = SET_ERROR(EISDIR); 3863 goto unlockout; 3864 } 3865 } 3866 } 3867 3868 vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct); 3869 if (tzp) 3870 vnevent_rename_dest(*tvpp, tdvp, tnm, ct); 3871 3872 /* 3873 * notify the target directory if it is not the same 3874 * as source directory. 3875 */ 3876 if (tdvp != sdvp) { 3877 vnevent_rename_dest_dir(tdvp, ct); 3878 } 3879 3880 tx = dmu_tx_create(zfsvfs->z_os); 3881 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); 3882 dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); 3883 dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); 3884 dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); 3885 if (sdzp != tdzp) { 3886 dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); 3887 zfs_sa_upgrade_txholds(tx, tdzp); 3888 } 3889 if (tzp) { 3890 dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); 3891 zfs_sa_upgrade_txholds(tx, tzp); 3892 } 3893 3894 zfs_sa_upgrade_txholds(tx, szp); 3895 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 3896 error = dmu_tx_assign(tx, TXG_WAIT); 3897 if (error) { 3898 dmu_tx_abort(tx); 3899 goto unlockout; 3900 } 3901 3902 3903 if (tzp) /* Attempt to remove the existing target */ 3904 error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL); 3905 3906 if (error == 0) { 3907 error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING); 3908 if (error == 0) { 3909 szp->z_pflags |= ZFS_AV_MODIFIED; 3910 3911 error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), 3912 (void *)&szp->z_pflags, sizeof (uint64_t), tx); 3913 ASSERT0(error); 3914 3915 error = zfs_link_destroy(sdzp, snm, szp, tx, ZRENAMING, 3916 NULL); 3917 if (error == 0) { 3918 zfs_log_rename(zilog, tx, TX_RENAME, sdzp, 3919 snm, tdzp, tnm, szp); 3920 3921 /* 3922 * Update path information for the target vnode 3923 */ 3924 vn_renamepath(tdvp, *svpp, tnm, strlen(tnm)); 3925 } else { 3926 /* 3927 * At this point, we have successfully created 3928 * the target name, but have failed to remove 3929 * the source name. Since the create was done 3930 * with the ZRENAMING flag, there are 3931 * complications; for one, the link count is 3932 * wrong. The easiest way to deal with this 3933 * is to remove the newly created target, and 3934 * return the original error. This must 3935 * succeed; fortunately, it is very unlikely to 3936 * fail, since we just created it. 3937 */ 3938 VERIFY3U(zfs_link_destroy(tdzp, tnm, szp, tx, 3939 ZRENAMING, NULL), ==, 0); 3940 } 3941 } 3942 if (error == 0) { 3943 cache_purge(*svpp); 3944 if (*tvpp != NULL) 3945 cache_purge(*tvpp); 3946 cache_purge_negative(tdvp); 3947 } 3948 } 3949 3950 dmu_tx_commit(tx); 3951 3952unlockout: /* all 4 vnodes are locked, ZFS_ENTER called */ 3953 ZFS_EXIT(zfsvfs); 3954 VOP_UNLOCK(*svpp, 0); 3955 VOP_UNLOCK(sdvp, 0); 3956 3957out: /* original two vnodes are locked */ 3958 if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 3959 zil_commit(zilog, 0); 3960 3961 if (*tvpp != NULL) 3962 VOP_UNLOCK(*tvpp, 0); 3963 if (tdvp != *tvpp) 3964 VOP_UNLOCK(tdvp, 0); 3965 return (error); 3966} 3967 3968/* 3969 * Insert the indicated symbolic reference entry into the directory. 3970 * 3971 * IN: dvp - Directory to contain new symbolic link. 3972 * link - Name for new symlink entry. 3973 * vap - Attributes of new entry. 3974 * cr - credentials of caller. 3975 * ct - caller context 3976 * flags - case flags 3977 * 3978 * RETURN: 0 on success, error code on failure. 3979 * 3980 * Timestamps: 3981 * dvp - ctime|mtime updated 3982 */ 3983/*ARGSUSED*/ 3984static int 3985zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link, 3986 cred_t *cr, kthread_t *td) 3987{ 3988 znode_t *zp, *dzp = VTOZ(dvp); 3989 dmu_tx_t *tx; 3990 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 3991 zilog_t *zilog; 3992 uint64_t len = strlen(link); 3993 int error; 3994 zfs_acl_ids_t acl_ids; 3995 boolean_t fuid_dirtied; 3996 uint64_t txtype = TX_SYMLINK; 3997 int flags = 0; 3998 3999 ASSERT(vap->va_type == VLNK); 4000 4001 ZFS_ENTER(zfsvfs); 4002 ZFS_VERIFY_ZP(dzp); 4003 zilog = zfsvfs->z_log; 4004 4005 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), 4006 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 4007 ZFS_EXIT(zfsvfs); 4008 return (SET_ERROR(EILSEQ)); 4009 } 4010 4011 if (len > MAXPATHLEN) { 4012 ZFS_EXIT(zfsvfs); 4013 return (SET_ERROR(ENAMETOOLONG)); 4014 } 4015 4016 if ((error = zfs_acl_ids_create(dzp, 0, 4017 vap, cr, NULL, &acl_ids)) != 0) { 4018 ZFS_EXIT(zfsvfs); 4019 return (error); 4020 } 4021 4022 /* 4023 * Attempt to lock directory; fail if entry already exists. 4024 */ 4025 error = zfs_dirent_lookup(dzp, name, &zp, ZNEW); 4026 if (error) { 4027 zfs_acl_ids_free(&acl_ids); 4028 ZFS_EXIT(zfsvfs); 4029 return (error); 4030 } 4031 4032 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { 4033 zfs_acl_ids_free(&acl_ids); 4034 ZFS_EXIT(zfsvfs); 4035 return (error); 4036 } 4037 4038 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { 4039 zfs_acl_ids_free(&acl_ids); 4040 ZFS_EXIT(zfsvfs); 4041 return (SET_ERROR(EDQUOT)); 4042 } 4043 4044 getnewvnode_reserve(1); 4045 tx = dmu_tx_create(zfsvfs->z_os); 4046 fuid_dirtied = zfsvfs->z_fuid_dirty; 4047 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); 4048 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 4049 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 4050 ZFS_SA_BASE_ATTR_SIZE + len); 4051 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); 4052 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 4053 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 4054 acl_ids.z_aclp->z_acl_bytes); 4055 } 4056 if (fuid_dirtied) 4057 zfs_fuid_txhold(zfsvfs, tx); 4058 error = dmu_tx_assign(tx, TXG_WAIT); 4059 if (error) { 4060 zfs_acl_ids_free(&acl_ids); 4061 dmu_tx_abort(tx); 4062 getnewvnode_drop_reserve(); 4063 ZFS_EXIT(zfsvfs); 4064 return (error); 4065 } 4066 4067 /* 4068 * Create a new object for the symlink. 4069 * for version 4 ZPL datsets the symlink will be an SA attribute 4070 */ 4071 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); 4072 4073 if (fuid_dirtied) 4074 zfs_fuid_sync(zfsvfs, tx); 4075 4076 if (zp->z_is_sa) 4077 error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), 4078 link, len, tx); 4079 else 4080 zfs_sa_symlink(zp, link, len, tx); 4081 4082 zp->z_size = len; 4083 (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), 4084 &zp->z_size, sizeof (zp->z_size), tx); 4085 /* 4086 * Insert the new object into the directory. 4087 */ 4088 (void) zfs_link_create(dzp, name, zp, tx, ZNEW); 4089 4090 zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); 4091 *vpp = ZTOV(zp); 4092 4093 zfs_acl_ids_free(&acl_ids); 4094 4095 dmu_tx_commit(tx); 4096 4097 getnewvnode_drop_reserve(); 4098 4099 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 4100 zil_commit(zilog, 0); 4101 4102 ZFS_EXIT(zfsvfs); 4103 return (error); 4104} 4105 4106/* 4107 * Return, in the buffer contained in the provided uio structure, 4108 * the symbolic path referred to by vp. 4109 * 4110 * IN: vp - vnode of symbolic link. 4111 * uio - structure to contain the link path. 4112 * cr - credentials of caller. 4113 * ct - caller context 4114 * 4115 * OUT: uio - structure containing the link path. 4116 * 4117 * RETURN: 0 on success, error code on failure. 4118 * 4119 * Timestamps: 4120 * vp - atime updated 4121 */ 4122/* ARGSUSED */ 4123static int 4124zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct) 4125{ 4126 znode_t *zp = VTOZ(vp); 4127 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4128 int error; 4129 4130 ZFS_ENTER(zfsvfs); 4131 ZFS_VERIFY_ZP(zp); 4132 4133 if (zp->z_is_sa) 4134 error = sa_lookup_uio(zp->z_sa_hdl, 4135 SA_ZPL_SYMLINK(zfsvfs), uio); 4136 else 4137 error = zfs_sa_readlink(zp, uio); 4138 4139 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 4140 4141 ZFS_EXIT(zfsvfs); 4142 return (error); 4143} 4144 4145/* 4146 * Insert a new entry into directory tdvp referencing svp. 4147 * 4148 * IN: tdvp - Directory to contain new entry. 4149 * svp - vnode of new entry. 4150 * name - name of new entry. 4151 * cr - credentials of caller. 4152 * ct - caller context 4153 * 4154 * RETURN: 0 on success, error code on failure. 4155 * 4156 * Timestamps: 4157 * tdvp - ctime|mtime updated 4158 * svp - ctime updated 4159 */ 4160/* ARGSUSED */ 4161static int 4162zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr, 4163 caller_context_t *ct, int flags) 4164{ 4165 znode_t *dzp = VTOZ(tdvp); 4166 znode_t *tzp, *szp; 4167 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 4168 zilog_t *zilog; 4169 dmu_tx_t *tx; 4170 int error; 4171 uint64_t parent; 4172 uid_t owner; 4173 4174 ASSERT(tdvp->v_type == VDIR); 4175 4176 ZFS_ENTER(zfsvfs); 4177 ZFS_VERIFY_ZP(dzp); 4178 zilog = zfsvfs->z_log; 4179 4180 /* 4181 * POSIX dictates that we return EPERM here. 4182 * Better choices include ENOTSUP or EISDIR. 4183 */ 4184 if (svp->v_type == VDIR) { 4185 ZFS_EXIT(zfsvfs); 4186 return (SET_ERROR(EPERM)); 4187 } 4188 4189 szp = VTOZ(svp); 4190 ZFS_VERIFY_ZP(szp); 4191 4192 if (szp->z_pflags & (ZFS_APPENDONLY | ZFS_IMMUTABLE | ZFS_READONLY)) { 4193 ZFS_EXIT(zfsvfs); 4194 return (SET_ERROR(EPERM)); 4195 } 4196 4197 /* Prevent links to .zfs/shares files */ 4198 4199 if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), 4200 &parent, sizeof (uint64_t))) != 0) { 4201 ZFS_EXIT(zfsvfs); 4202 return (error); 4203 } 4204 if (parent == zfsvfs->z_shares_dir) { 4205 ZFS_EXIT(zfsvfs); 4206 return (SET_ERROR(EPERM)); 4207 } 4208 4209 if (zfsvfs->z_utf8 && u8_validate(name, 4210 strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 4211 ZFS_EXIT(zfsvfs); 4212 return (SET_ERROR(EILSEQ)); 4213 } 4214 4215 /* 4216 * We do not support links between attributes and non-attributes 4217 * because of the potential security risk of creating links 4218 * into "normal" file space in order to circumvent restrictions 4219 * imposed in attribute space. 4220 */ 4221 if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) { 4222 ZFS_EXIT(zfsvfs); 4223 return (SET_ERROR(EINVAL)); 4224 } 4225 4226 4227 owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER); 4228 if (owner != crgetuid(cr) && secpolicy_basic_link(svp, cr) != 0) { 4229 ZFS_EXIT(zfsvfs); 4230 return (SET_ERROR(EPERM)); 4231 } 4232 4233 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { 4234 ZFS_EXIT(zfsvfs); 4235 return (error); 4236 } 4237 4238 /* 4239 * Attempt to lock directory; fail if entry already exists. 4240 */ 4241 error = zfs_dirent_lookup(dzp, name, &tzp, ZNEW); 4242 if (error) { 4243 ZFS_EXIT(zfsvfs); 4244 return (error); 4245 } 4246 4247 tx = dmu_tx_create(zfsvfs->z_os); 4248 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); 4249 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 4250 zfs_sa_upgrade_txholds(tx, szp); 4251 zfs_sa_upgrade_txholds(tx, dzp); 4252 error = dmu_tx_assign(tx, TXG_WAIT); 4253 if (error) { 4254 dmu_tx_abort(tx); 4255 ZFS_EXIT(zfsvfs); 4256 return (error); 4257 } 4258 4259 error = zfs_link_create(dzp, name, szp, tx, 0); 4260 4261 if (error == 0) { 4262 uint64_t txtype = TX_LINK; 4263 zfs_log_link(zilog, tx, txtype, dzp, szp, name); 4264 } 4265 4266 dmu_tx_commit(tx); 4267 4268 if (error == 0) { 4269 vnevent_link(svp, ct); 4270 } 4271 4272 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 4273 zil_commit(zilog, 0); 4274 4275 ZFS_EXIT(zfsvfs); 4276 return (error); 4277} 4278 4279 4280/*ARGSUSED*/ 4281void 4282zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) 4283{ 4284 znode_t *zp = VTOZ(vp); 4285 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4286 int error; 4287 4288 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); 4289 if (zp->z_sa_hdl == NULL) { 4290 /* 4291 * The fs has been unmounted, or we did a 4292 * suspend/resume and this file no longer exists. 4293 */ 4294 rw_exit(&zfsvfs->z_teardown_inactive_lock); 4295 vrecycle(vp); 4296 return; 4297 } 4298 4299 if (zp->z_unlinked) { 4300 /* 4301 * Fast path to recycle a vnode of a removed file. 4302 */ 4303 rw_exit(&zfsvfs->z_teardown_inactive_lock); 4304 vrecycle(vp); 4305 return; 4306 } 4307 4308 if (zp->z_atime_dirty && zp->z_unlinked == 0) { 4309 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); 4310 4311 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 4312 zfs_sa_upgrade_txholds(tx, zp); 4313 error = dmu_tx_assign(tx, TXG_WAIT); 4314 if (error) { 4315 dmu_tx_abort(tx); 4316 } else { 4317 (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), 4318 (void *)&zp->z_atime, sizeof (zp->z_atime), tx); 4319 zp->z_atime_dirty = 0; 4320 dmu_tx_commit(tx); 4321 } 4322 } 4323 rw_exit(&zfsvfs->z_teardown_inactive_lock); 4324} 4325 4326 4327CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid)); 4328CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid)); 4329 4330/*ARGSUSED*/ 4331static int 4332zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct) 4333{ 4334 znode_t *zp = VTOZ(vp); 4335 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4336 uint32_t gen; 4337 uint64_t gen64; 4338 uint64_t object = zp->z_id; 4339 zfid_short_t *zfid; 4340 int size, i, error; 4341 4342 ZFS_ENTER(zfsvfs); 4343 ZFS_VERIFY_ZP(zp); 4344 4345 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), 4346 &gen64, sizeof (uint64_t))) != 0) { 4347 ZFS_EXIT(zfsvfs); 4348 return (error); 4349 } 4350 4351 gen = (uint32_t)gen64; 4352 4353 size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN; 4354 4355#ifdef illumos 4356 if (fidp->fid_len < size) { 4357 fidp->fid_len = size; 4358 ZFS_EXIT(zfsvfs); 4359 return (SET_ERROR(ENOSPC)); 4360 } 4361#else 4362 fidp->fid_len = size; 4363#endif 4364 4365 zfid = (zfid_short_t *)fidp; 4366 4367 zfid->zf_len = size; 4368 4369 for (i = 0; i < sizeof (zfid->zf_object); i++) 4370 zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); 4371 4372 /* Must have a non-zero generation number to distinguish from .zfs */ 4373 if (gen == 0) 4374 gen = 1; 4375 for (i = 0; i < sizeof (zfid->zf_gen); i++) 4376 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); 4377 4378 if (size == LONG_FID_LEN) { 4379 uint64_t objsetid = dmu_objset_id(zfsvfs->z_os); 4380 zfid_long_t *zlfid; 4381 4382 zlfid = (zfid_long_t *)fidp; 4383 4384 for (i = 0; i < sizeof (zlfid->zf_setid); i++) 4385 zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i)); 4386 4387 /* XXX - this should be the generation number for the objset */ 4388 for (i = 0; i < sizeof (zlfid->zf_setgen); i++) 4389 zlfid->zf_setgen[i] = 0; 4390 } 4391 4392 ZFS_EXIT(zfsvfs); 4393 return (0); 4394} 4395 4396static int 4397zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, 4398 caller_context_t *ct) 4399{ 4400 znode_t *zp, *xzp; 4401 zfsvfs_t *zfsvfs; 4402 int error; 4403 4404 switch (cmd) { 4405 case _PC_LINK_MAX: 4406 *valp = INT_MAX; 4407 return (0); 4408 4409 case _PC_FILESIZEBITS: 4410 *valp = 64; 4411 return (0); 4412#ifdef illumos 4413 case _PC_XATTR_EXISTS: 4414 zp = VTOZ(vp); 4415 zfsvfs = zp->z_zfsvfs; 4416 ZFS_ENTER(zfsvfs); 4417 ZFS_VERIFY_ZP(zp); 4418 *valp = 0; 4419 error = zfs_dirent_lookup(zp, "", &xzp, 4420 ZXATTR | ZEXISTS | ZSHARED); 4421 if (error == 0) { 4422 if (!zfs_dirempty(xzp)) 4423 *valp = 1; 4424 vrele(ZTOV(xzp)); 4425 } else if (error == ENOENT) { 4426 /* 4427 * If there aren't extended attributes, it's the 4428 * same as having zero of them. 4429 */ 4430 error = 0; 4431 } 4432 ZFS_EXIT(zfsvfs); 4433 return (error); 4434 4435 case _PC_SATTR_ENABLED: 4436 case _PC_SATTR_EXISTS: 4437 *valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && 4438 (vp->v_type == VREG || vp->v_type == VDIR); 4439 return (0); 4440 4441 case _PC_ACCESS_FILTERING: 4442 *valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) && 4443 vp->v_type == VDIR; 4444 return (0); 4445 4446 case _PC_ACL_ENABLED: 4447 *valp = _ACL_ACE_ENABLED; 4448 return (0); 4449#endif /* illumos */ 4450 case _PC_MIN_HOLE_SIZE: 4451 *valp = (int)SPA_MINBLOCKSIZE; 4452 return (0); 4453#ifdef illumos 4454 case _PC_TIMESTAMP_RESOLUTION: 4455 /* nanosecond timestamp resolution */ 4456 *valp = 1L; 4457 return (0); 4458#endif 4459 case _PC_ACL_EXTENDED: 4460 *valp = 0; 4461 return (0); 4462 4463 case _PC_ACL_NFS4: 4464 *valp = 1; 4465 return (0); 4466 4467 case _PC_ACL_PATH_MAX: 4468 *valp = ACL_MAX_ENTRIES; 4469 return (0); 4470 4471 default: 4472 return (EOPNOTSUPP); 4473 } 4474} 4475 4476/*ARGSUSED*/ 4477static int 4478zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, 4479 caller_context_t *ct) 4480{ 4481 znode_t *zp = VTOZ(vp); 4482 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4483 int error; 4484 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 4485 4486 ZFS_ENTER(zfsvfs); 4487 ZFS_VERIFY_ZP(zp); 4488 error = zfs_getacl(zp, vsecp, skipaclchk, cr); 4489 ZFS_EXIT(zfsvfs); 4490 4491 return (error); 4492} 4493 4494/*ARGSUSED*/ 4495int 4496zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, 4497 caller_context_t *ct) 4498{ 4499 znode_t *zp = VTOZ(vp); 4500 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4501 int error; 4502 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 4503 zilog_t *zilog = zfsvfs->z_log; 4504 4505 ZFS_ENTER(zfsvfs); 4506 ZFS_VERIFY_ZP(zp); 4507 4508 error = zfs_setacl(zp, vsecp, skipaclchk, cr); 4509 4510 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 4511 zil_commit(zilog, 0); 4512 4513 ZFS_EXIT(zfsvfs); 4514 return (error); 4515} 4516 4517static int 4518zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind, 4519 int *rahead) 4520{ 4521 znode_t *zp = VTOZ(vp); 4522 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4523 objset_t *os = zp->z_zfsvfs->z_os; 4524 rl_t *rl; 4525 vm_object_t object; 4526 off_t start, end, obj_size; 4527 uint_t blksz; 4528 int pgsin_b, pgsin_a; 4529 int error; 4530 4531 ZFS_ENTER(zfsvfs); 4532 ZFS_VERIFY_ZP(zp); 4533 4534 start = IDX_TO_OFF(ma[0]->pindex); 4535 end = IDX_TO_OFF(ma[count - 1]->pindex + 1); 4536 4537 /* 4538 * Lock a range covering all required and optional pages. 4539 * Note that we need to handle the case of the block size growing. 4540 */ 4541 for (;;) { 4542 blksz = zp->z_blksz; 4543 rl = zfs_range_lock(zp, rounddown(start, blksz), 4544 roundup(end, blksz) - rounddown(start, blksz), RL_READER); 4545 if (blksz == zp->z_blksz) 4546 break; 4547 zfs_range_unlock(rl); 4548 } 4549 4550 object = ma[0]->object; 4551 zfs_vmobject_wlock(object); 4552 obj_size = object->un_pager.vnp.vnp_size; 4553 zfs_vmobject_wunlock(object); 4554 if (IDX_TO_OFF(ma[count - 1]->pindex) >= obj_size) { 4555 zfs_range_unlock(rl); 4556 ZFS_EXIT(zfsvfs); 4557 return (zfs_vm_pagerret_bad); 4558 } 4559 4560 pgsin_b = 0; 4561 if (rbehind != NULL) { 4562 pgsin_b = OFF_TO_IDX(start - rounddown(start, blksz)); 4563 pgsin_b = MIN(*rbehind, pgsin_b); 4564 } 4565 4566 pgsin_a = 0; 4567 if (rahead != NULL) { 4568 pgsin_a = OFF_TO_IDX(roundup(end, blksz) - end); 4569 if (end + IDX_TO_OFF(pgsin_a) >= obj_size) 4570 pgsin_a = OFF_TO_IDX(round_page(obj_size) - end); 4571 pgsin_a = MIN(*rahead, pgsin_a); 4572 } 4573 4574 /* 4575 * NB: we need to pass the exact byte size of the data that we expect 4576 * to read after accounting for the file size. This is required because 4577 * ZFS will panic if we request DMU to read beyond the end of the last 4578 * allocated block. 4579 */ 4580 error = dmu_read_pages(os, zp->z_id, ma, count, &pgsin_b, &pgsin_a, 4581 MIN(end, obj_size) - (end - PAGE_SIZE)); 4582 4583 zfs_range_unlock(rl); 4584 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 4585 ZFS_EXIT(zfsvfs); 4586 4587 if (error != 0) 4588 return (zfs_vm_pagerret_error); 4589 4590 PCPU_INC(cnt.v_vnodein); 4591 PCPU_ADD(cnt.v_vnodepgsin, count + pgsin_b + pgsin_a); 4592 if (rbehind != NULL) 4593 *rbehind = pgsin_b; 4594 if (rahead != NULL) 4595 *rahead = pgsin_a; 4596 return (zfs_vm_pagerret_ok); 4597} 4598 4599static int 4600zfs_freebsd_getpages(ap) 4601 struct vop_getpages_args /* { 4602 struct vnode *a_vp; 4603 vm_page_t *a_m; 4604 int a_count; 4605 int *a_rbehind; 4606 int *a_rahead; 4607 } */ *ap; 4608{ 4609 4610 return (zfs_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind, 4611 ap->a_rahead)); 4612} 4613 4614static int 4615zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags, 4616 int *rtvals) 4617{ 4618 znode_t *zp = VTOZ(vp); 4619 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4620 rl_t *rl; 4621 dmu_tx_t *tx; 4622 struct sf_buf *sf; 4623 vm_object_t object; 4624 vm_page_t m; 4625 caddr_t va; 4626 size_t tocopy; 4627 size_t lo_len; 4628 vm_ooffset_t lo_off; 4629 vm_ooffset_t off; 4630 uint_t blksz; 4631 int ncount; 4632 int pcount; 4633 int err; 4634 int i; 4635 4636 ZFS_ENTER(zfsvfs); 4637 ZFS_VERIFY_ZP(zp); 4638 4639 object = vp->v_object; 4640 pcount = btoc(len); 4641 ncount = pcount; 4642 4643 KASSERT(ma[0]->object == object, ("mismatching object")); 4644 KASSERT(len > 0 && (len & PAGE_MASK) == 0, ("unexpected length")); 4645 4646 for (i = 0; i < pcount; i++) 4647 rtvals[i] = zfs_vm_pagerret_error; 4648 4649 off = IDX_TO_OFF(ma[0]->pindex); 4650 blksz = zp->z_blksz; 4651 lo_off = rounddown(off, blksz); 4652 lo_len = roundup(len + (off - lo_off), blksz); 4653 rl = zfs_range_lock(zp, lo_off, lo_len, RL_WRITER); 4654 4655 zfs_vmobject_wlock(object); 4656 if (len + off > object->un_pager.vnp.vnp_size) { 4657 if (object->un_pager.vnp.vnp_size > off) { 4658 int pgoff; 4659 4660 len = object->un_pager.vnp.vnp_size - off; 4661 ncount = btoc(len); 4662 if ((pgoff = (int)len & PAGE_MASK) != 0) { 4663 /* 4664 * If the object is locked and the following 4665 * conditions hold, then the page's dirty 4666 * field cannot be concurrently changed by a 4667 * pmap operation. 4668 */ 4669 m = ma[ncount - 1]; 4670 vm_page_assert_sbusied(m); 4671 KASSERT(!pmap_page_is_write_mapped(m), 4672 ("zfs_putpages: page %p is not read-only", m)); 4673 vm_page_clear_dirty(m, pgoff, PAGE_SIZE - 4674 pgoff); 4675 } 4676 } else { 4677 len = 0; 4678 ncount = 0; 4679 } 4680 if (ncount < pcount) { 4681 for (i = ncount; i < pcount; i++) { 4682 rtvals[i] = zfs_vm_pagerret_bad; 4683 } 4684 } 4685 } 4686 zfs_vmobject_wunlock(object); 4687 4688 if (ncount == 0) 4689 goto out; 4690 4691 if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) || 4692 zfs_owner_overquota(zfsvfs, zp, B_TRUE)) { 4693 goto out; 4694 } 4695 4696 tx = dmu_tx_create(zfsvfs->z_os); 4697 dmu_tx_hold_write(tx, zp->z_id, off, len); 4698 4699 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 4700 zfs_sa_upgrade_txholds(tx, zp); 4701 err = dmu_tx_assign(tx, TXG_WAIT); 4702 if (err != 0) { 4703 dmu_tx_abort(tx); 4704 goto out; 4705 } 4706 4707 if (zp->z_blksz < PAGE_SIZE) { 4708 for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) { 4709 tocopy = len > PAGE_SIZE ? PAGE_SIZE : len; 4710 va = zfs_map_page(ma[i], &sf); 4711 dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx); 4712 zfs_unmap_page(sf); 4713 } 4714 } else { 4715 err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, ma, tx); 4716 } 4717 4718 if (err == 0) { 4719 uint64_t mtime[2], ctime[2]; 4720 sa_bulk_attr_t bulk[3]; 4721 int count = 0; 4722 4723 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, 4724 &mtime, 16); 4725 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, 4726 &ctime, 16); 4727 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 4728 &zp->z_pflags, 8); 4729 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, 4730 B_TRUE); 4731 err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 4732 ASSERT0(err); 4733 zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0); 4734 4735 zfs_vmobject_wlock(object); 4736 for (i = 0; i < ncount; i++) { 4737 rtvals[i] = zfs_vm_pagerret_ok; 4738 vm_page_undirty(ma[i]); 4739 } 4740 zfs_vmobject_wunlock(object); 4741 PCPU_INC(cnt.v_vnodeout); 4742 PCPU_ADD(cnt.v_vnodepgsout, ncount); 4743 } 4744 dmu_tx_commit(tx); 4745 4746out: 4747 zfs_range_unlock(rl); 4748 if ((flags & (zfs_vm_pagerput_sync | zfs_vm_pagerput_inval)) != 0 || 4749 zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 4750 zil_commit(zfsvfs->z_log, zp->z_id); 4751 ZFS_EXIT(zfsvfs); 4752 return (rtvals[0]); 4753} 4754 4755int 4756zfs_freebsd_putpages(ap) 4757 struct vop_putpages_args /* { 4758 struct vnode *a_vp; 4759 vm_page_t *a_m; 4760 int a_count; 4761 int a_sync; 4762 int *a_rtvals; 4763 } */ *ap; 4764{ 4765 4766 return (zfs_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync, 4767 ap->a_rtvals)); 4768} 4769 4770static int 4771zfs_freebsd_bmap(ap) 4772 struct vop_bmap_args /* { 4773 struct vnode *a_vp; 4774 daddr_t a_bn; 4775 struct bufobj **a_bop; 4776 daddr_t *a_bnp; 4777 int *a_runp; 4778 int *a_runb; 4779 } */ *ap; 4780{ 4781 4782 if (ap->a_bop != NULL) 4783 *ap->a_bop = &ap->a_vp->v_bufobj; 4784 if (ap->a_bnp != NULL) 4785 *ap->a_bnp = ap->a_bn; 4786 if (ap->a_runp != NULL) 4787 *ap->a_runp = 0; 4788 if (ap->a_runb != NULL) 4789 *ap->a_runb = 0; 4790 4791 return (0); 4792} 4793 4794static int 4795zfs_freebsd_open(ap) 4796 struct vop_open_args /* { 4797 struct vnode *a_vp; 4798 int a_mode; 4799 struct ucred *a_cred; 4800 struct thread *a_td; 4801 } */ *ap; 4802{ 4803 vnode_t *vp = ap->a_vp; 4804 znode_t *zp = VTOZ(vp); 4805 int error; 4806 4807 error = zfs_open(&vp, ap->a_mode, ap->a_cred, NULL); 4808 if (error == 0) 4809 vnode_create_vobject(vp, zp->z_size, ap->a_td); 4810 return (error); 4811} 4812 4813static int 4814zfs_freebsd_close(ap) 4815 struct vop_close_args /* { 4816 struct vnode *a_vp; 4817 int a_fflag; 4818 struct ucred *a_cred; 4819 struct thread *a_td; 4820 } */ *ap; 4821{ 4822 4823 return (zfs_close(ap->a_vp, ap->a_fflag, 1, 0, ap->a_cred, NULL)); 4824} 4825 4826static int 4827zfs_freebsd_ioctl(ap) 4828 struct vop_ioctl_args /* { 4829 struct vnode *a_vp; 4830 u_long a_command; 4831 caddr_t a_data; 4832 int a_fflag; 4833 struct ucred *cred; 4834 struct thread *td; 4835 } */ *ap; 4836{ 4837 4838 return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data, 4839 ap->a_fflag, ap->a_cred, NULL, NULL)); 4840} 4841 4842static int 4843ioflags(int ioflags) 4844{ 4845 int flags = 0; 4846 4847 if (ioflags & IO_APPEND) 4848 flags |= FAPPEND; 4849 if (ioflags & IO_NDELAY) 4850 flags |= FNONBLOCK; 4851 if (ioflags & IO_SYNC) 4852 flags |= (FSYNC | FDSYNC | FRSYNC); 4853 4854 return (flags); 4855} 4856 4857static int 4858zfs_freebsd_read(ap) 4859 struct vop_read_args /* { 4860 struct vnode *a_vp; 4861 struct uio *a_uio; 4862 int a_ioflag; 4863 struct ucred *a_cred; 4864 } */ *ap; 4865{ 4866 4867 return (zfs_read(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag), 4868 ap->a_cred, NULL)); 4869} 4870 4871static int 4872zfs_freebsd_write(ap) 4873 struct vop_write_args /* { 4874 struct vnode *a_vp; 4875 struct uio *a_uio; 4876 int a_ioflag; 4877 struct ucred *a_cred; 4878 } */ *ap; 4879{ 4880 4881 return (zfs_write(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag), 4882 ap->a_cred, NULL)); 4883} 4884 4885static int 4886zfs_freebsd_access(ap) 4887 struct vop_access_args /* { 4888 struct vnode *a_vp; 4889 accmode_t a_accmode; 4890 struct ucred *a_cred; 4891 struct thread *a_td; 4892 } */ *ap; 4893{ 4894 vnode_t *vp = ap->a_vp; 4895 znode_t *zp = VTOZ(vp); 4896 accmode_t accmode; 4897 int error = 0; 4898 4899 /* 4900 * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND, 4901 */ 4902 accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND); 4903 if (accmode != 0) 4904 error = zfs_access(ap->a_vp, accmode, 0, ap->a_cred, NULL); 4905 4906 /* 4907 * VADMIN has to be handled by vaccess(). 4908 */ 4909 if (error == 0) { 4910 accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND); 4911 if (accmode != 0) { 4912 error = vaccess(vp->v_type, zp->z_mode, zp->z_uid, 4913 zp->z_gid, accmode, ap->a_cred, NULL); 4914 } 4915 } 4916 4917 /* 4918 * For VEXEC, ensure that at least one execute bit is set for 4919 * non-directories. 4920 */ 4921 if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR && 4922 (zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) { 4923 error = EACCES; 4924 } 4925 4926 return (error); 4927} 4928 4929static int 4930zfs_freebsd_lookup(ap) 4931 struct vop_lookup_args /* { 4932 struct vnode *a_dvp; 4933 struct vnode **a_vpp; 4934 struct componentname *a_cnp; 4935 } */ *ap; 4936{ 4937 struct componentname *cnp = ap->a_cnp; 4938 char nm[NAME_MAX + 1]; 4939 4940 ASSERT(cnp->cn_namelen < sizeof(nm)); 4941 strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm))); 4942 4943 return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop, 4944 cnp->cn_cred, cnp->cn_thread, 0)); 4945} 4946 4947static int 4948zfs_cache_lookup(ap) 4949 struct vop_lookup_args /* { 4950 struct vnode *a_dvp; 4951 struct vnode **a_vpp; 4952 struct componentname *a_cnp; 4953 } */ *ap; 4954{ 4955 zfsvfs_t *zfsvfs; 4956 4957 zfsvfs = ap->a_dvp->v_mount->mnt_data; 4958 if (zfsvfs->z_use_namecache) 4959 return (vfs_cache_lookup(ap)); 4960 else 4961 return (zfs_freebsd_lookup(ap)); 4962} 4963 4964static int 4965zfs_freebsd_create(ap) 4966 struct vop_create_args /* { 4967 struct vnode *a_dvp; 4968 struct vnode **a_vpp; 4969 struct componentname *a_cnp; 4970 struct vattr *a_vap; 4971 } */ *ap; 4972{ 4973 zfsvfs_t *zfsvfs; 4974 struct componentname *cnp = ap->a_cnp; 4975 vattr_t *vap = ap->a_vap; 4976 int error, mode; 4977 4978 ASSERT(cnp->cn_flags & SAVENAME); 4979 4980 vattr_init_mask(vap); 4981 mode = vap->va_mode & ALLPERMS; 4982 zfsvfs = ap->a_dvp->v_mount->mnt_data; 4983 4984 error = zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode, 4985 ap->a_vpp, cnp->cn_cred, cnp->cn_thread); 4986 if (zfsvfs->z_use_namecache && 4987 error == 0 && (cnp->cn_flags & MAKEENTRY) != 0) 4988 cache_enter(ap->a_dvp, *ap->a_vpp, cnp); 4989 return (error); 4990} 4991 4992static int 4993zfs_freebsd_remove(ap) 4994 struct vop_remove_args /* { 4995 struct vnode *a_dvp; 4996 struct vnode *a_vp; 4997 struct componentname *a_cnp; 4998 } */ *ap; 4999{ 5000 5001 ASSERT(ap->a_cnp->cn_flags & SAVENAME); 5002 5003 return (zfs_remove(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr, 5004 ap->a_cnp->cn_cred)); 5005} 5006 5007static int 5008zfs_freebsd_mkdir(ap) 5009 struct vop_mkdir_args /* { 5010 struct vnode *a_dvp; 5011 struct vnode **a_vpp; 5012 struct componentname *a_cnp; 5013 struct vattr *a_vap; 5014 } */ *ap; 5015{ 5016 vattr_t *vap = ap->a_vap; 5017 5018 ASSERT(ap->a_cnp->cn_flags & SAVENAME); 5019 5020 vattr_init_mask(vap); 5021 5022 return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp, 5023 ap->a_cnp->cn_cred)); 5024} 5025 5026static int 5027zfs_freebsd_rmdir(ap) 5028 struct vop_rmdir_args /* { 5029 struct vnode *a_dvp; 5030 struct vnode *a_vp; 5031 struct componentname *a_cnp; 5032 } */ *ap; 5033{ 5034 struct componentname *cnp = ap->a_cnp; 5035 5036 ASSERT(cnp->cn_flags & SAVENAME); 5037 5038 return (zfs_rmdir(ap->a_dvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred)); 5039} 5040 5041static int 5042zfs_freebsd_readdir(ap) 5043 struct vop_readdir_args /* { 5044 struct vnode *a_vp; 5045 struct uio *a_uio; 5046 struct ucred *a_cred; 5047 int *a_eofflag; 5048 int *a_ncookies; 5049 u_long **a_cookies; 5050 } */ *ap; 5051{ 5052 5053 return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag, 5054 ap->a_ncookies, ap->a_cookies)); 5055} 5056 5057static int 5058zfs_freebsd_fsync(ap) 5059 struct vop_fsync_args /* { 5060 struct vnode *a_vp; 5061 int a_waitfor; 5062 struct thread *a_td; 5063 } */ *ap; 5064{ 5065 5066 vop_stdfsync(ap); 5067 return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL)); 5068} 5069 5070static int 5071zfs_freebsd_getattr(ap) 5072 struct vop_getattr_args /* { 5073 struct vnode *a_vp; 5074 struct vattr *a_vap; 5075 struct ucred *a_cred; 5076 } */ *ap; 5077{ 5078 vattr_t *vap = ap->a_vap; 5079 xvattr_t xvap; 5080 u_long fflags = 0; 5081 int error; 5082 5083 xva_init(&xvap); 5084 xvap.xva_vattr = *vap; 5085 xvap.xva_vattr.va_mask |= AT_XVATTR; 5086 5087 /* Convert chflags into ZFS-type flags. */ 5088 /* XXX: what about SF_SETTABLE?. */ 5089 XVA_SET_REQ(&xvap, XAT_IMMUTABLE); 5090 XVA_SET_REQ(&xvap, XAT_APPENDONLY); 5091 XVA_SET_REQ(&xvap, XAT_NOUNLINK); 5092 XVA_SET_REQ(&xvap, XAT_NODUMP); 5093 XVA_SET_REQ(&xvap, XAT_READONLY); 5094 XVA_SET_REQ(&xvap, XAT_ARCHIVE); 5095 XVA_SET_REQ(&xvap, XAT_SYSTEM); 5096 XVA_SET_REQ(&xvap, XAT_HIDDEN); 5097 XVA_SET_REQ(&xvap, XAT_REPARSE); 5098 XVA_SET_REQ(&xvap, XAT_OFFLINE); 5099 XVA_SET_REQ(&xvap, XAT_SPARSE); 5100 5101 error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL); 5102 if (error != 0) 5103 return (error); 5104 5105 /* Convert ZFS xattr into chflags. */ 5106#define FLAG_CHECK(fflag, xflag, xfield) do { \ 5107 if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0) \ 5108 fflags |= (fflag); \ 5109} while (0) 5110 FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE, 5111 xvap.xva_xoptattrs.xoa_immutable); 5112 FLAG_CHECK(SF_APPEND, XAT_APPENDONLY, 5113 xvap.xva_xoptattrs.xoa_appendonly); 5114 FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK, 5115 xvap.xva_xoptattrs.xoa_nounlink); 5116 FLAG_CHECK(UF_ARCHIVE, XAT_ARCHIVE, 5117 xvap.xva_xoptattrs.xoa_archive); 5118 FLAG_CHECK(UF_NODUMP, XAT_NODUMP, 5119 xvap.xva_xoptattrs.xoa_nodump); 5120 FLAG_CHECK(UF_READONLY, XAT_READONLY, 5121 xvap.xva_xoptattrs.xoa_readonly); 5122 FLAG_CHECK(UF_SYSTEM, XAT_SYSTEM, 5123 xvap.xva_xoptattrs.xoa_system); 5124 FLAG_CHECK(UF_HIDDEN, XAT_HIDDEN, 5125 xvap.xva_xoptattrs.xoa_hidden); 5126 FLAG_CHECK(UF_REPARSE, XAT_REPARSE, 5127 xvap.xva_xoptattrs.xoa_reparse); 5128 FLAG_CHECK(UF_OFFLINE, XAT_OFFLINE, 5129 xvap.xva_xoptattrs.xoa_offline); 5130 FLAG_CHECK(UF_SPARSE, XAT_SPARSE, 5131 xvap.xva_xoptattrs.xoa_sparse); 5132 5133#undef FLAG_CHECK 5134 *vap = xvap.xva_vattr; 5135 vap->va_flags = fflags; 5136 return (0); 5137} 5138 5139static int 5140zfs_freebsd_setattr(ap) 5141 struct vop_setattr_args /* { 5142 struct vnode *a_vp; 5143 struct vattr *a_vap; 5144 struct ucred *a_cred; 5145 } */ *ap; 5146{ 5147 vnode_t *vp = ap->a_vp; 5148 vattr_t *vap = ap->a_vap; 5149 cred_t *cred = ap->a_cred; 5150 xvattr_t xvap; 5151 u_long fflags; 5152 uint64_t zflags; 5153 5154 vattr_init_mask(vap); 5155 vap->va_mask &= ~AT_NOSET; 5156 5157 xva_init(&xvap); 5158 xvap.xva_vattr = *vap; 5159 5160 zflags = VTOZ(vp)->z_pflags; 5161 5162 if (vap->va_flags != VNOVAL) { 5163 zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs; 5164 int error; 5165 5166 if (zfsvfs->z_use_fuids == B_FALSE) 5167 return (EOPNOTSUPP); 5168 5169 fflags = vap->va_flags; 5170 /* 5171 * XXX KDM 5172 * We need to figure out whether it makes sense to allow 5173 * UF_REPARSE through, since we don't really have other 5174 * facilities to handle reparse points and zfs_setattr() 5175 * doesn't currently allow setting that attribute anyway. 5176 */ 5177 if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_ARCHIVE| 5178 UF_NODUMP|UF_SYSTEM|UF_HIDDEN|UF_READONLY|UF_REPARSE| 5179 UF_OFFLINE|UF_SPARSE)) != 0) 5180 return (EOPNOTSUPP); 5181 /* 5182 * Unprivileged processes are not permitted to unset system 5183 * flags, or modify flags if any system flags are set. 5184 * Privileged non-jail processes may not modify system flags 5185 * if securelevel > 0 and any existing system flags are set. 5186 * Privileged jail processes behave like privileged non-jail 5187 * processes if the security.jail.chflags_allowed sysctl is 5188 * is non-zero; otherwise, they behave like unprivileged 5189 * processes. 5190 */ 5191 if (secpolicy_fs_owner(vp->v_mount, cred) == 0 || 5192 priv_check_cred(cred, PRIV_VFS_SYSFLAGS, 0) == 0) { 5193 if (zflags & 5194 (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) { 5195 error = securelevel_gt(cred, 0); 5196 if (error != 0) 5197 return (error); 5198 } 5199 } else { 5200 /* 5201 * Callers may only modify the file flags on objects they 5202 * have VADMIN rights for. 5203 */ 5204 if ((error = VOP_ACCESS(vp, VADMIN, cred, curthread)) != 0) 5205 return (error); 5206 if (zflags & 5207 (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) { 5208 return (EPERM); 5209 } 5210 if (fflags & 5211 (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) { 5212 return (EPERM); 5213 } 5214 } 5215 5216#define FLAG_CHANGE(fflag, zflag, xflag, xfield) do { \ 5217 if (((fflags & (fflag)) && !(zflags & (zflag))) || \ 5218 ((zflags & (zflag)) && !(fflags & (fflag)))) { \ 5219 XVA_SET_REQ(&xvap, (xflag)); \ 5220 (xfield) = ((fflags & (fflag)) != 0); \ 5221 } \ 5222} while (0) 5223 /* Convert chflags into ZFS-type flags. */ 5224 /* XXX: what about SF_SETTABLE?. */ 5225 FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE, 5226 xvap.xva_xoptattrs.xoa_immutable); 5227 FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY, 5228 xvap.xva_xoptattrs.xoa_appendonly); 5229 FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK, 5230 xvap.xva_xoptattrs.xoa_nounlink); 5231 FLAG_CHANGE(UF_ARCHIVE, ZFS_ARCHIVE, XAT_ARCHIVE, 5232 xvap.xva_xoptattrs.xoa_archive); 5233 FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP, 5234 xvap.xva_xoptattrs.xoa_nodump); 5235 FLAG_CHANGE(UF_READONLY, ZFS_READONLY, XAT_READONLY, 5236 xvap.xva_xoptattrs.xoa_readonly); 5237 FLAG_CHANGE(UF_SYSTEM, ZFS_SYSTEM, XAT_SYSTEM, 5238 xvap.xva_xoptattrs.xoa_system); 5239 FLAG_CHANGE(UF_HIDDEN, ZFS_HIDDEN, XAT_HIDDEN, 5240 xvap.xva_xoptattrs.xoa_hidden); 5241 FLAG_CHANGE(UF_REPARSE, ZFS_REPARSE, XAT_REPARSE, 5242 xvap.xva_xoptattrs.xoa_hidden); 5243 FLAG_CHANGE(UF_OFFLINE, ZFS_OFFLINE, XAT_OFFLINE, 5244 xvap.xva_xoptattrs.xoa_offline); 5245 FLAG_CHANGE(UF_SPARSE, ZFS_SPARSE, XAT_SPARSE, 5246 xvap.xva_xoptattrs.xoa_sparse); 5247#undef FLAG_CHANGE 5248 } 5249 if (vap->va_birthtime.tv_sec != VNOVAL) { 5250 xvap.xva_vattr.va_mask |= AT_XVATTR; 5251 XVA_SET_REQ(&xvap, XAT_CREATETIME); 5252 } 5253 return (zfs_setattr(vp, (vattr_t *)&xvap, 0, cred, NULL)); 5254} 5255 5256static int 5257zfs_freebsd_rename(ap) 5258 struct vop_rename_args /* { 5259 struct vnode *a_fdvp; 5260 struct vnode *a_fvp; 5261 struct componentname *a_fcnp; 5262 struct vnode *a_tdvp; 5263 struct vnode *a_tvp; 5264 struct componentname *a_tcnp; 5265 } */ *ap; 5266{ 5267 vnode_t *fdvp = ap->a_fdvp; 5268 vnode_t *fvp = ap->a_fvp; 5269 vnode_t *tdvp = ap->a_tdvp; 5270 vnode_t *tvp = ap->a_tvp; 5271 int error; 5272 5273 ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART)); 5274 ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART)); 5275 5276 error = zfs_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp, 5277 ap->a_tcnp, ap->a_fcnp->cn_cred); 5278 5279 vrele(fdvp); 5280 vrele(fvp); 5281 vrele(tdvp); 5282 if (tvp != NULL) 5283 vrele(tvp); 5284 5285 return (error); 5286} 5287 5288static int 5289zfs_freebsd_symlink(ap) 5290 struct vop_symlink_args /* { 5291 struct vnode *a_dvp; 5292 struct vnode **a_vpp; 5293 struct componentname *a_cnp; 5294 struct vattr *a_vap; 5295 char *a_target; 5296 } */ *ap; 5297{ 5298 struct componentname *cnp = ap->a_cnp; 5299 vattr_t *vap = ap->a_vap; 5300 5301 ASSERT(cnp->cn_flags & SAVENAME); 5302 5303 vap->va_type = VLNK; /* FreeBSD: Syscall only sets va_mode. */ 5304 vattr_init_mask(vap); 5305 5306 return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap, 5307 ap->a_target, cnp->cn_cred, cnp->cn_thread)); 5308} 5309 5310static int 5311zfs_freebsd_readlink(ap) 5312 struct vop_readlink_args /* { 5313 struct vnode *a_vp; 5314 struct uio *a_uio; 5315 struct ucred *a_cred; 5316 } */ *ap; 5317{ 5318 5319 return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL)); 5320} 5321 5322static int 5323zfs_freebsd_link(ap) 5324 struct vop_link_args /* { 5325 struct vnode *a_tdvp; 5326 struct vnode *a_vp; 5327 struct componentname *a_cnp; 5328 } */ *ap; 5329{ 5330 struct componentname *cnp = ap->a_cnp; 5331 vnode_t *vp = ap->a_vp; 5332 vnode_t *tdvp = ap->a_tdvp; 5333 5334 if (tdvp->v_mount != vp->v_mount) 5335 return (EXDEV); 5336 5337 ASSERT(cnp->cn_flags & SAVENAME); 5338 5339 return (zfs_link(tdvp, vp, cnp->cn_nameptr, cnp->cn_cred, NULL, 0)); 5340} 5341 5342static int 5343zfs_freebsd_inactive(ap) 5344 struct vop_inactive_args /* { 5345 struct vnode *a_vp; 5346 struct thread *a_td; 5347 } */ *ap; 5348{ 5349 vnode_t *vp = ap->a_vp; 5350 5351 zfs_inactive(vp, ap->a_td->td_ucred, NULL); 5352 return (0); 5353} 5354 5355static int 5356zfs_freebsd_reclaim(ap) 5357 struct vop_reclaim_args /* { 5358 struct vnode *a_vp; 5359 struct thread *a_td; 5360 } */ *ap; 5361{ 5362 vnode_t *vp = ap->a_vp; 5363 znode_t *zp = VTOZ(vp); 5364 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 5365 5366 ASSERT(zp != NULL); 5367 5368 /* Destroy the vm object and flush associated pages. */ 5369 vnode_destroy_vobject(vp); 5370 5371 /* 5372 * z_teardown_inactive_lock protects from a race with 5373 * zfs_znode_dmu_fini in zfsvfs_teardown during 5374 * force unmount. 5375 */ 5376 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); 5377 if (zp->z_sa_hdl == NULL) 5378 zfs_znode_free(zp); 5379 else 5380 zfs_zinactive(zp); 5381 rw_exit(&zfsvfs->z_teardown_inactive_lock); 5382 5383 vp->v_data = NULL; 5384 return (0); 5385} 5386 5387static int 5388zfs_freebsd_fid(ap) 5389 struct vop_fid_args /* { 5390 struct vnode *a_vp; 5391 struct fid *a_fid; 5392 } */ *ap; 5393{ 5394 5395 return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL)); 5396} 5397 5398static int 5399zfs_freebsd_pathconf(ap) 5400 struct vop_pathconf_args /* { 5401 struct vnode *a_vp; 5402 int a_name; 5403 register_t *a_retval; 5404 } */ *ap; 5405{ 5406 ulong_t val; 5407 int error; 5408 5409 error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred, NULL); 5410 if (error == 0) { 5411 *ap->a_retval = val; 5412 return (error); 5413 } 5414 if (error != EOPNOTSUPP) 5415 return (error); 5416 5417 switch (ap->a_name) { 5418 case _PC_NAME_MAX: 5419 *ap->a_retval = NAME_MAX; 5420 return (0); 5421 case _PC_PIPE_BUF: 5422 if (ap->a_vp->v_type == VDIR || ap->a_vp->v_type == VFIFO) { 5423 *ap->a_retval = PIPE_BUF; 5424 return (0); 5425 } 5426 return (EINVAL); 5427 default: 5428 return (vop_stdpathconf(ap)); 5429 } 5430} 5431 5432/* 5433 * FreeBSD's extended attributes namespace defines file name prefix for ZFS' 5434 * extended attribute name: 5435 * 5436 * NAMESPACE PREFIX 5437 * system freebsd:system: 5438 * user (none, can be used to access ZFS fsattr(5) attributes 5439 * created on Solaris) 5440 */ 5441static int 5442zfs_create_attrname(int attrnamespace, const char *name, char *attrname, 5443 size_t size) 5444{ 5445 const char *namespace, *prefix, *suffix; 5446 5447 /* We don't allow '/' character in attribute name. */ 5448 if (strchr(name, '/') != NULL) 5449 return (EINVAL); 5450 /* We don't allow attribute names that start with "freebsd:" string. */ 5451 if (strncmp(name, "freebsd:", 8) == 0) 5452 return (EINVAL); 5453 5454 bzero(attrname, size); 5455 5456 switch (attrnamespace) { 5457 case EXTATTR_NAMESPACE_USER: 5458#if 0 5459 prefix = "freebsd:"; 5460 namespace = EXTATTR_NAMESPACE_USER_STRING; 5461 suffix = ":"; 5462#else 5463 /* 5464 * This is the default namespace by which we can access all 5465 * attributes created on Solaris. 5466 */ 5467 prefix = namespace = suffix = ""; 5468#endif 5469 break; 5470 case EXTATTR_NAMESPACE_SYSTEM: 5471 prefix = "freebsd:"; 5472 namespace = EXTATTR_NAMESPACE_SYSTEM_STRING; 5473 suffix = ":"; 5474 break; 5475 case EXTATTR_NAMESPACE_EMPTY: 5476 default: 5477 return (EINVAL); 5478 } 5479 if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix, 5480 name) >= size) { 5481 return (ENAMETOOLONG); 5482 } 5483 return (0); 5484} 5485 5486/* 5487 * Vnode operating to retrieve a named extended attribute. 5488 */ 5489static int 5490zfs_getextattr(struct vop_getextattr_args *ap) 5491/* 5492vop_getextattr { 5493 IN struct vnode *a_vp; 5494 IN int a_attrnamespace; 5495 IN const char *a_name; 5496 INOUT struct uio *a_uio; 5497 OUT size_t *a_size; 5498 IN struct ucred *a_cred; 5499 IN struct thread *a_td; 5500}; 5501*/ 5502{ 5503 zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; 5504 struct thread *td = ap->a_td; 5505 struct nameidata nd; 5506 char attrname[255]; 5507 struct vattr va; 5508 vnode_t *xvp = NULL, *vp; 5509 int error, flags; 5510 5511 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 5512 ap->a_cred, ap->a_td, VREAD); 5513 if (error != 0) 5514 return (error); 5515 5516 error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, 5517 sizeof(attrname)); 5518 if (error != 0) 5519 return (error); 5520 5521 ZFS_ENTER(zfsvfs); 5522 5523 error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, 5524 LOOKUP_XATTR); 5525 if (error != 0) { 5526 ZFS_EXIT(zfsvfs); 5527 return (error); 5528 } 5529 5530 flags = FREAD; 5531 NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, 5532 xvp, td); 5533 error = vn_open_cred(&nd, &flags, 0, 0, ap->a_cred, NULL); 5534 vp = nd.ni_vp; 5535 NDFREE(&nd, NDF_ONLY_PNBUF); 5536 if (error != 0) { 5537 ZFS_EXIT(zfsvfs); 5538 if (error == ENOENT) 5539 error = ENOATTR; 5540 return (error); 5541 } 5542 5543 if (ap->a_size != NULL) { 5544 error = VOP_GETATTR(vp, &va, ap->a_cred); 5545 if (error == 0) 5546 *ap->a_size = (size_t)va.va_size; 5547 } else if (ap->a_uio != NULL) 5548 error = VOP_READ(vp, ap->a_uio, IO_UNIT, ap->a_cred); 5549 5550 VOP_UNLOCK(vp, 0); 5551 vn_close(vp, flags, ap->a_cred, td); 5552 ZFS_EXIT(zfsvfs); 5553 5554 return (error); 5555} 5556 5557/* 5558 * Vnode operation to remove a named attribute. 5559 */ 5560int 5561zfs_deleteextattr(struct vop_deleteextattr_args *ap) 5562/* 5563vop_deleteextattr { 5564 IN struct vnode *a_vp; 5565 IN int a_attrnamespace; 5566 IN const char *a_name; 5567 IN struct ucred *a_cred; 5568 IN struct thread *a_td; 5569}; 5570*/ 5571{ 5572 zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; 5573 struct thread *td = ap->a_td; 5574 struct nameidata nd; 5575 char attrname[255]; 5576 struct vattr va; 5577 vnode_t *xvp = NULL, *vp; 5578 int error, flags; 5579 5580 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 5581 ap->a_cred, ap->a_td, VWRITE); 5582 if (error != 0) 5583 return (error); 5584 5585 error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, 5586 sizeof(attrname)); 5587 if (error != 0) 5588 return (error); 5589 5590 ZFS_ENTER(zfsvfs); 5591 5592 error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, 5593 LOOKUP_XATTR); 5594 if (error != 0) { 5595 ZFS_EXIT(zfsvfs); 5596 return (error); 5597 } 5598 5599 NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF, 5600 UIO_SYSSPACE, attrname, xvp, td); 5601 error = namei(&nd); 5602 vp = nd.ni_vp; 5603 if (error != 0) { 5604 ZFS_EXIT(zfsvfs); 5605 NDFREE(&nd, NDF_ONLY_PNBUF); 5606 if (error == ENOENT) 5607 error = ENOATTR; 5608 return (error); 5609 } 5610 5611 error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd); 5612 NDFREE(&nd, NDF_ONLY_PNBUF); 5613 5614 vput(nd.ni_dvp); 5615 if (vp == nd.ni_dvp) 5616 vrele(vp); 5617 else 5618 vput(vp); 5619 ZFS_EXIT(zfsvfs); 5620 5621 return (error); 5622} 5623 5624/* 5625 * Vnode operation to set a named attribute. 5626 */ 5627static int 5628zfs_setextattr(struct vop_setextattr_args *ap) 5629/* 5630vop_setextattr { 5631 IN struct vnode *a_vp; 5632 IN int a_attrnamespace; 5633 IN const char *a_name; 5634 INOUT struct uio *a_uio; 5635 IN struct ucred *a_cred; 5636 IN struct thread *a_td; 5637}; 5638*/ 5639{ 5640 zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; 5641 struct thread *td = ap->a_td; 5642 struct nameidata nd; 5643 char attrname[255]; 5644 struct vattr va; 5645 vnode_t *xvp = NULL, *vp; 5646 int error, flags; 5647 5648 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 5649 ap->a_cred, ap->a_td, VWRITE); 5650 if (error != 0) 5651 return (error); 5652 5653 error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, 5654 sizeof(attrname)); 5655 if (error != 0) 5656 return (error); 5657 5658 ZFS_ENTER(zfsvfs); 5659 5660 error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, 5661 LOOKUP_XATTR | CREATE_XATTR_DIR); 5662 if (error != 0) { 5663 ZFS_EXIT(zfsvfs); 5664 return (error); 5665 } 5666 5667 flags = FFLAGS(O_WRONLY | O_CREAT); 5668 NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, 5669 xvp, td); 5670 error = vn_open_cred(&nd, &flags, 0600, 0, ap->a_cred, NULL); 5671 vp = nd.ni_vp; 5672 NDFREE(&nd, NDF_ONLY_PNBUF); 5673 if (error != 0) { 5674 ZFS_EXIT(zfsvfs); 5675 return (error); 5676 } 5677 5678 VATTR_NULL(&va); 5679 va.va_size = 0; 5680 error = VOP_SETATTR(vp, &va, ap->a_cred); 5681 if (error == 0) 5682 VOP_WRITE(vp, ap->a_uio, IO_UNIT, ap->a_cred); 5683 5684 VOP_UNLOCK(vp, 0); 5685 vn_close(vp, flags, ap->a_cred, td); 5686 ZFS_EXIT(zfsvfs); 5687 5688 return (error); 5689} 5690 5691/* 5692 * Vnode operation to retrieve extended attributes on a vnode. 5693 */ 5694static int 5695zfs_listextattr(struct vop_listextattr_args *ap) 5696/* 5697vop_listextattr { 5698 IN struct vnode *a_vp; 5699 IN int a_attrnamespace; 5700 INOUT struct uio *a_uio; 5701 OUT size_t *a_size; 5702 IN struct ucred *a_cred; 5703 IN struct thread *a_td; 5704}; 5705*/ 5706{ 5707 zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; 5708 struct thread *td = ap->a_td; 5709 struct nameidata nd; 5710 char attrprefix[16]; 5711 u_char dirbuf[sizeof(struct dirent)]; 5712 struct dirent *dp; 5713 struct iovec aiov; 5714 struct uio auio, *uio = ap->a_uio; 5715 size_t *sizep = ap->a_size; 5716 size_t plen; 5717 vnode_t *xvp = NULL, *vp; 5718 int done, error, eof, pos; 5719 5720 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 5721 ap->a_cred, ap->a_td, VREAD); 5722 if (error != 0) 5723 return (error); 5724 5725 error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix, 5726 sizeof(attrprefix)); 5727 if (error != 0) 5728 return (error); 5729 plen = strlen(attrprefix); 5730 5731 ZFS_ENTER(zfsvfs); 5732 5733 if (sizep != NULL) 5734 *sizep = 0; 5735 5736 error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, 5737 LOOKUP_XATTR); 5738 if (error != 0) { 5739 ZFS_EXIT(zfsvfs); 5740 /* 5741 * ENOATTR means that the EA directory does not yet exist, 5742 * i.e. there are no extended attributes there. 5743 */ 5744 if (error == ENOATTR) 5745 error = 0; 5746 return (error); 5747 } 5748 5749 NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED, 5750 UIO_SYSSPACE, ".", xvp, td); 5751 error = namei(&nd); 5752 vp = nd.ni_vp; 5753 NDFREE(&nd, NDF_ONLY_PNBUF); 5754 if (error != 0) { 5755 ZFS_EXIT(zfsvfs); 5756 return (error); 5757 } 5758 5759 auio.uio_iov = &aiov; 5760 auio.uio_iovcnt = 1; 5761 auio.uio_segflg = UIO_SYSSPACE; 5762 auio.uio_td = td; 5763 auio.uio_rw = UIO_READ; 5764 auio.uio_offset = 0; 5765 5766 do { 5767 u_char nlen; 5768 5769 aiov.iov_base = (void *)dirbuf; 5770 aiov.iov_len = sizeof(dirbuf); 5771 auio.uio_resid = sizeof(dirbuf); 5772 error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL); 5773 done = sizeof(dirbuf) - auio.uio_resid; 5774 if (error != 0) 5775 break; 5776 for (pos = 0; pos < done;) { 5777 dp = (struct dirent *)(dirbuf + pos); 5778 pos += dp->d_reclen; 5779 /* 5780 * XXX: Temporarily we also accept DT_UNKNOWN, as this 5781 * is what we get when attribute was created on Solaris. 5782 */ 5783 if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN) 5784 continue; 5785 if (plen == 0 && strncmp(dp->d_name, "freebsd:", 8) == 0) 5786 continue; 5787 else if (strncmp(dp->d_name, attrprefix, plen) != 0) 5788 continue; 5789 nlen = dp->d_namlen - plen; 5790 if (sizep != NULL) 5791 *sizep += 1 + nlen; 5792 else if (uio != NULL) { 5793 /* 5794 * Format of extattr name entry is one byte for 5795 * length and the rest for name. 5796 */ 5797 error = uiomove(&nlen, 1, uio->uio_rw, uio); 5798 if (error == 0) { 5799 error = uiomove(dp->d_name + plen, nlen, 5800 uio->uio_rw, uio); 5801 } 5802 if (error != 0) 5803 break; 5804 } 5805 } 5806 } while (!eof && error == 0); 5807 5808 vput(vp); 5809 ZFS_EXIT(zfsvfs); 5810 5811 return (error); 5812} 5813 5814int 5815zfs_freebsd_getacl(ap) 5816 struct vop_getacl_args /* { 5817 struct vnode *vp; 5818 acl_type_t type; 5819 struct acl *aclp; 5820 struct ucred *cred; 5821 struct thread *td; 5822 } */ *ap; 5823{ 5824 int error; 5825 vsecattr_t vsecattr; 5826 5827 if (ap->a_type != ACL_TYPE_NFS4) 5828 return (EINVAL); 5829 5830 vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT; 5831 if (error = zfs_getsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL)) 5832 return (error); 5833 5834 error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp, vsecattr.vsa_aclcnt); 5835 if (vsecattr.vsa_aclentp != NULL) 5836 kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz); 5837 5838 return (error); 5839} 5840 5841int 5842zfs_freebsd_setacl(ap) 5843 struct vop_setacl_args /* { 5844 struct vnode *vp; 5845 acl_type_t type; 5846 struct acl *aclp; 5847 struct ucred *cred; 5848 struct thread *td; 5849 } */ *ap; 5850{ 5851 int error; 5852 vsecattr_t vsecattr; 5853 int aclbsize; /* size of acl list in bytes */ 5854 aclent_t *aaclp; 5855 5856 if (ap->a_type != ACL_TYPE_NFS4) 5857 return (EINVAL); 5858 5859 if (ap->a_aclp == NULL) 5860 return (EINVAL); 5861 5862 if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES) 5863 return (EINVAL); 5864 5865 /* 5866 * With NFSv4 ACLs, chmod(2) may need to add additional entries, 5867 * splitting every entry into two and appending "canonical six" 5868 * entries at the end. Don't allow for setting an ACL that would 5869 * cause chmod(2) to run out of ACL entries. 5870 */ 5871 if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES) 5872 return (ENOSPC); 5873 5874 error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR); 5875 if (error != 0) 5876 return (error); 5877 5878 vsecattr.vsa_mask = VSA_ACE; 5879 aclbsize = ap->a_aclp->acl_cnt * sizeof(ace_t); 5880 vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP); 5881 aaclp = vsecattr.vsa_aclentp; 5882 vsecattr.vsa_aclentsz = aclbsize; 5883 5884 aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp); 5885 error = zfs_setsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL); 5886 kmem_free(aaclp, aclbsize); 5887 5888 return (error); 5889} 5890 5891int 5892zfs_freebsd_aclcheck(ap) 5893 struct vop_aclcheck_args /* { 5894 struct vnode *vp; 5895 acl_type_t type; 5896 struct acl *aclp; 5897 struct ucred *cred; 5898 struct thread *td; 5899 } */ *ap; 5900{ 5901 5902 return (EOPNOTSUPP); 5903} 5904 5905static int 5906zfs_vptocnp(struct vop_vptocnp_args *ap) 5907{ 5908 vnode_t *covered_vp; 5909 vnode_t *vp = ap->a_vp;; 5910 zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; 5911 znode_t *zp = VTOZ(vp); 5912 int ltype; 5913 int error; 5914 5915 ZFS_ENTER(zfsvfs); 5916 ZFS_VERIFY_ZP(zp); 5917 5918 /* 5919 * If we are a snapshot mounted under .zfs, run the operation 5920 * on the covered vnode. 5921 */ 5922 if (zp->z_id != zfsvfs->z_root || zfsvfs->z_parent == zfsvfs) { 5923 char name[MAXNAMLEN + 1]; 5924 znode_t *dzp; 5925 size_t len; 5926 5927 error = zfs_znode_parent_and_name(zp, &dzp, name); 5928 if (error == 0) { 5929 len = strlen(name); 5930 if (*ap->a_buflen < len) 5931 error = SET_ERROR(ENOMEM); 5932 } 5933 if (error == 0) { 5934 *ap->a_buflen -= len; 5935 bcopy(name, ap->a_buf + *ap->a_buflen, len); 5936 *ap->a_vpp = ZTOV(dzp); 5937 } 5938 ZFS_EXIT(zfsvfs); 5939 return (error); 5940 } 5941 ZFS_EXIT(zfsvfs); 5942 5943 covered_vp = vp->v_mount->mnt_vnodecovered; 5944 vhold(covered_vp); 5945 ltype = VOP_ISLOCKED(vp); 5946 VOP_UNLOCK(vp, 0); 5947 error = vget(covered_vp, LK_SHARED | LK_VNHELD, curthread); 5948 if (error == 0) { 5949 error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_cred, 5950 ap->a_buf, ap->a_buflen); 5951 vput(covered_vp); 5952 } 5953 vn_lock(vp, ltype | LK_RETRY); 5954 if ((vp->v_iflag & VI_DOOMED) != 0) 5955 error = SET_ERROR(ENOENT); 5956 return (error); 5957} 5958 5959#ifdef DIAGNOSTIC 5960static int 5961zfs_lock(ap) 5962 struct vop_lock1_args /* { 5963 struct vnode *a_vp; 5964 int a_flags; 5965 char *file; 5966 int line; 5967 } */ *ap; 5968{ 5969 vnode_t *vp; 5970 znode_t *zp; 5971 int err; 5972 5973 err = vop_stdlock(ap); 5974 if (err == 0 && (ap->a_flags & LK_NOWAIT) == 0) { 5975 vp = ap->a_vp; 5976 zp = vp->v_data; 5977 if (vp->v_mount != NULL && (vp->v_iflag & VI_DOOMED) == 0 && 5978 zp != NULL && (zp->z_pflags & ZFS_XATTR) == 0) 5979 VERIFY(!RRM_LOCK_HELD(&zp->z_zfsvfs->z_teardown_lock)); 5980 } 5981 return (err); 5982} 5983#endif 5984 5985struct vop_vector zfs_vnodeops; 5986struct vop_vector zfs_fifoops; 5987struct vop_vector zfs_shareops; 5988 5989struct vop_vector zfs_vnodeops = { 5990 .vop_default = &default_vnodeops, 5991 .vop_inactive = zfs_freebsd_inactive, 5992 .vop_reclaim = zfs_freebsd_reclaim, 5993 .vop_access = zfs_freebsd_access, 5994 .vop_lookup = zfs_cache_lookup, 5995 .vop_cachedlookup = zfs_freebsd_lookup, 5996 .vop_getattr = zfs_freebsd_getattr, 5997 .vop_setattr = zfs_freebsd_setattr, 5998 .vop_create = zfs_freebsd_create, 5999 .vop_mknod = zfs_freebsd_create, 6000 .vop_mkdir = zfs_freebsd_mkdir, 6001 .vop_readdir = zfs_freebsd_readdir, 6002 .vop_fsync = zfs_freebsd_fsync, 6003 .vop_open = zfs_freebsd_open, 6004 .vop_close = zfs_freebsd_close, 6005 .vop_rmdir = zfs_freebsd_rmdir, 6006 .vop_ioctl = zfs_freebsd_ioctl, 6007 .vop_link = zfs_freebsd_link, 6008 .vop_symlink = zfs_freebsd_symlink, 6009 .vop_readlink = zfs_freebsd_readlink, 6010 .vop_read = zfs_freebsd_read, 6011 .vop_write = zfs_freebsd_write, 6012 .vop_remove = zfs_freebsd_remove, 6013 .vop_rename = zfs_freebsd_rename, 6014 .vop_pathconf = zfs_freebsd_pathconf, 6015 .vop_bmap = zfs_freebsd_bmap, 6016 .vop_fid = zfs_freebsd_fid, 6017 .vop_getextattr = zfs_getextattr, 6018 .vop_deleteextattr = zfs_deleteextattr, 6019 .vop_setextattr = zfs_setextattr, 6020 .vop_listextattr = zfs_listextattr, 6021 .vop_getacl = zfs_freebsd_getacl, 6022 .vop_setacl = zfs_freebsd_setacl, 6023 .vop_aclcheck = zfs_freebsd_aclcheck, 6024 .vop_getpages = zfs_freebsd_getpages, 6025 .vop_putpages = zfs_freebsd_putpages, 6026 .vop_vptocnp = zfs_vptocnp, 6027#ifdef DIAGNOSTIC 6028 .vop_lock1 = zfs_lock, 6029#endif 6030}; 6031 6032struct vop_vector zfs_fifoops = { 6033 .vop_default = &fifo_specops, 6034 .vop_fsync = zfs_freebsd_fsync, 6035 .vop_access = zfs_freebsd_access, 6036 .vop_getattr = zfs_freebsd_getattr, 6037 .vop_inactive = zfs_freebsd_inactive, 6038 .vop_read = VOP_PANIC, 6039 .vop_reclaim = zfs_freebsd_reclaim, 6040 .vop_setattr = zfs_freebsd_setattr, 6041 .vop_write = VOP_PANIC, 6042 .vop_pathconf = zfs_freebsd_pathconf, 6043 .vop_fid = zfs_freebsd_fid, 6044 .vop_getacl = zfs_freebsd_getacl, 6045 .vop_setacl = zfs_freebsd_setacl, 6046 .vop_aclcheck = zfs_freebsd_aclcheck, 6047}; 6048 6049/* 6050 * special share hidden files vnode operations template 6051 */ 6052struct vop_vector zfs_shareops = { 6053 .vop_default = &default_vnodeops, 6054 .vop_access = zfs_freebsd_access, 6055 .vop_inactive = zfs_freebsd_inactive, 6056 .vop_reclaim = zfs_freebsd_reclaim, 6057 .vop_fid = zfs_freebsd_fid, 6058 .vop_pathconf = zfs_freebsd_pathconf, 6059}; 6060