zfs_vnops.c revision 323748
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22/* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2012, 2015 by Delphix. All rights reserved. 25 * Copyright (c) 2014 Integros [integros.com] 26 * Copyright 2017 Nexenta Systems, Inc. 27 */ 28 29/* Portions Copyright 2007 Jeremy Teo */ 30/* Portions Copyright 2010 Robert Milkowski */ 31 32#include <sys/types.h> 33#include <sys/param.h> 34#include <sys/time.h> 35#include <sys/systm.h> 36#include <sys/sysmacros.h> 37#include <sys/resource.h> 38#include <sys/vfs.h> 39#include <sys/vm.h> 40#include <sys/vnode.h> 41#include <sys/file.h> 42#include <sys/stat.h> 43#include <sys/kmem.h> 44#include <sys/taskq.h> 45#include <sys/uio.h> 46#include <sys/atomic.h> 47#include <sys/namei.h> 48#include <sys/mman.h> 49#include <sys/cmn_err.h> 50#include <sys/errno.h> 51#include <sys/unistd.h> 52#include <sys/zfs_dir.h> 53#include <sys/zfs_ioctl.h> 54#include <sys/fs/zfs.h> 55#include <sys/dmu.h> 56#include <sys/dmu_objset.h> 57#include <sys/spa.h> 58#include <sys/txg.h> 59#include <sys/dbuf.h> 60#include <sys/zap.h> 61#include <sys/sa.h> 62#include <sys/dirent.h> 63#include <sys/policy.h> 64#include <sys/sunddi.h> 65#include <sys/filio.h> 66#include <sys/sid.h> 67#include <sys/zfs_ctldir.h> 68#include <sys/zfs_fuid.h> 69#include <sys/zfs_sa.h> 70#include <sys/zfs_rlock.h> 71#include <sys/extdirent.h> 72#include <sys/kidmap.h> 73#include <sys/bio.h> 74#include <sys/buf.h> 75#include <sys/sched.h> 76#include <sys/acl.h> 77#include <vm/vm_param.h> 78 79/* 80 * Programming rules. 81 * 82 * Each vnode op performs some logical unit of work. To do this, the ZPL must 83 * properly lock its in-core state, create a DMU transaction, do the work, 84 * record this work in the intent log (ZIL), commit the DMU transaction, 85 * and wait for the intent log to commit if it is a synchronous operation. 86 * Moreover, the vnode ops must work in both normal and log replay context. 87 * The ordering of events is important to avoid deadlocks and references 88 * to freed memory. The example below illustrates the following Big Rules: 89 * 90 * (1) A check must be made in each zfs thread for a mounted file system. 91 * This is done avoiding races using ZFS_ENTER(zfsvfs). 92 * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes 93 * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros 94 * can return EIO from the calling function. 95 * 96 * (2) VN_RELE() should always be the last thing except for zil_commit() 97 * (if necessary) and ZFS_EXIT(). This is for 3 reasons: 98 * First, if it's the last reference, the vnode/znode 99 * can be freed, so the zp may point to freed memory. Second, the last 100 * reference will call zfs_zinactive(), which may induce a lot of work -- 101 * pushing cached pages (which acquires range locks) and syncing out 102 * cached atime changes. Third, zfs_zinactive() may require a new tx, 103 * which could deadlock the system if you were already holding one. 104 * If you must call VN_RELE() within a tx then use VN_RELE_ASYNC(). 105 * 106 * (3) All range locks must be grabbed before calling dmu_tx_assign(), 107 * as they can span dmu_tx_assign() calls. 108 * 109 * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to 110 * dmu_tx_assign(). This is critical because we don't want to block 111 * while holding locks. 112 * 113 * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This 114 * reduces lock contention and CPU usage when we must wait (note that if 115 * throughput is constrained by the storage, nearly every transaction 116 * must wait). 117 * 118 * Note, in particular, that if a lock is sometimes acquired before 119 * the tx assigns, and sometimes after (e.g. z_lock), then failing 120 * to use a non-blocking assign can deadlock the system. The scenario: 121 * 122 * Thread A has grabbed a lock before calling dmu_tx_assign(). 123 * Thread B is in an already-assigned tx, and blocks for this lock. 124 * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() 125 * forever, because the previous txg can't quiesce until B's tx commits. 126 * 127 * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, 128 * then drop all locks, call dmu_tx_wait(), and try again. On subsequent 129 * calls to dmu_tx_assign(), pass TXG_WAITED rather than TXG_NOWAIT, 130 * to indicate that this operation has already called dmu_tx_wait(). 131 * This will ensure that we don't retry forever, waiting a short bit 132 * each time. 133 * 134 * (5) If the operation succeeded, generate the intent log entry for it 135 * before dropping locks. This ensures that the ordering of events 136 * in the intent log matches the order in which they actually occurred. 137 * During ZIL replay the zfs_log_* functions will update the sequence 138 * number to indicate the zil transaction has replayed. 139 * 140 * (6) At the end of each vnode op, the DMU tx must always commit, 141 * regardless of whether there were any errors. 142 * 143 * (7) After dropping all locks, invoke zil_commit(zilog, foid) 144 * to ensure that synchronous semantics are provided when necessary. 145 * 146 * In general, this is how things should be ordered in each vnode op: 147 * 148 * ZFS_ENTER(zfsvfs); // exit if unmounted 149 * top: 150 * zfs_dirent_lookup(&dl, ...) // lock directory entry (may VN_HOLD()) 151 * rw_enter(...); // grab any other locks you need 152 * tx = dmu_tx_create(...); // get DMU tx 153 * dmu_tx_hold_*(); // hold each object you might modify 154 * error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); 155 * if (error) { 156 * rw_exit(...); // drop locks 157 * zfs_dirent_unlock(dl); // unlock directory entry 158 * VN_RELE(...); // release held vnodes 159 * if (error == ERESTART) { 160 * waited = B_TRUE; 161 * dmu_tx_wait(tx); 162 * dmu_tx_abort(tx); 163 * goto top; 164 * } 165 * dmu_tx_abort(tx); // abort DMU tx 166 * ZFS_EXIT(zfsvfs); // finished in zfs 167 * return (error); // really out of space 168 * } 169 * error = do_real_work(); // do whatever this VOP does 170 * if (error == 0) 171 * zfs_log_*(...); // on success, make ZIL entry 172 * dmu_tx_commit(tx); // commit DMU tx -- error or not 173 * rw_exit(...); // drop locks 174 * zfs_dirent_unlock(dl); // unlock directory entry 175 * VN_RELE(...); // release held vnodes 176 * zil_commit(zilog, foid); // synchronous when necessary 177 * ZFS_EXIT(zfsvfs); // finished in zfs 178 * return (error); // done, report error 179 */ 180 181/* ARGSUSED */ 182static int 183zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) 184{ 185 znode_t *zp = VTOZ(*vpp); 186 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 187 188 ZFS_ENTER(zfsvfs); 189 ZFS_VERIFY_ZP(zp); 190 191 if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) && 192 ((flag & FAPPEND) == 0)) { 193 ZFS_EXIT(zfsvfs); 194 return (SET_ERROR(EPERM)); 195 } 196 197 if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && 198 ZTOV(zp)->v_type == VREG && 199 !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) { 200 if (fs_vscan(*vpp, cr, 0) != 0) { 201 ZFS_EXIT(zfsvfs); 202 return (SET_ERROR(EACCES)); 203 } 204 } 205 206 /* Keep a count of the synchronous opens in the znode */ 207 if (flag & (FSYNC | FDSYNC)) 208 atomic_inc_32(&zp->z_sync_cnt); 209 210 ZFS_EXIT(zfsvfs); 211 return (0); 212} 213 214/* ARGSUSED */ 215static int 216zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, 217 caller_context_t *ct) 218{ 219 znode_t *zp = VTOZ(vp); 220 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 221 222 /* 223 * Clean up any locks held by this process on the vp. 224 */ 225 cleanlocks(vp, ddi_get_pid(), 0); 226 cleanshares(vp, ddi_get_pid()); 227 228 ZFS_ENTER(zfsvfs); 229 ZFS_VERIFY_ZP(zp); 230 231 /* Decrement the synchronous opens in the znode */ 232 if ((flag & (FSYNC | FDSYNC)) && (count == 1)) 233 atomic_dec_32(&zp->z_sync_cnt); 234 235 if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && 236 ZTOV(zp)->v_type == VREG && 237 !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) 238 VERIFY(fs_vscan(vp, cr, 1) == 0); 239 240 ZFS_EXIT(zfsvfs); 241 return (0); 242} 243 244/* 245 * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and 246 * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter. 247 */ 248static int 249zfs_holey(vnode_t *vp, u_long cmd, offset_t *off) 250{ 251 znode_t *zp = VTOZ(vp); 252 uint64_t noff = (uint64_t)*off; /* new offset */ 253 uint64_t file_sz; 254 int error; 255 boolean_t hole; 256 257 file_sz = zp->z_size; 258 if (noff >= file_sz) { 259 return (SET_ERROR(ENXIO)); 260 } 261 262 if (cmd == _FIO_SEEK_HOLE) 263 hole = B_TRUE; 264 else 265 hole = B_FALSE; 266 267 error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff); 268 269 if (error == ESRCH) 270 return (SET_ERROR(ENXIO)); 271 272 /* 273 * We could find a hole that begins after the logical end-of-file, 274 * because dmu_offset_next() only works on whole blocks. If the 275 * EOF falls mid-block, then indicate that the "virtual hole" 276 * at the end of the file begins at the logical EOF, rather than 277 * at the end of the last block. 278 */ 279 if (noff > file_sz) { 280 ASSERT(hole); 281 noff = file_sz; 282 } 283 284 if (noff < *off) 285 return (error); 286 *off = noff; 287 return (error); 288} 289 290/* ARGSUSED */ 291static int 292zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred, 293 int *rvalp, caller_context_t *ct) 294{ 295 offset_t off; 296 offset_t ndata; 297 dmu_object_info_t doi; 298 int error; 299 zfsvfs_t *zfsvfs; 300 znode_t *zp; 301 302 switch (com) { 303 case _FIOFFS: 304 { 305 return (0); 306 307 /* 308 * The following two ioctls are used by bfu. Faking out, 309 * necessary to avoid bfu errors. 310 */ 311 } 312 case _FIOGDIO: 313 case _FIOSDIO: 314 { 315 return (0); 316 } 317 318 case _FIO_SEEK_DATA: 319 case _FIO_SEEK_HOLE: 320 { 321#ifdef illumos 322 if (ddi_copyin((void *)data, &off, sizeof (off), flag)) 323 return (SET_ERROR(EFAULT)); 324#else 325 off = *(offset_t *)data; 326#endif 327 zp = VTOZ(vp); 328 zfsvfs = zp->z_zfsvfs; 329 ZFS_ENTER(zfsvfs); 330 ZFS_VERIFY_ZP(zp); 331 332 /* offset parameter is in/out */ 333 error = zfs_holey(vp, com, &off); 334 ZFS_EXIT(zfsvfs); 335 if (error) 336 return (error); 337#ifdef illumos 338 if (ddi_copyout(&off, (void *)data, sizeof (off), flag)) 339 return (SET_ERROR(EFAULT)); 340#else 341 *(offset_t *)data = off; 342#endif 343 return (0); 344 } 345#ifdef illumos 346 case _FIO_COUNT_FILLED: 347 { 348 /* 349 * _FIO_COUNT_FILLED adds a new ioctl command which 350 * exposes the number of filled blocks in a 351 * ZFS object. 352 */ 353 zp = VTOZ(vp); 354 zfsvfs = zp->z_zfsvfs; 355 ZFS_ENTER(zfsvfs); 356 ZFS_VERIFY_ZP(zp); 357 358 /* 359 * Wait for all dirty blocks for this object 360 * to get synced out to disk, and the DMU info 361 * updated. 362 */ 363 error = dmu_object_wait_synced(zfsvfs->z_os, zp->z_id); 364 if (error) { 365 ZFS_EXIT(zfsvfs); 366 return (error); 367 } 368 369 /* 370 * Retrieve fill count from DMU object. 371 */ 372 error = dmu_object_info(zfsvfs->z_os, zp->z_id, &doi); 373 if (error) { 374 ZFS_EXIT(zfsvfs); 375 return (error); 376 } 377 378 ndata = doi.doi_fill_count; 379 380 ZFS_EXIT(zfsvfs); 381 if (ddi_copyout(&ndata, (void *)data, sizeof (ndata), flag)) 382 return (SET_ERROR(EFAULT)); 383 return (0); 384 } 385#endif 386 } 387 return (SET_ERROR(ENOTTY)); 388} 389 390static vm_page_t 391page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes) 392{ 393 vm_object_t obj; 394 vm_page_t pp; 395 int64_t end; 396 397 /* 398 * At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE 399 * aligned boundaries, if the range is not aligned. As a result a 400 * DEV_BSIZE subrange with partially dirty data may get marked as clean. 401 * It may happen that all DEV_BSIZE subranges are marked clean and thus 402 * the whole page would be considred clean despite have some dirty data. 403 * For this reason we should shrink the range to DEV_BSIZE aligned 404 * boundaries before calling vm_page_clear_dirty. 405 */ 406 end = rounddown2(off + nbytes, DEV_BSIZE); 407 off = roundup2(off, DEV_BSIZE); 408 nbytes = end - off; 409 410 obj = vp->v_object; 411 zfs_vmobject_assert_wlocked(obj); 412 413 for (;;) { 414 if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && 415 pp->valid) { 416 if (vm_page_xbusied(pp)) { 417 /* 418 * Reference the page before unlocking and 419 * sleeping so that the page daemon is less 420 * likely to reclaim it. 421 */ 422 vm_page_reference(pp); 423 vm_page_lock(pp); 424 zfs_vmobject_wunlock(obj); 425 vm_page_busy_sleep(pp, "zfsmwb", true); 426 zfs_vmobject_wlock(obj); 427 continue; 428 } 429 vm_page_sbusy(pp); 430 } else if (pp != NULL) { 431 ASSERT(!pp->valid); 432 pp = NULL; 433 } 434 435 if (pp != NULL) { 436 ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); 437 vm_object_pip_add(obj, 1); 438 pmap_remove_write(pp); 439 if (nbytes != 0) 440 vm_page_clear_dirty(pp, off, nbytes); 441 } 442 break; 443 } 444 return (pp); 445} 446 447static void 448page_unbusy(vm_page_t pp) 449{ 450 451 vm_page_sunbusy(pp); 452 vm_object_pip_subtract(pp->object, 1); 453} 454 455static vm_page_t 456page_hold(vnode_t *vp, int64_t start) 457{ 458 vm_object_t obj; 459 vm_page_t pp; 460 461 obj = vp->v_object; 462 zfs_vmobject_assert_wlocked(obj); 463 464 for (;;) { 465 if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && 466 pp->valid) { 467 if (vm_page_xbusied(pp)) { 468 /* 469 * Reference the page before unlocking and 470 * sleeping so that the page daemon is less 471 * likely to reclaim it. 472 */ 473 vm_page_reference(pp); 474 vm_page_lock(pp); 475 zfs_vmobject_wunlock(obj); 476 vm_page_busy_sleep(pp, "zfsmwb", true); 477 zfs_vmobject_wlock(obj); 478 continue; 479 } 480 481 ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); 482 vm_page_lock(pp); 483 vm_page_hold(pp); 484 vm_page_unlock(pp); 485 486 } else 487 pp = NULL; 488 break; 489 } 490 return (pp); 491} 492 493static void 494page_unhold(vm_page_t pp) 495{ 496 497 vm_page_lock(pp); 498 vm_page_unhold(pp); 499 vm_page_unlock(pp); 500} 501 502/* 503 * When a file is memory mapped, we must keep the IO data synchronized 504 * between the DMU cache and the memory mapped pages. What this means: 505 * 506 * On Write: If we find a memory mapped page, we write to *both* 507 * the page and the dmu buffer. 508 */ 509static void 510update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid, 511 int segflg, dmu_tx_t *tx) 512{ 513 vm_object_t obj; 514 struct sf_buf *sf; 515 caddr_t va; 516 int off; 517 518 ASSERT(segflg != UIO_NOCOPY); 519 ASSERT(vp->v_mount != NULL); 520 obj = vp->v_object; 521 ASSERT(obj != NULL); 522 523 off = start & PAGEOFFSET; 524 zfs_vmobject_wlock(obj); 525 for (start &= PAGEMASK; len > 0; start += PAGESIZE) { 526 vm_page_t pp; 527 int nbytes = imin(PAGESIZE - off, len); 528 529 if ((pp = page_busy(vp, start, off, nbytes)) != NULL) { 530 zfs_vmobject_wunlock(obj); 531 532 va = zfs_map_page(pp, &sf); 533 (void) dmu_read(os, oid, start+off, nbytes, 534 va+off, DMU_READ_PREFETCH);; 535 zfs_unmap_page(sf); 536 537 zfs_vmobject_wlock(obj); 538 page_unbusy(pp); 539 } 540 len -= nbytes; 541 off = 0; 542 } 543 vm_object_pip_wakeupn(obj, 0); 544 zfs_vmobject_wunlock(obj); 545} 546 547/* 548 * Read with UIO_NOCOPY flag means that sendfile(2) requests 549 * ZFS to populate a range of page cache pages with data. 550 * 551 * NOTE: this function could be optimized to pre-allocate 552 * all pages in advance, drain exclusive busy on all of them, 553 * map them into contiguous KVA region and populate them 554 * in one single dmu_read() call. 555 */ 556static int 557mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio) 558{ 559 znode_t *zp = VTOZ(vp); 560 objset_t *os = zp->z_zfsvfs->z_os; 561 struct sf_buf *sf; 562 vm_object_t obj; 563 vm_page_t pp; 564 int64_t start; 565 caddr_t va; 566 int len = nbytes; 567 int off; 568 int error = 0; 569 570 ASSERT(uio->uio_segflg == UIO_NOCOPY); 571 ASSERT(vp->v_mount != NULL); 572 obj = vp->v_object; 573 ASSERT(obj != NULL); 574 ASSERT((uio->uio_loffset & PAGEOFFSET) == 0); 575 576 zfs_vmobject_wlock(obj); 577 for (start = uio->uio_loffset; len > 0; start += PAGESIZE) { 578 int bytes = MIN(PAGESIZE, len); 579 580 pp = vm_page_grab(obj, OFF_TO_IDX(start), VM_ALLOC_SBUSY | 581 VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY); 582 if (pp->valid == 0) { 583 zfs_vmobject_wunlock(obj); 584 va = zfs_map_page(pp, &sf); 585 error = dmu_read(os, zp->z_id, start, bytes, va, 586 DMU_READ_PREFETCH); 587 if (bytes != PAGESIZE && error == 0) 588 bzero(va + bytes, PAGESIZE - bytes); 589 zfs_unmap_page(sf); 590 zfs_vmobject_wlock(obj); 591 vm_page_sunbusy(pp); 592 vm_page_lock(pp); 593 if (error) { 594 if (pp->wire_count == 0 && pp->valid == 0 && 595 !vm_page_busied(pp)) 596 vm_page_free(pp); 597 } else { 598 pp->valid = VM_PAGE_BITS_ALL; 599 vm_page_activate(pp); 600 } 601 vm_page_unlock(pp); 602 } else { 603 ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); 604 vm_page_sunbusy(pp); 605 } 606 if (error) 607 break; 608 uio->uio_resid -= bytes; 609 uio->uio_offset += bytes; 610 len -= bytes; 611 } 612 zfs_vmobject_wunlock(obj); 613 return (error); 614} 615 616/* 617 * When a file is memory mapped, we must keep the IO data synchronized 618 * between the DMU cache and the memory mapped pages. What this means: 619 * 620 * On Read: We "read" preferentially from memory mapped pages, 621 * else we default from the dmu buffer. 622 * 623 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when 624 * the file is memory mapped. 625 */ 626static int 627mappedread(vnode_t *vp, int nbytes, uio_t *uio) 628{ 629 znode_t *zp = VTOZ(vp); 630 vm_object_t obj; 631 int64_t start; 632 caddr_t va; 633 int len = nbytes; 634 int off; 635 int error = 0; 636 637 ASSERT(vp->v_mount != NULL); 638 obj = vp->v_object; 639 ASSERT(obj != NULL); 640 641 start = uio->uio_loffset; 642 off = start & PAGEOFFSET; 643 zfs_vmobject_wlock(obj); 644 for (start &= PAGEMASK; len > 0; start += PAGESIZE) { 645 vm_page_t pp; 646 uint64_t bytes = MIN(PAGESIZE - off, len); 647 648 if (pp = page_hold(vp, start)) { 649 struct sf_buf *sf; 650 caddr_t va; 651 652 zfs_vmobject_wunlock(obj); 653 va = zfs_map_page(pp, &sf); 654#ifdef illumos 655 error = uiomove(va + off, bytes, UIO_READ, uio); 656#else 657 error = vn_io_fault_uiomove(va + off, bytes, uio); 658#endif 659 zfs_unmap_page(sf); 660 zfs_vmobject_wlock(obj); 661 page_unhold(pp); 662 } else { 663 zfs_vmobject_wunlock(obj); 664 error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), 665 uio, bytes); 666 zfs_vmobject_wlock(obj); 667 } 668 len -= bytes; 669 off = 0; 670 if (error) 671 break; 672 } 673 zfs_vmobject_wunlock(obj); 674 return (error); 675} 676 677offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */ 678 679/* 680 * Read bytes from specified file into supplied buffer. 681 * 682 * IN: vp - vnode of file to be read from. 683 * uio - structure supplying read location, range info, 684 * and return buffer. 685 * ioflag - SYNC flags; used to provide FRSYNC semantics. 686 * cr - credentials of caller. 687 * ct - caller context 688 * 689 * OUT: uio - updated offset and range, buffer filled. 690 * 691 * RETURN: 0 on success, error code on failure. 692 * 693 * Side Effects: 694 * vp - atime updated if byte count > 0 695 */ 696/* ARGSUSED */ 697static int 698zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) 699{ 700 znode_t *zp = VTOZ(vp); 701 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 702 ssize_t n, nbytes; 703 int error = 0; 704 rl_t *rl; 705 xuio_t *xuio = NULL; 706 707 ZFS_ENTER(zfsvfs); 708 ZFS_VERIFY_ZP(zp); 709 710 if (zp->z_pflags & ZFS_AV_QUARANTINED) { 711 ZFS_EXIT(zfsvfs); 712 return (SET_ERROR(EACCES)); 713 } 714 715 /* 716 * Validate file offset 717 */ 718 if (uio->uio_loffset < (offset_t)0) { 719 ZFS_EXIT(zfsvfs); 720 return (SET_ERROR(EINVAL)); 721 } 722 723 /* 724 * Fasttrack empty reads 725 */ 726 if (uio->uio_resid == 0) { 727 ZFS_EXIT(zfsvfs); 728 return (0); 729 } 730 731 /* 732 * Check for mandatory locks 733 */ 734 if (MANDMODE(zp->z_mode)) { 735 if (error = chklock(vp, FREAD, 736 uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) { 737 ZFS_EXIT(zfsvfs); 738 return (error); 739 } 740 } 741 742 /* 743 * If we're in FRSYNC mode, sync out this znode before reading it. 744 */ 745 if (zfsvfs->z_log && 746 (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)) 747 zil_commit(zfsvfs->z_log, zp->z_id); 748 749 /* 750 * Lock the range against changes. 751 */ 752 rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER); 753 754 /* 755 * If we are reading past end-of-file we can skip 756 * to the end; but we might still need to set atime. 757 */ 758 if (uio->uio_loffset >= zp->z_size) { 759 error = 0; 760 goto out; 761 } 762 763 ASSERT(uio->uio_loffset < zp->z_size); 764 n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset); 765 766#ifdef illumos 767 if ((uio->uio_extflg == UIO_XUIO) && 768 (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) { 769 int nblk; 770 int blksz = zp->z_blksz; 771 uint64_t offset = uio->uio_loffset; 772 773 xuio = (xuio_t *)uio; 774 if ((ISP2(blksz))) { 775 nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset, 776 blksz)) / blksz; 777 } else { 778 ASSERT(offset + n <= blksz); 779 nblk = 1; 780 } 781 (void) dmu_xuio_init(xuio, nblk); 782 783 if (vn_has_cached_data(vp)) { 784 /* 785 * For simplicity, we always allocate a full buffer 786 * even if we only expect to read a portion of a block. 787 */ 788 while (--nblk >= 0) { 789 (void) dmu_xuio_add(xuio, 790 dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), 791 blksz), 0, blksz); 792 } 793 } 794 } 795#endif /* illumos */ 796 797 while (n > 0) { 798 nbytes = MIN(n, zfs_read_chunk_size - 799 P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); 800 801#ifdef __FreeBSD__ 802 if (uio->uio_segflg == UIO_NOCOPY) 803 error = mappedread_sf(vp, nbytes, uio); 804 else 805#endif /* __FreeBSD__ */ 806 if (vn_has_cached_data(vp)) { 807 error = mappedread(vp, nbytes, uio); 808 } else { 809 error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), 810 uio, nbytes); 811 } 812 if (error) { 813 /* convert checksum errors into IO errors */ 814 if (error == ECKSUM) 815 error = SET_ERROR(EIO); 816 break; 817 } 818 819 n -= nbytes; 820 } 821out: 822 zfs_range_unlock(rl); 823 824 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 825 ZFS_EXIT(zfsvfs); 826 return (error); 827} 828 829/* 830 * Write the bytes to a file. 831 * 832 * IN: vp - vnode of file to be written to. 833 * uio - structure supplying write location, range info, 834 * and data buffer. 835 * ioflag - FAPPEND, FSYNC, and/or FDSYNC. FAPPEND is 836 * set if in append mode. 837 * cr - credentials of caller. 838 * ct - caller context (NFS/CIFS fem monitor only) 839 * 840 * OUT: uio - updated offset and range. 841 * 842 * RETURN: 0 on success, error code on failure. 843 * 844 * Timestamps: 845 * vp - ctime|mtime updated if byte count > 0 846 */ 847 848/* ARGSUSED */ 849static int 850zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) 851{ 852 znode_t *zp = VTOZ(vp); 853 rlim64_t limit = MAXOFFSET_T; 854 ssize_t start_resid = uio->uio_resid; 855 ssize_t tx_bytes; 856 uint64_t end_size; 857 dmu_tx_t *tx; 858 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 859 zilog_t *zilog; 860 offset_t woff; 861 ssize_t n, nbytes; 862 rl_t *rl; 863 int max_blksz = zfsvfs->z_max_blksz; 864 int error = 0; 865 arc_buf_t *abuf; 866 iovec_t *aiov = NULL; 867 xuio_t *xuio = NULL; 868 int i_iov = 0; 869 int iovcnt = uio->uio_iovcnt; 870 iovec_t *iovp = uio->uio_iov; 871 int write_eof; 872 int count = 0; 873 sa_bulk_attr_t bulk[4]; 874 uint64_t mtime[2], ctime[2]; 875 876 /* 877 * Fasttrack empty write 878 */ 879 n = start_resid; 880 if (n == 0) 881 return (0); 882 883 if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) 884 limit = MAXOFFSET_T; 885 886 ZFS_ENTER(zfsvfs); 887 ZFS_VERIFY_ZP(zp); 888 889 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); 890 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); 891 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, 892 &zp->z_size, 8); 893 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 894 &zp->z_pflags, 8); 895 896 /* 897 * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our 898 * callers might not be able to detect properly that we are read-only, 899 * so check it explicitly here. 900 */ 901 if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { 902 ZFS_EXIT(zfsvfs); 903 return (SET_ERROR(EROFS)); 904 } 905 906 /* 907 * If immutable or not appending then return EPERM. 908 * Intentionally allow ZFS_READONLY through here. 909 * See zfs_zaccess_common() 910 */ 911 if ((zp->z_pflags & ZFS_IMMUTABLE) || 912 ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) && 913 (uio->uio_loffset < zp->z_size))) { 914 ZFS_EXIT(zfsvfs); 915 return (SET_ERROR(EPERM)); 916 } 917 918 zilog = zfsvfs->z_log; 919 920 /* 921 * Validate file offset 922 */ 923 woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset; 924 if (woff < 0) { 925 ZFS_EXIT(zfsvfs); 926 return (SET_ERROR(EINVAL)); 927 } 928 929 /* 930 * Check for mandatory locks before calling zfs_range_lock() 931 * in order to prevent a deadlock with locks set via fcntl(). 932 */ 933 if (MANDMODE((mode_t)zp->z_mode) && 934 (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) { 935 ZFS_EXIT(zfsvfs); 936 return (error); 937 } 938 939#ifdef illumos 940 /* 941 * Pre-fault the pages to ensure slow (eg NFS) pages 942 * don't hold up txg. 943 * Skip this if uio contains loaned arc_buf. 944 */ 945 if ((uio->uio_extflg == UIO_XUIO) && 946 (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) 947 xuio = (xuio_t *)uio; 948 else 949 uio_prefaultpages(MIN(n, max_blksz), uio); 950#endif 951 952 /* 953 * If in append mode, set the io offset pointer to eof. 954 */ 955 if (ioflag & FAPPEND) { 956 /* 957 * Obtain an appending range lock to guarantee file append 958 * semantics. We reset the write offset once we have the lock. 959 */ 960 rl = zfs_range_lock(zp, 0, n, RL_APPEND); 961 woff = rl->r_off; 962 if (rl->r_len == UINT64_MAX) { 963 /* 964 * We overlocked the file because this write will cause 965 * the file block size to increase. 966 * Note that zp_size cannot change with this lock held. 967 */ 968 woff = zp->z_size; 969 } 970 uio->uio_loffset = woff; 971 } else { 972 /* 973 * Note that if the file block size will change as a result of 974 * this write, then this range lock will lock the entire file 975 * so that we can re-write the block safely. 976 */ 977 rl = zfs_range_lock(zp, woff, n, RL_WRITER); 978 } 979 980 if (vn_rlimit_fsize(vp, uio, uio->uio_td)) { 981 zfs_range_unlock(rl); 982 ZFS_EXIT(zfsvfs); 983 return (EFBIG); 984 } 985 986 if (woff >= limit) { 987 zfs_range_unlock(rl); 988 ZFS_EXIT(zfsvfs); 989 return (SET_ERROR(EFBIG)); 990 } 991 992 if ((woff + n) > limit || woff > (limit - n)) 993 n = limit - woff; 994 995 /* Will this write extend the file length? */ 996 write_eof = (woff + n > zp->z_size); 997 998 end_size = MAX(zp->z_size, woff + n); 999 1000 /* 1001 * Write the file in reasonable size chunks. Each chunk is written 1002 * in a separate transaction; this keeps the intent log records small 1003 * and allows us to do more fine-grained space accounting. 1004 */ 1005 while (n > 0) { 1006 abuf = NULL; 1007 woff = uio->uio_loffset; 1008 if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) || 1009 zfs_owner_overquota(zfsvfs, zp, B_TRUE)) { 1010 if (abuf != NULL) 1011 dmu_return_arcbuf(abuf); 1012 error = SET_ERROR(EDQUOT); 1013 break; 1014 } 1015 1016 if (xuio && abuf == NULL) { 1017 ASSERT(i_iov < iovcnt); 1018 aiov = &iovp[i_iov]; 1019 abuf = dmu_xuio_arcbuf(xuio, i_iov); 1020 dmu_xuio_clear(xuio, i_iov); 1021 DTRACE_PROBE3(zfs_cp_write, int, i_iov, 1022 iovec_t *, aiov, arc_buf_t *, abuf); 1023 ASSERT((aiov->iov_base == abuf->b_data) || 1024 ((char *)aiov->iov_base - (char *)abuf->b_data + 1025 aiov->iov_len == arc_buf_size(abuf))); 1026 i_iov++; 1027 } else if (abuf == NULL && n >= max_blksz && 1028 woff >= zp->z_size && 1029 P2PHASE(woff, max_blksz) == 0 && 1030 zp->z_blksz == max_blksz) { 1031 /* 1032 * This write covers a full block. "Borrow" a buffer 1033 * from the dmu so that we can fill it before we enter 1034 * a transaction. This avoids the possibility of 1035 * holding up the transaction if the data copy hangs 1036 * up on a pagefault (e.g., from an NFS server mapping). 1037 */ 1038#ifdef illumos 1039 size_t cbytes; 1040#endif 1041 1042 abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), 1043 max_blksz); 1044 ASSERT(abuf != NULL); 1045 ASSERT(arc_buf_size(abuf) == max_blksz); 1046#ifdef illumos 1047 if (error = uiocopy(abuf->b_data, max_blksz, 1048 UIO_WRITE, uio, &cbytes)) { 1049 dmu_return_arcbuf(abuf); 1050 break; 1051 } 1052 ASSERT(cbytes == max_blksz); 1053#else 1054 ssize_t resid = uio->uio_resid; 1055 error = vn_io_fault_uiomove(abuf->b_data, max_blksz, uio); 1056 if (error != 0) { 1057 uio->uio_offset -= resid - uio->uio_resid; 1058 uio->uio_resid = resid; 1059 dmu_return_arcbuf(abuf); 1060 break; 1061 } 1062#endif 1063 } 1064 1065 /* 1066 * Start a transaction. 1067 */ 1068 tx = dmu_tx_create(zfsvfs->z_os); 1069 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 1070 dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz)); 1071 zfs_sa_upgrade_txholds(tx, zp); 1072 error = dmu_tx_assign(tx, TXG_WAIT); 1073 if (error) { 1074 dmu_tx_abort(tx); 1075 if (abuf != NULL) 1076 dmu_return_arcbuf(abuf); 1077 break; 1078 } 1079 1080 /* 1081 * If zfs_range_lock() over-locked we grow the blocksize 1082 * and then reduce the lock range. This will only happen 1083 * on the first iteration since zfs_range_reduce() will 1084 * shrink down r_len to the appropriate size. 1085 */ 1086 if (rl->r_len == UINT64_MAX) { 1087 uint64_t new_blksz; 1088 1089 if (zp->z_blksz > max_blksz) { 1090 /* 1091 * File's blocksize is already larger than the 1092 * "recordsize" property. Only let it grow to 1093 * the next power of 2. 1094 */ 1095 ASSERT(!ISP2(zp->z_blksz)); 1096 new_blksz = MIN(end_size, 1097 1 << highbit64(zp->z_blksz)); 1098 } else { 1099 new_blksz = MIN(end_size, max_blksz); 1100 } 1101 zfs_grow_blocksize(zp, new_blksz, tx); 1102 zfs_range_reduce(rl, woff, n); 1103 } 1104 1105 /* 1106 * XXX - should we really limit each write to z_max_blksz? 1107 * Perhaps we should use SPA_MAXBLOCKSIZE chunks? 1108 */ 1109 nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); 1110 1111 if (woff + nbytes > zp->z_size) 1112 vnode_pager_setsize(vp, woff + nbytes); 1113 1114 if (abuf == NULL) { 1115 tx_bytes = uio->uio_resid; 1116 error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), 1117 uio, nbytes, tx); 1118 tx_bytes -= uio->uio_resid; 1119 } else { 1120 tx_bytes = nbytes; 1121 ASSERT(xuio == NULL || tx_bytes == aiov->iov_len); 1122 /* 1123 * If this is not a full block write, but we are 1124 * extending the file past EOF and this data starts 1125 * block-aligned, use assign_arcbuf(). Otherwise, 1126 * write via dmu_write(). 1127 */ 1128 if (tx_bytes < max_blksz && (!write_eof || 1129 aiov->iov_base != abuf->b_data)) { 1130 ASSERT(xuio); 1131 dmu_write(zfsvfs->z_os, zp->z_id, woff, 1132 aiov->iov_len, aiov->iov_base, tx); 1133 dmu_return_arcbuf(abuf); 1134 xuio_stat_wbuf_copied(); 1135 } else { 1136 ASSERT(xuio || tx_bytes == max_blksz); 1137 dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl), 1138 woff, abuf, tx); 1139 } 1140#ifdef illumos 1141 ASSERT(tx_bytes <= uio->uio_resid); 1142 uioskip(uio, tx_bytes); 1143#endif 1144 } 1145 if (tx_bytes && vn_has_cached_data(vp)) { 1146 update_pages(vp, woff, tx_bytes, zfsvfs->z_os, 1147 zp->z_id, uio->uio_segflg, tx); 1148 } 1149 1150 /* 1151 * If we made no progress, we're done. If we made even 1152 * partial progress, update the znode and ZIL accordingly. 1153 */ 1154 if (tx_bytes == 0) { 1155 (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), 1156 (void *)&zp->z_size, sizeof (uint64_t), tx); 1157 dmu_tx_commit(tx); 1158 ASSERT(error != 0); 1159 break; 1160 } 1161 1162 /* 1163 * Clear Set-UID/Set-GID bits on successful write if not 1164 * privileged and at least one of the excute bits is set. 1165 * 1166 * It would be nice to to this after all writes have 1167 * been done, but that would still expose the ISUID/ISGID 1168 * to another app after the partial write is committed. 1169 * 1170 * Note: we don't call zfs_fuid_map_id() here because 1171 * user 0 is not an ephemeral uid. 1172 */ 1173 mutex_enter(&zp->z_acl_lock); 1174 if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | 1175 (S_IXUSR >> 6))) != 0 && 1176 (zp->z_mode & (S_ISUID | S_ISGID)) != 0 && 1177 secpolicy_vnode_setid_retain(vp, cr, 1178 (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) { 1179 uint64_t newmode; 1180 zp->z_mode &= ~(S_ISUID | S_ISGID); 1181 newmode = zp->z_mode; 1182 (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), 1183 (void *)&newmode, sizeof (uint64_t), tx); 1184 } 1185 mutex_exit(&zp->z_acl_lock); 1186 1187 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, 1188 B_TRUE); 1189 1190 /* 1191 * Update the file size (zp_size) if it has changed; 1192 * account for possible concurrent updates. 1193 */ 1194 while ((end_size = zp->z_size) < uio->uio_loffset) { 1195 (void) atomic_cas_64(&zp->z_size, end_size, 1196 uio->uio_loffset); 1197#ifdef illumos 1198 ASSERT(error == 0); 1199#else 1200 ASSERT(error == 0 || error == EFAULT); 1201#endif 1202 } 1203 /* 1204 * If we are replaying and eof is non zero then force 1205 * the file size to the specified eof. Note, there's no 1206 * concurrency during replay. 1207 */ 1208 if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0) 1209 zp->z_size = zfsvfs->z_replay_eof; 1210 1211 if (error == 0) 1212 error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 1213 else 1214 (void) sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 1215 1216 zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag); 1217 dmu_tx_commit(tx); 1218 1219 if (error != 0) 1220 break; 1221 ASSERT(tx_bytes == nbytes); 1222 n -= nbytes; 1223 1224#ifdef illumos 1225 if (!xuio && n > 0) 1226 uio_prefaultpages(MIN(n, max_blksz), uio); 1227#endif 1228 } 1229 1230 zfs_range_unlock(rl); 1231 1232 /* 1233 * If we're in replay mode, or we made no progress, return error. 1234 * Otherwise, it's at least a partial write, so it's successful. 1235 */ 1236 if (zfsvfs->z_replay || uio->uio_resid == start_resid) { 1237 ZFS_EXIT(zfsvfs); 1238 return (error); 1239 } 1240 1241#ifdef __FreeBSD__ 1242 /* 1243 * EFAULT means that at least one page of the source buffer was not 1244 * available. VFS will re-try remaining I/O upon this error. 1245 */ 1246 if (error == EFAULT) { 1247 ZFS_EXIT(zfsvfs); 1248 return (error); 1249 } 1250#endif 1251 1252 if (ioflag & (FSYNC | FDSYNC) || 1253 zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 1254 zil_commit(zilog, zp->z_id); 1255 1256 ZFS_EXIT(zfsvfs); 1257 return (0); 1258} 1259 1260void 1261zfs_get_done(zgd_t *zgd, int error) 1262{ 1263 znode_t *zp = zgd->zgd_private; 1264 objset_t *os = zp->z_zfsvfs->z_os; 1265 1266 if (zgd->zgd_db) 1267 dmu_buf_rele(zgd->zgd_db, zgd); 1268 1269 zfs_range_unlock(zgd->zgd_rl); 1270 1271 /* 1272 * Release the vnode asynchronously as we currently have the 1273 * txg stopped from syncing. 1274 */ 1275 VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os))); 1276 1277 if (error == 0 && zgd->zgd_bp) 1278 zil_add_block(zgd->zgd_zilog, zgd->zgd_bp); 1279 1280 kmem_free(zgd, sizeof (zgd_t)); 1281} 1282 1283#ifdef DEBUG 1284static int zil_fault_io = 0; 1285#endif 1286 1287/* 1288 * Get data to generate a TX_WRITE intent log record. 1289 */ 1290int 1291zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) 1292{ 1293 zfsvfs_t *zfsvfs = arg; 1294 objset_t *os = zfsvfs->z_os; 1295 znode_t *zp; 1296 uint64_t object = lr->lr_foid; 1297 uint64_t offset = lr->lr_offset; 1298 uint64_t size = lr->lr_length; 1299 dmu_buf_t *db; 1300 zgd_t *zgd; 1301 int error = 0; 1302 1303 ASSERT(zio != NULL); 1304 ASSERT(size != 0); 1305 1306 /* 1307 * Nothing to do if the file has been removed 1308 */ 1309 if (zfs_zget(zfsvfs, object, &zp) != 0) 1310 return (SET_ERROR(ENOENT)); 1311 if (zp->z_unlinked) { 1312 /* 1313 * Release the vnode asynchronously as we currently have the 1314 * txg stopped from syncing. 1315 */ 1316 VN_RELE_ASYNC(ZTOV(zp), 1317 dsl_pool_vnrele_taskq(dmu_objset_pool(os))); 1318 return (SET_ERROR(ENOENT)); 1319 } 1320 1321 zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP); 1322 zgd->zgd_zilog = zfsvfs->z_log; 1323 zgd->zgd_private = zp; 1324 1325 /* 1326 * Write records come in two flavors: immediate and indirect. 1327 * For small writes it's cheaper to store the data with the 1328 * log record (immediate); for large writes it's cheaper to 1329 * sync the data and get a pointer to it (indirect) so that 1330 * we don't have to write the data twice. 1331 */ 1332 if (buf != NULL) { /* immediate write */ 1333 zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER); 1334 /* test for truncation needs to be done while range locked */ 1335 if (offset >= zp->z_size) { 1336 error = SET_ERROR(ENOENT); 1337 } else { 1338 error = dmu_read(os, object, offset, size, buf, 1339 DMU_READ_NO_PREFETCH); 1340 } 1341 ASSERT(error == 0 || error == ENOENT); 1342 } else { /* indirect write */ 1343 /* 1344 * Have to lock the whole block to ensure when it's 1345 * written out and it's checksum is being calculated 1346 * that no one can change the data. We need to re-check 1347 * blocksize after we get the lock in case it's changed! 1348 */ 1349 for (;;) { 1350 uint64_t blkoff; 1351 size = zp->z_blksz; 1352 blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; 1353 offset -= blkoff; 1354 zgd->zgd_rl = zfs_range_lock(zp, offset, size, 1355 RL_READER); 1356 if (zp->z_blksz == size) 1357 break; 1358 offset += blkoff; 1359 zfs_range_unlock(zgd->zgd_rl); 1360 } 1361 /* test for truncation needs to be done while range locked */ 1362 if (lr->lr_offset >= zp->z_size) 1363 error = SET_ERROR(ENOENT); 1364#ifdef DEBUG 1365 if (zil_fault_io) { 1366 error = SET_ERROR(EIO); 1367 zil_fault_io = 0; 1368 } 1369#endif 1370 if (error == 0) 1371 error = dmu_buf_hold(os, object, offset, zgd, &db, 1372 DMU_READ_NO_PREFETCH); 1373 1374 if (error == 0) { 1375 blkptr_t *bp = &lr->lr_blkptr; 1376 1377 zgd->zgd_db = db; 1378 zgd->zgd_bp = bp; 1379 1380 ASSERT(db->db_offset == offset); 1381 ASSERT(db->db_size == size); 1382 1383 error = dmu_sync(zio, lr->lr_common.lrc_txg, 1384 zfs_get_done, zgd); 1385 ASSERT(error || lr->lr_length <= size); 1386 1387 /* 1388 * On success, we need to wait for the write I/O 1389 * initiated by dmu_sync() to complete before we can 1390 * release this dbuf. We will finish everything up 1391 * in the zfs_get_done() callback. 1392 */ 1393 if (error == 0) 1394 return (0); 1395 1396 if (error == EALREADY) { 1397 lr->lr_common.lrc_txtype = TX_WRITE2; 1398 error = 0; 1399 } 1400 } 1401 } 1402 1403 zfs_get_done(zgd, error); 1404 1405 return (error); 1406} 1407 1408/*ARGSUSED*/ 1409static int 1410zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr, 1411 caller_context_t *ct) 1412{ 1413 znode_t *zp = VTOZ(vp); 1414 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1415 int error; 1416 1417 ZFS_ENTER(zfsvfs); 1418 ZFS_VERIFY_ZP(zp); 1419 1420 if (flag & V_ACE_MASK) 1421 error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); 1422 else 1423 error = zfs_zaccess_rwx(zp, mode, flag, cr); 1424 1425 ZFS_EXIT(zfsvfs); 1426 return (error); 1427} 1428 1429static int 1430zfs_dd_callback(struct mount *mp, void *arg, int lkflags, struct vnode **vpp) 1431{ 1432 int error; 1433 1434 *vpp = arg; 1435 error = vn_lock(*vpp, lkflags); 1436 if (error != 0) 1437 vrele(*vpp); 1438 return (error); 1439} 1440 1441static int 1442zfs_lookup_lock(vnode_t *dvp, vnode_t *vp, const char *name, int lkflags) 1443{ 1444 znode_t *zdp = VTOZ(dvp); 1445 zfsvfs_t *zfsvfs = zdp->z_zfsvfs; 1446 int error; 1447 int ltype; 1448 1449 ASSERT_VOP_LOCKED(dvp, __func__); 1450#ifdef DIAGNOSTIC 1451 if ((zdp->z_pflags & ZFS_XATTR) == 0) 1452 VERIFY(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock)); 1453#endif 1454 1455 if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) { 1456 ASSERT3P(dvp, ==, vp); 1457 vref(dvp); 1458 ltype = lkflags & LK_TYPE_MASK; 1459 if (ltype != VOP_ISLOCKED(dvp)) { 1460 if (ltype == LK_EXCLUSIVE) 1461 vn_lock(dvp, LK_UPGRADE | LK_RETRY); 1462 else /* if (ltype == LK_SHARED) */ 1463 vn_lock(dvp, LK_DOWNGRADE | LK_RETRY); 1464 1465 /* 1466 * Relock for the "." case could leave us with 1467 * reclaimed vnode. 1468 */ 1469 if (dvp->v_iflag & VI_DOOMED) { 1470 vrele(dvp); 1471 return (SET_ERROR(ENOENT)); 1472 } 1473 } 1474 return (0); 1475 } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) { 1476 /* 1477 * Note that in this case, dvp is the child vnode, and we 1478 * are looking up the parent vnode - exactly reverse from 1479 * normal operation. Unlocking dvp requires some rather 1480 * tricky unlock/relock dance to prevent mp from being freed; 1481 * use vn_vget_ino_gen() which takes care of all that. 1482 * 1483 * XXX Note that there is a time window when both vnodes are 1484 * unlocked. It is possible, although highly unlikely, that 1485 * during that window the parent-child relationship between 1486 * the vnodes may change, for example, get reversed. 1487 * In that case we would have a wrong lock order for the vnodes. 1488 * All other filesystems seem to ignore this problem, so we 1489 * do the same here. 1490 * A potential solution could be implemented as follows: 1491 * - using LK_NOWAIT when locking the second vnode and retrying 1492 * if necessary 1493 * - checking that the parent-child relationship still holds 1494 * after locking both vnodes and retrying if it doesn't 1495 */ 1496 error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp); 1497 return (error); 1498 } else { 1499 error = vn_lock(vp, lkflags); 1500 if (error != 0) 1501 vrele(vp); 1502 return (error); 1503 } 1504} 1505 1506/* 1507 * Lookup an entry in a directory, or an extended attribute directory. 1508 * If it exists, return a held vnode reference for it. 1509 * 1510 * IN: dvp - vnode of directory to search. 1511 * nm - name of entry to lookup. 1512 * pnp - full pathname to lookup [UNUSED]. 1513 * flags - LOOKUP_XATTR set if looking for an attribute. 1514 * rdir - root directory vnode [UNUSED]. 1515 * cr - credentials of caller. 1516 * ct - caller context 1517 * 1518 * OUT: vpp - vnode of located entry, NULL if not found. 1519 * 1520 * RETURN: 0 on success, error code on failure. 1521 * 1522 * Timestamps: 1523 * NA 1524 */ 1525/* ARGSUSED */ 1526static int 1527zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp, 1528 int nameiop, cred_t *cr, kthread_t *td, int flags) 1529{ 1530 znode_t *zdp = VTOZ(dvp); 1531 znode_t *zp; 1532 zfsvfs_t *zfsvfs = zdp->z_zfsvfs; 1533 int error = 0; 1534 1535 /* 1536 * Fast path lookup, however we must skip DNLC lookup 1537 * for case folding or normalizing lookups because the 1538 * DNLC code only stores the passed in name. This means 1539 * creating 'a' and removing 'A' on a case insensitive 1540 * file system would work, but DNLC still thinks 'a' 1541 * exists and won't let you create it again on the next 1542 * pass through fast path. 1543 */ 1544 if (!(flags & LOOKUP_XATTR)) { 1545 if (dvp->v_type != VDIR) { 1546 return (SET_ERROR(ENOTDIR)); 1547 } else if (zdp->z_sa_hdl == NULL) { 1548 return (SET_ERROR(EIO)); 1549 } 1550 } 1551 1552 DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm); 1553 1554 ZFS_ENTER(zfsvfs); 1555 ZFS_VERIFY_ZP(zdp); 1556 1557 *vpp = NULL; 1558 1559 if (flags & LOOKUP_XATTR) { 1560#ifdef TODO 1561 /* 1562 * If the xattr property is off, refuse the lookup request. 1563 */ 1564 if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) { 1565 ZFS_EXIT(zfsvfs); 1566 return (SET_ERROR(EINVAL)); 1567 } 1568#endif 1569 1570 /* 1571 * We don't allow recursive attributes.. 1572 * Maybe someday we will. 1573 */ 1574 if (zdp->z_pflags & ZFS_XATTR) { 1575 ZFS_EXIT(zfsvfs); 1576 return (SET_ERROR(EINVAL)); 1577 } 1578 1579 if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) { 1580 ZFS_EXIT(zfsvfs); 1581 return (error); 1582 } 1583 1584 /* 1585 * Do we have permission to get into attribute directory? 1586 */ 1587 if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0, 1588 B_FALSE, cr)) { 1589 vrele(*vpp); 1590 *vpp = NULL; 1591 } 1592 1593 ZFS_EXIT(zfsvfs); 1594 return (error); 1595 } 1596 1597 /* 1598 * Check accessibility of directory. 1599 */ 1600 if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) { 1601 ZFS_EXIT(zfsvfs); 1602 return (error); 1603 } 1604 1605 if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), 1606 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 1607 ZFS_EXIT(zfsvfs); 1608 return (SET_ERROR(EILSEQ)); 1609 } 1610 1611 1612 /* 1613 * First handle the special cases. 1614 */ 1615 if ((cnp->cn_flags & ISDOTDOT) != 0) { 1616 /* 1617 * If we are a snapshot mounted under .zfs, return 1618 * the vp for the snapshot directory. 1619 */ 1620 if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) { 1621 struct componentname cn; 1622 vnode_t *zfsctl_vp; 1623 int ltype; 1624 1625 ZFS_EXIT(zfsvfs); 1626 ltype = VOP_ISLOCKED(dvp); 1627 VOP_UNLOCK(dvp, 0); 1628 error = zfsctl_root(zfsvfs->z_parent, LK_SHARED, 1629 &zfsctl_vp); 1630 if (error == 0) { 1631 cn.cn_nameptr = "snapshot"; 1632 cn.cn_namelen = strlen(cn.cn_nameptr); 1633 cn.cn_nameiop = cnp->cn_nameiop; 1634 cn.cn_flags = cnp->cn_flags & ~ISDOTDOT; 1635 cn.cn_lkflags = cnp->cn_lkflags; 1636 error = VOP_LOOKUP(zfsctl_vp, vpp, &cn); 1637 vput(zfsctl_vp); 1638 } 1639 vn_lock(dvp, ltype | LK_RETRY); 1640 return (error); 1641 } 1642 } 1643 if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) { 1644 ZFS_EXIT(zfsvfs); 1645 if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP) 1646 return (SET_ERROR(ENOTSUP)); 1647 error = zfsctl_root(zfsvfs, cnp->cn_lkflags, vpp); 1648 return (error); 1649 } 1650 1651 /* 1652 * The loop is retry the lookup if the parent-child relationship 1653 * changes during the dot-dot locking complexities. 1654 */ 1655 for (;;) { 1656 uint64_t parent; 1657 1658 error = zfs_dirlook(zdp, nm, &zp); 1659 if (error == 0) 1660 *vpp = ZTOV(zp); 1661 1662 ZFS_EXIT(zfsvfs); 1663 if (error != 0) 1664 break; 1665 1666 error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags); 1667 if (error != 0) { 1668 /* 1669 * If we've got a locking error, then the vnode 1670 * got reclaimed because of a force unmount. 1671 * We never enter doomed vnodes into the name cache. 1672 */ 1673 *vpp = NULL; 1674 return (error); 1675 } 1676 1677 if ((cnp->cn_flags & ISDOTDOT) == 0) 1678 break; 1679 1680 ZFS_ENTER(zfsvfs); 1681 if (zdp->z_sa_hdl == NULL) { 1682 error = SET_ERROR(EIO); 1683 } else { 1684 error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), 1685 &parent, sizeof (parent)); 1686 } 1687 if (error != 0) { 1688 ZFS_EXIT(zfsvfs); 1689 vput(ZTOV(zp)); 1690 break; 1691 } 1692 if (zp->z_id == parent) { 1693 ZFS_EXIT(zfsvfs); 1694 break; 1695 } 1696 vput(ZTOV(zp)); 1697 } 1698 1699out: 1700 if (error != 0) 1701 *vpp = NULL; 1702 1703 /* Translate errors and add SAVENAME when needed. */ 1704 if (cnp->cn_flags & ISLASTCN) { 1705 switch (nameiop) { 1706 case CREATE: 1707 case RENAME: 1708 if (error == ENOENT) { 1709 error = EJUSTRETURN; 1710 cnp->cn_flags |= SAVENAME; 1711 break; 1712 } 1713 /* FALLTHROUGH */ 1714 case DELETE: 1715 if (error == 0) 1716 cnp->cn_flags |= SAVENAME; 1717 break; 1718 } 1719 } 1720 1721 /* Insert name into cache (as non-existent) if appropriate. */ 1722 if (zfsvfs->z_use_namecache && 1723 error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0) 1724 cache_enter(dvp, NULL, cnp); 1725 1726 /* Insert name into cache if appropriate. */ 1727 if (zfsvfs->z_use_namecache && 1728 error == 0 && (cnp->cn_flags & MAKEENTRY)) { 1729 if (!(cnp->cn_flags & ISLASTCN) || 1730 (nameiop != DELETE && nameiop != RENAME)) { 1731 cache_enter(dvp, *vpp, cnp); 1732 } 1733 } 1734 1735 return (error); 1736} 1737 1738/* 1739 * Attempt to create a new entry in a directory. If the entry 1740 * already exists, truncate the file if permissible, else return 1741 * an error. Return the vp of the created or trunc'd file. 1742 * 1743 * IN: dvp - vnode of directory to put new file entry in. 1744 * name - name of new file entry. 1745 * vap - attributes of new file. 1746 * excl - flag indicating exclusive or non-exclusive mode. 1747 * mode - mode to open file with. 1748 * cr - credentials of caller. 1749 * flag - large file flag [UNUSED]. 1750 * ct - caller context 1751 * vsecp - ACL to be set 1752 * 1753 * OUT: vpp - vnode of created or trunc'd entry. 1754 * 1755 * RETURN: 0 on success, error code on failure. 1756 * 1757 * Timestamps: 1758 * dvp - ctime|mtime updated if new entry created 1759 * vp - ctime|mtime always, atime if new 1760 */ 1761 1762/* ARGSUSED */ 1763static int 1764zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode, 1765 vnode_t **vpp, cred_t *cr, kthread_t *td) 1766{ 1767 znode_t *zp, *dzp = VTOZ(dvp); 1768 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1769 zilog_t *zilog; 1770 objset_t *os; 1771 dmu_tx_t *tx; 1772 int error; 1773 ksid_t *ksid; 1774 uid_t uid; 1775 gid_t gid = crgetgid(cr); 1776 zfs_acl_ids_t acl_ids; 1777 boolean_t fuid_dirtied; 1778 void *vsecp = NULL; 1779 int flag = 0; 1780 uint64_t txtype; 1781 1782 /* 1783 * If we have an ephemeral id, ACL, or XVATTR then 1784 * make sure file system is at proper version 1785 */ 1786 1787 ksid = crgetsid(cr, KSID_OWNER); 1788 if (ksid) 1789 uid = ksid_getid(ksid); 1790 else 1791 uid = crgetuid(cr); 1792 1793 if (zfsvfs->z_use_fuids == B_FALSE && 1794 (vsecp || (vap->va_mask & AT_XVATTR) || 1795 IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) 1796 return (SET_ERROR(EINVAL)); 1797 1798 ZFS_ENTER(zfsvfs); 1799 ZFS_VERIFY_ZP(dzp); 1800 os = zfsvfs->z_os; 1801 zilog = zfsvfs->z_log; 1802 1803 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), 1804 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 1805 ZFS_EXIT(zfsvfs); 1806 return (SET_ERROR(EILSEQ)); 1807 } 1808 1809 if (vap->va_mask & AT_XVATTR) { 1810 if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap, 1811 crgetuid(cr), cr, vap->va_type)) != 0) { 1812 ZFS_EXIT(zfsvfs); 1813 return (error); 1814 } 1815 } 1816 1817 *vpp = NULL; 1818 1819 if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr)) 1820 vap->va_mode &= ~S_ISVTX; 1821 1822 error = zfs_dirent_lookup(dzp, name, &zp, ZNEW); 1823 if (error) { 1824 ZFS_EXIT(zfsvfs); 1825 return (error); 1826 } 1827 ASSERT3P(zp, ==, NULL); 1828 1829 /* 1830 * Create a new file object and update the directory 1831 * to reference it. 1832 */ 1833 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { 1834 goto out; 1835 } 1836 1837 /* 1838 * We only support the creation of regular files in 1839 * extended attribute directories. 1840 */ 1841 1842 if ((dzp->z_pflags & ZFS_XATTR) && 1843 (vap->va_type != VREG)) { 1844 error = SET_ERROR(EINVAL); 1845 goto out; 1846 } 1847 1848 if ((error = zfs_acl_ids_create(dzp, 0, vap, 1849 cr, vsecp, &acl_ids)) != 0) 1850 goto out; 1851 1852 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { 1853 zfs_acl_ids_free(&acl_ids); 1854 error = SET_ERROR(EDQUOT); 1855 goto out; 1856 } 1857 1858 getnewvnode_reserve(1); 1859 1860 tx = dmu_tx_create(os); 1861 1862 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 1863 ZFS_SA_BASE_ATTR_SIZE); 1864 1865 fuid_dirtied = zfsvfs->z_fuid_dirty; 1866 if (fuid_dirtied) 1867 zfs_fuid_txhold(zfsvfs, tx); 1868 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 1869 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); 1870 if (!zfsvfs->z_use_sa && 1871 acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 1872 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 1873 0, acl_ids.z_aclp->z_acl_bytes); 1874 } 1875 error = dmu_tx_assign(tx, TXG_WAIT); 1876 if (error) { 1877 zfs_acl_ids_free(&acl_ids); 1878 dmu_tx_abort(tx); 1879 getnewvnode_drop_reserve(); 1880 ZFS_EXIT(zfsvfs); 1881 return (error); 1882 } 1883 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); 1884 1885 if (fuid_dirtied) 1886 zfs_fuid_sync(zfsvfs, tx); 1887 1888 (void) zfs_link_create(dzp, name, zp, tx, ZNEW); 1889 txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); 1890 zfs_log_create(zilog, tx, txtype, dzp, zp, name, 1891 vsecp, acl_ids.z_fuidp, vap); 1892 zfs_acl_ids_free(&acl_ids); 1893 dmu_tx_commit(tx); 1894 1895 getnewvnode_drop_reserve(); 1896 1897out: 1898 if (error == 0) { 1899 *vpp = ZTOV(zp); 1900 } 1901 1902 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 1903 zil_commit(zilog, 0); 1904 1905 ZFS_EXIT(zfsvfs); 1906 return (error); 1907} 1908 1909/* 1910 * Remove an entry from a directory. 1911 * 1912 * IN: dvp - vnode of directory to remove entry from. 1913 * name - name of entry to remove. 1914 * cr - credentials of caller. 1915 * ct - caller context 1916 * flags - case flags 1917 * 1918 * RETURN: 0 on success, error code on failure. 1919 * 1920 * Timestamps: 1921 * dvp - ctime|mtime 1922 * vp - ctime (if nlink > 0) 1923 */ 1924 1925/*ARGSUSED*/ 1926static int 1927zfs_remove(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr) 1928{ 1929 znode_t *dzp = VTOZ(dvp); 1930 znode_t *zp = VTOZ(vp); 1931 znode_t *xzp; 1932 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1933 zilog_t *zilog; 1934 uint64_t acl_obj, xattr_obj; 1935 uint64_t obj = 0; 1936 dmu_tx_t *tx; 1937 boolean_t unlinked, toobig = FALSE; 1938 uint64_t txtype; 1939 int error; 1940 1941 ZFS_ENTER(zfsvfs); 1942 ZFS_VERIFY_ZP(dzp); 1943 ZFS_VERIFY_ZP(zp); 1944 zilog = zfsvfs->z_log; 1945 zp = VTOZ(vp); 1946 1947 xattr_obj = 0; 1948 xzp = NULL; 1949 1950 if (error = zfs_zaccess_delete(dzp, zp, cr)) { 1951 goto out; 1952 } 1953 1954 /* 1955 * Need to use rmdir for removing directories. 1956 */ 1957 if (vp->v_type == VDIR) { 1958 error = SET_ERROR(EPERM); 1959 goto out; 1960 } 1961 1962 vnevent_remove(vp, dvp, name, ct); 1963 1964 obj = zp->z_id; 1965 1966 /* are there any extended attributes? */ 1967 error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), 1968 &xattr_obj, sizeof (xattr_obj)); 1969 if (error == 0 && xattr_obj) { 1970 error = zfs_zget(zfsvfs, xattr_obj, &xzp); 1971 ASSERT0(error); 1972 } 1973 1974 /* 1975 * We may delete the znode now, or we may put it in the unlinked set; 1976 * it depends on whether we're the last link, and on whether there are 1977 * other holds on the vnode. So we dmu_tx_hold() the right things to 1978 * allow for either case. 1979 */ 1980 tx = dmu_tx_create(zfsvfs->z_os); 1981 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); 1982 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 1983 zfs_sa_upgrade_txholds(tx, zp); 1984 zfs_sa_upgrade_txholds(tx, dzp); 1985 1986 if (xzp) { 1987 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 1988 dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); 1989 } 1990 1991 /* charge as an update -- would be nice not to charge at all */ 1992 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 1993 1994 /* 1995 * Mark this transaction as typically resulting in a net free of space 1996 */ 1997 dmu_tx_mark_netfree(tx); 1998 1999 error = dmu_tx_assign(tx, TXG_WAIT); 2000 if (error) { 2001 dmu_tx_abort(tx); 2002 ZFS_EXIT(zfsvfs); 2003 return (error); 2004 } 2005 2006 /* 2007 * Remove the directory entry. 2008 */ 2009 error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked); 2010 2011 if (error) { 2012 dmu_tx_commit(tx); 2013 goto out; 2014 } 2015 2016 if (unlinked) { 2017 zfs_unlinked_add(zp, tx); 2018 vp->v_vflag |= VV_NOSYNC; 2019 } 2020 2021 txtype = TX_REMOVE; 2022 zfs_log_remove(zilog, tx, txtype, dzp, name, obj); 2023 2024 dmu_tx_commit(tx); 2025out: 2026 2027 if (xzp) 2028 vrele(ZTOV(xzp)); 2029 2030 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 2031 zil_commit(zilog, 0); 2032 2033 ZFS_EXIT(zfsvfs); 2034 return (error); 2035} 2036 2037/* 2038 * Create a new directory and insert it into dvp using the name 2039 * provided. Return a pointer to the inserted directory. 2040 * 2041 * IN: dvp - vnode of directory to add subdir to. 2042 * dirname - name of new directory. 2043 * vap - attributes of new directory. 2044 * cr - credentials of caller. 2045 * ct - caller context 2046 * flags - case flags 2047 * vsecp - ACL to be set 2048 * 2049 * OUT: vpp - vnode of created directory. 2050 * 2051 * RETURN: 0 on success, error code on failure. 2052 * 2053 * Timestamps: 2054 * dvp - ctime|mtime updated 2055 * vp - ctime|mtime|atime updated 2056 */ 2057/*ARGSUSED*/ 2058static int 2059zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr) 2060{ 2061 znode_t *zp, *dzp = VTOZ(dvp); 2062 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 2063 zilog_t *zilog; 2064 uint64_t txtype; 2065 dmu_tx_t *tx; 2066 int error; 2067 ksid_t *ksid; 2068 uid_t uid; 2069 gid_t gid = crgetgid(cr); 2070 zfs_acl_ids_t acl_ids; 2071 boolean_t fuid_dirtied; 2072 2073 ASSERT(vap->va_type == VDIR); 2074 2075 /* 2076 * If we have an ephemeral id, ACL, or XVATTR then 2077 * make sure file system is at proper version 2078 */ 2079 2080 ksid = crgetsid(cr, KSID_OWNER); 2081 if (ksid) 2082 uid = ksid_getid(ksid); 2083 else 2084 uid = crgetuid(cr); 2085 if (zfsvfs->z_use_fuids == B_FALSE && 2086 ((vap->va_mask & AT_XVATTR) || 2087 IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) 2088 return (SET_ERROR(EINVAL)); 2089 2090 ZFS_ENTER(zfsvfs); 2091 ZFS_VERIFY_ZP(dzp); 2092 zilog = zfsvfs->z_log; 2093 2094 if (dzp->z_pflags & ZFS_XATTR) { 2095 ZFS_EXIT(zfsvfs); 2096 return (SET_ERROR(EINVAL)); 2097 } 2098 2099 if (zfsvfs->z_utf8 && u8_validate(dirname, 2100 strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 2101 ZFS_EXIT(zfsvfs); 2102 return (SET_ERROR(EILSEQ)); 2103 } 2104 2105 if (vap->va_mask & AT_XVATTR) { 2106 if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap, 2107 crgetuid(cr), cr, vap->va_type)) != 0) { 2108 ZFS_EXIT(zfsvfs); 2109 return (error); 2110 } 2111 } 2112 2113 if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, 2114 NULL, &acl_ids)) != 0) { 2115 ZFS_EXIT(zfsvfs); 2116 return (error); 2117 } 2118 2119 /* 2120 * First make sure the new directory doesn't exist. 2121 * 2122 * Existence is checked first to make sure we don't return 2123 * EACCES instead of EEXIST which can cause some applications 2124 * to fail. 2125 */ 2126 *vpp = NULL; 2127 2128 if (error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW)) { 2129 zfs_acl_ids_free(&acl_ids); 2130 ZFS_EXIT(zfsvfs); 2131 return (error); 2132 } 2133 ASSERT3P(zp, ==, NULL); 2134 2135 if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) { 2136 zfs_acl_ids_free(&acl_ids); 2137 ZFS_EXIT(zfsvfs); 2138 return (error); 2139 } 2140 2141 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { 2142 zfs_acl_ids_free(&acl_ids); 2143 ZFS_EXIT(zfsvfs); 2144 return (SET_ERROR(EDQUOT)); 2145 } 2146 2147 /* 2148 * Add a new entry to the directory. 2149 */ 2150 getnewvnode_reserve(1); 2151 tx = dmu_tx_create(zfsvfs->z_os); 2152 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); 2153 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); 2154 fuid_dirtied = zfsvfs->z_fuid_dirty; 2155 if (fuid_dirtied) 2156 zfs_fuid_txhold(zfsvfs, tx); 2157 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 2158 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 2159 acl_ids.z_aclp->z_acl_bytes); 2160 } 2161 2162 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 2163 ZFS_SA_BASE_ATTR_SIZE); 2164 2165 error = dmu_tx_assign(tx, TXG_WAIT); 2166 if (error) { 2167 zfs_acl_ids_free(&acl_ids); 2168 dmu_tx_abort(tx); 2169 getnewvnode_drop_reserve(); 2170 ZFS_EXIT(zfsvfs); 2171 return (error); 2172 } 2173 2174 /* 2175 * Create new node. 2176 */ 2177 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); 2178 2179 if (fuid_dirtied) 2180 zfs_fuid_sync(zfsvfs, tx); 2181 2182 /* 2183 * Now put new name in parent dir. 2184 */ 2185 (void) zfs_link_create(dzp, dirname, zp, tx, ZNEW); 2186 2187 *vpp = ZTOV(zp); 2188 2189 txtype = zfs_log_create_txtype(Z_DIR, NULL, vap); 2190 zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL, 2191 acl_ids.z_fuidp, vap); 2192 2193 zfs_acl_ids_free(&acl_ids); 2194 2195 dmu_tx_commit(tx); 2196 2197 getnewvnode_drop_reserve(); 2198 2199 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 2200 zil_commit(zilog, 0); 2201 2202 ZFS_EXIT(zfsvfs); 2203 return (0); 2204} 2205 2206/* 2207 * Remove a directory subdir entry. If the current working 2208 * directory is the same as the subdir to be removed, the 2209 * remove will fail. 2210 * 2211 * IN: dvp - vnode of directory to remove from. 2212 * name - name of directory to be removed. 2213 * cwd - vnode of current working directory. 2214 * cr - credentials of caller. 2215 * ct - caller context 2216 * flags - case flags 2217 * 2218 * RETURN: 0 on success, error code on failure. 2219 * 2220 * Timestamps: 2221 * dvp - ctime|mtime updated 2222 */ 2223/*ARGSUSED*/ 2224static int 2225zfs_rmdir(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr) 2226{ 2227 znode_t *dzp = VTOZ(dvp); 2228 znode_t *zp = VTOZ(vp); 2229 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 2230 zilog_t *zilog; 2231 dmu_tx_t *tx; 2232 int error; 2233 2234 ZFS_ENTER(zfsvfs); 2235 ZFS_VERIFY_ZP(dzp); 2236 ZFS_VERIFY_ZP(zp); 2237 zilog = zfsvfs->z_log; 2238 2239 2240 if (error = zfs_zaccess_delete(dzp, zp, cr)) { 2241 goto out; 2242 } 2243 2244 if (vp->v_type != VDIR) { 2245 error = SET_ERROR(ENOTDIR); 2246 goto out; 2247 } 2248 2249 vnevent_rmdir(vp, dvp, name, ct); 2250 2251 tx = dmu_tx_create(zfsvfs->z_os); 2252 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); 2253 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 2254 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 2255 zfs_sa_upgrade_txholds(tx, zp); 2256 zfs_sa_upgrade_txholds(tx, dzp); 2257 dmu_tx_mark_netfree(tx); 2258 error = dmu_tx_assign(tx, TXG_WAIT); 2259 if (error) { 2260 dmu_tx_abort(tx); 2261 ZFS_EXIT(zfsvfs); 2262 return (error); 2263 } 2264 2265 cache_purge(dvp); 2266 2267 error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL); 2268 2269 if (error == 0) { 2270 uint64_t txtype = TX_RMDIR; 2271 zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT); 2272 } 2273 2274 dmu_tx_commit(tx); 2275 2276 cache_purge(vp); 2277out: 2278 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 2279 zil_commit(zilog, 0); 2280 2281 ZFS_EXIT(zfsvfs); 2282 return (error); 2283} 2284 2285/* 2286 * Read as many directory entries as will fit into the provided 2287 * buffer from the given directory cursor position (specified in 2288 * the uio structure). 2289 * 2290 * IN: vp - vnode of directory to read. 2291 * uio - structure supplying read location, range info, 2292 * and return buffer. 2293 * cr - credentials of caller. 2294 * ct - caller context 2295 * flags - case flags 2296 * 2297 * OUT: uio - updated offset and range, buffer filled. 2298 * eofp - set to true if end-of-file detected. 2299 * 2300 * RETURN: 0 on success, error code on failure. 2301 * 2302 * Timestamps: 2303 * vp - atime updated 2304 * 2305 * Note that the low 4 bits of the cookie returned by zap is always zero. 2306 * This allows us to use the low range for "special" directory entries: 2307 * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, 2308 * we use the offset 2 for the '.zfs' directory. 2309 */ 2310/* ARGSUSED */ 2311static int 2312zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies) 2313{ 2314 znode_t *zp = VTOZ(vp); 2315 iovec_t *iovp; 2316 edirent_t *eodp; 2317 dirent64_t *odp; 2318 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2319 objset_t *os; 2320 caddr_t outbuf; 2321 size_t bufsize; 2322 zap_cursor_t zc; 2323 zap_attribute_t zap; 2324 uint_t bytes_wanted; 2325 uint64_t offset; /* must be unsigned; checks for < 1 */ 2326 uint64_t parent; 2327 int local_eof; 2328 int outcount; 2329 int error; 2330 uint8_t prefetch; 2331 boolean_t check_sysattrs; 2332 uint8_t type; 2333 int ncooks; 2334 u_long *cooks = NULL; 2335 int flags = 0; 2336 2337 ZFS_ENTER(zfsvfs); 2338 ZFS_VERIFY_ZP(zp); 2339 2340 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), 2341 &parent, sizeof (parent))) != 0) { 2342 ZFS_EXIT(zfsvfs); 2343 return (error); 2344 } 2345 2346 /* 2347 * If we are not given an eof variable, 2348 * use a local one. 2349 */ 2350 if (eofp == NULL) 2351 eofp = &local_eof; 2352 2353 /* 2354 * Check for valid iov_len. 2355 */ 2356 if (uio->uio_iov->iov_len <= 0) { 2357 ZFS_EXIT(zfsvfs); 2358 return (SET_ERROR(EINVAL)); 2359 } 2360 2361 /* 2362 * Quit if directory has been removed (posix) 2363 */ 2364 if ((*eofp = zp->z_unlinked) != 0) { 2365 ZFS_EXIT(zfsvfs); 2366 return (0); 2367 } 2368 2369 error = 0; 2370 os = zfsvfs->z_os; 2371 offset = uio->uio_loffset; 2372 prefetch = zp->z_zn_prefetch; 2373 2374 /* 2375 * Initialize the iterator cursor. 2376 */ 2377 if (offset <= 3) { 2378 /* 2379 * Start iteration from the beginning of the directory. 2380 */ 2381 zap_cursor_init(&zc, os, zp->z_id); 2382 } else { 2383 /* 2384 * The offset is a serialized cursor. 2385 */ 2386 zap_cursor_init_serialized(&zc, os, zp->z_id, offset); 2387 } 2388 2389 /* 2390 * Get space to change directory entries into fs independent format. 2391 */ 2392 iovp = uio->uio_iov; 2393 bytes_wanted = iovp->iov_len; 2394 if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) { 2395 bufsize = bytes_wanted; 2396 outbuf = kmem_alloc(bufsize, KM_SLEEP); 2397 odp = (struct dirent64 *)outbuf; 2398 } else { 2399 bufsize = bytes_wanted; 2400 outbuf = NULL; 2401 odp = (struct dirent64 *)iovp->iov_base; 2402 } 2403 eodp = (struct edirent *)odp; 2404 2405 if (ncookies != NULL) { 2406 /* 2407 * Minimum entry size is dirent size and 1 byte for a file name. 2408 */ 2409 ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1); 2410 cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK); 2411 *cookies = cooks; 2412 *ncookies = ncooks; 2413 } 2414 /* 2415 * If this VFS supports the system attribute view interface; and 2416 * we're looking at an extended attribute directory; and we care 2417 * about normalization conflicts on this vfs; then we must check 2418 * for normalization conflicts with the sysattr name space. 2419 */ 2420#ifdef TODO 2421 check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && 2422 (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm && 2423 (flags & V_RDDIR_ENTFLAGS); 2424#else 2425 check_sysattrs = 0; 2426#endif 2427 2428 /* 2429 * Transform to file-system independent format 2430 */ 2431 outcount = 0; 2432 while (outcount < bytes_wanted) { 2433 ino64_t objnum; 2434 ushort_t reclen; 2435 off64_t *next = NULL; 2436 2437 /* 2438 * Special case `.', `..', and `.zfs'. 2439 */ 2440 if (offset == 0) { 2441 (void) strcpy(zap.za_name, "."); 2442 zap.za_normalization_conflict = 0; 2443 objnum = zp->z_id; 2444 type = DT_DIR; 2445 } else if (offset == 1) { 2446 (void) strcpy(zap.za_name, ".."); 2447 zap.za_normalization_conflict = 0; 2448 objnum = parent; 2449 type = DT_DIR; 2450 } else if (offset == 2 && zfs_show_ctldir(zp)) { 2451 (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); 2452 zap.za_normalization_conflict = 0; 2453 objnum = ZFSCTL_INO_ROOT; 2454 type = DT_DIR; 2455 } else { 2456 /* 2457 * Grab next entry. 2458 */ 2459 if (error = zap_cursor_retrieve(&zc, &zap)) { 2460 if ((*eofp = (error == ENOENT)) != 0) 2461 break; 2462 else 2463 goto update; 2464 } 2465 2466 if (zap.za_integer_length != 8 || 2467 zap.za_num_integers != 1) { 2468 cmn_err(CE_WARN, "zap_readdir: bad directory " 2469 "entry, obj = %lld, offset = %lld\n", 2470 (u_longlong_t)zp->z_id, 2471 (u_longlong_t)offset); 2472 error = SET_ERROR(ENXIO); 2473 goto update; 2474 } 2475 2476 objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); 2477 /* 2478 * MacOS X can extract the object type here such as: 2479 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer); 2480 */ 2481 type = ZFS_DIRENT_TYPE(zap.za_first_integer); 2482 2483 if (check_sysattrs && !zap.za_normalization_conflict) { 2484#ifdef TODO 2485 zap.za_normalization_conflict = 2486 xattr_sysattr_casechk(zap.za_name); 2487#else 2488 panic("%s:%u: TODO", __func__, __LINE__); 2489#endif 2490 } 2491 } 2492 2493 if (flags & V_RDDIR_ACCFILTER) { 2494 /* 2495 * If we have no access at all, don't include 2496 * this entry in the returned information 2497 */ 2498 znode_t *ezp; 2499 if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0) 2500 goto skip_entry; 2501 if (!zfs_has_access(ezp, cr)) { 2502 vrele(ZTOV(ezp)); 2503 goto skip_entry; 2504 } 2505 vrele(ZTOV(ezp)); 2506 } 2507 2508 if (flags & V_RDDIR_ENTFLAGS) 2509 reclen = EDIRENT_RECLEN(strlen(zap.za_name)); 2510 else 2511 reclen = DIRENT64_RECLEN(strlen(zap.za_name)); 2512 2513 /* 2514 * Will this entry fit in the buffer? 2515 */ 2516 if (outcount + reclen > bufsize) { 2517 /* 2518 * Did we manage to fit anything in the buffer? 2519 */ 2520 if (!outcount) { 2521 error = SET_ERROR(EINVAL); 2522 goto update; 2523 } 2524 break; 2525 } 2526 if (flags & V_RDDIR_ENTFLAGS) { 2527 /* 2528 * Add extended flag entry: 2529 */ 2530 eodp->ed_ino = objnum; 2531 eodp->ed_reclen = reclen; 2532 /* NOTE: ed_off is the offset for the *next* entry */ 2533 next = &(eodp->ed_off); 2534 eodp->ed_eflags = zap.za_normalization_conflict ? 2535 ED_CASE_CONFLICT : 0; 2536 (void) strncpy(eodp->ed_name, zap.za_name, 2537 EDIRENT_NAMELEN(reclen)); 2538 eodp = (edirent_t *)((intptr_t)eodp + reclen); 2539 } else { 2540 /* 2541 * Add normal entry: 2542 */ 2543 odp->d_ino = objnum; 2544 odp->d_reclen = reclen; 2545 odp->d_namlen = strlen(zap.za_name); 2546 (void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1); 2547 odp->d_type = type; 2548 odp = (dirent64_t *)((intptr_t)odp + reclen); 2549 } 2550 outcount += reclen; 2551 2552 ASSERT(outcount <= bufsize); 2553 2554 /* Prefetch znode */ 2555 if (prefetch) 2556 dmu_prefetch(os, objnum, 0, 0, 0, 2557 ZIO_PRIORITY_SYNC_READ); 2558 2559 skip_entry: 2560 /* 2561 * Move to the next entry, fill in the previous offset. 2562 */ 2563 if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { 2564 zap_cursor_advance(&zc); 2565 offset = zap_cursor_serialize(&zc); 2566 } else { 2567 offset += 1; 2568 } 2569 2570 if (cooks != NULL) { 2571 *cooks++ = offset; 2572 ncooks--; 2573 KASSERT(ncooks >= 0, ("ncookies=%d", ncooks)); 2574 } 2575 } 2576 zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ 2577 2578 /* Subtract unused cookies */ 2579 if (ncookies != NULL) 2580 *ncookies -= ncooks; 2581 2582 if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) { 2583 iovp->iov_base += outcount; 2584 iovp->iov_len -= outcount; 2585 uio->uio_resid -= outcount; 2586 } else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) { 2587 /* 2588 * Reset the pointer. 2589 */ 2590 offset = uio->uio_loffset; 2591 } 2592 2593update: 2594 zap_cursor_fini(&zc); 2595 if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) 2596 kmem_free(outbuf, bufsize); 2597 2598 if (error == ENOENT) 2599 error = 0; 2600 2601 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 2602 2603 uio->uio_loffset = offset; 2604 ZFS_EXIT(zfsvfs); 2605 if (error != 0 && cookies != NULL) { 2606 free(*cookies, M_TEMP); 2607 *cookies = NULL; 2608 *ncookies = 0; 2609 } 2610 return (error); 2611} 2612 2613ulong_t zfs_fsync_sync_cnt = 4; 2614 2615static int 2616zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct) 2617{ 2618 znode_t *zp = VTOZ(vp); 2619 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2620 2621 (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt); 2622 2623 if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { 2624 ZFS_ENTER(zfsvfs); 2625 ZFS_VERIFY_ZP(zp); 2626 zil_commit(zfsvfs->z_log, zp->z_id); 2627 ZFS_EXIT(zfsvfs); 2628 } 2629 return (0); 2630} 2631 2632 2633/* 2634 * Get the requested file attributes and place them in the provided 2635 * vattr structure. 2636 * 2637 * IN: vp - vnode of file. 2638 * vap - va_mask identifies requested attributes. 2639 * If AT_XVATTR set, then optional attrs are requested 2640 * flags - ATTR_NOACLCHECK (CIFS server context) 2641 * cr - credentials of caller. 2642 * ct - caller context 2643 * 2644 * OUT: vap - attribute values. 2645 * 2646 * RETURN: 0 (always succeeds). 2647 */ 2648/* ARGSUSED */ 2649static int 2650zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, 2651 caller_context_t *ct) 2652{ 2653 znode_t *zp = VTOZ(vp); 2654 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2655 int error = 0; 2656 uint32_t blksize; 2657 u_longlong_t nblocks; 2658 uint64_t links; 2659 uint64_t mtime[2], ctime[2], crtime[2], rdev; 2660 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ 2661 xoptattr_t *xoap = NULL; 2662 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 2663 sa_bulk_attr_t bulk[4]; 2664 int count = 0; 2665 2666 ZFS_ENTER(zfsvfs); 2667 ZFS_VERIFY_ZP(zp); 2668 2669 zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid); 2670 2671 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); 2672 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); 2673 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); 2674 if (vp->v_type == VBLK || vp->v_type == VCHR) 2675 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL, 2676 &rdev, 8); 2677 2678 if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) { 2679 ZFS_EXIT(zfsvfs); 2680 return (error); 2681 } 2682 2683 /* 2684 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES. 2685 * Also, if we are the owner don't bother, since owner should 2686 * always be allowed to read basic attributes of file. 2687 */ 2688 if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) && 2689 (vap->va_uid != crgetuid(cr))) { 2690 if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0, 2691 skipaclchk, cr)) { 2692 ZFS_EXIT(zfsvfs); 2693 return (error); 2694 } 2695 } 2696 2697 /* 2698 * Return all attributes. It's cheaper to provide the answer 2699 * than to determine whether we were asked the question. 2700 */ 2701 2702 vap->va_type = IFTOVT(zp->z_mode); 2703 vap->va_mode = zp->z_mode & ~S_IFMT; 2704#ifdef illumos 2705 vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev; 2706#else 2707 vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; 2708#endif 2709 vap->va_nodeid = zp->z_id; 2710 if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp)) 2711 links = zp->z_links + 1; 2712 else 2713 links = zp->z_links; 2714 vap->va_nlink = MIN(links, LINK_MAX); /* nlink_t limit! */ 2715 vap->va_size = zp->z_size; 2716#ifdef illumos 2717 vap->va_rdev = vp->v_rdev; 2718#else 2719 if (vp->v_type == VBLK || vp->v_type == VCHR) 2720 vap->va_rdev = zfs_cmpldev(rdev); 2721#endif 2722 vap->va_seq = zp->z_seq; 2723 vap->va_flags = 0; /* FreeBSD: Reset chflags(2) flags. */ 2724 vap->va_filerev = zp->z_seq; 2725 2726 /* 2727 * Add in any requested optional attributes and the create time. 2728 * Also set the corresponding bits in the returned attribute bitmap. 2729 */ 2730 if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) { 2731 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { 2732 xoap->xoa_archive = 2733 ((zp->z_pflags & ZFS_ARCHIVE) != 0); 2734 XVA_SET_RTN(xvap, XAT_ARCHIVE); 2735 } 2736 2737 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { 2738 xoap->xoa_readonly = 2739 ((zp->z_pflags & ZFS_READONLY) != 0); 2740 XVA_SET_RTN(xvap, XAT_READONLY); 2741 } 2742 2743 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { 2744 xoap->xoa_system = 2745 ((zp->z_pflags & ZFS_SYSTEM) != 0); 2746 XVA_SET_RTN(xvap, XAT_SYSTEM); 2747 } 2748 2749 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { 2750 xoap->xoa_hidden = 2751 ((zp->z_pflags & ZFS_HIDDEN) != 0); 2752 XVA_SET_RTN(xvap, XAT_HIDDEN); 2753 } 2754 2755 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { 2756 xoap->xoa_nounlink = 2757 ((zp->z_pflags & ZFS_NOUNLINK) != 0); 2758 XVA_SET_RTN(xvap, XAT_NOUNLINK); 2759 } 2760 2761 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { 2762 xoap->xoa_immutable = 2763 ((zp->z_pflags & ZFS_IMMUTABLE) != 0); 2764 XVA_SET_RTN(xvap, XAT_IMMUTABLE); 2765 } 2766 2767 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { 2768 xoap->xoa_appendonly = 2769 ((zp->z_pflags & ZFS_APPENDONLY) != 0); 2770 XVA_SET_RTN(xvap, XAT_APPENDONLY); 2771 } 2772 2773 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { 2774 xoap->xoa_nodump = 2775 ((zp->z_pflags & ZFS_NODUMP) != 0); 2776 XVA_SET_RTN(xvap, XAT_NODUMP); 2777 } 2778 2779 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { 2780 xoap->xoa_opaque = 2781 ((zp->z_pflags & ZFS_OPAQUE) != 0); 2782 XVA_SET_RTN(xvap, XAT_OPAQUE); 2783 } 2784 2785 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { 2786 xoap->xoa_av_quarantined = 2787 ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0); 2788 XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); 2789 } 2790 2791 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { 2792 xoap->xoa_av_modified = 2793 ((zp->z_pflags & ZFS_AV_MODIFIED) != 0); 2794 XVA_SET_RTN(xvap, XAT_AV_MODIFIED); 2795 } 2796 2797 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) && 2798 vp->v_type == VREG) { 2799 zfs_sa_get_scanstamp(zp, xvap); 2800 } 2801 2802 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { 2803 xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0); 2804 XVA_SET_RTN(xvap, XAT_REPARSE); 2805 } 2806 if (XVA_ISSET_REQ(xvap, XAT_GEN)) { 2807 xoap->xoa_generation = zp->z_gen; 2808 XVA_SET_RTN(xvap, XAT_GEN); 2809 } 2810 2811 if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { 2812 xoap->xoa_offline = 2813 ((zp->z_pflags & ZFS_OFFLINE) != 0); 2814 XVA_SET_RTN(xvap, XAT_OFFLINE); 2815 } 2816 2817 if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { 2818 xoap->xoa_sparse = 2819 ((zp->z_pflags & ZFS_SPARSE) != 0); 2820 XVA_SET_RTN(xvap, XAT_SPARSE); 2821 } 2822 } 2823 2824 ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime); 2825 ZFS_TIME_DECODE(&vap->va_mtime, mtime); 2826 ZFS_TIME_DECODE(&vap->va_ctime, ctime); 2827 ZFS_TIME_DECODE(&vap->va_birthtime, crtime); 2828 2829 2830 sa_object_size(zp->z_sa_hdl, &blksize, &nblocks); 2831 vap->va_blksize = blksize; 2832 vap->va_bytes = nblocks << 9; /* nblocks * 512 */ 2833 2834 if (zp->z_blksz == 0) { 2835 /* 2836 * Block size hasn't been set; suggest maximal I/O transfers. 2837 */ 2838 vap->va_blksize = zfsvfs->z_max_blksz; 2839 } 2840 2841 ZFS_EXIT(zfsvfs); 2842 return (0); 2843} 2844 2845/* 2846 * Set the file attributes to the values contained in the 2847 * vattr structure. 2848 * 2849 * IN: vp - vnode of file to be modified. 2850 * vap - new attribute values. 2851 * If AT_XVATTR set, then optional attrs are being set 2852 * flags - ATTR_UTIME set if non-default time values provided. 2853 * - ATTR_NOACLCHECK (CIFS context only). 2854 * cr - credentials of caller. 2855 * ct - caller context 2856 * 2857 * RETURN: 0 on success, error code on failure. 2858 * 2859 * Timestamps: 2860 * vp - ctime updated, mtime updated if size changed. 2861 */ 2862/* ARGSUSED */ 2863static int 2864zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, 2865 caller_context_t *ct) 2866{ 2867 znode_t *zp = VTOZ(vp); 2868 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2869 zilog_t *zilog; 2870 dmu_tx_t *tx; 2871 vattr_t oldva; 2872 xvattr_t tmpxvattr; 2873 uint_t mask = vap->va_mask; 2874 uint_t saved_mask = 0; 2875 uint64_t saved_mode; 2876 int trim_mask = 0; 2877 uint64_t new_mode; 2878 uint64_t new_uid, new_gid; 2879 uint64_t xattr_obj; 2880 uint64_t mtime[2], ctime[2]; 2881 znode_t *attrzp; 2882 int need_policy = FALSE; 2883 int err, err2; 2884 zfs_fuid_info_t *fuidp = NULL; 2885 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ 2886 xoptattr_t *xoap; 2887 zfs_acl_t *aclp; 2888 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 2889 boolean_t fuid_dirtied = B_FALSE; 2890 sa_bulk_attr_t bulk[7], xattr_bulk[7]; 2891 int count = 0, xattr_count = 0; 2892 2893 if (mask == 0) 2894 return (0); 2895 2896 if (mask & AT_NOSET) 2897 return (SET_ERROR(EINVAL)); 2898 2899 ZFS_ENTER(zfsvfs); 2900 ZFS_VERIFY_ZP(zp); 2901 2902 zilog = zfsvfs->z_log; 2903 2904 /* 2905 * Make sure that if we have ephemeral uid/gid or xvattr specified 2906 * that file system is at proper version level 2907 */ 2908 2909 if (zfsvfs->z_use_fuids == B_FALSE && 2910 (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) || 2911 ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) || 2912 (mask & AT_XVATTR))) { 2913 ZFS_EXIT(zfsvfs); 2914 return (SET_ERROR(EINVAL)); 2915 } 2916 2917 if (mask & AT_SIZE && vp->v_type == VDIR) { 2918 ZFS_EXIT(zfsvfs); 2919 return (SET_ERROR(EISDIR)); 2920 } 2921 2922 if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) { 2923 ZFS_EXIT(zfsvfs); 2924 return (SET_ERROR(EINVAL)); 2925 } 2926 2927 /* 2928 * If this is an xvattr_t, then get a pointer to the structure of 2929 * optional attributes. If this is NULL, then we have a vattr_t. 2930 */ 2931 xoap = xva_getxoptattr(xvap); 2932 2933 xva_init(&tmpxvattr); 2934 2935 /* 2936 * Immutable files can only alter immutable bit and atime 2937 */ 2938 if ((zp->z_pflags & ZFS_IMMUTABLE) && 2939 ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) || 2940 ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { 2941 ZFS_EXIT(zfsvfs); 2942 return (SET_ERROR(EPERM)); 2943 } 2944 2945 /* 2946 * Note: ZFS_READONLY is handled in zfs_zaccess_common. 2947 */ 2948 2949 /* 2950 * Verify timestamps doesn't overflow 32 bits. 2951 * ZFS can handle large timestamps, but 32bit syscalls can't 2952 * handle times greater than 2039. This check should be removed 2953 * once large timestamps are fully supported. 2954 */ 2955 if (mask & (AT_ATIME | AT_MTIME)) { 2956 if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || 2957 ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { 2958 ZFS_EXIT(zfsvfs); 2959 return (SET_ERROR(EOVERFLOW)); 2960 } 2961 } 2962 if (xoap && (mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME) && 2963 TIMESPEC_OVERFLOW(&vap->va_birthtime)) { 2964 ZFS_EXIT(zfsvfs); 2965 return (SET_ERROR(EOVERFLOW)); 2966 } 2967 2968 attrzp = NULL; 2969 aclp = NULL; 2970 2971 /* Can this be moved to before the top label? */ 2972 if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { 2973 ZFS_EXIT(zfsvfs); 2974 return (SET_ERROR(EROFS)); 2975 } 2976 2977 /* 2978 * First validate permissions 2979 */ 2980 2981 if (mask & AT_SIZE) { 2982 /* 2983 * XXX - Note, we are not providing any open 2984 * mode flags here (like FNDELAY), so we may 2985 * block if there are locks present... this 2986 * should be addressed in openat(). 2987 */ 2988 /* XXX - would it be OK to generate a log record here? */ 2989 err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); 2990 if (err) { 2991 ZFS_EXIT(zfsvfs); 2992 return (err); 2993 } 2994 } 2995 2996 if (mask & (AT_ATIME|AT_MTIME) || 2997 ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || 2998 XVA_ISSET_REQ(xvap, XAT_READONLY) || 2999 XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || 3000 XVA_ISSET_REQ(xvap, XAT_OFFLINE) || 3001 XVA_ISSET_REQ(xvap, XAT_SPARSE) || 3002 XVA_ISSET_REQ(xvap, XAT_CREATETIME) || 3003 XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { 3004 need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, 3005 skipaclchk, cr); 3006 } 3007 3008 if (mask & (AT_UID|AT_GID)) { 3009 int idmask = (mask & (AT_UID|AT_GID)); 3010 int take_owner; 3011 int take_group; 3012 3013 /* 3014 * NOTE: even if a new mode is being set, 3015 * we may clear S_ISUID/S_ISGID bits. 3016 */ 3017 3018 if (!(mask & AT_MODE)) 3019 vap->va_mode = zp->z_mode; 3020 3021 /* 3022 * Take ownership or chgrp to group we are a member of 3023 */ 3024 3025 take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr)); 3026 take_group = (mask & AT_GID) && 3027 zfs_groupmember(zfsvfs, vap->va_gid, cr); 3028 3029 /* 3030 * If both AT_UID and AT_GID are set then take_owner and 3031 * take_group must both be set in order to allow taking 3032 * ownership. 3033 * 3034 * Otherwise, send the check through secpolicy_vnode_setattr() 3035 * 3036 */ 3037 3038 if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) || 3039 ((idmask == AT_UID) && take_owner) || 3040 ((idmask == AT_GID) && take_group)) { 3041 if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, 3042 skipaclchk, cr) == 0) { 3043 /* 3044 * Remove setuid/setgid for non-privileged users 3045 */ 3046 secpolicy_setid_clear(vap, vp, cr); 3047 trim_mask = (mask & (AT_UID|AT_GID)); 3048 } else { 3049 need_policy = TRUE; 3050 } 3051 } else { 3052 need_policy = TRUE; 3053 } 3054 } 3055 3056 oldva.va_mode = zp->z_mode; 3057 zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); 3058 if (mask & AT_XVATTR) { 3059 /* 3060 * Update xvattr mask to include only those attributes 3061 * that are actually changing. 3062 * 3063 * the bits will be restored prior to actually setting 3064 * the attributes so the caller thinks they were set. 3065 */ 3066 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { 3067 if (xoap->xoa_appendonly != 3068 ((zp->z_pflags & ZFS_APPENDONLY) != 0)) { 3069 need_policy = TRUE; 3070 } else { 3071 XVA_CLR_REQ(xvap, XAT_APPENDONLY); 3072 XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY); 3073 } 3074 } 3075 3076 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { 3077 if (xoap->xoa_nounlink != 3078 ((zp->z_pflags & ZFS_NOUNLINK) != 0)) { 3079 need_policy = TRUE; 3080 } else { 3081 XVA_CLR_REQ(xvap, XAT_NOUNLINK); 3082 XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK); 3083 } 3084 } 3085 3086 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { 3087 if (xoap->xoa_immutable != 3088 ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) { 3089 need_policy = TRUE; 3090 } else { 3091 XVA_CLR_REQ(xvap, XAT_IMMUTABLE); 3092 XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE); 3093 } 3094 } 3095 3096 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { 3097 if (xoap->xoa_nodump != 3098 ((zp->z_pflags & ZFS_NODUMP) != 0)) { 3099 need_policy = TRUE; 3100 } else { 3101 XVA_CLR_REQ(xvap, XAT_NODUMP); 3102 XVA_SET_REQ(&tmpxvattr, XAT_NODUMP); 3103 } 3104 } 3105 3106 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { 3107 if (xoap->xoa_av_modified != 3108 ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) { 3109 need_policy = TRUE; 3110 } else { 3111 XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); 3112 XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED); 3113 } 3114 } 3115 3116 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { 3117 if ((vp->v_type != VREG && 3118 xoap->xoa_av_quarantined) || 3119 xoap->xoa_av_quarantined != 3120 ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) { 3121 need_policy = TRUE; 3122 } else { 3123 XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); 3124 XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED); 3125 } 3126 } 3127 3128 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { 3129 ZFS_EXIT(zfsvfs); 3130 return (SET_ERROR(EPERM)); 3131 } 3132 3133 if (need_policy == FALSE && 3134 (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) || 3135 XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { 3136 need_policy = TRUE; 3137 } 3138 } 3139 3140 if (mask & AT_MODE) { 3141 if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) { 3142 err = secpolicy_setid_setsticky_clear(vp, vap, 3143 &oldva, cr); 3144 if (err) { 3145 ZFS_EXIT(zfsvfs); 3146 return (err); 3147 } 3148 trim_mask |= AT_MODE; 3149 } else { 3150 need_policy = TRUE; 3151 } 3152 } 3153 3154 if (need_policy) { 3155 /* 3156 * If trim_mask is set then take ownership 3157 * has been granted or write_acl is present and user 3158 * has the ability to modify mode. In that case remove 3159 * UID|GID and or MODE from mask so that 3160 * secpolicy_vnode_setattr() doesn't revoke it. 3161 */ 3162 3163 if (trim_mask) { 3164 saved_mask = vap->va_mask; 3165 vap->va_mask &= ~trim_mask; 3166 if (trim_mask & AT_MODE) { 3167 /* 3168 * Save the mode, as secpolicy_vnode_setattr() 3169 * will overwrite it with ova.va_mode. 3170 */ 3171 saved_mode = vap->va_mode; 3172 } 3173 } 3174 err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags, 3175 (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp); 3176 if (err) { 3177 ZFS_EXIT(zfsvfs); 3178 return (err); 3179 } 3180 3181 if (trim_mask) { 3182 vap->va_mask |= saved_mask; 3183 if (trim_mask & AT_MODE) { 3184 /* 3185 * Recover the mode after 3186 * secpolicy_vnode_setattr(). 3187 */ 3188 vap->va_mode = saved_mode; 3189 } 3190 } 3191 } 3192 3193 /* 3194 * secpolicy_vnode_setattr, or take ownership may have 3195 * changed va_mask 3196 */ 3197 mask = vap->va_mask; 3198 3199 if ((mask & (AT_UID | AT_GID))) { 3200 err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), 3201 &xattr_obj, sizeof (xattr_obj)); 3202 3203 if (err == 0 && xattr_obj) { 3204 err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp); 3205 if (err == 0) { 3206 err = vn_lock(ZTOV(attrzp), LK_EXCLUSIVE); 3207 if (err != 0) 3208 vrele(ZTOV(attrzp)); 3209 } 3210 if (err) 3211 goto out2; 3212 } 3213 if (mask & AT_UID) { 3214 new_uid = zfs_fuid_create(zfsvfs, 3215 (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); 3216 if (new_uid != zp->z_uid && 3217 zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) { 3218 if (attrzp) 3219 vput(ZTOV(attrzp)); 3220 err = SET_ERROR(EDQUOT); 3221 goto out2; 3222 } 3223 } 3224 3225 if (mask & AT_GID) { 3226 new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid, 3227 cr, ZFS_GROUP, &fuidp); 3228 if (new_gid != zp->z_gid && 3229 zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) { 3230 if (attrzp) 3231 vput(ZTOV(attrzp)); 3232 err = SET_ERROR(EDQUOT); 3233 goto out2; 3234 } 3235 } 3236 } 3237 tx = dmu_tx_create(zfsvfs->z_os); 3238 3239 if (mask & AT_MODE) { 3240 uint64_t pmode = zp->z_mode; 3241 uint64_t acl_obj; 3242 new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); 3243 3244 if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED && 3245 !(zp->z_pflags & ZFS_ACL_TRIVIAL)) { 3246 err = SET_ERROR(EPERM); 3247 goto out; 3248 } 3249 3250 if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)) 3251 goto out; 3252 3253 if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) { 3254 /* 3255 * Are we upgrading ACL from old V0 format 3256 * to V1 format? 3257 */ 3258 if (zfsvfs->z_version >= ZPL_VERSION_FUID && 3259 zfs_znode_acl_version(zp) == 3260 ZFS_ACL_VERSION_INITIAL) { 3261 dmu_tx_hold_free(tx, acl_obj, 0, 3262 DMU_OBJECT_END); 3263 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 3264 0, aclp->z_acl_bytes); 3265 } else { 3266 dmu_tx_hold_write(tx, acl_obj, 0, 3267 aclp->z_acl_bytes); 3268 } 3269 } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { 3270 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 3271 0, aclp->z_acl_bytes); 3272 } 3273 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 3274 } else { 3275 if ((mask & AT_XVATTR) && 3276 XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) 3277 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 3278 else 3279 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 3280 } 3281 3282 if (attrzp) { 3283 dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE); 3284 } 3285 3286 fuid_dirtied = zfsvfs->z_fuid_dirty; 3287 if (fuid_dirtied) 3288 zfs_fuid_txhold(zfsvfs, tx); 3289 3290 zfs_sa_upgrade_txholds(tx, zp); 3291 3292 err = dmu_tx_assign(tx, TXG_WAIT); 3293 if (err) 3294 goto out; 3295 3296 count = 0; 3297 /* 3298 * Set each attribute requested. 3299 * We group settings according to the locks they need to acquire. 3300 * 3301 * Note: you cannot set ctime directly, although it will be 3302 * updated as a side-effect of calling this function. 3303 */ 3304 3305 if (mask & (AT_UID|AT_GID|AT_MODE)) 3306 mutex_enter(&zp->z_acl_lock); 3307 3308 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 3309 &zp->z_pflags, sizeof (zp->z_pflags)); 3310 3311 if (attrzp) { 3312 if (mask & (AT_UID|AT_GID|AT_MODE)) 3313 mutex_enter(&attrzp->z_acl_lock); 3314 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 3315 SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags, 3316 sizeof (attrzp->z_pflags)); 3317 } 3318 3319 if (mask & (AT_UID|AT_GID)) { 3320 3321 if (mask & AT_UID) { 3322 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, 3323 &new_uid, sizeof (new_uid)); 3324 zp->z_uid = new_uid; 3325 if (attrzp) { 3326 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 3327 SA_ZPL_UID(zfsvfs), NULL, &new_uid, 3328 sizeof (new_uid)); 3329 attrzp->z_uid = new_uid; 3330 } 3331 } 3332 3333 if (mask & AT_GID) { 3334 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), 3335 NULL, &new_gid, sizeof (new_gid)); 3336 zp->z_gid = new_gid; 3337 if (attrzp) { 3338 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 3339 SA_ZPL_GID(zfsvfs), NULL, &new_gid, 3340 sizeof (new_gid)); 3341 attrzp->z_gid = new_gid; 3342 } 3343 } 3344 if (!(mask & AT_MODE)) { 3345 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), 3346 NULL, &new_mode, sizeof (new_mode)); 3347 new_mode = zp->z_mode; 3348 } 3349 err = zfs_acl_chown_setattr(zp); 3350 ASSERT(err == 0); 3351 if (attrzp) { 3352 err = zfs_acl_chown_setattr(attrzp); 3353 ASSERT(err == 0); 3354 } 3355 } 3356 3357 if (mask & AT_MODE) { 3358 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, 3359 &new_mode, sizeof (new_mode)); 3360 zp->z_mode = new_mode; 3361 ASSERT3U((uintptr_t)aclp, !=, 0); 3362 err = zfs_aclset_common(zp, aclp, cr, tx); 3363 ASSERT0(err); 3364 if (zp->z_acl_cached) 3365 zfs_acl_free(zp->z_acl_cached); 3366 zp->z_acl_cached = aclp; 3367 aclp = NULL; 3368 } 3369 3370 3371 if (mask & AT_ATIME) { 3372 ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime); 3373 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, 3374 &zp->z_atime, sizeof (zp->z_atime)); 3375 } 3376 3377 if (mask & AT_MTIME) { 3378 ZFS_TIME_ENCODE(&vap->va_mtime, mtime); 3379 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, 3380 mtime, sizeof (mtime)); 3381 } 3382 3383 /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */ 3384 if (mask & AT_SIZE && !(mask & AT_MTIME)) { 3385 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), 3386 NULL, mtime, sizeof (mtime)); 3387 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, 3388 &ctime, sizeof (ctime)); 3389 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, 3390 B_TRUE); 3391 } else if (mask != 0) { 3392 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, 3393 &ctime, sizeof (ctime)); 3394 zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime, 3395 B_TRUE); 3396 if (attrzp) { 3397 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 3398 SA_ZPL_CTIME(zfsvfs), NULL, 3399 &ctime, sizeof (ctime)); 3400 zfs_tstamp_update_setup(attrzp, STATE_CHANGED, 3401 mtime, ctime, B_TRUE); 3402 } 3403 } 3404 /* 3405 * Do this after setting timestamps to prevent timestamp 3406 * update from toggling bit 3407 */ 3408 3409 if (xoap && (mask & AT_XVATTR)) { 3410 3411 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) 3412 xoap->xoa_createtime = vap->va_birthtime; 3413 /* 3414 * restore trimmed off masks 3415 * so that return masks can be set for caller. 3416 */ 3417 3418 if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) { 3419 XVA_SET_REQ(xvap, XAT_APPENDONLY); 3420 } 3421 if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) { 3422 XVA_SET_REQ(xvap, XAT_NOUNLINK); 3423 } 3424 if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) { 3425 XVA_SET_REQ(xvap, XAT_IMMUTABLE); 3426 } 3427 if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) { 3428 XVA_SET_REQ(xvap, XAT_NODUMP); 3429 } 3430 if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) { 3431 XVA_SET_REQ(xvap, XAT_AV_MODIFIED); 3432 } 3433 if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) { 3434 XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); 3435 } 3436 3437 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) 3438 ASSERT(vp->v_type == VREG); 3439 3440 zfs_xvattr_set(zp, xvap, tx); 3441 } 3442 3443 if (fuid_dirtied) 3444 zfs_fuid_sync(zfsvfs, tx); 3445 3446 if (mask != 0) 3447 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); 3448 3449 if (mask & (AT_UID|AT_GID|AT_MODE)) 3450 mutex_exit(&zp->z_acl_lock); 3451 3452 if (attrzp) { 3453 if (mask & (AT_UID|AT_GID|AT_MODE)) 3454 mutex_exit(&attrzp->z_acl_lock); 3455 } 3456out: 3457 if (err == 0 && attrzp) { 3458 err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, 3459 xattr_count, tx); 3460 ASSERT(err2 == 0); 3461 } 3462 3463 if (attrzp) 3464 vput(ZTOV(attrzp)); 3465 3466 if (aclp) 3467 zfs_acl_free(aclp); 3468 3469 if (fuidp) { 3470 zfs_fuid_info_free(fuidp); 3471 fuidp = NULL; 3472 } 3473 3474 if (err) { 3475 dmu_tx_abort(tx); 3476 } else { 3477 err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 3478 dmu_tx_commit(tx); 3479 } 3480 3481out2: 3482 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 3483 zil_commit(zilog, 0); 3484 3485 ZFS_EXIT(zfsvfs); 3486 return (err); 3487} 3488 3489/* 3490 * We acquire all but fdvp locks using non-blocking acquisitions. If we 3491 * fail to acquire any lock in the path we will drop all held locks, 3492 * acquire the new lock in a blocking fashion, and then release it and 3493 * restart the rename. This acquire/release step ensures that we do not 3494 * spin on a lock waiting for release. On error release all vnode locks 3495 * and decrement references the way tmpfs_rename() would do. 3496 */ 3497static int 3498zfs_rename_relock(struct vnode *sdvp, struct vnode **svpp, 3499 struct vnode *tdvp, struct vnode **tvpp, 3500 const struct componentname *scnp, const struct componentname *tcnp) 3501{ 3502 zfsvfs_t *zfsvfs; 3503 struct vnode *nvp, *svp, *tvp; 3504 znode_t *sdzp, *tdzp, *szp, *tzp; 3505 const char *snm = scnp->cn_nameptr; 3506 const char *tnm = tcnp->cn_nameptr; 3507 int error; 3508 3509 VOP_UNLOCK(tdvp, 0); 3510 if (*tvpp != NULL && *tvpp != tdvp) 3511 VOP_UNLOCK(*tvpp, 0); 3512 3513relock: 3514 error = vn_lock(sdvp, LK_EXCLUSIVE); 3515 if (error) 3516 goto out; 3517 sdzp = VTOZ(sdvp); 3518 3519 error = vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT); 3520 if (error != 0) { 3521 VOP_UNLOCK(sdvp, 0); 3522 if (error != EBUSY) 3523 goto out; 3524 error = vn_lock(tdvp, LK_EXCLUSIVE); 3525 if (error) 3526 goto out; 3527 VOP_UNLOCK(tdvp, 0); 3528 goto relock; 3529 } 3530 tdzp = VTOZ(tdvp); 3531 3532 /* 3533 * Before using sdzp and tdzp we must ensure that they are live. 3534 * As a porting legacy from illumos we have two things to worry 3535 * about. One is typical for FreeBSD and it is that the vnode is 3536 * not reclaimed (doomed). The other is that the znode is live. 3537 * The current code can invalidate the znode without acquiring the 3538 * corresponding vnode lock if the object represented by the znode 3539 * and vnode is no longer valid after a rollback or receive operation. 3540 * z_teardown_lock hidden behind ZFS_ENTER and ZFS_EXIT is the lock 3541 * that protects the znodes from the invalidation. 3542 */ 3543 zfsvfs = sdzp->z_zfsvfs; 3544 ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs); 3545 ZFS_ENTER(zfsvfs); 3546 3547 /* 3548 * We can not use ZFS_VERIFY_ZP() here because it could directly return 3549 * bypassing the cleanup code in the case of an error. 3550 */ 3551 if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) { 3552 ZFS_EXIT(zfsvfs); 3553 VOP_UNLOCK(sdvp, 0); 3554 VOP_UNLOCK(tdvp, 0); 3555 error = SET_ERROR(EIO); 3556 goto out; 3557 } 3558 3559 /* 3560 * Re-resolve svp to be certain it still exists and fetch the 3561 * correct vnode. 3562 */ 3563 error = zfs_dirent_lookup(sdzp, snm, &szp, ZEXISTS); 3564 if (error != 0) { 3565 /* Source entry invalid or not there. */ 3566 ZFS_EXIT(zfsvfs); 3567 VOP_UNLOCK(sdvp, 0); 3568 VOP_UNLOCK(tdvp, 0); 3569 if ((scnp->cn_flags & ISDOTDOT) != 0 || 3570 (scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.')) 3571 error = SET_ERROR(EINVAL); 3572 goto out; 3573 } 3574 svp = ZTOV(szp); 3575 3576 /* 3577 * Re-resolve tvp, if it disappeared we just carry on. 3578 */ 3579 error = zfs_dirent_lookup(tdzp, tnm, &tzp, 0); 3580 if (error != 0) { 3581 ZFS_EXIT(zfsvfs); 3582 VOP_UNLOCK(sdvp, 0); 3583 VOP_UNLOCK(tdvp, 0); 3584 vrele(svp); 3585 if ((tcnp->cn_flags & ISDOTDOT) != 0) 3586 error = SET_ERROR(EINVAL); 3587 goto out; 3588 } 3589 if (tzp != NULL) 3590 tvp = ZTOV(tzp); 3591 else 3592 tvp = NULL; 3593 3594 /* 3595 * At present the vnode locks must be acquired before z_teardown_lock, 3596 * although it would be more logical to use the opposite order. 3597 */ 3598 ZFS_EXIT(zfsvfs); 3599 3600 /* 3601 * Now try acquire locks on svp and tvp. 3602 */ 3603 nvp = svp; 3604 error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT); 3605 if (error != 0) { 3606 VOP_UNLOCK(sdvp, 0); 3607 VOP_UNLOCK(tdvp, 0); 3608 if (tvp != NULL) 3609 vrele(tvp); 3610 if (error != EBUSY) { 3611 vrele(nvp); 3612 goto out; 3613 } 3614 error = vn_lock(nvp, LK_EXCLUSIVE); 3615 if (error != 0) { 3616 vrele(nvp); 3617 goto out; 3618 } 3619 VOP_UNLOCK(nvp, 0); 3620 /* 3621 * Concurrent rename race. 3622 * XXX ? 3623 */ 3624 if (nvp == tdvp) { 3625 vrele(nvp); 3626 error = SET_ERROR(EINVAL); 3627 goto out; 3628 } 3629 vrele(*svpp); 3630 *svpp = nvp; 3631 goto relock; 3632 } 3633 vrele(*svpp); 3634 *svpp = nvp; 3635 3636 if (*tvpp != NULL) 3637 vrele(*tvpp); 3638 *tvpp = NULL; 3639 if (tvp != NULL) { 3640 nvp = tvp; 3641 error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT); 3642 if (error != 0) { 3643 VOP_UNLOCK(sdvp, 0); 3644 VOP_UNLOCK(tdvp, 0); 3645 VOP_UNLOCK(*svpp, 0); 3646 if (error != EBUSY) { 3647 vrele(nvp); 3648 goto out; 3649 } 3650 error = vn_lock(nvp, LK_EXCLUSIVE); 3651 if (error != 0) { 3652 vrele(nvp); 3653 goto out; 3654 } 3655 vput(nvp); 3656 goto relock; 3657 } 3658 *tvpp = nvp; 3659 } 3660 3661 return (0); 3662 3663out: 3664 return (error); 3665} 3666 3667/* 3668 * Note that we must use VRELE_ASYNC in this function as it walks 3669 * up the directory tree and vrele may need to acquire an exclusive 3670 * lock if a last reference to a vnode is dropped. 3671 */ 3672static int 3673zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp) 3674{ 3675 zfsvfs_t *zfsvfs; 3676 znode_t *zp, *zp1; 3677 uint64_t parent; 3678 int error; 3679 3680 zfsvfs = tdzp->z_zfsvfs; 3681 if (tdzp == szp) 3682 return (SET_ERROR(EINVAL)); 3683 if (tdzp == sdzp) 3684 return (0); 3685 if (tdzp->z_id == zfsvfs->z_root) 3686 return (0); 3687 zp = tdzp; 3688 for (;;) { 3689 ASSERT(!zp->z_unlinked); 3690 if ((error = sa_lookup(zp->z_sa_hdl, 3691 SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) 3692 break; 3693 3694 if (parent == szp->z_id) { 3695 error = SET_ERROR(EINVAL); 3696 break; 3697 } 3698 if (parent == zfsvfs->z_root) 3699 break; 3700 if (parent == sdzp->z_id) 3701 break; 3702 3703 error = zfs_zget(zfsvfs, parent, &zp1); 3704 if (error != 0) 3705 break; 3706 3707 if (zp != tdzp) 3708 VN_RELE_ASYNC(ZTOV(zp), 3709 dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os))); 3710 zp = zp1; 3711 } 3712 3713 if (error == ENOTDIR) 3714 panic("checkpath: .. not a directory\n"); 3715 if (zp != tdzp) 3716 VN_RELE_ASYNC(ZTOV(zp), 3717 dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os))); 3718 return (error); 3719} 3720 3721/* 3722 * Move an entry from the provided source directory to the target 3723 * directory. Change the entry name as indicated. 3724 * 3725 * IN: sdvp - Source directory containing the "old entry". 3726 * snm - Old entry name. 3727 * tdvp - Target directory to contain the "new entry". 3728 * tnm - New entry name. 3729 * cr - credentials of caller. 3730 * ct - caller context 3731 * flags - case flags 3732 * 3733 * RETURN: 0 on success, error code on failure. 3734 * 3735 * Timestamps: 3736 * sdvp,tdvp - ctime|mtime updated 3737 */ 3738/*ARGSUSED*/ 3739static int 3740zfs_rename(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp, 3741 vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp, 3742 cred_t *cr) 3743{ 3744 zfsvfs_t *zfsvfs; 3745 znode_t *sdzp, *tdzp, *szp, *tzp; 3746 zilog_t *zilog = NULL; 3747 dmu_tx_t *tx; 3748 char *snm = scnp->cn_nameptr; 3749 char *tnm = tcnp->cn_nameptr; 3750 int error = 0; 3751 3752 /* Reject renames across filesystems. */ 3753 if ((*svpp)->v_mount != tdvp->v_mount || 3754 ((*tvpp) != NULL && (*svpp)->v_mount != (*tvpp)->v_mount)) { 3755 error = SET_ERROR(EXDEV); 3756 goto out; 3757 } 3758 3759 if (zfsctl_is_node(tdvp)) { 3760 error = SET_ERROR(EXDEV); 3761 goto out; 3762 } 3763 3764 /* 3765 * Lock all four vnodes to ensure safety and semantics of renaming. 3766 */ 3767 error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp); 3768 if (error != 0) { 3769 /* no vnodes are locked in the case of error here */ 3770 return (error); 3771 } 3772 3773 tdzp = VTOZ(tdvp); 3774 sdzp = VTOZ(sdvp); 3775 zfsvfs = tdzp->z_zfsvfs; 3776 zilog = zfsvfs->z_log; 3777 3778 /* 3779 * After we re-enter ZFS_ENTER() we will have to revalidate all 3780 * znodes involved. 3781 */ 3782 ZFS_ENTER(zfsvfs); 3783 3784 if (zfsvfs->z_utf8 && u8_validate(tnm, 3785 strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 3786 error = SET_ERROR(EILSEQ); 3787 goto unlockout; 3788 } 3789 3790 /* If source and target are the same file, there is nothing to do. */ 3791 if ((*svpp) == (*tvpp)) { 3792 error = 0; 3793 goto unlockout; 3794 } 3795 3796 if (((*svpp)->v_type == VDIR && (*svpp)->v_mountedhere != NULL) || 3797 ((*tvpp) != NULL && (*tvpp)->v_type == VDIR && 3798 (*tvpp)->v_mountedhere != NULL)) { 3799 error = SET_ERROR(EXDEV); 3800 goto unlockout; 3801 } 3802 3803 /* 3804 * We can not use ZFS_VERIFY_ZP() here because it could directly return 3805 * bypassing the cleanup code in the case of an error. 3806 */ 3807 if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) { 3808 error = SET_ERROR(EIO); 3809 goto unlockout; 3810 } 3811 3812 szp = VTOZ(*svpp); 3813 tzp = *tvpp == NULL ? NULL : VTOZ(*tvpp); 3814 if (szp->z_sa_hdl == NULL || (tzp != NULL && tzp->z_sa_hdl == NULL)) { 3815 error = SET_ERROR(EIO); 3816 goto unlockout; 3817 } 3818 3819 /* 3820 * This is to prevent the creation of links into attribute space 3821 * by renaming a linked file into/outof an attribute directory. 3822 * See the comment in zfs_link() for why this is considered bad. 3823 */ 3824 if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { 3825 error = SET_ERROR(EINVAL); 3826 goto unlockout; 3827 } 3828 3829 /* 3830 * Must have write access at the source to remove the old entry 3831 * and write access at the target to create the new entry. 3832 * Note that if target and source are the same, this can be 3833 * done in a single check. 3834 */ 3835 if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr)) 3836 goto unlockout; 3837 3838 if ((*svpp)->v_type == VDIR) { 3839 /* 3840 * Avoid ".", "..", and aliases of "." for obvious reasons. 3841 */ 3842 if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') || 3843 sdzp == szp || 3844 (scnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) { 3845 error = EINVAL; 3846 goto unlockout; 3847 } 3848 3849 /* 3850 * Check to make sure rename is valid. 3851 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d 3852 */ 3853 if (error = zfs_rename_check(szp, sdzp, tdzp)) 3854 goto unlockout; 3855 } 3856 3857 /* 3858 * Does target exist? 3859 */ 3860 if (tzp) { 3861 /* 3862 * Source and target must be the same type. 3863 */ 3864 if ((*svpp)->v_type == VDIR) { 3865 if ((*tvpp)->v_type != VDIR) { 3866 error = SET_ERROR(ENOTDIR); 3867 goto unlockout; 3868 } else { 3869 cache_purge(tdvp); 3870 if (sdvp != tdvp) 3871 cache_purge(sdvp); 3872 } 3873 } else { 3874 if ((*tvpp)->v_type == VDIR) { 3875 error = SET_ERROR(EISDIR); 3876 goto unlockout; 3877 } 3878 } 3879 } 3880 3881 vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct); 3882 if (tzp) 3883 vnevent_rename_dest(*tvpp, tdvp, tnm, ct); 3884 3885 /* 3886 * notify the target directory if it is not the same 3887 * as source directory. 3888 */ 3889 if (tdvp != sdvp) { 3890 vnevent_rename_dest_dir(tdvp, ct); 3891 } 3892 3893 tx = dmu_tx_create(zfsvfs->z_os); 3894 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); 3895 dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); 3896 dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); 3897 dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); 3898 if (sdzp != tdzp) { 3899 dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); 3900 zfs_sa_upgrade_txholds(tx, tdzp); 3901 } 3902 if (tzp) { 3903 dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); 3904 zfs_sa_upgrade_txholds(tx, tzp); 3905 } 3906 3907 zfs_sa_upgrade_txholds(tx, szp); 3908 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 3909 error = dmu_tx_assign(tx, TXG_WAIT); 3910 if (error) { 3911 dmu_tx_abort(tx); 3912 goto unlockout; 3913 } 3914 3915 3916 if (tzp) /* Attempt to remove the existing target */ 3917 error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL); 3918 3919 if (error == 0) { 3920 error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING); 3921 if (error == 0) { 3922 szp->z_pflags |= ZFS_AV_MODIFIED; 3923 3924 error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), 3925 (void *)&szp->z_pflags, sizeof (uint64_t), tx); 3926 ASSERT0(error); 3927 3928 error = zfs_link_destroy(sdzp, snm, szp, tx, ZRENAMING, 3929 NULL); 3930 if (error == 0) { 3931 zfs_log_rename(zilog, tx, TX_RENAME, sdzp, 3932 snm, tdzp, tnm, szp); 3933 3934 /* 3935 * Update path information for the target vnode 3936 */ 3937 vn_renamepath(tdvp, *svpp, tnm, strlen(tnm)); 3938 } else { 3939 /* 3940 * At this point, we have successfully created 3941 * the target name, but have failed to remove 3942 * the source name. Since the create was done 3943 * with the ZRENAMING flag, there are 3944 * complications; for one, the link count is 3945 * wrong. The easiest way to deal with this 3946 * is to remove the newly created target, and 3947 * return the original error. This must 3948 * succeed; fortunately, it is very unlikely to 3949 * fail, since we just created it. 3950 */ 3951 VERIFY3U(zfs_link_destroy(tdzp, tnm, szp, tx, 3952 ZRENAMING, NULL), ==, 0); 3953 } 3954 } 3955 if (error == 0) { 3956 cache_purge(*svpp); 3957 if (*tvpp != NULL) 3958 cache_purge(*tvpp); 3959 cache_purge_negative(tdvp); 3960 } 3961 } 3962 3963 dmu_tx_commit(tx); 3964 3965unlockout: /* all 4 vnodes are locked, ZFS_ENTER called */ 3966 ZFS_EXIT(zfsvfs); 3967 VOP_UNLOCK(*svpp, 0); 3968 VOP_UNLOCK(sdvp, 0); 3969 3970out: /* original two vnodes are locked */ 3971 if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 3972 zil_commit(zilog, 0); 3973 3974 if (*tvpp != NULL) 3975 VOP_UNLOCK(*tvpp, 0); 3976 if (tdvp != *tvpp) 3977 VOP_UNLOCK(tdvp, 0); 3978 return (error); 3979} 3980 3981/* 3982 * Insert the indicated symbolic reference entry into the directory. 3983 * 3984 * IN: dvp - Directory to contain new symbolic link. 3985 * link - Name for new symlink entry. 3986 * vap - Attributes of new entry. 3987 * cr - credentials of caller. 3988 * ct - caller context 3989 * flags - case flags 3990 * 3991 * RETURN: 0 on success, error code on failure. 3992 * 3993 * Timestamps: 3994 * dvp - ctime|mtime updated 3995 */ 3996/*ARGSUSED*/ 3997static int 3998zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link, 3999 cred_t *cr, kthread_t *td) 4000{ 4001 znode_t *zp, *dzp = VTOZ(dvp); 4002 dmu_tx_t *tx; 4003 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 4004 zilog_t *zilog; 4005 uint64_t len = strlen(link); 4006 int error; 4007 zfs_acl_ids_t acl_ids; 4008 boolean_t fuid_dirtied; 4009 uint64_t txtype = TX_SYMLINK; 4010 int flags = 0; 4011 4012 ASSERT(vap->va_type == VLNK); 4013 4014 ZFS_ENTER(zfsvfs); 4015 ZFS_VERIFY_ZP(dzp); 4016 zilog = zfsvfs->z_log; 4017 4018 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), 4019 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 4020 ZFS_EXIT(zfsvfs); 4021 return (SET_ERROR(EILSEQ)); 4022 } 4023 4024 if (len > MAXPATHLEN) { 4025 ZFS_EXIT(zfsvfs); 4026 return (SET_ERROR(ENAMETOOLONG)); 4027 } 4028 4029 if ((error = zfs_acl_ids_create(dzp, 0, 4030 vap, cr, NULL, &acl_ids)) != 0) { 4031 ZFS_EXIT(zfsvfs); 4032 return (error); 4033 } 4034 4035 /* 4036 * Attempt to lock directory; fail if entry already exists. 4037 */ 4038 error = zfs_dirent_lookup(dzp, name, &zp, ZNEW); 4039 if (error) { 4040 zfs_acl_ids_free(&acl_ids); 4041 ZFS_EXIT(zfsvfs); 4042 return (error); 4043 } 4044 4045 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { 4046 zfs_acl_ids_free(&acl_ids); 4047 ZFS_EXIT(zfsvfs); 4048 return (error); 4049 } 4050 4051 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { 4052 zfs_acl_ids_free(&acl_ids); 4053 ZFS_EXIT(zfsvfs); 4054 return (SET_ERROR(EDQUOT)); 4055 } 4056 4057 getnewvnode_reserve(1); 4058 tx = dmu_tx_create(zfsvfs->z_os); 4059 fuid_dirtied = zfsvfs->z_fuid_dirty; 4060 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); 4061 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 4062 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 4063 ZFS_SA_BASE_ATTR_SIZE + len); 4064 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); 4065 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 4066 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 4067 acl_ids.z_aclp->z_acl_bytes); 4068 } 4069 if (fuid_dirtied) 4070 zfs_fuid_txhold(zfsvfs, tx); 4071 error = dmu_tx_assign(tx, TXG_WAIT); 4072 if (error) { 4073 zfs_acl_ids_free(&acl_ids); 4074 dmu_tx_abort(tx); 4075 getnewvnode_drop_reserve(); 4076 ZFS_EXIT(zfsvfs); 4077 return (error); 4078 } 4079 4080 /* 4081 * Create a new object for the symlink. 4082 * for version 4 ZPL datsets the symlink will be an SA attribute 4083 */ 4084 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); 4085 4086 if (fuid_dirtied) 4087 zfs_fuid_sync(zfsvfs, tx); 4088 4089 if (zp->z_is_sa) 4090 error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), 4091 link, len, tx); 4092 else 4093 zfs_sa_symlink(zp, link, len, tx); 4094 4095 zp->z_size = len; 4096 (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), 4097 &zp->z_size, sizeof (zp->z_size), tx); 4098 /* 4099 * Insert the new object into the directory. 4100 */ 4101 (void) zfs_link_create(dzp, name, zp, tx, ZNEW); 4102 4103 zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); 4104 *vpp = ZTOV(zp); 4105 4106 zfs_acl_ids_free(&acl_ids); 4107 4108 dmu_tx_commit(tx); 4109 4110 getnewvnode_drop_reserve(); 4111 4112 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 4113 zil_commit(zilog, 0); 4114 4115 ZFS_EXIT(zfsvfs); 4116 return (error); 4117} 4118 4119/* 4120 * Return, in the buffer contained in the provided uio structure, 4121 * the symbolic path referred to by vp. 4122 * 4123 * IN: vp - vnode of symbolic link. 4124 * uio - structure to contain the link path. 4125 * cr - credentials of caller. 4126 * ct - caller context 4127 * 4128 * OUT: uio - structure containing the link path. 4129 * 4130 * RETURN: 0 on success, error code on failure. 4131 * 4132 * Timestamps: 4133 * vp - atime updated 4134 */ 4135/* ARGSUSED */ 4136static int 4137zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct) 4138{ 4139 znode_t *zp = VTOZ(vp); 4140 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4141 int error; 4142 4143 ZFS_ENTER(zfsvfs); 4144 ZFS_VERIFY_ZP(zp); 4145 4146 if (zp->z_is_sa) 4147 error = sa_lookup_uio(zp->z_sa_hdl, 4148 SA_ZPL_SYMLINK(zfsvfs), uio); 4149 else 4150 error = zfs_sa_readlink(zp, uio); 4151 4152 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 4153 4154 ZFS_EXIT(zfsvfs); 4155 return (error); 4156} 4157 4158/* 4159 * Insert a new entry into directory tdvp referencing svp. 4160 * 4161 * IN: tdvp - Directory to contain new entry. 4162 * svp - vnode of new entry. 4163 * name - name of new entry. 4164 * cr - credentials of caller. 4165 * ct - caller context 4166 * 4167 * RETURN: 0 on success, error code on failure. 4168 * 4169 * Timestamps: 4170 * tdvp - ctime|mtime updated 4171 * svp - ctime updated 4172 */ 4173/* ARGSUSED */ 4174static int 4175zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr, 4176 caller_context_t *ct, int flags) 4177{ 4178 znode_t *dzp = VTOZ(tdvp); 4179 znode_t *tzp, *szp; 4180 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 4181 zilog_t *zilog; 4182 dmu_tx_t *tx; 4183 int error; 4184 uint64_t parent; 4185 uid_t owner; 4186 4187 ASSERT(tdvp->v_type == VDIR); 4188 4189 ZFS_ENTER(zfsvfs); 4190 ZFS_VERIFY_ZP(dzp); 4191 zilog = zfsvfs->z_log; 4192 4193 /* 4194 * POSIX dictates that we return EPERM here. 4195 * Better choices include ENOTSUP or EISDIR. 4196 */ 4197 if (svp->v_type == VDIR) { 4198 ZFS_EXIT(zfsvfs); 4199 return (SET_ERROR(EPERM)); 4200 } 4201 4202 szp = VTOZ(svp); 4203 ZFS_VERIFY_ZP(szp); 4204 4205 if (szp->z_pflags & (ZFS_APPENDONLY | ZFS_IMMUTABLE | ZFS_READONLY)) { 4206 ZFS_EXIT(zfsvfs); 4207 return (SET_ERROR(EPERM)); 4208 } 4209 4210 /* Prevent links to .zfs/shares files */ 4211 4212 if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), 4213 &parent, sizeof (uint64_t))) != 0) { 4214 ZFS_EXIT(zfsvfs); 4215 return (error); 4216 } 4217 if (parent == zfsvfs->z_shares_dir) { 4218 ZFS_EXIT(zfsvfs); 4219 return (SET_ERROR(EPERM)); 4220 } 4221 4222 if (zfsvfs->z_utf8 && u8_validate(name, 4223 strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 4224 ZFS_EXIT(zfsvfs); 4225 return (SET_ERROR(EILSEQ)); 4226 } 4227 4228 /* 4229 * We do not support links between attributes and non-attributes 4230 * because of the potential security risk of creating links 4231 * into "normal" file space in order to circumvent restrictions 4232 * imposed in attribute space. 4233 */ 4234 if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) { 4235 ZFS_EXIT(zfsvfs); 4236 return (SET_ERROR(EINVAL)); 4237 } 4238 4239 4240 owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER); 4241 if (owner != crgetuid(cr) && secpolicy_basic_link(svp, cr) != 0) { 4242 ZFS_EXIT(zfsvfs); 4243 return (SET_ERROR(EPERM)); 4244 } 4245 4246 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { 4247 ZFS_EXIT(zfsvfs); 4248 return (error); 4249 } 4250 4251 /* 4252 * Attempt to lock directory; fail if entry already exists. 4253 */ 4254 error = zfs_dirent_lookup(dzp, name, &tzp, ZNEW); 4255 if (error) { 4256 ZFS_EXIT(zfsvfs); 4257 return (error); 4258 } 4259 4260 tx = dmu_tx_create(zfsvfs->z_os); 4261 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); 4262 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 4263 zfs_sa_upgrade_txholds(tx, szp); 4264 zfs_sa_upgrade_txholds(tx, dzp); 4265 error = dmu_tx_assign(tx, TXG_WAIT); 4266 if (error) { 4267 dmu_tx_abort(tx); 4268 ZFS_EXIT(zfsvfs); 4269 return (error); 4270 } 4271 4272 error = zfs_link_create(dzp, name, szp, tx, 0); 4273 4274 if (error == 0) { 4275 uint64_t txtype = TX_LINK; 4276 zfs_log_link(zilog, tx, txtype, dzp, szp, name); 4277 } 4278 4279 dmu_tx_commit(tx); 4280 4281 if (error == 0) { 4282 vnevent_link(svp, ct); 4283 } 4284 4285 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 4286 zil_commit(zilog, 0); 4287 4288 ZFS_EXIT(zfsvfs); 4289 return (error); 4290} 4291 4292 4293/*ARGSUSED*/ 4294void 4295zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) 4296{ 4297 znode_t *zp = VTOZ(vp); 4298 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4299 int error; 4300 4301 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); 4302 if (zp->z_sa_hdl == NULL) { 4303 /* 4304 * The fs has been unmounted, or we did a 4305 * suspend/resume and this file no longer exists. 4306 */ 4307 rw_exit(&zfsvfs->z_teardown_inactive_lock); 4308 vrecycle(vp); 4309 return; 4310 } 4311 4312 if (zp->z_unlinked) { 4313 /* 4314 * Fast path to recycle a vnode of a removed file. 4315 */ 4316 rw_exit(&zfsvfs->z_teardown_inactive_lock); 4317 vrecycle(vp); 4318 return; 4319 } 4320 4321 if (zp->z_atime_dirty && zp->z_unlinked == 0) { 4322 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); 4323 4324 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 4325 zfs_sa_upgrade_txholds(tx, zp); 4326 error = dmu_tx_assign(tx, TXG_WAIT); 4327 if (error) { 4328 dmu_tx_abort(tx); 4329 } else { 4330 (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), 4331 (void *)&zp->z_atime, sizeof (zp->z_atime), tx); 4332 zp->z_atime_dirty = 0; 4333 dmu_tx_commit(tx); 4334 } 4335 } 4336 rw_exit(&zfsvfs->z_teardown_inactive_lock); 4337} 4338 4339 4340CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid)); 4341CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid)); 4342 4343/*ARGSUSED*/ 4344static int 4345zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct) 4346{ 4347 znode_t *zp = VTOZ(vp); 4348 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4349 uint32_t gen; 4350 uint64_t gen64; 4351 uint64_t object = zp->z_id; 4352 zfid_short_t *zfid; 4353 int size, i, error; 4354 4355 ZFS_ENTER(zfsvfs); 4356 ZFS_VERIFY_ZP(zp); 4357 4358 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), 4359 &gen64, sizeof (uint64_t))) != 0) { 4360 ZFS_EXIT(zfsvfs); 4361 return (error); 4362 } 4363 4364 gen = (uint32_t)gen64; 4365 4366 size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN; 4367 4368#ifdef illumos 4369 if (fidp->fid_len < size) { 4370 fidp->fid_len = size; 4371 ZFS_EXIT(zfsvfs); 4372 return (SET_ERROR(ENOSPC)); 4373 } 4374#else 4375 fidp->fid_len = size; 4376#endif 4377 4378 zfid = (zfid_short_t *)fidp; 4379 4380 zfid->zf_len = size; 4381 4382 for (i = 0; i < sizeof (zfid->zf_object); i++) 4383 zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); 4384 4385 /* Must have a non-zero generation number to distinguish from .zfs */ 4386 if (gen == 0) 4387 gen = 1; 4388 for (i = 0; i < sizeof (zfid->zf_gen); i++) 4389 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); 4390 4391 if (size == LONG_FID_LEN) { 4392 uint64_t objsetid = dmu_objset_id(zfsvfs->z_os); 4393 zfid_long_t *zlfid; 4394 4395 zlfid = (zfid_long_t *)fidp; 4396 4397 for (i = 0; i < sizeof (zlfid->zf_setid); i++) 4398 zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i)); 4399 4400 /* XXX - this should be the generation number for the objset */ 4401 for (i = 0; i < sizeof (zlfid->zf_setgen); i++) 4402 zlfid->zf_setgen[i] = 0; 4403 } 4404 4405 ZFS_EXIT(zfsvfs); 4406 return (0); 4407} 4408 4409static int 4410zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, 4411 caller_context_t *ct) 4412{ 4413 znode_t *zp, *xzp; 4414 zfsvfs_t *zfsvfs; 4415 int error; 4416 4417 switch (cmd) { 4418 case _PC_LINK_MAX: 4419 *valp = INT_MAX; 4420 return (0); 4421 4422 case _PC_FILESIZEBITS: 4423 *valp = 64; 4424 return (0); 4425#ifdef illumos 4426 case _PC_XATTR_EXISTS: 4427 zp = VTOZ(vp); 4428 zfsvfs = zp->z_zfsvfs; 4429 ZFS_ENTER(zfsvfs); 4430 ZFS_VERIFY_ZP(zp); 4431 *valp = 0; 4432 error = zfs_dirent_lookup(zp, "", &xzp, 4433 ZXATTR | ZEXISTS | ZSHARED); 4434 if (error == 0) { 4435 if (!zfs_dirempty(xzp)) 4436 *valp = 1; 4437 vrele(ZTOV(xzp)); 4438 } else if (error == ENOENT) { 4439 /* 4440 * If there aren't extended attributes, it's the 4441 * same as having zero of them. 4442 */ 4443 error = 0; 4444 } 4445 ZFS_EXIT(zfsvfs); 4446 return (error); 4447 4448 case _PC_SATTR_ENABLED: 4449 case _PC_SATTR_EXISTS: 4450 *valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && 4451 (vp->v_type == VREG || vp->v_type == VDIR); 4452 return (0); 4453 4454 case _PC_ACCESS_FILTERING: 4455 *valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) && 4456 vp->v_type == VDIR; 4457 return (0); 4458 4459 case _PC_ACL_ENABLED: 4460 *valp = _ACL_ACE_ENABLED; 4461 return (0); 4462#endif /* illumos */ 4463 case _PC_MIN_HOLE_SIZE: 4464 *valp = (int)SPA_MINBLOCKSIZE; 4465 return (0); 4466#ifdef illumos 4467 case _PC_TIMESTAMP_RESOLUTION: 4468 /* nanosecond timestamp resolution */ 4469 *valp = 1L; 4470 return (0); 4471#endif 4472 case _PC_ACL_EXTENDED: 4473 *valp = 0; 4474 return (0); 4475 4476 case _PC_ACL_NFS4: 4477 *valp = 1; 4478 return (0); 4479 4480 case _PC_ACL_PATH_MAX: 4481 *valp = ACL_MAX_ENTRIES; 4482 return (0); 4483 4484 default: 4485 return (EOPNOTSUPP); 4486 } 4487} 4488 4489/*ARGSUSED*/ 4490static int 4491zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, 4492 caller_context_t *ct) 4493{ 4494 znode_t *zp = VTOZ(vp); 4495 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4496 int error; 4497 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 4498 4499 ZFS_ENTER(zfsvfs); 4500 ZFS_VERIFY_ZP(zp); 4501 error = zfs_getacl(zp, vsecp, skipaclchk, cr); 4502 ZFS_EXIT(zfsvfs); 4503 4504 return (error); 4505} 4506 4507/*ARGSUSED*/ 4508int 4509zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, 4510 caller_context_t *ct) 4511{ 4512 znode_t *zp = VTOZ(vp); 4513 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4514 int error; 4515 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 4516 zilog_t *zilog = zfsvfs->z_log; 4517 4518 ZFS_ENTER(zfsvfs); 4519 ZFS_VERIFY_ZP(zp); 4520 4521 error = zfs_setacl(zp, vsecp, skipaclchk, cr); 4522 4523 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 4524 zil_commit(zilog, 0); 4525 4526 ZFS_EXIT(zfsvfs); 4527 return (error); 4528} 4529 4530static int 4531ioflags(int ioflags) 4532{ 4533 int flags = 0; 4534 4535 if (ioflags & IO_APPEND) 4536 flags |= FAPPEND; 4537 if (ioflags & IO_NDELAY) 4538 flags |= FNONBLOCK; 4539 if (ioflags & IO_SYNC) 4540 flags |= (FSYNC | FDSYNC | FRSYNC); 4541 4542 return (flags); 4543} 4544 4545static int 4546zfs_getpages(struct vnode *vp, vm_page_t *m, int count, int *rbehind, 4547 int *rahead) 4548{ 4549 znode_t *zp = VTOZ(vp); 4550 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4551 objset_t *os = zp->z_zfsvfs->z_os; 4552 vm_page_t mlast; 4553 vm_object_t object; 4554 caddr_t va; 4555 struct sf_buf *sf; 4556 off_t startoff, endoff; 4557 int i, error; 4558 vm_pindex_t reqstart, reqend; 4559 int lsize, size; 4560 4561 object = m[0]->object; 4562 error = 0; 4563 4564 ZFS_ENTER(zfsvfs); 4565 ZFS_VERIFY_ZP(zp); 4566 4567 zfs_vmobject_wlock(object); 4568 if (m[count - 1]->valid != 0 && --count == 0) { 4569 zfs_vmobject_wunlock(object); 4570 goto out; 4571 } 4572 4573 mlast = m[count - 1]; 4574 4575 if (IDX_TO_OFF(mlast->pindex) >= 4576 object->un_pager.vnp.vnp_size) { 4577 zfs_vmobject_wunlock(object); 4578 ZFS_EXIT(zfsvfs); 4579 return (zfs_vm_pagerret_bad); 4580 } 4581 4582 PCPU_INC(cnt.v_vnodein); 4583 PCPU_ADD(cnt.v_vnodepgsin, count); 4584 4585 lsize = PAGE_SIZE; 4586 if (IDX_TO_OFF(mlast->pindex) + lsize > object->un_pager.vnp.vnp_size) 4587 lsize = object->un_pager.vnp.vnp_size - 4588 IDX_TO_OFF(mlast->pindex); 4589 zfs_vmobject_wunlock(object); 4590 4591 for (i = 0; i < count; i++) { 4592 size = PAGE_SIZE; 4593 if (i == count - 1) 4594 size = lsize; 4595 va = zfs_map_page(m[i], &sf); 4596 error = dmu_read(os, zp->z_id, IDX_TO_OFF(m[i]->pindex), 4597 size, va, DMU_READ_PREFETCH); 4598 if (size != PAGE_SIZE) 4599 bzero(va + size, PAGE_SIZE - size); 4600 zfs_unmap_page(sf); 4601 if (error != 0) 4602 goto out; 4603 } 4604 4605 zfs_vmobject_wlock(object); 4606 for (i = 0; i < count; i++) 4607 m[i]->valid = VM_PAGE_BITS_ALL; 4608 zfs_vmobject_wunlock(object); 4609 4610out: 4611 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 4612 ZFS_EXIT(zfsvfs); 4613 if (error == 0) { 4614 if (rbehind) 4615 *rbehind = 0; 4616 if (rahead) 4617 *rahead = 0; 4618 return (zfs_vm_pagerret_ok); 4619 } else 4620 return (zfs_vm_pagerret_error); 4621} 4622 4623static int 4624zfs_freebsd_getpages(ap) 4625 struct vop_getpages_args /* { 4626 struct vnode *a_vp; 4627 vm_page_t *a_m; 4628 int a_count; 4629 int *a_rbehind; 4630 int *a_rahead; 4631 } */ *ap; 4632{ 4633 4634 return (zfs_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind, 4635 ap->a_rahead)); 4636} 4637 4638static int 4639zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags, 4640 int *rtvals) 4641{ 4642 znode_t *zp = VTOZ(vp); 4643 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4644 rl_t *rl; 4645 dmu_tx_t *tx; 4646 struct sf_buf *sf; 4647 vm_object_t object; 4648 vm_page_t m; 4649 caddr_t va; 4650 size_t tocopy; 4651 size_t lo_len; 4652 vm_ooffset_t lo_off; 4653 vm_ooffset_t off; 4654 uint_t blksz; 4655 int ncount; 4656 int pcount; 4657 int err; 4658 int i; 4659 4660 ZFS_ENTER(zfsvfs); 4661 ZFS_VERIFY_ZP(zp); 4662 4663 object = vp->v_object; 4664 pcount = btoc(len); 4665 ncount = pcount; 4666 4667 KASSERT(ma[0]->object == object, ("mismatching object")); 4668 KASSERT(len > 0 && (len & PAGE_MASK) == 0, ("unexpected length")); 4669 4670 for (i = 0; i < pcount; i++) 4671 rtvals[i] = zfs_vm_pagerret_error; 4672 4673 off = IDX_TO_OFF(ma[0]->pindex); 4674 blksz = zp->z_blksz; 4675 lo_off = rounddown(off, blksz); 4676 lo_len = roundup(len + (off - lo_off), blksz); 4677 rl = zfs_range_lock(zp, lo_off, lo_len, RL_WRITER); 4678 4679 zfs_vmobject_wlock(object); 4680 if (len + off > object->un_pager.vnp.vnp_size) { 4681 if (object->un_pager.vnp.vnp_size > off) { 4682 int pgoff; 4683 4684 len = object->un_pager.vnp.vnp_size - off; 4685 ncount = btoc(len); 4686 if ((pgoff = (int)len & PAGE_MASK) != 0) { 4687 /* 4688 * If the object is locked and the following 4689 * conditions hold, then the page's dirty 4690 * field cannot be concurrently changed by a 4691 * pmap operation. 4692 */ 4693 m = ma[ncount - 1]; 4694 vm_page_assert_sbusied(m); 4695 KASSERT(!pmap_page_is_write_mapped(m), 4696 ("zfs_putpages: page %p is not read-only", m)); 4697 vm_page_clear_dirty(m, pgoff, PAGE_SIZE - 4698 pgoff); 4699 } 4700 } else { 4701 len = 0; 4702 ncount = 0; 4703 } 4704 if (ncount < pcount) { 4705 for (i = ncount; i < pcount; i++) { 4706 rtvals[i] = zfs_vm_pagerret_bad; 4707 } 4708 } 4709 } 4710 zfs_vmobject_wunlock(object); 4711 4712 if (ncount == 0) 4713 goto out; 4714 4715 if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) || 4716 zfs_owner_overquota(zfsvfs, zp, B_TRUE)) { 4717 goto out; 4718 } 4719 4720 tx = dmu_tx_create(zfsvfs->z_os); 4721 dmu_tx_hold_write(tx, zp->z_id, off, len); 4722 4723 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 4724 zfs_sa_upgrade_txholds(tx, zp); 4725 err = dmu_tx_assign(tx, TXG_WAIT); 4726 if (err != 0) { 4727 dmu_tx_abort(tx); 4728 goto out; 4729 } 4730 4731 if (zp->z_blksz < PAGE_SIZE) { 4732 i = 0; 4733 for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) { 4734 tocopy = len > PAGE_SIZE ? PAGE_SIZE : len; 4735 va = zfs_map_page(ma[i], &sf); 4736 dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx); 4737 zfs_unmap_page(sf); 4738 } 4739 } else { 4740 err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, ma, tx); 4741 } 4742 4743 if (err == 0) { 4744 uint64_t mtime[2], ctime[2]; 4745 sa_bulk_attr_t bulk[3]; 4746 int count = 0; 4747 4748 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, 4749 &mtime, 16); 4750 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, 4751 &ctime, 16); 4752 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 4753 &zp->z_pflags, 8); 4754 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, 4755 B_TRUE); 4756 err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 4757 ASSERT0(err); 4758 zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0); 4759 4760 zfs_vmobject_wlock(object); 4761 for (i = 0; i < ncount; i++) { 4762 rtvals[i] = zfs_vm_pagerret_ok; 4763 vm_page_undirty(ma[i]); 4764 } 4765 zfs_vmobject_wunlock(object); 4766 PCPU_INC(cnt.v_vnodeout); 4767 PCPU_ADD(cnt.v_vnodepgsout, ncount); 4768 } 4769 dmu_tx_commit(tx); 4770 4771out: 4772 zfs_range_unlock(rl); 4773 if ((flags & (zfs_vm_pagerput_sync | zfs_vm_pagerput_inval)) != 0 || 4774 zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 4775 zil_commit(zfsvfs->z_log, zp->z_id); 4776 ZFS_EXIT(zfsvfs); 4777 return (rtvals[0]); 4778} 4779 4780int 4781zfs_freebsd_putpages(ap) 4782 struct vop_putpages_args /* { 4783 struct vnode *a_vp; 4784 vm_page_t *a_m; 4785 int a_count; 4786 int a_sync; 4787 int *a_rtvals; 4788 } */ *ap; 4789{ 4790 4791 return (zfs_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync, 4792 ap->a_rtvals)); 4793} 4794 4795static int 4796zfs_freebsd_bmap(ap) 4797 struct vop_bmap_args /* { 4798 struct vnode *a_vp; 4799 daddr_t a_bn; 4800 struct bufobj **a_bop; 4801 daddr_t *a_bnp; 4802 int *a_runp; 4803 int *a_runb; 4804 } */ *ap; 4805{ 4806 4807 if (ap->a_bop != NULL) 4808 *ap->a_bop = &ap->a_vp->v_bufobj; 4809 if (ap->a_bnp != NULL) 4810 *ap->a_bnp = ap->a_bn; 4811 if (ap->a_runp != NULL) 4812 *ap->a_runp = 0; 4813 if (ap->a_runb != NULL) 4814 *ap->a_runb = 0; 4815 4816 return (0); 4817} 4818 4819static int 4820zfs_freebsd_open(ap) 4821 struct vop_open_args /* { 4822 struct vnode *a_vp; 4823 int a_mode; 4824 struct ucred *a_cred; 4825 struct thread *a_td; 4826 } */ *ap; 4827{ 4828 vnode_t *vp = ap->a_vp; 4829 znode_t *zp = VTOZ(vp); 4830 int error; 4831 4832 error = zfs_open(&vp, ap->a_mode, ap->a_cred, NULL); 4833 if (error == 0) 4834 vnode_create_vobject(vp, zp->z_size, ap->a_td); 4835 return (error); 4836} 4837 4838static int 4839zfs_freebsd_close(ap) 4840 struct vop_close_args /* { 4841 struct vnode *a_vp; 4842 int a_fflag; 4843 struct ucred *a_cred; 4844 struct thread *a_td; 4845 } */ *ap; 4846{ 4847 4848 return (zfs_close(ap->a_vp, ap->a_fflag, 1, 0, ap->a_cred, NULL)); 4849} 4850 4851static int 4852zfs_freebsd_ioctl(ap) 4853 struct vop_ioctl_args /* { 4854 struct vnode *a_vp; 4855 u_long a_command; 4856 caddr_t a_data; 4857 int a_fflag; 4858 struct ucred *cred; 4859 struct thread *td; 4860 } */ *ap; 4861{ 4862 4863 return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data, 4864 ap->a_fflag, ap->a_cred, NULL, NULL)); 4865} 4866 4867static int 4868zfs_freebsd_read(ap) 4869 struct vop_read_args /* { 4870 struct vnode *a_vp; 4871 struct uio *a_uio; 4872 int a_ioflag; 4873 struct ucred *a_cred; 4874 } */ *ap; 4875{ 4876 4877 return (zfs_read(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag), 4878 ap->a_cred, NULL)); 4879} 4880 4881static int 4882zfs_freebsd_write(ap) 4883 struct vop_write_args /* { 4884 struct vnode *a_vp; 4885 struct uio *a_uio; 4886 int a_ioflag; 4887 struct ucred *a_cred; 4888 } */ *ap; 4889{ 4890 4891 return (zfs_write(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag), 4892 ap->a_cred, NULL)); 4893} 4894 4895static int 4896zfs_freebsd_access(ap) 4897 struct vop_access_args /* { 4898 struct vnode *a_vp; 4899 accmode_t a_accmode; 4900 struct ucred *a_cred; 4901 struct thread *a_td; 4902 } */ *ap; 4903{ 4904 vnode_t *vp = ap->a_vp; 4905 znode_t *zp = VTOZ(vp); 4906 accmode_t accmode; 4907 int error = 0; 4908 4909 /* 4910 * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND, 4911 */ 4912 accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND); 4913 if (accmode != 0) 4914 error = zfs_access(ap->a_vp, accmode, 0, ap->a_cred, NULL); 4915 4916 /* 4917 * VADMIN has to be handled by vaccess(). 4918 */ 4919 if (error == 0) { 4920 accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND); 4921 if (accmode != 0) { 4922 error = vaccess(vp->v_type, zp->z_mode, zp->z_uid, 4923 zp->z_gid, accmode, ap->a_cred, NULL); 4924 } 4925 } 4926 4927 /* 4928 * For VEXEC, ensure that at least one execute bit is set for 4929 * non-directories. 4930 */ 4931 if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR && 4932 (zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) { 4933 error = EACCES; 4934 } 4935 4936 return (error); 4937} 4938 4939static int 4940zfs_freebsd_lookup(ap) 4941 struct vop_lookup_args /* { 4942 struct vnode *a_dvp; 4943 struct vnode **a_vpp; 4944 struct componentname *a_cnp; 4945 } */ *ap; 4946{ 4947 struct componentname *cnp = ap->a_cnp; 4948 char nm[NAME_MAX + 1]; 4949 4950 ASSERT(cnp->cn_namelen < sizeof(nm)); 4951 strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm))); 4952 4953 return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop, 4954 cnp->cn_cred, cnp->cn_thread, 0)); 4955} 4956 4957static int 4958zfs_cache_lookup(ap) 4959 struct vop_lookup_args /* { 4960 struct vnode *a_dvp; 4961 struct vnode **a_vpp; 4962 struct componentname *a_cnp; 4963 } */ *ap; 4964{ 4965 zfsvfs_t *zfsvfs; 4966 4967 zfsvfs = ap->a_dvp->v_mount->mnt_data; 4968 if (zfsvfs->z_use_namecache) 4969 return (vfs_cache_lookup(ap)); 4970 else 4971 return (zfs_freebsd_lookup(ap)); 4972} 4973 4974static int 4975zfs_freebsd_create(ap) 4976 struct vop_create_args /* { 4977 struct vnode *a_dvp; 4978 struct vnode **a_vpp; 4979 struct componentname *a_cnp; 4980 struct vattr *a_vap; 4981 } */ *ap; 4982{ 4983 zfsvfs_t *zfsvfs; 4984 struct componentname *cnp = ap->a_cnp; 4985 vattr_t *vap = ap->a_vap; 4986 int error, mode; 4987 4988 ASSERT(cnp->cn_flags & SAVENAME); 4989 4990 vattr_init_mask(vap); 4991 mode = vap->va_mode & ALLPERMS; 4992 zfsvfs = ap->a_dvp->v_mount->mnt_data; 4993 4994 error = zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode, 4995 ap->a_vpp, cnp->cn_cred, cnp->cn_thread); 4996 if (zfsvfs->z_use_namecache && 4997 error == 0 && (cnp->cn_flags & MAKEENTRY) != 0) 4998 cache_enter(ap->a_dvp, *ap->a_vpp, cnp); 4999 return (error); 5000} 5001 5002static int 5003zfs_freebsd_remove(ap) 5004 struct vop_remove_args /* { 5005 struct vnode *a_dvp; 5006 struct vnode *a_vp; 5007 struct componentname *a_cnp; 5008 } */ *ap; 5009{ 5010 5011 ASSERT(ap->a_cnp->cn_flags & SAVENAME); 5012 5013 return (zfs_remove(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr, 5014 ap->a_cnp->cn_cred)); 5015} 5016 5017static int 5018zfs_freebsd_mkdir(ap) 5019 struct vop_mkdir_args /* { 5020 struct vnode *a_dvp; 5021 struct vnode **a_vpp; 5022 struct componentname *a_cnp; 5023 struct vattr *a_vap; 5024 } */ *ap; 5025{ 5026 vattr_t *vap = ap->a_vap; 5027 5028 ASSERT(ap->a_cnp->cn_flags & SAVENAME); 5029 5030 vattr_init_mask(vap); 5031 5032 return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp, 5033 ap->a_cnp->cn_cred)); 5034} 5035 5036static int 5037zfs_freebsd_rmdir(ap) 5038 struct vop_rmdir_args /* { 5039 struct vnode *a_dvp; 5040 struct vnode *a_vp; 5041 struct componentname *a_cnp; 5042 } */ *ap; 5043{ 5044 struct componentname *cnp = ap->a_cnp; 5045 5046 ASSERT(cnp->cn_flags & SAVENAME); 5047 5048 return (zfs_rmdir(ap->a_dvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred)); 5049} 5050 5051static int 5052zfs_freebsd_readdir(ap) 5053 struct vop_readdir_args /* { 5054 struct vnode *a_vp; 5055 struct uio *a_uio; 5056 struct ucred *a_cred; 5057 int *a_eofflag; 5058 int *a_ncookies; 5059 u_long **a_cookies; 5060 } */ *ap; 5061{ 5062 5063 return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag, 5064 ap->a_ncookies, ap->a_cookies)); 5065} 5066 5067static int 5068zfs_freebsd_fsync(ap) 5069 struct vop_fsync_args /* { 5070 struct vnode *a_vp; 5071 int a_waitfor; 5072 struct thread *a_td; 5073 } */ *ap; 5074{ 5075 5076 vop_stdfsync(ap); 5077 return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL)); 5078} 5079 5080static int 5081zfs_freebsd_getattr(ap) 5082 struct vop_getattr_args /* { 5083 struct vnode *a_vp; 5084 struct vattr *a_vap; 5085 struct ucred *a_cred; 5086 } */ *ap; 5087{ 5088 vattr_t *vap = ap->a_vap; 5089 xvattr_t xvap; 5090 u_long fflags = 0; 5091 int error; 5092 5093 xva_init(&xvap); 5094 xvap.xva_vattr = *vap; 5095 xvap.xva_vattr.va_mask |= AT_XVATTR; 5096 5097 /* Convert chflags into ZFS-type flags. */ 5098 /* XXX: what about SF_SETTABLE?. */ 5099 XVA_SET_REQ(&xvap, XAT_IMMUTABLE); 5100 XVA_SET_REQ(&xvap, XAT_APPENDONLY); 5101 XVA_SET_REQ(&xvap, XAT_NOUNLINK); 5102 XVA_SET_REQ(&xvap, XAT_NODUMP); 5103 XVA_SET_REQ(&xvap, XAT_READONLY); 5104 XVA_SET_REQ(&xvap, XAT_ARCHIVE); 5105 XVA_SET_REQ(&xvap, XAT_SYSTEM); 5106 XVA_SET_REQ(&xvap, XAT_HIDDEN); 5107 XVA_SET_REQ(&xvap, XAT_REPARSE); 5108 XVA_SET_REQ(&xvap, XAT_OFFLINE); 5109 XVA_SET_REQ(&xvap, XAT_SPARSE); 5110 5111 error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL); 5112 if (error != 0) 5113 return (error); 5114 5115 /* Convert ZFS xattr into chflags. */ 5116#define FLAG_CHECK(fflag, xflag, xfield) do { \ 5117 if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0) \ 5118 fflags |= (fflag); \ 5119} while (0) 5120 FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE, 5121 xvap.xva_xoptattrs.xoa_immutable); 5122 FLAG_CHECK(SF_APPEND, XAT_APPENDONLY, 5123 xvap.xva_xoptattrs.xoa_appendonly); 5124 FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK, 5125 xvap.xva_xoptattrs.xoa_nounlink); 5126 FLAG_CHECK(UF_ARCHIVE, XAT_ARCHIVE, 5127 xvap.xva_xoptattrs.xoa_archive); 5128 FLAG_CHECK(UF_NODUMP, XAT_NODUMP, 5129 xvap.xva_xoptattrs.xoa_nodump); 5130 FLAG_CHECK(UF_READONLY, XAT_READONLY, 5131 xvap.xva_xoptattrs.xoa_readonly); 5132 FLAG_CHECK(UF_SYSTEM, XAT_SYSTEM, 5133 xvap.xva_xoptattrs.xoa_system); 5134 FLAG_CHECK(UF_HIDDEN, XAT_HIDDEN, 5135 xvap.xva_xoptattrs.xoa_hidden); 5136 FLAG_CHECK(UF_REPARSE, XAT_REPARSE, 5137 xvap.xva_xoptattrs.xoa_reparse); 5138 FLAG_CHECK(UF_OFFLINE, XAT_OFFLINE, 5139 xvap.xva_xoptattrs.xoa_offline); 5140 FLAG_CHECK(UF_SPARSE, XAT_SPARSE, 5141 xvap.xva_xoptattrs.xoa_sparse); 5142 5143#undef FLAG_CHECK 5144 *vap = xvap.xva_vattr; 5145 vap->va_flags = fflags; 5146 return (0); 5147} 5148 5149static int 5150zfs_freebsd_setattr(ap) 5151 struct vop_setattr_args /* { 5152 struct vnode *a_vp; 5153 struct vattr *a_vap; 5154 struct ucred *a_cred; 5155 } */ *ap; 5156{ 5157 vnode_t *vp = ap->a_vp; 5158 vattr_t *vap = ap->a_vap; 5159 cred_t *cred = ap->a_cred; 5160 xvattr_t xvap; 5161 u_long fflags; 5162 uint64_t zflags; 5163 5164 vattr_init_mask(vap); 5165 vap->va_mask &= ~AT_NOSET; 5166 5167 xva_init(&xvap); 5168 xvap.xva_vattr = *vap; 5169 5170 zflags = VTOZ(vp)->z_pflags; 5171 5172 if (vap->va_flags != VNOVAL) { 5173 zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs; 5174 int error; 5175 5176 if (zfsvfs->z_use_fuids == B_FALSE) 5177 return (EOPNOTSUPP); 5178 5179 fflags = vap->va_flags; 5180 /* 5181 * XXX KDM 5182 * We need to figure out whether it makes sense to allow 5183 * UF_REPARSE through, since we don't really have other 5184 * facilities to handle reparse points and zfs_setattr() 5185 * doesn't currently allow setting that attribute anyway. 5186 */ 5187 if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_ARCHIVE| 5188 UF_NODUMP|UF_SYSTEM|UF_HIDDEN|UF_READONLY|UF_REPARSE| 5189 UF_OFFLINE|UF_SPARSE)) != 0) 5190 return (EOPNOTSUPP); 5191 /* 5192 * Unprivileged processes are not permitted to unset system 5193 * flags, or modify flags if any system flags are set. 5194 * Privileged non-jail processes may not modify system flags 5195 * if securelevel > 0 and any existing system flags are set. 5196 * Privileged jail processes behave like privileged non-jail 5197 * processes if the security.jail.chflags_allowed sysctl is 5198 * is non-zero; otherwise, they behave like unprivileged 5199 * processes. 5200 */ 5201 if (secpolicy_fs_owner(vp->v_mount, cred) == 0 || 5202 priv_check_cred(cred, PRIV_VFS_SYSFLAGS, 0) == 0) { 5203 if (zflags & 5204 (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) { 5205 error = securelevel_gt(cred, 0); 5206 if (error != 0) 5207 return (error); 5208 } 5209 } else { 5210 /* 5211 * Callers may only modify the file flags on objects they 5212 * have VADMIN rights for. 5213 */ 5214 if ((error = VOP_ACCESS(vp, VADMIN, cred, curthread)) != 0) 5215 return (error); 5216 if (zflags & 5217 (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) { 5218 return (EPERM); 5219 } 5220 if (fflags & 5221 (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) { 5222 return (EPERM); 5223 } 5224 } 5225 5226#define FLAG_CHANGE(fflag, zflag, xflag, xfield) do { \ 5227 if (((fflags & (fflag)) && !(zflags & (zflag))) || \ 5228 ((zflags & (zflag)) && !(fflags & (fflag)))) { \ 5229 XVA_SET_REQ(&xvap, (xflag)); \ 5230 (xfield) = ((fflags & (fflag)) != 0); \ 5231 } \ 5232} while (0) 5233 /* Convert chflags into ZFS-type flags. */ 5234 /* XXX: what about SF_SETTABLE?. */ 5235 FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE, 5236 xvap.xva_xoptattrs.xoa_immutable); 5237 FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY, 5238 xvap.xva_xoptattrs.xoa_appendonly); 5239 FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK, 5240 xvap.xva_xoptattrs.xoa_nounlink); 5241 FLAG_CHANGE(UF_ARCHIVE, ZFS_ARCHIVE, XAT_ARCHIVE, 5242 xvap.xva_xoptattrs.xoa_archive); 5243 FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP, 5244 xvap.xva_xoptattrs.xoa_nodump); 5245 FLAG_CHANGE(UF_READONLY, ZFS_READONLY, XAT_READONLY, 5246 xvap.xva_xoptattrs.xoa_readonly); 5247 FLAG_CHANGE(UF_SYSTEM, ZFS_SYSTEM, XAT_SYSTEM, 5248 xvap.xva_xoptattrs.xoa_system); 5249 FLAG_CHANGE(UF_HIDDEN, ZFS_HIDDEN, XAT_HIDDEN, 5250 xvap.xva_xoptattrs.xoa_hidden); 5251 FLAG_CHANGE(UF_REPARSE, ZFS_REPARSE, XAT_REPARSE, 5252 xvap.xva_xoptattrs.xoa_hidden); 5253 FLAG_CHANGE(UF_OFFLINE, ZFS_OFFLINE, XAT_OFFLINE, 5254 xvap.xva_xoptattrs.xoa_offline); 5255 FLAG_CHANGE(UF_SPARSE, ZFS_SPARSE, XAT_SPARSE, 5256 xvap.xva_xoptattrs.xoa_sparse); 5257#undef FLAG_CHANGE 5258 } 5259 if (vap->va_birthtime.tv_sec != VNOVAL) { 5260 xvap.xva_vattr.va_mask |= AT_XVATTR; 5261 XVA_SET_REQ(&xvap, XAT_CREATETIME); 5262 } 5263 return (zfs_setattr(vp, (vattr_t *)&xvap, 0, cred, NULL)); 5264} 5265 5266static int 5267zfs_freebsd_rename(ap) 5268 struct vop_rename_args /* { 5269 struct vnode *a_fdvp; 5270 struct vnode *a_fvp; 5271 struct componentname *a_fcnp; 5272 struct vnode *a_tdvp; 5273 struct vnode *a_tvp; 5274 struct componentname *a_tcnp; 5275 } */ *ap; 5276{ 5277 vnode_t *fdvp = ap->a_fdvp; 5278 vnode_t *fvp = ap->a_fvp; 5279 vnode_t *tdvp = ap->a_tdvp; 5280 vnode_t *tvp = ap->a_tvp; 5281 int error; 5282 5283 ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART)); 5284 ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART)); 5285 5286 error = zfs_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp, 5287 ap->a_tcnp, ap->a_fcnp->cn_cred); 5288 5289 vrele(fdvp); 5290 vrele(fvp); 5291 vrele(tdvp); 5292 if (tvp != NULL) 5293 vrele(tvp); 5294 5295 return (error); 5296} 5297 5298static int 5299zfs_freebsd_symlink(ap) 5300 struct vop_symlink_args /* { 5301 struct vnode *a_dvp; 5302 struct vnode **a_vpp; 5303 struct componentname *a_cnp; 5304 struct vattr *a_vap; 5305 char *a_target; 5306 } */ *ap; 5307{ 5308 struct componentname *cnp = ap->a_cnp; 5309 vattr_t *vap = ap->a_vap; 5310 5311 ASSERT(cnp->cn_flags & SAVENAME); 5312 5313 vap->va_type = VLNK; /* FreeBSD: Syscall only sets va_mode. */ 5314 vattr_init_mask(vap); 5315 5316 return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap, 5317 ap->a_target, cnp->cn_cred, cnp->cn_thread)); 5318} 5319 5320static int 5321zfs_freebsd_readlink(ap) 5322 struct vop_readlink_args /* { 5323 struct vnode *a_vp; 5324 struct uio *a_uio; 5325 struct ucred *a_cred; 5326 } */ *ap; 5327{ 5328 5329 return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL)); 5330} 5331 5332static int 5333zfs_freebsd_link(ap) 5334 struct vop_link_args /* { 5335 struct vnode *a_tdvp; 5336 struct vnode *a_vp; 5337 struct componentname *a_cnp; 5338 } */ *ap; 5339{ 5340 struct componentname *cnp = ap->a_cnp; 5341 vnode_t *vp = ap->a_vp; 5342 vnode_t *tdvp = ap->a_tdvp; 5343 5344 if (tdvp->v_mount != vp->v_mount) 5345 return (EXDEV); 5346 5347 ASSERT(cnp->cn_flags & SAVENAME); 5348 5349 return (zfs_link(tdvp, vp, cnp->cn_nameptr, cnp->cn_cred, NULL, 0)); 5350} 5351 5352static int 5353zfs_freebsd_inactive(ap) 5354 struct vop_inactive_args /* { 5355 struct vnode *a_vp; 5356 struct thread *a_td; 5357 } */ *ap; 5358{ 5359 vnode_t *vp = ap->a_vp; 5360 5361 zfs_inactive(vp, ap->a_td->td_ucred, NULL); 5362 return (0); 5363} 5364 5365static int 5366zfs_freebsd_reclaim(ap) 5367 struct vop_reclaim_args /* { 5368 struct vnode *a_vp; 5369 struct thread *a_td; 5370 } */ *ap; 5371{ 5372 vnode_t *vp = ap->a_vp; 5373 znode_t *zp = VTOZ(vp); 5374 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 5375 5376 ASSERT(zp != NULL); 5377 5378 /* Destroy the vm object and flush associated pages. */ 5379 vnode_destroy_vobject(vp); 5380 5381 /* 5382 * z_teardown_inactive_lock protects from a race with 5383 * zfs_znode_dmu_fini in zfsvfs_teardown during 5384 * force unmount. 5385 */ 5386 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); 5387 if (zp->z_sa_hdl == NULL) 5388 zfs_znode_free(zp); 5389 else 5390 zfs_zinactive(zp); 5391 rw_exit(&zfsvfs->z_teardown_inactive_lock); 5392 5393 vp->v_data = NULL; 5394 return (0); 5395} 5396 5397static int 5398zfs_freebsd_fid(ap) 5399 struct vop_fid_args /* { 5400 struct vnode *a_vp; 5401 struct fid *a_fid; 5402 } */ *ap; 5403{ 5404 5405 return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL)); 5406} 5407 5408static int 5409zfs_freebsd_pathconf(ap) 5410 struct vop_pathconf_args /* { 5411 struct vnode *a_vp; 5412 int a_name; 5413 register_t *a_retval; 5414 } */ *ap; 5415{ 5416 ulong_t val; 5417 int error; 5418 5419 error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred, NULL); 5420 if (error == 0) 5421 *ap->a_retval = val; 5422 else if (error == EOPNOTSUPP) 5423 error = vop_stdpathconf(ap); 5424 return (error); 5425} 5426 5427static int 5428zfs_freebsd_fifo_pathconf(ap) 5429 struct vop_pathconf_args /* { 5430 struct vnode *a_vp; 5431 int a_name; 5432 register_t *a_retval; 5433 } */ *ap; 5434{ 5435 5436 switch (ap->a_name) { 5437 case _PC_ACL_EXTENDED: 5438 case _PC_ACL_NFS4: 5439 case _PC_ACL_PATH_MAX: 5440 case _PC_MAC_PRESENT: 5441 return (zfs_freebsd_pathconf(ap)); 5442 default: 5443 return (fifo_specops.vop_pathconf(ap)); 5444 } 5445} 5446 5447/* 5448 * FreeBSD's extended attributes namespace defines file name prefix for ZFS' 5449 * extended attribute name: 5450 * 5451 * NAMESPACE PREFIX 5452 * system freebsd:system: 5453 * user (none, can be used to access ZFS fsattr(5) attributes 5454 * created on Solaris) 5455 */ 5456static int 5457zfs_create_attrname(int attrnamespace, const char *name, char *attrname, 5458 size_t size) 5459{ 5460 const char *namespace, *prefix, *suffix; 5461 5462 /* We don't allow '/' character in attribute name. */ 5463 if (strchr(name, '/') != NULL) 5464 return (EINVAL); 5465 /* We don't allow attribute names that start with "freebsd:" string. */ 5466 if (strncmp(name, "freebsd:", 8) == 0) 5467 return (EINVAL); 5468 5469 bzero(attrname, size); 5470 5471 switch (attrnamespace) { 5472 case EXTATTR_NAMESPACE_USER: 5473#if 0 5474 prefix = "freebsd:"; 5475 namespace = EXTATTR_NAMESPACE_USER_STRING; 5476 suffix = ":"; 5477#else 5478 /* 5479 * This is the default namespace by which we can access all 5480 * attributes created on Solaris. 5481 */ 5482 prefix = namespace = suffix = ""; 5483#endif 5484 break; 5485 case EXTATTR_NAMESPACE_SYSTEM: 5486 prefix = "freebsd:"; 5487 namespace = EXTATTR_NAMESPACE_SYSTEM_STRING; 5488 suffix = ":"; 5489 break; 5490 case EXTATTR_NAMESPACE_EMPTY: 5491 default: 5492 return (EINVAL); 5493 } 5494 if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix, 5495 name) >= size) { 5496 return (ENAMETOOLONG); 5497 } 5498 return (0); 5499} 5500 5501/* 5502 * Vnode operating to retrieve a named extended attribute. 5503 */ 5504static int 5505zfs_getextattr(struct vop_getextattr_args *ap) 5506/* 5507vop_getextattr { 5508 IN struct vnode *a_vp; 5509 IN int a_attrnamespace; 5510 IN const char *a_name; 5511 INOUT struct uio *a_uio; 5512 OUT size_t *a_size; 5513 IN struct ucred *a_cred; 5514 IN struct thread *a_td; 5515}; 5516*/ 5517{ 5518 zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; 5519 struct thread *td = ap->a_td; 5520 struct nameidata nd; 5521 char attrname[255]; 5522 struct vattr va; 5523 vnode_t *xvp = NULL, *vp; 5524 int error, flags; 5525 5526 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 5527 ap->a_cred, ap->a_td, VREAD); 5528 if (error != 0) 5529 return (error); 5530 5531 error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, 5532 sizeof(attrname)); 5533 if (error != 0) 5534 return (error); 5535 5536 ZFS_ENTER(zfsvfs); 5537 5538 error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, 5539 LOOKUP_XATTR); 5540 if (error != 0) { 5541 ZFS_EXIT(zfsvfs); 5542 return (error); 5543 } 5544 5545 flags = FREAD; 5546 NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, 5547 xvp, td); 5548 error = vn_open_cred(&nd, &flags, 0, 0, ap->a_cred, NULL); 5549 vp = nd.ni_vp; 5550 NDFREE(&nd, NDF_ONLY_PNBUF); 5551 if (error != 0) { 5552 ZFS_EXIT(zfsvfs); 5553 if (error == ENOENT) 5554 error = ENOATTR; 5555 return (error); 5556 } 5557 5558 if (ap->a_size != NULL) { 5559 error = VOP_GETATTR(vp, &va, ap->a_cred); 5560 if (error == 0) 5561 *ap->a_size = (size_t)va.va_size; 5562 } else if (ap->a_uio != NULL) 5563 error = VOP_READ(vp, ap->a_uio, IO_UNIT, ap->a_cred); 5564 5565 VOP_UNLOCK(vp, 0); 5566 vn_close(vp, flags, ap->a_cred, td); 5567 ZFS_EXIT(zfsvfs); 5568 5569 return (error); 5570} 5571 5572/* 5573 * Vnode operation to remove a named attribute. 5574 */ 5575int 5576zfs_deleteextattr(struct vop_deleteextattr_args *ap) 5577/* 5578vop_deleteextattr { 5579 IN struct vnode *a_vp; 5580 IN int a_attrnamespace; 5581 IN const char *a_name; 5582 IN struct ucred *a_cred; 5583 IN struct thread *a_td; 5584}; 5585*/ 5586{ 5587 zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; 5588 struct thread *td = ap->a_td; 5589 struct nameidata nd; 5590 char attrname[255]; 5591 struct vattr va; 5592 vnode_t *xvp = NULL, *vp; 5593 int error, flags; 5594 5595 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 5596 ap->a_cred, ap->a_td, VWRITE); 5597 if (error != 0) 5598 return (error); 5599 5600 error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, 5601 sizeof(attrname)); 5602 if (error != 0) 5603 return (error); 5604 5605 ZFS_ENTER(zfsvfs); 5606 5607 error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, 5608 LOOKUP_XATTR); 5609 if (error != 0) { 5610 ZFS_EXIT(zfsvfs); 5611 return (error); 5612 } 5613 5614 NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF, 5615 UIO_SYSSPACE, attrname, xvp, td); 5616 error = namei(&nd); 5617 vp = nd.ni_vp; 5618 if (error != 0) { 5619 ZFS_EXIT(zfsvfs); 5620 NDFREE(&nd, NDF_ONLY_PNBUF); 5621 if (error == ENOENT) 5622 error = ENOATTR; 5623 return (error); 5624 } 5625 5626 error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd); 5627 NDFREE(&nd, NDF_ONLY_PNBUF); 5628 5629 vput(nd.ni_dvp); 5630 if (vp == nd.ni_dvp) 5631 vrele(vp); 5632 else 5633 vput(vp); 5634 ZFS_EXIT(zfsvfs); 5635 5636 return (error); 5637} 5638 5639/* 5640 * Vnode operation to set a named attribute. 5641 */ 5642static int 5643zfs_setextattr(struct vop_setextattr_args *ap) 5644/* 5645vop_setextattr { 5646 IN struct vnode *a_vp; 5647 IN int a_attrnamespace; 5648 IN const char *a_name; 5649 INOUT struct uio *a_uio; 5650 IN struct ucred *a_cred; 5651 IN struct thread *a_td; 5652}; 5653*/ 5654{ 5655 zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; 5656 struct thread *td = ap->a_td; 5657 struct nameidata nd; 5658 char attrname[255]; 5659 struct vattr va; 5660 vnode_t *xvp = NULL, *vp; 5661 int error, flags; 5662 5663 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 5664 ap->a_cred, ap->a_td, VWRITE); 5665 if (error != 0) 5666 return (error); 5667 5668 error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, 5669 sizeof(attrname)); 5670 if (error != 0) 5671 return (error); 5672 5673 ZFS_ENTER(zfsvfs); 5674 5675 error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, 5676 LOOKUP_XATTR | CREATE_XATTR_DIR); 5677 if (error != 0) { 5678 ZFS_EXIT(zfsvfs); 5679 return (error); 5680 } 5681 5682 flags = FFLAGS(O_WRONLY | O_CREAT); 5683 NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, 5684 xvp, td); 5685 error = vn_open_cred(&nd, &flags, 0600, 0, ap->a_cred, NULL); 5686 vp = nd.ni_vp; 5687 NDFREE(&nd, NDF_ONLY_PNBUF); 5688 if (error != 0) { 5689 ZFS_EXIT(zfsvfs); 5690 return (error); 5691 } 5692 5693 VATTR_NULL(&va); 5694 va.va_size = 0; 5695 error = VOP_SETATTR(vp, &va, ap->a_cred); 5696 if (error == 0) 5697 VOP_WRITE(vp, ap->a_uio, IO_UNIT, ap->a_cred); 5698 5699 VOP_UNLOCK(vp, 0); 5700 vn_close(vp, flags, ap->a_cred, td); 5701 ZFS_EXIT(zfsvfs); 5702 5703 return (error); 5704} 5705 5706/* 5707 * Vnode operation to retrieve extended attributes on a vnode. 5708 */ 5709static int 5710zfs_listextattr(struct vop_listextattr_args *ap) 5711/* 5712vop_listextattr { 5713 IN struct vnode *a_vp; 5714 IN int a_attrnamespace; 5715 INOUT struct uio *a_uio; 5716 OUT size_t *a_size; 5717 IN struct ucred *a_cred; 5718 IN struct thread *a_td; 5719}; 5720*/ 5721{ 5722 zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; 5723 struct thread *td = ap->a_td; 5724 struct nameidata nd; 5725 char attrprefix[16]; 5726 u_char dirbuf[sizeof(struct dirent)]; 5727 struct dirent *dp; 5728 struct iovec aiov; 5729 struct uio auio, *uio = ap->a_uio; 5730 size_t *sizep = ap->a_size; 5731 size_t plen; 5732 vnode_t *xvp = NULL, *vp; 5733 int done, error, eof, pos; 5734 5735 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 5736 ap->a_cred, ap->a_td, VREAD); 5737 if (error != 0) 5738 return (error); 5739 5740 error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix, 5741 sizeof(attrprefix)); 5742 if (error != 0) 5743 return (error); 5744 plen = strlen(attrprefix); 5745 5746 ZFS_ENTER(zfsvfs); 5747 5748 if (sizep != NULL) 5749 *sizep = 0; 5750 5751 error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, 5752 LOOKUP_XATTR); 5753 if (error != 0) { 5754 ZFS_EXIT(zfsvfs); 5755 /* 5756 * ENOATTR means that the EA directory does not yet exist, 5757 * i.e. there are no extended attributes there. 5758 */ 5759 if (error == ENOATTR) 5760 error = 0; 5761 return (error); 5762 } 5763 5764 NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED, 5765 UIO_SYSSPACE, ".", xvp, td); 5766 error = namei(&nd); 5767 vp = nd.ni_vp; 5768 NDFREE(&nd, NDF_ONLY_PNBUF); 5769 if (error != 0) { 5770 ZFS_EXIT(zfsvfs); 5771 return (error); 5772 } 5773 5774 auio.uio_iov = &aiov; 5775 auio.uio_iovcnt = 1; 5776 auio.uio_segflg = UIO_SYSSPACE; 5777 auio.uio_td = td; 5778 auio.uio_rw = UIO_READ; 5779 auio.uio_offset = 0; 5780 5781 do { 5782 u_char nlen; 5783 5784 aiov.iov_base = (void *)dirbuf; 5785 aiov.iov_len = sizeof(dirbuf); 5786 auio.uio_resid = sizeof(dirbuf); 5787 error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL); 5788 done = sizeof(dirbuf) - auio.uio_resid; 5789 if (error != 0) 5790 break; 5791 for (pos = 0; pos < done;) { 5792 dp = (struct dirent *)(dirbuf + pos); 5793 pos += dp->d_reclen; 5794 /* 5795 * XXX: Temporarily we also accept DT_UNKNOWN, as this 5796 * is what we get when attribute was created on Solaris. 5797 */ 5798 if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN) 5799 continue; 5800 if (plen == 0 && strncmp(dp->d_name, "freebsd:", 8) == 0) 5801 continue; 5802 else if (strncmp(dp->d_name, attrprefix, plen) != 0) 5803 continue; 5804 nlen = dp->d_namlen - plen; 5805 if (sizep != NULL) 5806 *sizep += 1 + nlen; 5807 else if (uio != NULL) { 5808 /* 5809 * Format of extattr name entry is one byte for 5810 * length and the rest for name. 5811 */ 5812 error = uiomove(&nlen, 1, uio->uio_rw, uio); 5813 if (error == 0) { 5814 error = uiomove(dp->d_name + plen, nlen, 5815 uio->uio_rw, uio); 5816 } 5817 if (error != 0) 5818 break; 5819 } 5820 } 5821 } while (!eof && error == 0); 5822 5823 vput(vp); 5824 ZFS_EXIT(zfsvfs); 5825 5826 return (error); 5827} 5828 5829int 5830zfs_freebsd_getacl(ap) 5831 struct vop_getacl_args /* { 5832 struct vnode *vp; 5833 acl_type_t type; 5834 struct acl *aclp; 5835 struct ucred *cred; 5836 struct thread *td; 5837 } */ *ap; 5838{ 5839 int error; 5840 vsecattr_t vsecattr; 5841 5842 if (ap->a_type != ACL_TYPE_NFS4) 5843 return (EINVAL); 5844 5845 vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT; 5846 if (error = zfs_getsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL)) 5847 return (error); 5848 5849 error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp, vsecattr.vsa_aclcnt); 5850 if (vsecattr.vsa_aclentp != NULL) 5851 kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz); 5852 5853 return (error); 5854} 5855 5856int 5857zfs_freebsd_setacl(ap) 5858 struct vop_setacl_args /* { 5859 struct vnode *vp; 5860 acl_type_t type; 5861 struct acl *aclp; 5862 struct ucred *cred; 5863 struct thread *td; 5864 } */ *ap; 5865{ 5866 int error; 5867 vsecattr_t vsecattr; 5868 int aclbsize; /* size of acl list in bytes */ 5869 aclent_t *aaclp; 5870 5871 if (ap->a_type != ACL_TYPE_NFS4) 5872 return (EINVAL); 5873 5874 if (ap->a_aclp == NULL) 5875 return (EINVAL); 5876 5877 if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES) 5878 return (EINVAL); 5879 5880 /* 5881 * With NFSv4 ACLs, chmod(2) may need to add additional entries, 5882 * splitting every entry into two and appending "canonical six" 5883 * entries at the end. Don't allow for setting an ACL that would 5884 * cause chmod(2) to run out of ACL entries. 5885 */ 5886 if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES) 5887 return (ENOSPC); 5888 5889 error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR); 5890 if (error != 0) 5891 return (error); 5892 5893 vsecattr.vsa_mask = VSA_ACE; 5894 aclbsize = ap->a_aclp->acl_cnt * sizeof(ace_t); 5895 vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP); 5896 aaclp = vsecattr.vsa_aclentp; 5897 vsecattr.vsa_aclentsz = aclbsize; 5898 5899 aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp); 5900 error = zfs_setsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL); 5901 kmem_free(aaclp, aclbsize); 5902 5903 return (error); 5904} 5905 5906int 5907zfs_freebsd_aclcheck(ap) 5908 struct vop_aclcheck_args /* { 5909 struct vnode *vp; 5910 acl_type_t type; 5911 struct acl *aclp; 5912 struct ucred *cred; 5913 struct thread *td; 5914 } */ *ap; 5915{ 5916 5917 return (EOPNOTSUPP); 5918} 5919 5920static int 5921zfs_vptocnp(struct vop_vptocnp_args *ap) 5922{ 5923 vnode_t *covered_vp; 5924 vnode_t *vp = ap->a_vp;; 5925 zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; 5926 znode_t *zp = VTOZ(vp); 5927 uint64_t parent; 5928 int ltype; 5929 int error; 5930 5931 ZFS_ENTER(zfsvfs); 5932 ZFS_VERIFY_ZP(zp); 5933 5934 /* 5935 * If we are a snapshot mounted under .zfs, run the operation 5936 * on the covered vnode. 5937 */ 5938 if ((error = sa_lookup(zp->z_sa_hdl, 5939 SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) { 5940 ZFS_EXIT(zfsvfs); 5941 return (error); 5942 } 5943 5944 if (zp->z_id != parent || zfsvfs->z_parent == zfsvfs) { 5945 char name[MAXNAMLEN + 1]; 5946 znode_t *dzp; 5947 size_t len; 5948 5949 error = zfs_znode_parent_and_name(zp, &dzp, name); 5950 if (error == 0) { 5951 len = strlen(name); 5952 if (*ap->a_buflen < len) 5953 error = SET_ERROR(ENOMEM); 5954 } 5955 if (error == 0) { 5956 *ap->a_buflen -= len; 5957 bcopy(name, ap->a_buf + *ap->a_buflen, len); 5958 *ap->a_vpp = ZTOV(dzp); 5959 } 5960 ZFS_EXIT(zfsvfs); 5961 return (error); 5962 } 5963 ZFS_EXIT(zfsvfs); 5964 5965 covered_vp = vp->v_mount->mnt_vnodecovered; 5966 vhold(covered_vp); 5967 ltype = VOP_ISLOCKED(vp); 5968 VOP_UNLOCK(vp, 0); 5969 error = vget(covered_vp, LK_SHARED | LK_VNHELD, curthread); 5970 if (error == 0) { 5971 error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_cred, 5972 ap->a_buf, ap->a_buflen); 5973 vput(covered_vp); 5974 } 5975 vn_lock(vp, ltype | LK_RETRY); 5976 if ((vp->v_iflag & VI_DOOMED) != 0) 5977 error = SET_ERROR(ENOENT); 5978 return (error); 5979} 5980 5981#ifdef DIAGNOSTIC 5982static int 5983zfs_lock(ap) 5984 struct vop_lock1_args /* { 5985 struct vnode *a_vp; 5986 int a_flags; 5987 char *file; 5988 int line; 5989 } */ *ap; 5990{ 5991 vnode_t *vp; 5992 znode_t *zp; 5993 int err; 5994 5995 err = vop_stdlock(ap); 5996 if (err == 0 && (ap->a_flags & LK_NOWAIT) == 0) { 5997 vp = ap->a_vp; 5998 zp = vp->v_data; 5999 if (vp->v_mount != NULL && (vp->v_iflag & VI_DOOMED) == 0 && 6000 zp != NULL && (zp->z_pflags & ZFS_XATTR) == 0) 6001 VERIFY(!RRM_LOCK_HELD(&zp->z_zfsvfs->z_teardown_lock)); 6002 } 6003 return (err); 6004} 6005#endif 6006 6007struct vop_vector zfs_vnodeops; 6008struct vop_vector zfs_fifoops; 6009struct vop_vector zfs_shareops; 6010 6011struct vop_vector zfs_vnodeops = { 6012 .vop_default = &default_vnodeops, 6013 .vop_inactive = zfs_freebsd_inactive, 6014 .vop_reclaim = zfs_freebsd_reclaim, 6015 .vop_access = zfs_freebsd_access, 6016 .vop_lookup = zfs_cache_lookup, 6017 .vop_cachedlookup = zfs_freebsd_lookup, 6018 .vop_getattr = zfs_freebsd_getattr, 6019 .vop_setattr = zfs_freebsd_setattr, 6020 .vop_create = zfs_freebsd_create, 6021 .vop_mknod = zfs_freebsd_create, 6022 .vop_mkdir = zfs_freebsd_mkdir, 6023 .vop_readdir = zfs_freebsd_readdir, 6024 .vop_fsync = zfs_freebsd_fsync, 6025 .vop_open = zfs_freebsd_open, 6026 .vop_close = zfs_freebsd_close, 6027 .vop_rmdir = zfs_freebsd_rmdir, 6028 .vop_ioctl = zfs_freebsd_ioctl, 6029 .vop_link = zfs_freebsd_link, 6030 .vop_symlink = zfs_freebsd_symlink, 6031 .vop_readlink = zfs_freebsd_readlink, 6032 .vop_read = zfs_freebsd_read, 6033 .vop_write = zfs_freebsd_write, 6034 .vop_remove = zfs_freebsd_remove, 6035 .vop_rename = zfs_freebsd_rename, 6036 .vop_pathconf = zfs_freebsd_pathconf, 6037 .vop_bmap = zfs_freebsd_bmap, 6038 .vop_fid = zfs_freebsd_fid, 6039 .vop_getextattr = zfs_getextattr, 6040 .vop_deleteextattr = zfs_deleteextattr, 6041 .vop_setextattr = zfs_setextattr, 6042 .vop_listextattr = zfs_listextattr, 6043 .vop_getacl = zfs_freebsd_getacl, 6044 .vop_setacl = zfs_freebsd_setacl, 6045 .vop_aclcheck = zfs_freebsd_aclcheck, 6046 .vop_getpages = zfs_freebsd_getpages, 6047 .vop_putpages = zfs_freebsd_putpages, 6048 .vop_vptocnp = zfs_vptocnp, 6049#ifdef DIAGNOSTIC 6050 .vop_lock1 = zfs_lock, 6051#endif 6052}; 6053 6054struct vop_vector zfs_fifoops = { 6055 .vop_default = &fifo_specops, 6056 .vop_fsync = zfs_freebsd_fsync, 6057 .vop_access = zfs_freebsd_access, 6058 .vop_getattr = zfs_freebsd_getattr, 6059 .vop_inactive = zfs_freebsd_inactive, 6060 .vop_read = VOP_PANIC, 6061 .vop_reclaim = zfs_freebsd_reclaim, 6062 .vop_setattr = zfs_freebsd_setattr, 6063 .vop_write = VOP_PANIC, 6064 .vop_pathconf = zfs_freebsd_fifo_pathconf, 6065 .vop_fid = zfs_freebsd_fid, 6066 .vop_getacl = zfs_freebsd_getacl, 6067 .vop_setacl = zfs_freebsd_setacl, 6068 .vop_aclcheck = zfs_freebsd_aclcheck, 6069}; 6070 6071/* 6072 * special share hidden files vnode operations template 6073 */ 6074struct vop_vector zfs_shareops = { 6075 .vop_default = &default_vnodeops, 6076 .vop_access = zfs_freebsd_access, 6077 .vop_inactive = zfs_freebsd_inactive, 6078 .vop_reclaim = zfs_freebsd_reclaim, 6079 .vop_fid = zfs_freebsd_fid, 6080 .vop_pathconf = zfs_freebsd_pathconf, 6081}; 6082