zfs_vnops.c revision 197153
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26/* Portions Copyright 2007 Jeremy Teo */ 27 28#include <sys/types.h> 29#include <sys/param.h> 30#include <sys/time.h> 31#include <sys/systm.h> 32#include <sys/sysmacros.h> 33#include <sys/resource.h> 34#include <sys/vfs.h> 35#include <sys/vnode.h> 36#include <sys/file.h> 37#include <sys/stat.h> 38#include <sys/kmem.h> 39#include <sys/taskq.h> 40#include <sys/uio.h> 41#include <sys/atomic.h> 42#include <sys/namei.h> 43#include <sys/mman.h> 44#include <sys/cmn_err.h> 45#include <sys/errno.h> 46#include <sys/unistd.h> 47#include <sys/zfs_dir.h> 48#include <sys/zfs_ioctl.h> 49#include <sys/fs/zfs.h> 50#include <sys/dmu.h> 51#include <sys/spa.h> 52#include <sys/txg.h> 53#include <sys/dbuf.h> 54#include <sys/zap.h> 55#include <sys/dirent.h> 56#include <sys/policy.h> 57#include <sys/sunddi.h> 58#include <sys/filio.h> 59#include <sys/zfs_ctldir.h> 60#include <sys/zfs_fuid.h> 61#include <sys/dnlc.h> 62#include <sys/zfs_rlock.h> 63#include <sys/extdirent.h> 64#include <sys/kidmap.h> 65#include <sys/bio.h> 66#include <sys/buf.h> 67#include <sys/sf_buf.h> 68#include <sys/sched.h> 69#include <sys/acl.h> 70 71/* 72 * Programming rules. 73 * 74 * Each vnode op performs some logical unit of work. To do this, the ZPL must 75 * properly lock its in-core state, create a DMU transaction, do the work, 76 * record this work in the intent log (ZIL), commit the DMU transaction, 77 * and wait for the intent log to commit if it is a synchronous operation. 78 * Moreover, the vnode ops must work in both normal and log replay context. 79 * The ordering of events is important to avoid deadlocks and references 80 * to freed memory. The example below illustrates the following Big Rules: 81 * 82 * (1) A check must be made in each zfs thread for a mounted file system. 83 * This is done avoiding races using ZFS_ENTER(zfsvfs). 84 * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes 85 * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros 86 * can return EIO from the calling function. 87 * 88 * (2) VN_RELE() should always be the last thing except for zil_commit() 89 * (if necessary) and ZFS_EXIT(). This is for 3 reasons: 90 * First, if it's the last reference, the vnode/znode 91 * can be freed, so the zp may point to freed memory. Second, the last 92 * reference will call zfs_zinactive(), which may induce a lot of work -- 93 * pushing cached pages (which acquires range locks) and syncing out 94 * cached atime changes. Third, zfs_zinactive() may require a new tx, 95 * which could deadlock the system if you were already holding one. 96 * If you must call VN_RELE() within a tx then use VN_RELE_ASYNC(). 97 * 98 * (3) All range locks must be grabbed before calling dmu_tx_assign(), 99 * as they can span dmu_tx_assign() calls. 100 * 101 * (4) Always pass zfsvfs->z_assign as the second argument to dmu_tx_assign(). 102 * In normal operation, this will be TXG_NOWAIT. During ZIL replay, 103 * it will be a specific txg. Either way, dmu_tx_assign() never blocks. 104 * This is critical because we don't want to block while holding locks. 105 * Note, in particular, that if a lock is sometimes acquired before 106 * the tx assigns, and sometimes after (e.g. z_lock), then failing to 107 * use a non-blocking assign can deadlock the system. The scenario: 108 * 109 * Thread A has grabbed a lock before calling dmu_tx_assign(). 110 * Thread B is in an already-assigned tx, and blocks for this lock. 111 * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() 112 * forever, because the previous txg can't quiesce until B's tx commits. 113 * 114 * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, 115 * then drop all locks, call dmu_tx_wait(), and try again. 116 * 117 * (5) If the operation succeeded, generate the intent log entry for it 118 * before dropping locks. This ensures that the ordering of events 119 * in the intent log matches the order in which they actually occurred. 120 * 121 * (6) At the end of each vnode op, the DMU tx must always commit, 122 * regardless of whether there were any errors. 123 * 124 * (7) After dropping all locks, invoke zil_commit(zilog, seq, foid) 125 * to ensure that synchronous semantics are provided when necessary. 126 * 127 * In general, this is how things should be ordered in each vnode op: 128 * 129 * ZFS_ENTER(zfsvfs); // exit if unmounted 130 * top: 131 * zfs_dirent_lock(&dl, ...) // lock directory entry (may VN_HOLD()) 132 * rw_enter(...); // grab any other locks you need 133 * tx = dmu_tx_create(...); // get DMU tx 134 * dmu_tx_hold_*(); // hold each object you might modify 135 * error = dmu_tx_assign(tx, zfsvfs->z_assign); // try to assign 136 * if (error) { 137 * rw_exit(...); // drop locks 138 * zfs_dirent_unlock(dl); // unlock directory entry 139 * VN_RELE(...); // release held vnodes 140 * if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 141 * dmu_tx_wait(tx); 142 * dmu_tx_abort(tx); 143 * goto top; 144 * } 145 * dmu_tx_abort(tx); // abort DMU tx 146 * ZFS_EXIT(zfsvfs); // finished in zfs 147 * return (error); // really out of space 148 * } 149 * error = do_real_work(); // do whatever this VOP does 150 * if (error == 0) 151 * zfs_log_*(...); // on success, make ZIL entry 152 * dmu_tx_commit(tx); // commit DMU tx -- error or not 153 * rw_exit(...); // drop locks 154 * zfs_dirent_unlock(dl); // unlock directory entry 155 * VN_RELE(...); // release held vnodes 156 * zil_commit(zilog, seq, foid); // synchronous when necessary 157 * ZFS_EXIT(zfsvfs); // finished in zfs 158 * return (error); // done, report error 159 */ 160 161/* ARGSUSED */ 162static int 163zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) 164{ 165 znode_t *zp = VTOZ(*vpp); 166 167 if ((flag & FWRITE) && (zp->z_phys->zp_flags & ZFS_APPENDONLY) && 168 ((flag & FAPPEND) == 0)) { 169 return (EPERM); 170 } 171 172 if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && 173 ZTOV(zp)->v_type == VREG && 174 !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) && 175 zp->z_phys->zp_size > 0) 176 if (fs_vscan(*vpp, cr, 0) != 0) 177 return (EACCES); 178 179 /* Keep a count of the synchronous opens in the znode */ 180 if (flag & (FSYNC | FDSYNC)) 181 atomic_inc_32(&zp->z_sync_cnt); 182 183 return (0); 184} 185 186/* ARGSUSED */ 187static int 188zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, 189 caller_context_t *ct) 190{ 191 znode_t *zp = VTOZ(vp); 192 193 /* Decrement the synchronous opens in the znode */ 194 if ((flag & (FSYNC | FDSYNC)) && (count == 1)) 195 atomic_dec_32(&zp->z_sync_cnt); 196 197 /* 198 * Clean up any locks held by this process on the vp. 199 */ 200 cleanlocks(vp, ddi_get_pid(), 0); 201 cleanshares(vp, ddi_get_pid()); 202 203 if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && 204 ZTOV(zp)->v_type == VREG && 205 !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) && 206 zp->z_phys->zp_size > 0) 207 VERIFY(fs_vscan(vp, cr, 1) == 0); 208 209 return (0); 210} 211 212/* 213 * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and 214 * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter. 215 */ 216static int 217zfs_holey(vnode_t *vp, u_long cmd, offset_t *off) 218{ 219 znode_t *zp = VTOZ(vp); 220 uint64_t noff = (uint64_t)*off; /* new offset */ 221 uint64_t file_sz; 222 int error; 223 boolean_t hole; 224 225 file_sz = zp->z_phys->zp_size; 226 if (noff >= file_sz) { 227 return (ENXIO); 228 } 229 230 if (cmd == _FIO_SEEK_HOLE) 231 hole = B_TRUE; 232 else 233 hole = B_FALSE; 234 235 error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff); 236 237 /* end of file? */ 238 if ((error == ESRCH) || (noff > file_sz)) { 239 /* 240 * Handle the virtual hole at the end of file. 241 */ 242 if (hole) { 243 *off = file_sz; 244 return (0); 245 } 246 return (ENXIO); 247 } 248 249 if (noff < *off) 250 return (error); 251 *off = noff; 252 return (error); 253} 254 255/* ARGSUSED */ 256static int 257zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred, 258 int *rvalp, caller_context_t *ct) 259{ 260 offset_t off; 261 int error; 262 zfsvfs_t *zfsvfs; 263 znode_t *zp; 264 265 switch (com) { 266 case _FIOFFS: 267 return (0); 268 269 /* 270 * The following two ioctls are used by bfu. Faking out, 271 * necessary to avoid bfu errors. 272 */ 273 case _FIOGDIO: 274 case _FIOSDIO: 275 return (0); 276 277 case _FIO_SEEK_DATA: 278 case _FIO_SEEK_HOLE: 279 if (ddi_copyin((void *)data, &off, sizeof (off), flag)) 280 return (EFAULT); 281 282 zp = VTOZ(vp); 283 zfsvfs = zp->z_zfsvfs; 284 ZFS_ENTER(zfsvfs); 285 ZFS_VERIFY_ZP(zp); 286 287 /* offset parameter is in/out */ 288 error = zfs_holey(vp, com, &off); 289 ZFS_EXIT(zfsvfs); 290 if (error) 291 return (error); 292 if (ddi_copyout(&off, (void *)data, sizeof (off), flag)) 293 return (EFAULT); 294 return (0); 295 } 296 return (ENOTTY); 297} 298 299/* 300 * When a file is memory mapped, we must keep the IO data synchronized 301 * between the DMU cache and the memory mapped pages. What this means: 302 * 303 * On Write: If we find a memory mapped page, we write to *both* 304 * the page and the dmu buffer. 305 * 306 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when 307 * the file is memory mapped. 308 */ 309static int 310mappedwrite(vnode_t *vp, int nbytes, uio_t *uio, dmu_tx_t *tx) 311{ 312 znode_t *zp = VTOZ(vp); 313 objset_t *os = zp->z_zfsvfs->z_os; 314 vm_object_t obj; 315 vm_page_t m; 316 struct sf_buf *sf; 317 int64_t start, off; 318 int len = nbytes; 319 int error = 0; 320 uint64_t dirbytes; 321 322 ASSERT(vp->v_mount != NULL); 323 obj = vp->v_object; 324 ASSERT(obj != NULL); 325 326 start = uio->uio_loffset; 327 off = start & PAGEOFFSET; 328 dirbytes = 0; 329 VM_OBJECT_LOCK(obj); 330 for (start &= PAGEMASK; len > 0; start += PAGESIZE) { 331 uint64_t bytes = MIN(PAGESIZE - off, len); 332 uint64_t fsize; 333 334again: 335 if ((m = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && 336 vm_page_is_valid(m, (vm_offset_t)off, bytes)) { 337 uint64_t woff; 338 caddr_t va; 339 340 if (vm_page_sleep_if_busy(m, FALSE, "zfsmwb")) 341 goto again; 342 fsize = obj->un_pager.vnp.vnp_size; 343 vm_page_busy(m); 344 vm_page_lock_queues(); 345 vm_page_undirty(m); 346 vm_page_unlock_queues(); 347 VM_OBJECT_UNLOCK(obj); 348 if (dirbytes > 0) { 349 error = dmu_write_uio(os, zp->z_id, uio, 350 dirbytes, tx); 351 dirbytes = 0; 352 } 353 if (error == 0) { 354 sched_pin(); 355 sf = sf_buf_alloc(m, SFB_CPUPRIVATE); 356 va = (caddr_t)sf_buf_kva(sf); 357 woff = uio->uio_loffset - off; 358 error = uiomove(va + off, bytes, UIO_WRITE, uio); 359 /* 360 * The uiomove() above could have been partially 361 * successful, that's why we call dmu_write() 362 * below unconditionally. The page was marked 363 * non-dirty above and we would lose the changes 364 * without doing so. If the uiomove() failed 365 * entirely, well, we just write what we got 366 * before one more time. 367 */ 368 dmu_write(os, zp->z_id, woff, 369 MIN(PAGESIZE, fsize - woff), va, tx); 370 sf_buf_free(sf); 371 sched_unpin(); 372 } 373 VM_OBJECT_LOCK(obj); 374 vm_page_wakeup(m); 375 } else { 376 if (__predict_false(obj->cache != NULL)) { 377 vm_page_cache_free(obj, OFF_TO_IDX(start), 378 OFF_TO_IDX(start) + 1); 379 } 380 dirbytes += bytes; 381 } 382 len -= bytes; 383 off = 0; 384 if (error) 385 break; 386 } 387 VM_OBJECT_UNLOCK(obj); 388 if (error == 0 && dirbytes > 0) 389 error = dmu_write_uio(os, zp->z_id, uio, dirbytes, tx); 390 return (error); 391} 392 393/* 394 * When a file is memory mapped, we must keep the IO data synchronized 395 * between the DMU cache and the memory mapped pages. What this means: 396 * 397 * On Read: We "read" preferentially from memory mapped pages, 398 * else we default from the dmu buffer. 399 * 400 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when 401 * the file is memory mapped. 402 */ 403static int 404mappedread(vnode_t *vp, int nbytes, uio_t *uio) 405{ 406 znode_t *zp = VTOZ(vp); 407 objset_t *os = zp->z_zfsvfs->z_os; 408 vm_object_t obj; 409 vm_page_t m; 410 struct sf_buf *sf; 411 int64_t start, off; 412 caddr_t va; 413 int len = nbytes; 414 int error = 0; 415 uint64_t dirbytes; 416 417 ASSERT(vp->v_mount != NULL); 418 obj = vp->v_object; 419 ASSERT(obj != NULL); 420 421 start = uio->uio_loffset; 422 off = start & PAGEOFFSET; 423 dirbytes = 0; 424 VM_OBJECT_LOCK(obj); 425 for (start &= PAGEMASK; len > 0; start += PAGESIZE) { 426 uint64_t bytes = MIN(PAGESIZE - off, len); 427 428again: 429 if ((m = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && 430 vm_page_is_valid(m, (vm_offset_t)off, bytes)) { 431 if (vm_page_sleep_if_busy(m, FALSE, "zfsmrb")) 432 goto again; 433 vm_page_busy(m); 434 VM_OBJECT_UNLOCK(obj); 435 if (dirbytes > 0) { 436 error = dmu_read_uio(os, zp->z_id, uio, 437 dirbytes); 438 dirbytes = 0; 439 } 440 if (error == 0) { 441 sched_pin(); 442 sf = sf_buf_alloc(m, SFB_CPUPRIVATE); 443 va = (caddr_t)sf_buf_kva(sf); 444 error = uiomove(va + off, bytes, UIO_READ, uio); 445 sf_buf_free(sf); 446 sched_unpin(); 447 } 448 VM_OBJECT_LOCK(obj); 449 vm_page_wakeup(m); 450 } else if (m != NULL && uio->uio_segflg == UIO_NOCOPY) { 451 /* 452 * The code below is here to make sendfile(2) work 453 * correctly with ZFS. As pointed out by ups@ 454 * sendfile(2) should be changed to use VOP_GETPAGES(), 455 * but it pessimize performance of sendfile/UFS, that's 456 * why I handle this special case in ZFS code. 457 */ 458 if (vm_page_sleep_if_busy(m, FALSE, "zfsmrb")) 459 goto again; 460 vm_page_busy(m); 461 VM_OBJECT_UNLOCK(obj); 462 if (dirbytes > 0) { 463 error = dmu_read_uio(os, zp->z_id, uio, 464 dirbytes); 465 dirbytes = 0; 466 } 467 if (error == 0) { 468 sched_pin(); 469 sf = sf_buf_alloc(m, SFB_CPUPRIVATE); 470 va = (caddr_t)sf_buf_kva(sf); 471 error = dmu_read(os, zp->z_id, start + off, 472 bytes, (void *)(va + off)); 473 sf_buf_free(sf); 474 sched_unpin(); 475 } 476 VM_OBJECT_LOCK(obj); 477 vm_page_wakeup(m); 478 if (error == 0) 479 uio->uio_resid -= bytes; 480 } else { 481 dirbytes += bytes; 482 } 483 len -= bytes; 484 off = 0; 485 if (error) 486 break; 487 } 488 VM_OBJECT_UNLOCK(obj); 489 if (error == 0 && dirbytes > 0) 490 error = dmu_read_uio(os, zp->z_id, uio, dirbytes); 491 return (error); 492} 493 494offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */ 495 496/* 497 * Read bytes from specified file into supplied buffer. 498 * 499 * IN: vp - vnode of file to be read from. 500 * uio - structure supplying read location, range info, 501 * and return buffer. 502 * ioflag - SYNC flags; used to provide FRSYNC semantics. 503 * cr - credentials of caller. 504 * ct - caller context 505 * 506 * OUT: uio - updated offset and range, buffer filled. 507 * 508 * RETURN: 0 if success 509 * error code if failure 510 * 511 * Side Effects: 512 * vp - atime updated if byte count > 0 513 */ 514/* ARGSUSED */ 515static int 516zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) 517{ 518 znode_t *zp = VTOZ(vp); 519 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 520 objset_t *os; 521 ssize_t n, nbytes; 522 int error; 523 rl_t *rl; 524 525 ZFS_ENTER(zfsvfs); 526 ZFS_VERIFY_ZP(zp); 527 os = zfsvfs->z_os; 528 529 if (zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) { 530 ZFS_EXIT(zfsvfs); 531 return (EACCES); 532 } 533 534 /* 535 * Validate file offset 536 */ 537 if (uio->uio_loffset < (offset_t)0) { 538 ZFS_EXIT(zfsvfs); 539 return (EINVAL); 540 } 541 542 /* 543 * Fasttrack empty reads 544 */ 545 if (uio->uio_resid == 0) { 546 ZFS_EXIT(zfsvfs); 547 return (0); 548 } 549 550 /* 551 * Check for mandatory locks 552 */ 553 if (MANDMODE((mode_t)zp->z_phys->zp_mode)) { 554 if (error = chklock(vp, FREAD, 555 uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) { 556 ZFS_EXIT(zfsvfs); 557 return (error); 558 } 559 } 560 561 /* 562 * If we're in FRSYNC mode, sync out this znode before reading it. 563 */ 564 if (ioflag & FRSYNC) 565 zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id); 566 567 /* 568 * Lock the range against changes. 569 */ 570 rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER); 571 572 /* 573 * If we are reading past end-of-file we can skip 574 * to the end; but we might still need to set atime. 575 */ 576 if (uio->uio_loffset >= zp->z_phys->zp_size) { 577 error = 0; 578 goto out; 579 } 580 581 ASSERT(uio->uio_loffset < zp->z_phys->zp_size); 582 n = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset); 583 584 while (n > 0) { 585 nbytes = MIN(n, zfs_read_chunk_size - 586 P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); 587 588 if (vn_has_cached_data(vp)) 589 error = mappedread(vp, nbytes, uio); 590 else 591 error = dmu_read_uio(os, zp->z_id, uio, nbytes); 592 if (error) { 593 /* convert checksum errors into IO errors */ 594 if (error == ECKSUM) 595 error = EIO; 596 break; 597 } 598 599 n -= nbytes; 600 } 601 602out: 603 zfs_range_unlock(rl); 604 605 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 606 ZFS_EXIT(zfsvfs); 607 return (error); 608} 609 610/* 611 * Fault in the pages of the first n bytes specified by the uio structure. 612 * 1 byte in each page is touched and the uio struct is unmodified. 613 * Any error will exit this routine as this is only a best 614 * attempt to get the pages resident. This is a copy of ufs_trans_touch(). 615 */ 616static void 617zfs_prefault_write(ssize_t n, struct uio *uio) 618{ 619 struct iovec *iov; 620 ulong_t cnt, incr; 621 caddr_t p; 622 623 if (uio->uio_segflg != UIO_USERSPACE) 624 return; 625 626 iov = uio->uio_iov; 627 628 while (n) { 629 cnt = MIN(iov->iov_len, n); 630 if (cnt == 0) { 631 /* empty iov entry */ 632 iov++; 633 continue; 634 } 635 n -= cnt; 636 /* 637 * touch each page in this segment. 638 */ 639 p = iov->iov_base; 640 while (cnt) { 641 if (fubyte(p) == -1) 642 return; 643 incr = MIN(cnt, PAGESIZE); 644 p += incr; 645 cnt -= incr; 646 } 647 /* 648 * touch the last byte in case it straddles a page. 649 */ 650 p--; 651 if (fubyte(p) == -1) 652 return; 653 iov++; 654 } 655} 656 657/* 658 * Write the bytes to a file. 659 * 660 * IN: vp - vnode of file to be written to. 661 * uio - structure supplying write location, range info, 662 * and data buffer. 663 * ioflag - IO_APPEND flag set if in append mode. 664 * cr - credentials of caller. 665 * ct - caller context (NFS/CIFS fem monitor only) 666 * 667 * OUT: uio - updated offset and range. 668 * 669 * RETURN: 0 if success 670 * error code if failure 671 * 672 * Timestamps: 673 * vp - ctime|mtime updated if byte count > 0 674 */ 675/* ARGSUSED */ 676static int 677zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) 678{ 679 znode_t *zp = VTOZ(vp); 680 rlim64_t limit = MAXOFFSET_T; 681 ssize_t start_resid = uio->uio_resid; 682 ssize_t tx_bytes; 683 uint64_t end_size; 684 dmu_tx_t *tx; 685 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 686 zilog_t *zilog; 687 offset_t woff; 688 ssize_t n, nbytes; 689 rl_t *rl; 690 int max_blksz = zfsvfs->z_max_blksz; 691 uint64_t pflags; 692 int error; 693 694 /* 695 * Fasttrack empty write 696 */ 697 n = start_resid; 698 if (n == 0) 699 return (0); 700 701 if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) 702 limit = MAXOFFSET_T; 703 704 ZFS_ENTER(zfsvfs); 705 ZFS_VERIFY_ZP(zp); 706 707 /* 708 * If immutable or not appending then return EPERM 709 */ 710 pflags = zp->z_phys->zp_flags; 711 if ((pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) || 712 ((pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) && 713 (uio->uio_loffset < zp->z_phys->zp_size))) { 714 ZFS_EXIT(zfsvfs); 715 return (EPERM); 716 } 717 718 zilog = zfsvfs->z_log; 719 720 /* 721 * Pre-fault the pages to ensure slow (eg NFS) pages 722 * don't hold up txg. 723 */ 724 zfs_prefault_write(n, uio); 725 726 /* 727 * If in append mode, set the io offset pointer to eof. 728 */ 729 if (ioflag & IO_APPEND) { 730 /* 731 * Range lock for a file append: 732 * The value for the start of range will be determined by 733 * zfs_range_lock() (to guarantee append semantics). 734 * If this write will cause the block size to increase, 735 * zfs_range_lock() will lock the entire file, so we must 736 * later reduce the range after we grow the block size. 737 */ 738 rl = zfs_range_lock(zp, 0, n, RL_APPEND); 739 if (rl->r_len == UINT64_MAX) { 740 /* overlocked, zp_size can't change */ 741 woff = uio->uio_loffset = zp->z_phys->zp_size; 742 } else { 743 woff = uio->uio_loffset = rl->r_off; 744 } 745 } else { 746 woff = uio->uio_loffset; 747 /* 748 * Validate file offset 749 */ 750 if (woff < 0) { 751 ZFS_EXIT(zfsvfs); 752 return (EINVAL); 753 } 754 755 /* 756 * If we need to grow the block size then zfs_range_lock() 757 * will lock a wider range than we request here. 758 * Later after growing the block size we reduce the range. 759 */ 760 rl = zfs_range_lock(zp, woff, n, RL_WRITER); 761 } 762 763 if (woff >= limit) { 764 zfs_range_unlock(rl); 765 ZFS_EXIT(zfsvfs); 766 return (EFBIG); 767 } 768 769 if ((woff + n) > limit || woff > (limit - n)) 770 n = limit - woff; 771 772 /* 773 * Check for mandatory locks 774 */ 775 if (MANDMODE((mode_t)zp->z_phys->zp_mode) && 776 (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) { 777 zfs_range_unlock(rl); 778 ZFS_EXIT(zfsvfs); 779 return (error); 780 } 781 end_size = MAX(zp->z_phys->zp_size, woff + n); 782 783 /* 784 * Write the file in reasonable size chunks. Each chunk is written 785 * in a separate transaction; this keeps the intent log records small 786 * and allows us to do more fine-grained space accounting. 787 */ 788 while (n > 0) { 789 /* 790 * Start a transaction. 791 */ 792 woff = uio->uio_loffset; 793 tx = dmu_tx_create(zfsvfs->z_os); 794 dmu_tx_hold_bonus(tx, zp->z_id); 795 dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz)); 796 error = dmu_tx_assign(tx, zfsvfs->z_assign); 797 if (error) { 798 if (error == ERESTART && 799 zfsvfs->z_assign == TXG_NOWAIT) { 800 dmu_tx_wait(tx); 801 dmu_tx_abort(tx); 802 continue; 803 } 804 dmu_tx_abort(tx); 805 break; 806 } 807 808 /* 809 * If zfs_range_lock() over-locked we grow the blocksize 810 * and then reduce the lock range. This will only happen 811 * on the first iteration since zfs_range_reduce() will 812 * shrink down r_len to the appropriate size. 813 */ 814 if (rl->r_len == UINT64_MAX) { 815 uint64_t new_blksz; 816 817 if (zp->z_blksz > max_blksz) { 818 ASSERT(!ISP2(zp->z_blksz)); 819 new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE); 820 } else { 821 new_blksz = MIN(end_size, max_blksz); 822 } 823 zfs_grow_blocksize(zp, new_blksz, tx); 824 zfs_range_reduce(rl, woff, n); 825 } 826 827 /* 828 * XXX - should we really limit each write to z_max_blksz? 829 * Perhaps we should use SPA_MAXBLOCKSIZE chunks? 830 */ 831 nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); 832 833 if (woff + nbytes > zp->z_phys->zp_size) 834 vnode_pager_setsize(vp, woff + nbytes); 835 836 rw_enter(&zp->z_map_lock, RW_READER); 837 838 tx_bytes = uio->uio_resid; 839 if (vn_has_cached_data(vp)) { 840 rw_exit(&zp->z_map_lock); 841 error = mappedwrite(vp, nbytes, uio, tx); 842 } else { 843 error = dmu_write_uio(zfsvfs->z_os, zp->z_id, 844 uio, nbytes, tx); 845 rw_exit(&zp->z_map_lock); 846 } 847 tx_bytes -= uio->uio_resid; 848 849 /* 850 * If we made no progress, we're done. If we made even 851 * partial progress, update the znode and ZIL accordingly. 852 */ 853 if (tx_bytes == 0) { 854 dmu_tx_commit(tx); 855 ASSERT(error != 0); 856 break; 857 } 858 859 /* 860 * Clear Set-UID/Set-GID bits on successful write if not 861 * privileged and at least one of the excute bits is set. 862 * 863 * It would be nice to to this after all writes have 864 * been done, but that would still expose the ISUID/ISGID 865 * to another app after the partial write is committed. 866 * 867 * Note: we don't call zfs_fuid_map_id() here because 868 * user 0 is not an ephemeral uid. 869 */ 870 mutex_enter(&zp->z_acl_lock); 871 if ((zp->z_phys->zp_mode & (S_IXUSR | (S_IXUSR >> 3) | 872 (S_IXUSR >> 6))) != 0 && 873 (zp->z_phys->zp_mode & (S_ISUID | S_ISGID)) != 0 && 874 secpolicy_vnode_setid_retain(vp, cr, 875 (zp->z_phys->zp_mode & S_ISUID) != 0 && 876 zp->z_phys->zp_uid == 0) != 0) { 877 zp->z_phys->zp_mode &= ~(S_ISUID | S_ISGID); 878 } 879 mutex_exit(&zp->z_acl_lock); 880 881 /* 882 * Update time stamp. NOTE: This marks the bonus buffer as 883 * dirty, so we don't have to do it again for zp_size. 884 */ 885 zfs_time_stamper(zp, CONTENT_MODIFIED, tx); 886 887 /* 888 * Update the file size (zp_size) if it has changed; 889 * account for possible concurrent updates. 890 */ 891 while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset) 892 (void) atomic_cas_64(&zp->z_phys->zp_size, end_size, 893 uio->uio_loffset); 894 zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag); 895 dmu_tx_commit(tx); 896 897 if (error != 0) 898 break; 899 ASSERT(tx_bytes == nbytes); 900 n -= nbytes; 901 } 902 903 zfs_range_unlock(rl); 904 905 /* 906 * If we're in replay mode, or we made no progress, return error. 907 * Otherwise, it's at least a partial write, so it's successful. 908 */ 909 if (zfsvfs->z_assign >= TXG_INITIAL || uio->uio_resid == start_resid) { 910 ZFS_EXIT(zfsvfs); 911 return (error); 912 } 913 914 if (ioflag & (FSYNC | FDSYNC)) 915 zil_commit(zilog, zp->z_last_itx, zp->z_id); 916 917 ZFS_EXIT(zfsvfs); 918 return (0); 919} 920 921void 922zfs_get_done(dmu_buf_t *db, void *vzgd) 923{ 924 zgd_t *zgd = (zgd_t *)vzgd; 925 rl_t *rl = zgd->zgd_rl; 926 vnode_t *vp = ZTOV(rl->r_zp); 927 objset_t *os = rl->r_zp->z_zfsvfs->z_os; 928 int vfslocked; 929 930 vfslocked = VFS_LOCK_GIANT(vp->v_vfsp); 931 dmu_buf_rele(db, vzgd); 932 zfs_range_unlock(rl); 933 /* 934 * Release the vnode asynchronously as we currently have the 935 * txg stopped from syncing. 936 */ 937 VN_RELE_ASYNC(vp, dsl_pool_vnrele_taskq(dmu_objset_pool(os))); 938 zil_add_block(zgd->zgd_zilog, zgd->zgd_bp); 939 kmem_free(zgd, sizeof (zgd_t)); 940 VFS_UNLOCK_GIANT(vfslocked); 941} 942 943/* 944 * Get data to generate a TX_WRITE intent log record. 945 */ 946int 947zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) 948{ 949 zfsvfs_t *zfsvfs = arg; 950 objset_t *os = zfsvfs->z_os; 951 znode_t *zp; 952 uint64_t off = lr->lr_offset; 953 dmu_buf_t *db; 954 rl_t *rl; 955 zgd_t *zgd; 956 int dlen = lr->lr_length; /* length of user data */ 957 int error = 0; 958 959 ASSERT(zio); 960 ASSERT(dlen != 0); 961 962 /* 963 * Nothing to do if the file has been removed 964 */ 965 if (zfs_zget(zfsvfs, lr->lr_foid, &zp) != 0) 966 return (ENOENT); 967 if (zp->z_unlinked) { 968 /* 969 * Release the vnode asynchronously as we currently have the 970 * txg stopped from syncing. 971 */ 972 VN_RELE_ASYNC(ZTOV(zp), 973 dsl_pool_vnrele_taskq(dmu_objset_pool(os))); 974 return (ENOENT); 975 } 976 977 /* 978 * Write records come in two flavors: immediate and indirect. 979 * For small writes it's cheaper to store the data with the 980 * log record (immediate); for large writes it's cheaper to 981 * sync the data and get a pointer to it (indirect) so that 982 * we don't have to write the data twice. 983 */ 984 if (buf != NULL) { /* immediate write */ 985 rl = zfs_range_lock(zp, off, dlen, RL_READER); 986 /* test for truncation needs to be done while range locked */ 987 if (off >= zp->z_phys->zp_size) { 988 error = ENOENT; 989 goto out; 990 } 991 VERIFY(0 == dmu_read(os, lr->lr_foid, off, dlen, buf)); 992 } else { /* indirect write */ 993 uint64_t boff; /* block starting offset */ 994 995 /* 996 * Have to lock the whole block to ensure when it's 997 * written out and it's checksum is being calculated 998 * that no one can change the data. We need to re-check 999 * blocksize after we get the lock in case it's changed! 1000 */ 1001 for (;;) { 1002 if (ISP2(zp->z_blksz)) { 1003 boff = P2ALIGN_TYPED(off, zp->z_blksz, 1004 uint64_t); 1005 } else { 1006 boff = 0; 1007 } 1008 dlen = zp->z_blksz; 1009 rl = zfs_range_lock(zp, boff, dlen, RL_READER); 1010 if (zp->z_blksz == dlen) 1011 break; 1012 zfs_range_unlock(rl); 1013 } 1014 /* test for truncation needs to be done while range locked */ 1015 if (off >= zp->z_phys->zp_size) { 1016 error = ENOENT; 1017 goto out; 1018 } 1019 zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP); 1020 zgd->zgd_rl = rl; 1021 zgd->zgd_zilog = zfsvfs->z_log; 1022 zgd->zgd_bp = &lr->lr_blkptr; 1023 VERIFY(0 == dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db)); 1024 ASSERT(boff == db->db_offset); 1025 lr->lr_blkoff = off - boff; 1026 error = dmu_sync(zio, db, &lr->lr_blkptr, 1027 lr->lr_common.lrc_txg, zfs_get_done, zgd); 1028 ASSERT((error && error != EINPROGRESS) || 1029 lr->lr_length <= zp->z_blksz); 1030 if (error == 0) 1031 zil_add_block(zfsvfs->z_log, &lr->lr_blkptr); 1032 /* 1033 * If we get EINPROGRESS, then we need to wait for a 1034 * write IO initiated by dmu_sync() to complete before 1035 * we can release this dbuf. We will finish everything 1036 * up in the zfs_get_done() callback. 1037 */ 1038 if (error == EINPROGRESS) 1039 return (0); 1040 dmu_buf_rele(db, zgd); 1041 kmem_free(zgd, sizeof (zgd_t)); 1042 } 1043out: 1044 zfs_range_unlock(rl); 1045 /* 1046 * Release the vnode asynchronously as we currently have the 1047 * txg stopped from syncing. 1048 */ 1049 VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os))); 1050 return (error); 1051} 1052 1053/*ARGSUSED*/ 1054static int 1055zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr, 1056 caller_context_t *ct) 1057{ 1058 znode_t *zp = VTOZ(vp); 1059 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1060 int error; 1061 1062 ZFS_ENTER(zfsvfs); 1063 ZFS_VERIFY_ZP(zp); 1064 1065 if (flag & V_ACE_MASK) 1066 error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); 1067 else 1068 error = zfs_zaccess_rwx(zp, mode, flag, cr); 1069 1070 ZFS_EXIT(zfsvfs); 1071 return (error); 1072} 1073 1074/* 1075 * Lookup an entry in a directory, or an extended attribute directory. 1076 * If it exists, return a held vnode reference for it. 1077 * 1078 * IN: dvp - vnode of directory to search. 1079 * nm - name of entry to lookup. 1080 * pnp - full pathname to lookup [UNUSED]. 1081 * flags - LOOKUP_XATTR set if looking for an attribute. 1082 * rdir - root directory vnode [UNUSED]. 1083 * cr - credentials of caller. 1084 * ct - caller context 1085 * direntflags - directory lookup flags 1086 * realpnp - returned pathname. 1087 * 1088 * OUT: vpp - vnode of located entry, NULL if not found. 1089 * 1090 * RETURN: 0 if success 1091 * error code if failure 1092 * 1093 * Timestamps: 1094 * NA 1095 */ 1096/* ARGSUSED */ 1097static int 1098zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp, 1099 int nameiop, cred_t *cr, kthread_t *td, int flags) 1100{ 1101 znode_t *zdp = VTOZ(dvp); 1102 zfsvfs_t *zfsvfs = zdp->z_zfsvfs; 1103 int error; 1104 int *direntflags = NULL; 1105 void *realpnp = NULL; 1106 1107 ZFS_ENTER(zfsvfs); 1108 ZFS_VERIFY_ZP(zdp); 1109 1110 *vpp = NULL; 1111 1112 if (flags & LOOKUP_XATTR) { 1113#ifdef TODO 1114 /* 1115 * If the xattr property is off, refuse the lookup request. 1116 */ 1117 if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) { 1118 ZFS_EXIT(zfsvfs); 1119 return (EINVAL); 1120 } 1121#endif 1122 1123 /* 1124 * We don't allow recursive attributes.. 1125 * Maybe someday we will. 1126 */ 1127 if (zdp->z_phys->zp_flags & ZFS_XATTR) { 1128 ZFS_EXIT(zfsvfs); 1129 return (EINVAL); 1130 } 1131 1132 if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) { 1133 ZFS_EXIT(zfsvfs); 1134 return (error); 1135 } 1136 1137 /* 1138 * Do we have permission to get into attribute directory? 1139 */ 1140 1141 if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0, 1142 B_FALSE, cr)) { 1143 VN_RELE(*vpp); 1144 *vpp = NULL; 1145 } 1146 1147 ZFS_EXIT(zfsvfs); 1148 return (error); 1149 } 1150 1151 if (dvp->v_type != VDIR) { 1152 ZFS_EXIT(zfsvfs); 1153 return (ENOTDIR); 1154 } 1155 1156 /* 1157 * Check accessibility of directory. 1158 */ 1159 1160 if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) { 1161 ZFS_EXIT(zfsvfs); 1162 return (error); 1163 } 1164 1165 if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), 1166 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 1167 ZFS_EXIT(zfsvfs); 1168 return (EILSEQ); 1169 } 1170 1171 error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp); 1172 if (error == 0) { 1173 /* 1174 * Convert device special files 1175 */ 1176 if (IS_DEVVP(*vpp)) { 1177 vnode_t *svp; 1178 1179 svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); 1180 VN_RELE(*vpp); 1181 if (svp == NULL) 1182 error = ENOSYS; 1183 else 1184 *vpp = svp; 1185 } 1186 } 1187 1188 /* Translate errors and add SAVENAME when needed. */ 1189 if (cnp->cn_flags & ISLASTCN) { 1190 switch (nameiop) { 1191 case CREATE: 1192 case RENAME: 1193 if (error == ENOENT) { 1194 error = EJUSTRETURN; 1195 cnp->cn_flags |= SAVENAME; 1196 break; 1197 } 1198 /* FALLTHROUGH */ 1199 case DELETE: 1200 if (error == 0) 1201 cnp->cn_flags |= SAVENAME; 1202 break; 1203 } 1204 } 1205 if (error == 0 && (nm[0] != '.' || nm[1] != '\0')) { 1206 int ltype = 0; 1207 1208 if (cnp->cn_flags & ISDOTDOT) { 1209 ltype = VOP_ISLOCKED(dvp); 1210 VOP_UNLOCK(dvp, 0); 1211 } 1212 error = vn_lock(*vpp, cnp->cn_lkflags); 1213 if (cnp->cn_flags & ISDOTDOT) 1214 vn_lock(dvp, ltype | LK_RETRY); 1215 if (error != 0) { 1216 VN_RELE(*vpp); 1217 *vpp = NULL; 1218 ZFS_EXIT(zfsvfs); 1219 return (error); 1220 } 1221 } 1222 1223#ifdef FREEBSD_NAMECACHE 1224 /* 1225 * Insert name into cache (as non-existent) if appropriate. 1226 */ 1227 if (error == ENOENT && (cnp->cn_flags & MAKEENTRY) && nameiop != CREATE) 1228 cache_enter(dvp, *vpp, cnp); 1229 /* 1230 * Insert name into cache if appropriate. 1231 */ 1232 if (error == 0 && (cnp->cn_flags & MAKEENTRY)) { 1233 if (!(cnp->cn_flags & ISLASTCN) || 1234 (nameiop != DELETE && nameiop != RENAME)) { 1235 cache_enter(dvp, *vpp, cnp); 1236 } 1237 } 1238#endif 1239 1240 ZFS_EXIT(zfsvfs); 1241 1242 return (error); 1243} 1244 1245/* 1246 * Attempt to create a new entry in a directory. If the entry 1247 * already exists, truncate the file if permissible, else return 1248 * an error. Return the vp of the created or trunc'd file. 1249 * 1250 * IN: dvp - vnode of directory to put new file entry in. 1251 * name - name of new file entry. 1252 * vap - attributes of new file. 1253 * excl - flag indicating exclusive or non-exclusive mode. 1254 * mode - mode to open file with. 1255 * cr - credentials of caller. 1256 * flag - large file flag [UNUSED]. 1257 * ct - caller context 1258 * vsecp - ACL to be set 1259 * 1260 * OUT: vpp - vnode of created or trunc'd entry. 1261 * 1262 * RETURN: 0 if success 1263 * error code if failure 1264 * 1265 * Timestamps: 1266 * dvp - ctime|mtime updated if new entry created 1267 * vp - ctime|mtime always, atime if new 1268 */ 1269 1270/* ARGSUSED */ 1271static int 1272zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode, 1273 vnode_t **vpp, cred_t *cr, kthread_t *td) 1274{ 1275 znode_t *zp, *dzp = VTOZ(dvp); 1276 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1277 zilog_t *zilog; 1278 objset_t *os; 1279 zfs_dirlock_t *dl; 1280 dmu_tx_t *tx; 1281 int error; 1282 zfs_acl_t *aclp = NULL; 1283 zfs_fuid_info_t *fuidp = NULL; 1284 void *vsecp = NULL; 1285 int flag = 0; 1286 1287 /* 1288 * If we have an ephemeral id, ACL, or XVATTR then 1289 * make sure file system is at proper version 1290 */ 1291 1292 if (zfsvfs->z_use_fuids == B_FALSE && 1293 (vsecp || (vap->va_mask & AT_XVATTR) || 1294 IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr)))) 1295 return (EINVAL); 1296 1297 ZFS_ENTER(zfsvfs); 1298 ZFS_VERIFY_ZP(dzp); 1299 os = zfsvfs->z_os; 1300 zilog = zfsvfs->z_log; 1301 1302 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), 1303 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 1304 ZFS_EXIT(zfsvfs); 1305 return (EILSEQ); 1306 } 1307 1308 if (vap->va_mask & AT_XVATTR) { 1309 if ((error = secpolicy_xvattr((xvattr_t *)vap, 1310 crgetuid(cr), cr, vap->va_type)) != 0) { 1311 ZFS_EXIT(zfsvfs); 1312 return (error); 1313 } 1314 } 1315top: 1316 *vpp = NULL; 1317 1318 if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr)) 1319 vap->va_mode &= ~S_ISVTX; 1320 1321 if (*name == '\0') { 1322 /* 1323 * Null component name refers to the directory itself. 1324 */ 1325 VN_HOLD(dvp); 1326 zp = dzp; 1327 dl = NULL; 1328 error = 0; 1329 } else { 1330 /* possible VN_HOLD(zp) */ 1331 int zflg = 0; 1332 1333 if (flag & FIGNORECASE) 1334 zflg |= ZCILOOK; 1335 1336 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 1337 NULL, NULL); 1338 if (error) { 1339 if (strcmp(name, "..") == 0) 1340 error = EISDIR; 1341 ZFS_EXIT(zfsvfs); 1342 if (aclp) 1343 zfs_acl_free(aclp); 1344 return (error); 1345 } 1346 } 1347 if (vsecp && aclp == NULL) { 1348 error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, &aclp); 1349 if (error) { 1350 ZFS_EXIT(zfsvfs); 1351 if (dl) 1352 zfs_dirent_unlock(dl); 1353 return (error); 1354 } 1355 } 1356 1357 if (zp == NULL) { 1358 uint64_t txtype; 1359 1360 /* 1361 * Create a new file object and update the directory 1362 * to reference it. 1363 */ 1364 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { 1365 goto out; 1366 } 1367 1368 /* 1369 * We only support the creation of regular files in 1370 * extended attribute directories. 1371 */ 1372 if ((dzp->z_phys->zp_flags & ZFS_XATTR) && 1373 (vap->va_type != VREG)) { 1374 error = EINVAL; 1375 goto out; 1376 } 1377 1378 tx = dmu_tx_create(os); 1379 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 1380 if ((aclp && aclp->z_has_fuids) || IS_EPHEMERAL(crgetuid(cr)) || 1381 IS_EPHEMERAL(crgetgid(cr))) { 1382 if (zfsvfs->z_fuid_obj == 0) { 1383 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 1384 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 1385 FUID_SIZE_ESTIMATE(zfsvfs)); 1386 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, 1387 FALSE, NULL); 1388 } else { 1389 dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); 1390 dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, 1391 FUID_SIZE_ESTIMATE(zfsvfs)); 1392 } 1393 } 1394 dmu_tx_hold_bonus(tx, dzp->z_id); 1395 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 1396 if ((dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) || aclp) { 1397 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 1398 0, SPA_MAXBLOCKSIZE); 1399 } 1400 error = dmu_tx_assign(tx, zfsvfs->z_assign); 1401 if (error) { 1402 zfs_dirent_unlock(dl); 1403 if (error == ERESTART && 1404 zfsvfs->z_assign == TXG_NOWAIT) { 1405 dmu_tx_wait(tx); 1406 dmu_tx_abort(tx); 1407 goto top; 1408 } 1409 dmu_tx_abort(tx); 1410 ZFS_EXIT(zfsvfs); 1411 if (aclp) 1412 zfs_acl_free(aclp); 1413 return (error); 1414 } 1415 zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, aclp, &fuidp); 1416 (void) zfs_link_create(dl, zp, tx, ZNEW); 1417 txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); 1418 if (flag & FIGNORECASE) 1419 txtype |= TX_CI; 1420 zfs_log_create(zilog, tx, txtype, dzp, zp, name, 1421 vsecp, fuidp, vap); 1422 if (fuidp) 1423 zfs_fuid_info_free(fuidp); 1424 dmu_tx_commit(tx); 1425 } else { 1426 int aflags = (flag & FAPPEND) ? V_APPEND : 0; 1427 1428 /* 1429 * A directory entry already exists for this name. 1430 */ 1431 /* 1432 * Can't truncate an existing file if in exclusive mode. 1433 */ 1434 if (excl == EXCL) { 1435 error = EEXIST; 1436 goto out; 1437 } 1438 /* 1439 * Can't open a directory for writing. 1440 */ 1441 if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) { 1442 error = EISDIR; 1443 goto out; 1444 } 1445 /* 1446 * Verify requested access to file. 1447 */ 1448 if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) { 1449 goto out; 1450 } 1451 1452 mutex_enter(&dzp->z_lock); 1453 dzp->z_seq++; 1454 mutex_exit(&dzp->z_lock); 1455 1456 /* 1457 * Truncate regular files if requested. 1458 */ 1459 if ((ZTOV(zp)->v_type == VREG) && 1460 (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) { 1461 /* we can't hold any locks when calling zfs_freesp() */ 1462 zfs_dirent_unlock(dl); 1463 dl = NULL; 1464 error = zfs_freesp(zp, 0, 0, mode, TRUE); 1465 if (error == 0) { 1466 vnevent_create(ZTOV(zp), ct); 1467 } 1468 } 1469 } 1470out: 1471 if (dl) 1472 zfs_dirent_unlock(dl); 1473 1474 if (error) { 1475 if (zp) 1476 VN_RELE(ZTOV(zp)); 1477 } else { 1478 *vpp = ZTOV(zp); 1479 /* 1480 * If vnode is for a device return a specfs vnode instead. 1481 */ 1482 if (IS_DEVVP(*vpp)) { 1483 struct vnode *svp; 1484 1485 svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); 1486 VN_RELE(*vpp); 1487 if (svp == NULL) { 1488 error = ENOSYS; 1489 } 1490 *vpp = svp; 1491 } 1492 } 1493 if (aclp) 1494 zfs_acl_free(aclp); 1495 1496 ZFS_EXIT(zfsvfs); 1497 return (error); 1498} 1499 1500/* 1501 * Remove an entry from a directory. 1502 * 1503 * IN: dvp - vnode of directory to remove entry from. 1504 * name - name of entry to remove. 1505 * cr - credentials of caller. 1506 * ct - caller context 1507 * flags - case flags 1508 * 1509 * RETURN: 0 if success 1510 * error code if failure 1511 * 1512 * Timestamps: 1513 * dvp - ctime|mtime 1514 * vp - ctime (if nlink > 0) 1515 */ 1516/*ARGSUSED*/ 1517static int 1518zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct, 1519 int flags) 1520{ 1521 znode_t *zp, *dzp = VTOZ(dvp); 1522 znode_t *xzp = NULL; 1523 vnode_t *vp; 1524 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1525 zilog_t *zilog; 1526 uint64_t acl_obj, xattr_obj; 1527 zfs_dirlock_t *dl; 1528 dmu_tx_t *tx; 1529 boolean_t may_delete_now, delete_now = FALSE; 1530 boolean_t unlinked, toobig = FALSE; 1531 uint64_t txtype; 1532 pathname_t *realnmp = NULL; 1533 pathname_t realnm; 1534 int error; 1535 int zflg = ZEXISTS; 1536 1537 ZFS_ENTER(zfsvfs); 1538 ZFS_VERIFY_ZP(dzp); 1539 zilog = zfsvfs->z_log; 1540 1541 if (flags & FIGNORECASE) { 1542 zflg |= ZCILOOK; 1543 pn_alloc(&realnm); 1544 realnmp = &realnm; 1545 } 1546 1547top: 1548 /* 1549 * Attempt to lock directory; fail if entry doesn't exist. 1550 */ 1551 if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 1552 NULL, realnmp)) { 1553 if (realnmp) 1554 pn_free(realnmp); 1555 ZFS_EXIT(zfsvfs); 1556 return (error); 1557 } 1558 1559 vp = ZTOV(zp); 1560 1561 if (error = zfs_zaccess_delete(dzp, zp, cr)) { 1562 goto out; 1563 } 1564 1565 /* 1566 * Need to use rmdir for removing directories. 1567 */ 1568 if (vp->v_type == VDIR) { 1569 error = EPERM; 1570 goto out; 1571 } 1572 1573 vnevent_remove(vp, dvp, name, ct); 1574 1575 if (realnmp) 1576 dnlc_remove(dvp, realnmp->pn_buf); 1577 else 1578 dnlc_remove(dvp, name); 1579 1580 may_delete_now = FALSE; 1581 1582 /* 1583 * We may delete the znode now, or we may put it in the unlinked set; 1584 * it depends on whether we're the last link, and on whether there are 1585 * other holds on the vnode. So we dmu_tx_hold() the right things to 1586 * allow for either case. 1587 */ 1588 tx = dmu_tx_create(zfsvfs->z_os); 1589 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); 1590 dmu_tx_hold_bonus(tx, zp->z_id); 1591 if (may_delete_now) { 1592 toobig = 1593 zp->z_phys->zp_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT; 1594 /* if the file is too big, only hold_free a token amount */ 1595 dmu_tx_hold_free(tx, zp->z_id, 0, 1596 (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END)); 1597 } 1598 1599 /* are there any extended attributes? */ 1600 if ((xattr_obj = zp->z_phys->zp_xattr) != 0) { 1601 /* XXX - do we need this if we are deleting? */ 1602 dmu_tx_hold_bonus(tx, xattr_obj); 1603 } 1604 1605 /* are there any additional acls */ 1606 if ((acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj) != 0 && 1607 may_delete_now) 1608 dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); 1609 1610 /* charge as an update -- would be nice not to charge at all */ 1611 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 1612 1613 error = dmu_tx_assign(tx, zfsvfs->z_assign); 1614 if (error) { 1615 zfs_dirent_unlock(dl); 1616 VN_RELE(vp); 1617 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 1618 dmu_tx_wait(tx); 1619 dmu_tx_abort(tx); 1620 goto top; 1621 } 1622 if (realnmp) 1623 pn_free(realnmp); 1624 dmu_tx_abort(tx); 1625 ZFS_EXIT(zfsvfs); 1626 return (error); 1627 } 1628 1629 /* 1630 * Remove the directory entry. 1631 */ 1632 error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked); 1633 1634 if (error) { 1635 dmu_tx_commit(tx); 1636 goto out; 1637 } 1638 1639 if (0 && unlinked) { 1640 VI_LOCK(vp); 1641 delete_now = may_delete_now && !toobig && 1642 vp->v_count == 1 && !vn_has_cached_data(vp) && 1643 zp->z_phys->zp_xattr == xattr_obj && 1644 zp->z_phys->zp_acl.z_acl_extern_obj == acl_obj; 1645 VI_UNLOCK(vp); 1646 } 1647 1648 if (delete_now) { 1649 if (zp->z_phys->zp_xattr) { 1650 error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp); 1651 ASSERT3U(error, ==, 0); 1652 ASSERT3U(xzp->z_phys->zp_links, ==, 2); 1653 dmu_buf_will_dirty(xzp->z_dbuf, tx); 1654 mutex_enter(&xzp->z_lock); 1655 xzp->z_unlinked = 1; 1656 xzp->z_phys->zp_links = 0; 1657 mutex_exit(&xzp->z_lock); 1658 zfs_unlinked_add(xzp, tx); 1659 zp->z_phys->zp_xattr = 0; /* probably unnecessary */ 1660 } 1661 mutex_enter(&zp->z_lock); 1662 VI_LOCK(vp); 1663 vp->v_count--; 1664 ASSERT3U(vp->v_count, ==, 0); 1665 VI_UNLOCK(vp); 1666 mutex_exit(&zp->z_lock); 1667 zfs_znode_delete(zp, tx); 1668 } else if (unlinked) { 1669 zfs_unlinked_add(zp, tx); 1670 } 1671 1672 txtype = TX_REMOVE; 1673 if (flags & FIGNORECASE) 1674 txtype |= TX_CI; 1675 zfs_log_remove(zilog, tx, txtype, dzp, name); 1676 1677 dmu_tx_commit(tx); 1678out: 1679 if (realnmp) 1680 pn_free(realnmp); 1681 1682 zfs_dirent_unlock(dl); 1683 1684 if (!delete_now) { 1685 VN_RELE(vp); 1686 } else if (xzp) { 1687 /* this rele is delayed to prevent nesting transactions */ 1688 VN_RELE(ZTOV(xzp)); 1689 } 1690 1691 ZFS_EXIT(zfsvfs); 1692 return (error); 1693} 1694 1695/* 1696 * Create a new directory and insert it into dvp using the name 1697 * provided. Return a pointer to the inserted directory. 1698 * 1699 * IN: dvp - vnode of directory to add subdir to. 1700 * dirname - name of new directory. 1701 * vap - attributes of new directory. 1702 * cr - credentials of caller. 1703 * ct - caller context 1704 * vsecp - ACL to be set 1705 * 1706 * OUT: vpp - vnode of created directory. 1707 * 1708 * RETURN: 0 if success 1709 * error code if failure 1710 * 1711 * Timestamps: 1712 * dvp - ctime|mtime updated 1713 * vp - ctime|mtime|atime updated 1714 */ 1715/*ARGSUSED*/ 1716static int 1717zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr, 1718 caller_context_t *ct, int flags, vsecattr_t *vsecp) 1719{ 1720 znode_t *zp, *dzp = VTOZ(dvp); 1721 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1722 zilog_t *zilog; 1723 zfs_dirlock_t *dl; 1724 uint64_t txtype; 1725 dmu_tx_t *tx; 1726 int error; 1727 zfs_acl_t *aclp = NULL; 1728 zfs_fuid_info_t *fuidp = NULL; 1729 int zf = ZNEW; 1730 1731 ASSERT(vap->va_type == VDIR); 1732 1733 /* 1734 * If we have an ephemeral id, ACL, or XVATTR then 1735 * make sure file system is at proper version 1736 */ 1737 1738 if (zfsvfs->z_use_fuids == B_FALSE && 1739 (vsecp || (vap->va_mask & AT_XVATTR) || IS_EPHEMERAL(crgetuid(cr))|| 1740 IS_EPHEMERAL(crgetgid(cr)))) 1741 return (EINVAL); 1742 1743 ZFS_ENTER(zfsvfs); 1744 ZFS_VERIFY_ZP(dzp); 1745 zilog = zfsvfs->z_log; 1746 1747 if (dzp->z_phys->zp_flags & ZFS_XATTR) { 1748 ZFS_EXIT(zfsvfs); 1749 return (EINVAL); 1750 } 1751 1752 if (zfsvfs->z_utf8 && u8_validate(dirname, 1753 strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 1754 ZFS_EXIT(zfsvfs); 1755 return (EILSEQ); 1756 } 1757 if (flags & FIGNORECASE) 1758 zf |= ZCILOOK; 1759 1760 if (vap->va_mask & AT_XVATTR) 1761 if ((error = secpolicy_xvattr((xvattr_t *)vap, 1762 crgetuid(cr), cr, vap->va_type)) != 0) { 1763 ZFS_EXIT(zfsvfs); 1764 return (error); 1765 } 1766 1767 /* 1768 * First make sure the new directory doesn't exist. 1769 */ 1770top: 1771 *vpp = NULL; 1772 1773 if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf, 1774 NULL, NULL)) { 1775 ZFS_EXIT(zfsvfs); 1776 return (error); 1777 } 1778 1779 if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) { 1780 zfs_dirent_unlock(dl); 1781 ZFS_EXIT(zfsvfs); 1782 return (error); 1783 } 1784 1785 if (vsecp && aclp == NULL) { 1786 error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, &aclp); 1787 if (error) { 1788 zfs_dirent_unlock(dl); 1789 ZFS_EXIT(zfsvfs); 1790 return (error); 1791 } 1792 } 1793 /* 1794 * Add a new entry to the directory. 1795 */ 1796 tx = dmu_tx_create(zfsvfs->z_os); 1797 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); 1798 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); 1799 if ((aclp && aclp->z_has_fuids) || IS_EPHEMERAL(crgetuid(cr)) || 1800 IS_EPHEMERAL(crgetgid(cr))) { 1801 if (zfsvfs->z_fuid_obj == 0) { 1802 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 1803 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 1804 FUID_SIZE_ESTIMATE(zfsvfs)); 1805 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); 1806 } else { 1807 dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); 1808 dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, 1809 FUID_SIZE_ESTIMATE(zfsvfs)); 1810 } 1811 } 1812 if ((dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) || aclp) 1813 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 1814 0, SPA_MAXBLOCKSIZE); 1815 error = dmu_tx_assign(tx, zfsvfs->z_assign); 1816 if (error) { 1817 zfs_dirent_unlock(dl); 1818 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 1819 dmu_tx_wait(tx); 1820 dmu_tx_abort(tx); 1821 goto top; 1822 } 1823 dmu_tx_abort(tx); 1824 ZFS_EXIT(zfsvfs); 1825 if (aclp) 1826 zfs_acl_free(aclp); 1827 return (error); 1828 } 1829 1830 /* 1831 * Create new node. 1832 */ 1833 zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, aclp, &fuidp); 1834 1835 if (aclp) 1836 zfs_acl_free(aclp); 1837 1838 /* 1839 * Now put new name in parent dir. 1840 */ 1841 (void) zfs_link_create(dl, zp, tx, ZNEW); 1842 1843 *vpp = ZTOV(zp); 1844 1845 txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap); 1846 if (flags & FIGNORECASE) 1847 txtype |= TX_CI; 1848 zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, fuidp, vap); 1849 1850 if (fuidp) 1851 zfs_fuid_info_free(fuidp); 1852 dmu_tx_commit(tx); 1853 1854 zfs_dirent_unlock(dl); 1855 1856 ZFS_EXIT(zfsvfs); 1857 return (0); 1858} 1859 1860/* 1861 * Remove a directory subdir entry. If the current working 1862 * directory is the same as the subdir to be removed, the 1863 * remove will fail. 1864 * 1865 * IN: dvp - vnode of directory to remove from. 1866 * name - name of directory to be removed. 1867 * cwd - vnode of current working directory. 1868 * cr - credentials of caller. 1869 * ct - caller context 1870 * flags - case flags 1871 * 1872 * RETURN: 0 if success 1873 * error code if failure 1874 * 1875 * Timestamps: 1876 * dvp - ctime|mtime updated 1877 */ 1878/*ARGSUSED*/ 1879static int 1880zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr, 1881 caller_context_t *ct, int flags) 1882{ 1883 znode_t *dzp = VTOZ(dvp); 1884 znode_t *zp; 1885 vnode_t *vp; 1886 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1887 zilog_t *zilog; 1888 zfs_dirlock_t *dl; 1889 dmu_tx_t *tx; 1890 int error; 1891 int zflg = ZEXISTS; 1892 1893 ZFS_ENTER(zfsvfs); 1894 ZFS_VERIFY_ZP(dzp); 1895 zilog = zfsvfs->z_log; 1896 1897 if (flags & FIGNORECASE) 1898 zflg |= ZCILOOK; 1899top: 1900 zp = NULL; 1901 1902 /* 1903 * Attempt to lock directory; fail if entry doesn't exist. 1904 */ 1905 if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 1906 NULL, NULL)) { 1907 ZFS_EXIT(zfsvfs); 1908 return (error); 1909 } 1910 1911 vp = ZTOV(zp); 1912 1913 if (error = zfs_zaccess_delete(dzp, zp, cr)) { 1914 goto out; 1915 } 1916 1917 if (vp->v_type != VDIR) { 1918 error = ENOTDIR; 1919 goto out; 1920 } 1921 1922 if (vp == cwd) { 1923 error = EINVAL; 1924 goto out; 1925 } 1926 1927 vnevent_rmdir(vp, dvp, name, ct); 1928 1929 /* 1930 * Grab a lock on the directory to make sure that noone is 1931 * trying to add (or lookup) entries while we are removing it. 1932 */ 1933 rw_enter(&zp->z_name_lock, RW_WRITER); 1934 1935 /* 1936 * Grab a lock on the parent pointer to make sure we play well 1937 * with the treewalk and directory rename code. 1938 */ 1939 rw_enter(&zp->z_parent_lock, RW_WRITER); 1940 1941 tx = dmu_tx_create(zfsvfs->z_os); 1942 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); 1943 dmu_tx_hold_bonus(tx, zp->z_id); 1944 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 1945 error = dmu_tx_assign(tx, zfsvfs->z_assign); 1946 if (error) { 1947 rw_exit(&zp->z_parent_lock); 1948 rw_exit(&zp->z_name_lock); 1949 zfs_dirent_unlock(dl); 1950 VN_RELE(vp); 1951 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 1952 dmu_tx_wait(tx); 1953 dmu_tx_abort(tx); 1954 goto top; 1955 } 1956 dmu_tx_abort(tx); 1957 ZFS_EXIT(zfsvfs); 1958 return (error); 1959 } 1960 1961#ifdef FREEBSD_NAMECACHE 1962 cache_purge(dvp); 1963#endif 1964 1965 error = zfs_link_destroy(dl, zp, tx, zflg, NULL); 1966 1967 if (error == 0) { 1968 uint64_t txtype = TX_RMDIR; 1969 if (flags & FIGNORECASE) 1970 txtype |= TX_CI; 1971 zfs_log_remove(zilog, tx, txtype, dzp, name); 1972 } 1973 1974 dmu_tx_commit(tx); 1975 1976 rw_exit(&zp->z_parent_lock); 1977 rw_exit(&zp->z_name_lock); 1978#ifdef FREEBSD_NAMECACHE 1979 cache_purge(vp); 1980#endif 1981out: 1982 zfs_dirent_unlock(dl); 1983 1984 VN_RELE(vp); 1985 1986 ZFS_EXIT(zfsvfs); 1987 return (error); 1988} 1989 1990/* 1991 * Read as many directory entries as will fit into the provided 1992 * buffer from the given directory cursor position (specified in 1993 * the uio structure. 1994 * 1995 * IN: vp - vnode of directory to read. 1996 * uio - structure supplying read location, range info, 1997 * and return buffer. 1998 * cr - credentials of caller. 1999 * ct - caller context 2000 * flags - case flags 2001 * 2002 * OUT: uio - updated offset and range, buffer filled. 2003 * eofp - set to true if end-of-file detected. 2004 * 2005 * RETURN: 0 if success 2006 * error code if failure 2007 * 2008 * Timestamps: 2009 * vp - atime updated 2010 * 2011 * Note that the low 4 bits of the cookie returned by zap is always zero. 2012 * This allows us to use the low range for "special" directory entries: 2013 * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, 2014 * we use the offset 2 for the '.zfs' directory. 2015 */ 2016/* ARGSUSED */ 2017static int 2018zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies) 2019{ 2020 znode_t *zp = VTOZ(vp); 2021 iovec_t *iovp; 2022 edirent_t *eodp; 2023 dirent64_t *odp; 2024 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2025 objset_t *os; 2026 caddr_t outbuf; 2027 size_t bufsize; 2028 zap_cursor_t zc; 2029 zap_attribute_t zap; 2030 uint_t bytes_wanted; 2031 uint64_t offset; /* must be unsigned; checks for < 1 */ 2032 int local_eof; 2033 int outcount; 2034 int error; 2035 uint8_t prefetch; 2036 boolean_t check_sysattrs; 2037 uint8_t type; 2038 int ncooks; 2039 u_long *cooks = NULL; 2040 int flags = 0; 2041 2042 ZFS_ENTER(zfsvfs); 2043 ZFS_VERIFY_ZP(zp); 2044 2045 /* 2046 * If we are not given an eof variable, 2047 * use a local one. 2048 */ 2049 if (eofp == NULL) 2050 eofp = &local_eof; 2051 2052 /* 2053 * Check for valid iov_len. 2054 */ 2055 if (uio->uio_iov->iov_len <= 0) { 2056 ZFS_EXIT(zfsvfs); 2057 return (EINVAL); 2058 } 2059 2060 /* 2061 * Quit if directory has been removed (posix) 2062 */ 2063 if ((*eofp = zp->z_unlinked) != 0) { 2064 ZFS_EXIT(zfsvfs); 2065 return (0); 2066 } 2067 2068 error = 0; 2069 os = zfsvfs->z_os; 2070 offset = uio->uio_loffset; 2071 prefetch = zp->z_zn_prefetch; 2072 2073 /* 2074 * Initialize the iterator cursor. 2075 */ 2076 if (offset <= 3) { 2077 /* 2078 * Start iteration from the beginning of the directory. 2079 */ 2080 zap_cursor_init(&zc, os, zp->z_id); 2081 } else { 2082 /* 2083 * The offset is a serialized cursor. 2084 */ 2085 zap_cursor_init_serialized(&zc, os, zp->z_id, offset); 2086 } 2087 2088 /* 2089 * Get space to change directory entries into fs independent format. 2090 */ 2091 iovp = uio->uio_iov; 2092 bytes_wanted = iovp->iov_len; 2093 if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) { 2094 bufsize = bytes_wanted; 2095 outbuf = kmem_alloc(bufsize, KM_SLEEP); 2096 odp = (struct dirent64 *)outbuf; 2097 } else { 2098 bufsize = bytes_wanted; 2099 odp = (struct dirent64 *)iovp->iov_base; 2100 } 2101 eodp = (struct edirent *)odp; 2102 2103 if (ncookies != NULL) { 2104 /* 2105 * Minimum entry size is dirent size and 1 byte for a file name. 2106 */ 2107 ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1); 2108 cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK); 2109 *cookies = cooks; 2110 *ncookies = ncooks; 2111 } 2112 /* 2113 * If this VFS supports the system attribute view interface; and 2114 * we're looking at an extended attribute directory; and we care 2115 * about normalization conflicts on this vfs; then we must check 2116 * for normalization conflicts with the sysattr name space. 2117 */ 2118#ifdef TODO 2119 check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && 2120 (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm && 2121 (flags & V_RDDIR_ENTFLAGS); 2122#else 2123 check_sysattrs = 0; 2124#endif 2125 2126 /* 2127 * Transform to file-system independent format 2128 */ 2129 outcount = 0; 2130 while (outcount < bytes_wanted) { 2131 ino64_t objnum; 2132 ushort_t reclen; 2133 off64_t *next; 2134 2135 /* 2136 * Special case `.', `..', and `.zfs'. 2137 */ 2138 if (offset == 0) { 2139 (void) strcpy(zap.za_name, "."); 2140 zap.za_normalization_conflict = 0; 2141 objnum = zp->z_id; 2142 type = DT_DIR; 2143 } else if (offset == 1) { 2144 (void) strcpy(zap.za_name, ".."); 2145 zap.za_normalization_conflict = 0; 2146 objnum = zp->z_phys->zp_parent; 2147 type = DT_DIR; 2148 } else if (offset == 2 && zfs_show_ctldir(zp)) { 2149 (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); 2150 zap.za_normalization_conflict = 0; 2151 objnum = ZFSCTL_INO_ROOT; 2152 type = DT_DIR; 2153 } else { 2154 /* 2155 * Grab next entry. 2156 */ 2157 if (error = zap_cursor_retrieve(&zc, &zap)) { 2158 if ((*eofp = (error == ENOENT)) != 0) 2159 break; 2160 else 2161 goto update; 2162 } 2163 2164 if (zap.za_integer_length != 8 || 2165 zap.za_num_integers != 1) { 2166 cmn_err(CE_WARN, "zap_readdir: bad directory " 2167 "entry, obj = %lld, offset = %lld\n", 2168 (u_longlong_t)zp->z_id, 2169 (u_longlong_t)offset); 2170 error = ENXIO; 2171 goto update; 2172 } 2173 2174 objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); 2175 /* 2176 * MacOS X can extract the object type here such as: 2177 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer); 2178 */ 2179 type = ZFS_DIRENT_TYPE(zap.za_first_integer); 2180 2181 if (check_sysattrs && !zap.za_normalization_conflict) { 2182#ifdef TODO 2183 zap.za_normalization_conflict = 2184 xattr_sysattr_casechk(zap.za_name); 2185#else 2186 panic("%s:%u: TODO", __func__, __LINE__); 2187#endif 2188 } 2189 } 2190 2191 if (flags & V_RDDIR_ENTFLAGS) 2192 reclen = EDIRENT_RECLEN(strlen(zap.za_name)); 2193 else 2194 reclen = DIRENT64_RECLEN(strlen(zap.za_name)); 2195 2196 /* 2197 * Will this entry fit in the buffer? 2198 */ 2199 if (outcount + reclen > bufsize) { 2200 /* 2201 * Did we manage to fit anything in the buffer? 2202 */ 2203 if (!outcount) { 2204 error = EINVAL; 2205 goto update; 2206 } 2207 break; 2208 } 2209 if (flags & V_RDDIR_ENTFLAGS) { 2210 /* 2211 * Add extended flag entry: 2212 */ 2213 eodp->ed_ino = objnum; 2214 eodp->ed_reclen = reclen; 2215 /* NOTE: ed_off is the offset for the *next* entry */ 2216 next = &(eodp->ed_off); 2217 eodp->ed_eflags = zap.za_normalization_conflict ? 2218 ED_CASE_CONFLICT : 0; 2219 (void) strncpy(eodp->ed_name, zap.za_name, 2220 EDIRENT_NAMELEN(reclen)); 2221 eodp = (edirent_t *)((intptr_t)eodp + reclen); 2222 } else { 2223 /* 2224 * Add normal entry: 2225 */ 2226 odp->d_ino = objnum; 2227 odp->d_reclen = reclen; 2228 odp->d_namlen = strlen(zap.za_name); 2229 (void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1); 2230 odp->d_type = type; 2231 odp = (dirent64_t *)((intptr_t)odp + reclen); 2232 } 2233 outcount += reclen; 2234 2235 ASSERT(outcount <= bufsize); 2236 2237 /* Prefetch znode */ 2238 if (prefetch) 2239 dmu_prefetch(os, objnum, 0, 0); 2240 2241 /* 2242 * Move to the next entry, fill in the previous offset. 2243 */ 2244 if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { 2245 zap_cursor_advance(&zc); 2246 offset = zap_cursor_serialize(&zc); 2247 } else { 2248 offset += 1; 2249 } 2250 2251 if (cooks != NULL) { 2252 *cooks++ = offset; 2253 ncooks--; 2254 KASSERT(ncooks >= 0, ("ncookies=%d", ncooks)); 2255 } 2256 } 2257 zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ 2258 2259 /* Subtract unused cookies */ 2260 if (ncookies != NULL) 2261 *ncookies -= ncooks; 2262 2263 if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) { 2264 iovp->iov_base += outcount; 2265 iovp->iov_len -= outcount; 2266 uio->uio_resid -= outcount; 2267 } else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) { 2268 /* 2269 * Reset the pointer. 2270 */ 2271 offset = uio->uio_loffset; 2272 } 2273 2274update: 2275 zap_cursor_fini(&zc); 2276 if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) 2277 kmem_free(outbuf, bufsize); 2278 2279 if (error == ENOENT) 2280 error = 0; 2281 2282 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 2283 2284 uio->uio_loffset = offset; 2285 ZFS_EXIT(zfsvfs); 2286 if (error != 0 && cookies != NULL) { 2287 free(*cookies, M_TEMP); 2288 *cookies = NULL; 2289 *ncookies = 0; 2290 } 2291 return (error); 2292} 2293 2294ulong_t zfs_fsync_sync_cnt = 4; 2295 2296static int 2297zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct) 2298{ 2299 znode_t *zp = VTOZ(vp); 2300 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2301 2302 (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt); 2303 2304 ZFS_ENTER(zfsvfs); 2305 ZFS_VERIFY_ZP(zp); 2306 zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id); 2307 ZFS_EXIT(zfsvfs); 2308 return (0); 2309} 2310 2311 2312/* 2313 * Get the requested file attributes and place them in the provided 2314 * vattr structure. 2315 * 2316 * IN: vp - vnode of file. 2317 * vap - va_mask identifies requested attributes. 2318 * If AT_XVATTR set, then optional attrs are requested 2319 * flags - ATTR_NOACLCHECK (CIFS server context) 2320 * cr - credentials of caller. 2321 * ct - caller context 2322 * 2323 * OUT: vap - attribute values. 2324 * 2325 * RETURN: 0 (always succeeds) 2326 */ 2327/* ARGSUSED */ 2328static int 2329zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, 2330 caller_context_t *ct) 2331{ 2332 znode_t *zp = VTOZ(vp); 2333 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2334 znode_phys_t *pzp; 2335 int error = 0; 2336 uint32_t blksize; 2337 u_longlong_t nblocks; 2338 uint64_t links; 2339 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ 2340 xoptattr_t *xoap = NULL; 2341 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 2342 2343 ZFS_ENTER(zfsvfs); 2344 ZFS_VERIFY_ZP(zp); 2345 pzp = zp->z_phys; 2346 2347 mutex_enter(&zp->z_lock); 2348 2349 /* 2350 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES. 2351 * Also, if we are the owner don't bother, since owner should 2352 * always be allowed to read basic attributes of file. 2353 */ 2354 if (!(pzp->zp_flags & ZFS_ACL_TRIVIAL) && 2355 (pzp->zp_uid != crgetuid(cr))) { 2356 if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0, 2357 skipaclchk, cr)) { 2358 mutex_exit(&zp->z_lock); 2359 ZFS_EXIT(zfsvfs); 2360 return (error); 2361 } 2362 } 2363 2364 /* 2365 * Return all attributes. It's cheaper to provide the answer 2366 * than to determine whether we were asked the question. 2367 */ 2368 2369 vap->va_type = IFTOVT(pzp->zp_mode); 2370 vap->va_mode = pzp->zp_mode & ~S_IFMT; 2371 zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid); 2372// vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev; 2373 vap->va_nodeid = zp->z_id; 2374 if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp)) 2375 links = pzp->zp_links + 1; 2376 else 2377 links = pzp->zp_links; 2378 vap->va_nlink = MIN(links, UINT32_MAX); /* nlink_t limit! */ 2379 vap->va_size = pzp->zp_size; 2380 vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; 2381 vap->va_rdev = zfs_cmpldev(pzp->zp_rdev); 2382 vap->va_seq = zp->z_seq; 2383 vap->va_flags = 0; /* FreeBSD: Reset chflags(2) flags. */ 2384 2385 /* 2386 * Add in any requested optional attributes and the create time. 2387 * Also set the corresponding bits in the returned attribute bitmap. 2388 */ 2389 if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) { 2390 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { 2391 xoap->xoa_archive = 2392 ((pzp->zp_flags & ZFS_ARCHIVE) != 0); 2393 XVA_SET_RTN(xvap, XAT_ARCHIVE); 2394 } 2395 2396 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { 2397 xoap->xoa_readonly = 2398 ((pzp->zp_flags & ZFS_READONLY) != 0); 2399 XVA_SET_RTN(xvap, XAT_READONLY); 2400 } 2401 2402 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { 2403 xoap->xoa_system = 2404 ((pzp->zp_flags & ZFS_SYSTEM) != 0); 2405 XVA_SET_RTN(xvap, XAT_SYSTEM); 2406 } 2407 2408 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { 2409 xoap->xoa_hidden = 2410 ((pzp->zp_flags & ZFS_HIDDEN) != 0); 2411 XVA_SET_RTN(xvap, XAT_HIDDEN); 2412 } 2413 2414 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { 2415 xoap->xoa_nounlink = 2416 ((pzp->zp_flags & ZFS_NOUNLINK) != 0); 2417 XVA_SET_RTN(xvap, XAT_NOUNLINK); 2418 } 2419 2420 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { 2421 xoap->xoa_immutable = 2422 ((pzp->zp_flags & ZFS_IMMUTABLE) != 0); 2423 XVA_SET_RTN(xvap, XAT_IMMUTABLE); 2424 } 2425 2426 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { 2427 xoap->xoa_appendonly = 2428 ((pzp->zp_flags & ZFS_APPENDONLY) != 0); 2429 XVA_SET_RTN(xvap, XAT_APPENDONLY); 2430 } 2431 2432 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { 2433 xoap->xoa_nodump = 2434 ((pzp->zp_flags & ZFS_NODUMP) != 0); 2435 XVA_SET_RTN(xvap, XAT_NODUMP); 2436 } 2437 2438 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { 2439 xoap->xoa_opaque = 2440 ((pzp->zp_flags & ZFS_OPAQUE) != 0); 2441 XVA_SET_RTN(xvap, XAT_OPAQUE); 2442 } 2443 2444 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { 2445 xoap->xoa_av_quarantined = 2446 ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0); 2447 XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); 2448 } 2449 2450 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { 2451 xoap->xoa_av_modified = 2452 ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0); 2453 XVA_SET_RTN(xvap, XAT_AV_MODIFIED); 2454 } 2455 2456 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) && 2457 vp->v_type == VREG && 2458 (pzp->zp_flags & ZFS_BONUS_SCANSTAMP)) { 2459 size_t len; 2460 dmu_object_info_t doi; 2461 2462 /* 2463 * Only VREG files have anti-virus scanstamps, so we 2464 * won't conflict with symlinks in the bonus buffer. 2465 */ 2466 dmu_object_info_from_db(zp->z_dbuf, &doi); 2467 len = sizeof (xoap->xoa_av_scanstamp) + 2468 sizeof (znode_phys_t); 2469 if (len <= doi.doi_bonus_size) { 2470 /* 2471 * pzp points to the start of the 2472 * znode_phys_t. pzp + 1 points to the 2473 * first byte after the znode_phys_t. 2474 */ 2475 (void) memcpy(xoap->xoa_av_scanstamp, 2476 pzp + 1, 2477 sizeof (xoap->xoa_av_scanstamp)); 2478 XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); 2479 } 2480 } 2481 2482 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { 2483 ZFS_TIME_DECODE(&xoap->xoa_createtime, pzp->zp_crtime); 2484 XVA_SET_RTN(xvap, XAT_CREATETIME); 2485 } 2486 } 2487 2488 ZFS_TIME_DECODE(&vap->va_atime, pzp->zp_atime); 2489 ZFS_TIME_DECODE(&vap->va_mtime, pzp->zp_mtime); 2490 ZFS_TIME_DECODE(&vap->va_ctime, pzp->zp_ctime); 2491 ZFS_TIME_DECODE(&vap->va_birthtime, pzp->zp_crtime); 2492 2493 mutex_exit(&zp->z_lock); 2494 2495 dmu_object_size_from_db(zp->z_dbuf, &blksize, &nblocks); 2496 vap->va_blksize = blksize; 2497 vap->va_bytes = nblocks << 9; /* nblocks * 512 */ 2498 2499 if (zp->z_blksz == 0) { 2500 /* 2501 * Block size hasn't been set; suggest maximal I/O transfers. 2502 */ 2503 vap->va_blksize = zfsvfs->z_max_blksz; 2504 } 2505 2506 ZFS_EXIT(zfsvfs); 2507 return (0); 2508} 2509 2510/* 2511 * Set the file attributes to the values contained in the 2512 * vattr structure. 2513 * 2514 * IN: vp - vnode of file to be modified. 2515 * vap - new attribute values. 2516 * If AT_XVATTR set, then optional attrs are being set 2517 * flags - ATTR_UTIME set if non-default time values provided. 2518 * - ATTR_NOACLCHECK (CIFS context only). 2519 * cr - credentials of caller. 2520 * ct - caller context 2521 * 2522 * RETURN: 0 if success 2523 * error code if failure 2524 * 2525 * Timestamps: 2526 * vp - ctime updated, mtime updated if size changed. 2527 */ 2528/* ARGSUSED */ 2529static int 2530zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, 2531 caller_context_t *ct) 2532{ 2533 znode_t *zp = VTOZ(vp); 2534 znode_phys_t *pzp; 2535 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2536 zilog_t *zilog; 2537 dmu_tx_t *tx; 2538 vattr_t oldva; 2539 uint_t mask = vap->va_mask; 2540 uint_t saved_mask; 2541 int trim_mask = 0; 2542 uint64_t new_mode; 2543 znode_t *attrzp; 2544 int need_policy = FALSE; 2545 int err; 2546 zfs_fuid_info_t *fuidp = NULL; 2547 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ 2548 xoptattr_t *xoap; 2549 zfs_acl_t *aclp = NULL; 2550 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 2551 2552 if (mask == 0) 2553 return (0); 2554 2555 if (mask & AT_NOSET) 2556 return (EINVAL); 2557 2558 ZFS_ENTER(zfsvfs); 2559 ZFS_VERIFY_ZP(zp); 2560 2561 pzp = zp->z_phys; 2562 zilog = zfsvfs->z_log; 2563 2564 /* 2565 * Make sure that if we have ephemeral uid/gid or xvattr specified 2566 * that file system is at proper version level 2567 */ 2568 2569 if (zfsvfs->z_use_fuids == B_FALSE && 2570 (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) || 2571 ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) || 2572 (mask & AT_XVATTR))) { 2573 ZFS_EXIT(zfsvfs); 2574 return (EINVAL); 2575 } 2576 2577 if (mask & AT_SIZE && vp->v_type == VDIR) { 2578 ZFS_EXIT(zfsvfs); 2579 return (EISDIR); 2580 } 2581 2582 if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) { 2583 ZFS_EXIT(zfsvfs); 2584 return (EINVAL); 2585 } 2586 2587 /* 2588 * If this is an xvattr_t, then get a pointer to the structure of 2589 * optional attributes. If this is NULL, then we have a vattr_t. 2590 */ 2591 xoap = xva_getxoptattr(xvap); 2592 2593 /* 2594 * Immutable files can only alter immutable bit and atime 2595 */ 2596 if ((pzp->zp_flags & ZFS_IMMUTABLE) && 2597 ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) || 2598 ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { 2599 ZFS_EXIT(zfsvfs); 2600 return (EPERM); 2601 } 2602 2603 if ((mask & AT_SIZE) && (pzp->zp_flags & ZFS_READONLY)) { 2604 ZFS_EXIT(zfsvfs); 2605 return (EPERM); 2606 } 2607 2608 /* 2609 * Verify timestamps doesn't overflow 32 bits. 2610 * ZFS can handle large timestamps, but 32bit syscalls can't 2611 * handle times greater than 2039. This check should be removed 2612 * once large timestamps are fully supported. 2613 */ 2614 if (mask & (AT_ATIME | AT_MTIME)) { 2615 if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || 2616 ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { 2617 ZFS_EXIT(zfsvfs); 2618 return (EOVERFLOW); 2619 } 2620 } 2621 2622top: 2623 attrzp = NULL; 2624 2625 if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { 2626 ZFS_EXIT(zfsvfs); 2627 return (EROFS); 2628 } 2629 2630 /* 2631 * First validate permissions 2632 */ 2633 2634 if (mask & AT_SIZE) { 2635 err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr); 2636 if (err) { 2637 ZFS_EXIT(zfsvfs); 2638 return (err); 2639 } 2640 /* 2641 * XXX - Note, we are not providing any open 2642 * mode flags here (like FNDELAY), so we may 2643 * block if there are locks present... this 2644 * should be addressed in openat(). 2645 */ 2646 /* XXX - would it be OK to generate a log record here? */ 2647 err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); 2648 if (err) { 2649 ZFS_EXIT(zfsvfs); 2650 return (err); 2651 } 2652 } 2653 2654 if (mask & (AT_ATIME|AT_MTIME) || 2655 ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || 2656 XVA_ISSET_REQ(xvap, XAT_READONLY) || 2657 XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || 2658 XVA_ISSET_REQ(xvap, XAT_CREATETIME) || 2659 XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) 2660 need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, 2661 skipaclchk, cr); 2662 2663 if (mask & (AT_UID|AT_GID)) { 2664 int idmask = (mask & (AT_UID|AT_GID)); 2665 int take_owner; 2666 int take_group; 2667 2668 /* 2669 * NOTE: even if a new mode is being set, 2670 * we may clear S_ISUID/S_ISGID bits. 2671 */ 2672 2673 if (!(mask & AT_MODE)) 2674 vap->va_mode = pzp->zp_mode; 2675 2676 /* 2677 * Take ownership or chgrp to group we are a member of 2678 */ 2679 2680 take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr)); 2681 take_group = (mask & AT_GID) && 2682 zfs_groupmember(zfsvfs, vap->va_gid, cr); 2683 2684 /* 2685 * If both AT_UID and AT_GID are set then take_owner and 2686 * take_group must both be set in order to allow taking 2687 * ownership. 2688 * 2689 * Otherwise, send the check through secpolicy_vnode_setattr() 2690 * 2691 */ 2692 2693 if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) || 2694 ((idmask == AT_UID) && take_owner) || 2695 ((idmask == AT_GID) && take_group)) { 2696 if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, 2697 skipaclchk, cr) == 0) { 2698 /* 2699 * Remove setuid/setgid for non-privileged users 2700 */ 2701 secpolicy_setid_clear(vap, vp, cr); 2702 trim_mask = (mask & (AT_UID|AT_GID)); 2703 } else { 2704 need_policy = TRUE; 2705 } 2706 } else { 2707 need_policy = TRUE; 2708 } 2709 } 2710 2711 mutex_enter(&zp->z_lock); 2712 oldva.va_mode = pzp->zp_mode; 2713 zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); 2714 if (mask & AT_XVATTR) { 2715 if ((need_policy == FALSE) && 2716 (XVA_ISSET_REQ(xvap, XAT_APPENDONLY) && 2717 xoap->xoa_appendonly != 2718 ((pzp->zp_flags & ZFS_APPENDONLY) != 0)) || 2719 (XVA_ISSET_REQ(xvap, XAT_NOUNLINK) && 2720 xoap->xoa_nounlink != 2721 ((pzp->zp_flags & ZFS_NOUNLINK) != 0)) || 2722 (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE) && 2723 xoap->xoa_immutable != 2724 ((pzp->zp_flags & ZFS_IMMUTABLE) != 0)) || 2725 (XVA_ISSET_REQ(xvap, XAT_NODUMP) && 2726 xoap->xoa_nodump != 2727 ((pzp->zp_flags & ZFS_NODUMP) != 0)) || 2728 (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED) && 2729 xoap->xoa_av_modified != 2730 ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0)) || 2731 ((XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED) && 2732 ((vp->v_type != VREG && xoap->xoa_av_quarantined) || 2733 xoap->xoa_av_quarantined != 2734 ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0)))) || 2735 (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) || 2736 (XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { 2737 need_policy = TRUE; 2738 } 2739 } 2740 2741 mutex_exit(&zp->z_lock); 2742 2743 if (mask & AT_MODE) { 2744 if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) { 2745 err = secpolicy_setid_setsticky_clear(vp, vap, 2746 &oldva, cr); 2747 if (err) { 2748 ZFS_EXIT(zfsvfs); 2749 return (err); 2750 } 2751 trim_mask |= AT_MODE; 2752 } else { 2753 need_policy = TRUE; 2754 } 2755 } 2756 2757 if (need_policy) { 2758 /* 2759 * If trim_mask is set then take ownership 2760 * has been granted or write_acl is present and user 2761 * has the ability to modify mode. In that case remove 2762 * UID|GID and or MODE from mask so that 2763 * secpolicy_vnode_setattr() doesn't revoke it. 2764 */ 2765 2766 if (trim_mask) { 2767 saved_mask = vap->va_mask; 2768 vap->va_mask &= ~trim_mask; 2769 } 2770 err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags, 2771 (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp); 2772 if (err) { 2773 ZFS_EXIT(zfsvfs); 2774 return (err); 2775 } 2776 2777 if (trim_mask) 2778 vap->va_mask |= saved_mask; 2779 } 2780 2781 /* 2782 * secpolicy_vnode_setattr, or take ownership may have 2783 * changed va_mask 2784 */ 2785 mask = vap->va_mask; 2786 2787 tx = dmu_tx_create(zfsvfs->z_os); 2788 dmu_tx_hold_bonus(tx, zp->z_id); 2789 if (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) || 2790 ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid))) { 2791 if (zfsvfs->z_fuid_obj == 0) { 2792 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 2793 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 2794 FUID_SIZE_ESTIMATE(zfsvfs)); 2795 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); 2796 } else { 2797 dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); 2798 dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, 2799 FUID_SIZE_ESTIMATE(zfsvfs)); 2800 } 2801 } 2802 2803 if (mask & AT_MODE) { 2804 uint64_t pmode = pzp->zp_mode; 2805 2806 new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); 2807 2808 if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)) { 2809 dmu_tx_abort(tx); 2810 ZFS_EXIT(zfsvfs); 2811 return (err); 2812 } 2813 if (pzp->zp_acl.z_acl_extern_obj) { 2814 /* Are we upgrading ACL from old V0 format to new V1 */ 2815 if (zfsvfs->z_version <= ZPL_VERSION_FUID && 2816 pzp->zp_acl.z_acl_version == 2817 ZFS_ACL_VERSION_INITIAL) { 2818 dmu_tx_hold_free(tx, 2819 pzp->zp_acl.z_acl_extern_obj, 0, 2820 DMU_OBJECT_END); 2821 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 2822 0, aclp->z_acl_bytes); 2823 } else { 2824 dmu_tx_hold_write(tx, 2825 pzp->zp_acl.z_acl_extern_obj, 0, 2826 aclp->z_acl_bytes); 2827 } 2828 } else if (aclp->z_acl_bytes > ZFS_ACE_SPACE) { 2829 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 2830 0, aclp->z_acl_bytes); 2831 } 2832 } 2833 2834 if ((mask & (AT_UID | AT_GID)) && pzp->zp_xattr != 0) { 2835 err = zfs_zget(zp->z_zfsvfs, pzp->zp_xattr, &attrzp); 2836 if (err) { 2837 dmu_tx_abort(tx); 2838 ZFS_EXIT(zfsvfs); 2839 if (aclp) 2840 zfs_acl_free(aclp); 2841 return (err); 2842 } 2843 dmu_tx_hold_bonus(tx, attrzp->z_id); 2844 } 2845 2846 err = dmu_tx_assign(tx, zfsvfs->z_assign); 2847 if (err) { 2848 if (attrzp) 2849 VN_RELE(ZTOV(attrzp)); 2850 2851 if (aclp) { 2852 zfs_acl_free(aclp); 2853 aclp = NULL; 2854 } 2855 2856 if (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 2857 dmu_tx_wait(tx); 2858 dmu_tx_abort(tx); 2859 goto top; 2860 } 2861 dmu_tx_abort(tx); 2862 ZFS_EXIT(zfsvfs); 2863 return (err); 2864 } 2865 2866 dmu_buf_will_dirty(zp->z_dbuf, tx); 2867 2868 /* 2869 * Set each attribute requested. 2870 * We group settings according to the locks they need to acquire. 2871 * 2872 * Note: you cannot set ctime directly, although it will be 2873 * updated as a side-effect of calling this function. 2874 */ 2875 2876 mutex_enter(&zp->z_lock); 2877 2878 if (mask & AT_MODE) { 2879 mutex_enter(&zp->z_acl_lock); 2880 zp->z_phys->zp_mode = new_mode; 2881 err = zfs_aclset_common(zp, aclp, cr, &fuidp, tx); 2882 ASSERT3U(err, ==, 0); 2883 mutex_exit(&zp->z_acl_lock); 2884 } 2885 2886 if (attrzp) 2887 mutex_enter(&attrzp->z_lock); 2888 2889 if (mask & AT_UID) { 2890 pzp->zp_uid = zfs_fuid_create(zfsvfs, 2891 vap->va_uid, cr, ZFS_OWNER, tx, &fuidp); 2892 if (attrzp) { 2893 attrzp->z_phys->zp_uid = zfs_fuid_create(zfsvfs, 2894 vap->va_uid, cr, ZFS_OWNER, tx, &fuidp); 2895 } 2896 } 2897 2898 if (mask & AT_GID) { 2899 pzp->zp_gid = zfs_fuid_create(zfsvfs, vap->va_gid, 2900 cr, ZFS_GROUP, tx, &fuidp); 2901 if (attrzp) 2902 attrzp->z_phys->zp_gid = zfs_fuid_create(zfsvfs, 2903 vap->va_gid, cr, ZFS_GROUP, tx, &fuidp); 2904 } 2905 2906 if (aclp) 2907 zfs_acl_free(aclp); 2908 2909 if (attrzp) 2910 mutex_exit(&attrzp->z_lock); 2911 2912 if (mask & AT_ATIME) 2913 ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime); 2914 2915 if (mask & AT_MTIME) 2916 ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime); 2917 2918 /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */ 2919 if (mask & AT_SIZE) 2920 zfs_time_stamper_locked(zp, CONTENT_MODIFIED, tx); 2921 else if (mask != 0) 2922 zfs_time_stamper_locked(zp, STATE_CHANGED, tx); 2923 /* 2924 * Do this after setting timestamps to prevent timestamp 2925 * update from toggling bit 2926 */ 2927 2928 if (xoap && (mask & AT_XVATTR)) { 2929 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { 2930 size_t len; 2931 dmu_object_info_t doi; 2932 2933 ASSERT(vp->v_type == VREG); 2934 2935 /* Grow the bonus buffer if necessary. */ 2936 dmu_object_info_from_db(zp->z_dbuf, &doi); 2937 len = sizeof (xoap->xoa_av_scanstamp) + 2938 sizeof (znode_phys_t); 2939 if (len > doi.doi_bonus_size) 2940 VERIFY(dmu_set_bonus(zp->z_dbuf, len, tx) == 0); 2941 } 2942 zfs_xvattr_set(zp, xvap); 2943 } 2944 2945 if (mask != 0) 2946 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); 2947 2948 if (fuidp) 2949 zfs_fuid_info_free(fuidp); 2950 mutex_exit(&zp->z_lock); 2951 2952 if (attrzp) 2953 VN_RELE(ZTOV(attrzp)); 2954 2955 dmu_tx_commit(tx); 2956 2957 ZFS_EXIT(zfsvfs); 2958 return (err); 2959} 2960 2961typedef struct zfs_zlock { 2962 krwlock_t *zl_rwlock; /* lock we acquired */ 2963 znode_t *zl_znode; /* znode we held */ 2964 struct zfs_zlock *zl_next; /* next in list */ 2965} zfs_zlock_t; 2966 2967/* 2968 * Drop locks and release vnodes that were held by zfs_rename_lock(). 2969 */ 2970static void 2971zfs_rename_unlock(zfs_zlock_t **zlpp) 2972{ 2973 zfs_zlock_t *zl; 2974 2975 while ((zl = *zlpp) != NULL) { 2976 if (zl->zl_znode != NULL) 2977 VN_RELE(ZTOV(zl->zl_znode)); 2978 rw_exit(zl->zl_rwlock); 2979 *zlpp = zl->zl_next; 2980 kmem_free(zl, sizeof (*zl)); 2981 } 2982} 2983 2984/* 2985 * Search back through the directory tree, using the ".." entries. 2986 * Lock each directory in the chain to prevent concurrent renames. 2987 * Fail any attempt to move a directory into one of its own descendants. 2988 * XXX - z_parent_lock can overlap with map or grow locks 2989 */ 2990static int 2991zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) 2992{ 2993 zfs_zlock_t *zl; 2994 znode_t *zp = tdzp; 2995 uint64_t rootid = zp->z_zfsvfs->z_root; 2996 uint64_t *oidp = &zp->z_id; 2997 krwlock_t *rwlp = &szp->z_parent_lock; 2998 krw_t rw = RW_WRITER; 2999 3000 /* 3001 * First pass write-locks szp and compares to zp->z_id. 3002 * Later passes read-lock zp and compare to zp->z_parent. 3003 */ 3004 do { 3005 if (!rw_tryenter(rwlp, rw)) { 3006 /* 3007 * Another thread is renaming in this path. 3008 * Note that if we are a WRITER, we don't have any 3009 * parent_locks held yet. 3010 */ 3011 if (rw == RW_READER && zp->z_id > szp->z_id) { 3012 /* 3013 * Drop our locks and restart 3014 */ 3015 zfs_rename_unlock(&zl); 3016 *zlpp = NULL; 3017 zp = tdzp; 3018 oidp = &zp->z_id; 3019 rwlp = &szp->z_parent_lock; 3020 rw = RW_WRITER; 3021 continue; 3022 } else { 3023 /* 3024 * Wait for other thread to drop its locks 3025 */ 3026 rw_enter(rwlp, rw); 3027 } 3028 } 3029 3030 zl = kmem_alloc(sizeof (*zl), KM_SLEEP); 3031 zl->zl_rwlock = rwlp; 3032 zl->zl_znode = NULL; 3033 zl->zl_next = *zlpp; 3034 *zlpp = zl; 3035 3036 if (*oidp == szp->z_id) /* We're a descendant of szp */ 3037 return (EINVAL); 3038 3039 if (*oidp == rootid) /* We've hit the top */ 3040 return (0); 3041 3042 if (rw == RW_READER) { /* i.e. not the first pass */ 3043 int error = zfs_zget(zp->z_zfsvfs, *oidp, &zp); 3044 if (error) 3045 return (error); 3046 zl->zl_znode = zp; 3047 } 3048 oidp = &zp->z_phys->zp_parent; 3049 rwlp = &zp->z_parent_lock; 3050 rw = RW_READER; 3051 3052 } while (zp->z_id != sdzp->z_id); 3053 3054 return (0); 3055} 3056 3057/* 3058 * Move an entry from the provided source directory to the target 3059 * directory. Change the entry name as indicated. 3060 * 3061 * IN: sdvp - Source directory containing the "old entry". 3062 * snm - Old entry name. 3063 * tdvp - Target directory to contain the "new entry". 3064 * tnm - New entry name. 3065 * cr - credentials of caller. 3066 * ct - caller context 3067 * flags - case flags 3068 * 3069 * RETURN: 0 if success 3070 * error code if failure 3071 * 3072 * Timestamps: 3073 * sdvp,tdvp - ctime|mtime updated 3074 */ 3075/*ARGSUSED*/ 3076static int 3077zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr, 3078 caller_context_t *ct, int flags) 3079{ 3080 znode_t *tdzp, *szp, *tzp; 3081 znode_t *sdzp = VTOZ(sdvp); 3082 zfsvfs_t *zfsvfs = sdzp->z_zfsvfs; 3083 zilog_t *zilog; 3084 vnode_t *realvp; 3085 zfs_dirlock_t *sdl, *tdl; 3086 dmu_tx_t *tx; 3087 zfs_zlock_t *zl; 3088 int cmp, serr, terr; 3089 int error = 0; 3090 int zflg = 0; 3091 3092 ZFS_ENTER(zfsvfs); 3093 ZFS_VERIFY_ZP(sdzp); 3094 zilog = zfsvfs->z_log; 3095 3096 /* 3097 * Make sure we have the real vp for the target directory. 3098 */ 3099 if (VOP_REALVP(tdvp, &realvp, ct) == 0) 3100 tdvp = realvp; 3101 3102 if (tdvp->v_vfsp != sdvp->v_vfsp) { 3103 ZFS_EXIT(zfsvfs); 3104 return (EXDEV); 3105 } 3106 3107 tdzp = VTOZ(tdvp); 3108 ZFS_VERIFY_ZP(tdzp); 3109 if (zfsvfs->z_utf8 && u8_validate(tnm, 3110 strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 3111 ZFS_EXIT(zfsvfs); 3112 return (EILSEQ); 3113 } 3114 3115 if (flags & FIGNORECASE) 3116 zflg |= ZCILOOK; 3117 3118top: 3119 szp = NULL; 3120 tzp = NULL; 3121 zl = NULL; 3122 3123 /* 3124 * This is to prevent the creation of links into attribute space 3125 * by renaming a linked file into/outof an attribute directory. 3126 * See the comment in zfs_link() for why this is considered bad. 3127 */ 3128 if ((tdzp->z_phys->zp_flags & ZFS_XATTR) != 3129 (sdzp->z_phys->zp_flags & ZFS_XATTR)) { 3130 ZFS_EXIT(zfsvfs); 3131 return (EINVAL); 3132 } 3133 3134 /* 3135 * Lock source and target directory entries. To prevent deadlock, 3136 * a lock ordering must be defined. We lock the directory with 3137 * the smallest object id first, or if it's a tie, the one with 3138 * the lexically first name. 3139 */ 3140 if (sdzp->z_id < tdzp->z_id) { 3141 cmp = -1; 3142 } else if (sdzp->z_id > tdzp->z_id) { 3143 cmp = 1; 3144 } else { 3145 /* 3146 * First compare the two name arguments without 3147 * considering any case folding. 3148 */ 3149 int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER); 3150 3151 cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error); 3152 ASSERT(error == 0 || !zfsvfs->z_utf8); 3153 if (cmp == 0) { 3154 /* 3155 * POSIX: "If the old argument and the new argument 3156 * both refer to links to the same existing file, 3157 * the rename() function shall return successfully 3158 * and perform no other action." 3159 */ 3160 ZFS_EXIT(zfsvfs); 3161 return (0); 3162 } 3163 /* 3164 * If the file system is case-folding, then we may 3165 * have some more checking to do. A case-folding file 3166 * system is either supporting mixed case sensitivity 3167 * access or is completely case-insensitive. Note 3168 * that the file system is always case preserving. 3169 * 3170 * In mixed sensitivity mode case sensitive behavior 3171 * is the default. FIGNORECASE must be used to 3172 * explicitly request case insensitive behavior. 3173 * 3174 * If the source and target names provided differ only 3175 * by case (e.g., a request to rename 'tim' to 'Tim'), 3176 * we will treat this as a special case in the 3177 * case-insensitive mode: as long as the source name 3178 * is an exact match, we will allow this to proceed as 3179 * a name-change request. 3180 */ 3181 if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE || 3182 (zfsvfs->z_case == ZFS_CASE_MIXED && 3183 flags & FIGNORECASE)) && 3184 u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST, 3185 &error) == 0) { 3186 /* 3187 * case preserving rename request, require exact 3188 * name matches 3189 */ 3190 zflg |= ZCIEXACT; 3191 zflg &= ~ZCILOOK; 3192 } 3193 } 3194 3195 if (cmp < 0) { 3196 serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, 3197 ZEXISTS | zflg, NULL, NULL); 3198 terr = zfs_dirent_lock(&tdl, 3199 tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL); 3200 } else { 3201 terr = zfs_dirent_lock(&tdl, 3202 tdzp, tnm, &tzp, zflg, NULL, NULL); 3203 serr = zfs_dirent_lock(&sdl, 3204 sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg, 3205 NULL, NULL); 3206 } 3207 3208 if (serr) { 3209 /* 3210 * Source entry invalid or not there. 3211 */ 3212 if (!terr) { 3213 zfs_dirent_unlock(tdl); 3214 if (tzp) 3215 VN_RELE(ZTOV(tzp)); 3216 } 3217 if (strcmp(snm, ".") == 0 || strcmp(snm, "..") == 0) 3218 serr = EINVAL; 3219 ZFS_EXIT(zfsvfs); 3220 return (serr); 3221 } 3222 if (terr) { 3223 zfs_dirent_unlock(sdl); 3224 VN_RELE(ZTOV(szp)); 3225 if (strcmp(tnm, "..") == 0) 3226 terr = EINVAL; 3227 ZFS_EXIT(zfsvfs); 3228 return (terr); 3229 } 3230 3231 /* 3232 * Must have write access at the source to remove the old entry 3233 * and write access at the target to create the new entry. 3234 * Note that if target and source are the same, this can be 3235 * done in a single check. 3236 */ 3237 3238 if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr)) 3239 goto out; 3240 3241 if (ZTOV(szp)->v_type == VDIR) { 3242 /* 3243 * Check to make sure rename is valid. 3244 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d 3245 */ 3246 if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl)) 3247 goto out; 3248 } 3249 3250 /* 3251 * Does target exist? 3252 */ 3253 if (tzp) { 3254 /* 3255 * Source and target must be the same type. 3256 */ 3257 if (ZTOV(szp)->v_type == VDIR) { 3258 if (ZTOV(tzp)->v_type != VDIR) { 3259 error = ENOTDIR; 3260 goto out; 3261 } 3262 } else { 3263 if (ZTOV(tzp)->v_type == VDIR) { 3264 error = EISDIR; 3265 goto out; 3266 } 3267 } 3268 /* 3269 * POSIX dictates that when the source and target 3270 * entries refer to the same file object, rename 3271 * must do nothing and exit without error. 3272 */ 3273 if (szp->z_id == tzp->z_id) { 3274 error = 0; 3275 goto out; 3276 } 3277 } 3278 3279 vnevent_rename_src(ZTOV(szp), sdvp, snm, ct); 3280 if (tzp) 3281 vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct); 3282 3283 /* 3284 * notify the target directory if it is not the same 3285 * as source directory. 3286 */ 3287 if (tdvp != sdvp) { 3288 vnevent_rename_dest_dir(tdvp, ct); 3289 } 3290 3291 tx = dmu_tx_create(zfsvfs->z_os); 3292 dmu_tx_hold_bonus(tx, szp->z_id); /* nlink changes */ 3293 dmu_tx_hold_bonus(tx, sdzp->z_id); /* nlink changes */ 3294 dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); 3295 dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); 3296 if (sdzp != tdzp) 3297 dmu_tx_hold_bonus(tx, tdzp->z_id); /* nlink changes */ 3298 if (tzp) 3299 dmu_tx_hold_bonus(tx, tzp->z_id); /* parent changes */ 3300 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 3301 error = dmu_tx_assign(tx, zfsvfs->z_assign); 3302 if (error) { 3303 if (zl != NULL) 3304 zfs_rename_unlock(&zl); 3305 zfs_dirent_unlock(sdl); 3306 zfs_dirent_unlock(tdl); 3307 VN_RELE(ZTOV(szp)); 3308 if (tzp) 3309 VN_RELE(ZTOV(tzp)); 3310 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 3311 dmu_tx_wait(tx); 3312 dmu_tx_abort(tx); 3313 goto top; 3314 } 3315 dmu_tx_abort(tx); 3316 ZFS_EXIT(zfsvfs); 3317 return (error); 3318 } 3319 3320 if (tzp) /* Attempt to remove the existing target */ 3321 error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL); 3322 3323 if (error == 0) { 3324 error = zfs_link_create(tdl, szp, tx, ZRENAMING); 3325 if (error == 0) { 3326 szp->z_phys->zp_flags |= ZFS_AV_MODIFIED; 3327 3328 error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); 3329 ASSERT(error == 0); 3330 3331 zfs_log_rename(zilog, tx, 3332 TX_RENAME | (flags & FIGNORECASE ? TX_CI : 0), 3333 sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp); 3334 3335 /* Update path information for the target vnode */ 3336 vn_renamepath(tdvp, ZTOV(szp), tnm, strlen(tnm)); 3337 } 3338#ifdef FREEBSD_NAMECACHE 3339 if (error == 0) { 3340 cache_purge(sdvp); 3341 cache_purge(tdvp); 3342 } 3343#endif 3344 } 3345 3346 dmu_tx_commit(tx); 3347out: 3348 if (zl != NULL) 3349 zfs_rename_unlock(&zl); 3350 3351 zfs_dirent_unlock(sdl); 3352 zfs_dirent_unlock(tdl); 3353 3354 VN_RELE(ZTOV(szp)); 3355 if (tzp) 3356 VN_RELE(ZTOV(tzp)); 3357 3358 ZFS_EXIT(zfsvfs); 3359 3360 return (error); 3361} 3362 3363/* 3364 * Insert the indicated symbolic reference entry into the directory. 3365 * 3366 * IN: dvp - Directory to contain new symbolic link. 3367 * link - Name for new symlink entry. 3368 * vap - Attributes of new entry. 3369 * target - Target path of new symlink. 3370 * cr - credentials of caller. 3371 * ct - caller context 3372 * flags - case flags 3373 * 3374 * RETURN: 0 if success 3375 * error code if failure 3376 * 3377 * Timestamps: 3378 * dvp - ctime|mtime updated 3379 */ 3380/*ARGSUSED*/ 3381static int 3382zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link, 3383 cred_t *cr, kthread_t *td) 3384{ 3385 znode_t *zp, *dzp = VTOZ(dvp); 3386 zfs_dirlock_t *dl; 3387 dmu_tx_t *tx; 3388 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 3389 zilog_t *zilog; 3390 int len = strlen(link); 3391 int error; 3392 int zflg = ZNEW; 3393 zfs_fuid_info_t *fuidp = NULL; 3394 int flags = 0; 3395 3396 ASSERT(vap->va_type == VLNK); 3397 3398 ZFS_ENTER(zfsvfs); 3399 ZFS_VERIFY_ZP(dzp); 3400 zilog = zfsvfs->z_log; 3401 3402 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), 3403 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 3404 ZFS_EXIT(zfsvfs); 3405 return (EILSEQ); 3406 } 3407 if (flags & FIGNORECASE) 3408 zflg |= ZCILOOK; 3409top: 3410 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { 3411 ZFS_EXIT(zfsvfs); 3412 return (error); 3413 } 3414 3415 if (len > MAXPATHLEN) { 3416 ZFS_EXIT(zfsvfs); 3417 return (ENAMETOOLONG); 3418 } 3419 3420 /* 3421 * Attempt to lock directory; fail if entry already exists. 3422 */ 3423 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); 3424 if (error) { 3425 ZFS_EXIT(zfsvfs); 3426 return (error); 3427 } 3428 3429 tx = dmu_tx_create(zfsvfs->z_os); 3430 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); 3431 dmu_tx_hold_bonus(tx, dzp->z_id); 3432 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 3433 if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) 3434 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE); 3435 if (IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr))) { 3436 if (zfsvfs->z_fuid_obj == 0) { 3437 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 3438 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 3439 FUID_SIZE_ESTIMATE(zfsvfs)); 3440 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); 3441 } else { 3442 dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); 3443 dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, 3444 FUID_SIZE_ESTIMATE(zfsvfs)); 3445 } 3446 } 3447 error = dmu_tx_assign(tx, zfsvfs->z_assign); 3448 if (error) { 3449 zfs_dirent_unlock(dl); 3450 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 3451 dmu_tx_wait(tx); 3452 dmu_tx_abort(tx); 3453 goto top; 3454 } 3455 dmu_tx_abort(tx); 3456 ZFS_EXIT(zfsvfs); 3457 return (error); 3458 } 3459 3460 dmu_buf_will_dirty(dzp->z_dbuf, tx); 3461 3462 /* 3463 * Create a new object for the symlink. 3464 * Put the link content into bonus buffer if it will fit; 3465 * otherwise, store it just like any other file data. 3466 */ 3467 if (sizeof (znode_phys_t) + len <= dmu_bonus_max()) { 3468 zfs_mknode(dzp, vap, tx, cr, 0, &zp, len, NULL, &fuidp); 3469 if (len != 0) 3470 bcopy(link, zp->z_phys + 1, len); 3471 } else { 3472 dmu_buf_t *dbp; 3473 3474 zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, NULL, &fuidp); 3475 /* 3476 * Nothing can access the znode yet so no locking needed 3477 * for growing the znode's blocksize. 3478 */ 3479 zfs_grow_blocksize(zp, len, tx); 3480 3481 VERIFY(0 == dmu_buf_hold(zfsvfs->z_os, 3482 zp->z_id, 0, FTAG, &dbp)); 3483 dmu_buf_will_dirty(dbp, tx); 3484 3485 ASSERT3U(len, <=, dbp->db_size); 3486 bcopy(link, dbp->db_data, len); 3487 dmu_buf_rele(dbp, FTAG); 3488 } 3489 zp->z_phys->zp_size = len; 3490 3491 /* 3492 * Insert the new object into the directory. 3493 */ 3494 (void) zfs_link_create(dl, zp, tx, ZNEW); 3495out: 3496 if (error == 0) { 3497 uint64_t txtype = TX_SYMLINK; 3498 if (flags & FIGNORECASE) 3499 txtype |= TX_CI; 3500 zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); 3501 *vpp = ZTOV(zp); 3502 } 3503 if (fuidp) 3504 zfs_fuid_info_free(fuidp); 3505 3506 dmu_tx_commit(tx); 3507 3508 zfs_dirent_unlock(dl); 3509 3510 ZFS_EXIT(zfsvfs); 3511 return (error); 3512} 3513 3514/* 3515 * Return, in the buffer contained in the provided uio structure, 3516 * the symbolic path referred to by vp. 3517 * 3518 * IN: vp - vnode of symbolic link. 3519 * uoip - structure to contain the link path. 3520 * cr - credentials of caller. 3521 * ct - caller context 3522 * 3523 * OUT: uio - structure to contain the link path. 3524 * 3525 * RETURN: 0 if success 3526 * error code if failure 3527 * 3528 * Timestamps: 3529 * vp - atime updated 3530 */ 3531/* ARGSUSED */ 3532static int 3533zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct) 3534{ 3535 znode_t *zp = VTOZ(vp); 3536 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 3537 size_t bufsz; 3538 int error; 3539 3540 ZFS_ENTER(zfsvfs); 3541 ZFS_VERIFY_ZP(zp); 3542 3543 bufsz = (size_t)zp->z_phys->zp_size; 3544 if (bufsz + sizeof (znode_phys_t) <= zp->z_dbuf->db_size) { 3545 error = uiomove(zp->z_phys + 1, 3546 MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); 3547 } else { 3548 dmu_buf_t *dbp; 3549 error = dmu_buf_hold(zfsvfs->z_os, zp->z_id, 0, FTAG, &dbp); 3550 if (error) { 3551 ZFS_EXIT(zfsvfs); 3552 return (error); 3553 } 3554 error = uiomove(dbp->db_data, 3555 MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); 3556 dmu_buf_rele(dbp, FTAG); 3557 } 3558 3559 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 3560 ZFS_EXIT(zfsvfs); 3561 return (error); 3562} 3563 3564/* 3565 * Insert a new entry into directory tdvp referencing svp. 3566 * 3567 * IN: tdvp - Directory to contain new entry. 3568 * svp - vnode of new entry. 3569 * name - name of new entry. 3570 * cr - credentials of caller. 3571 * ct - caller context 3572 * 3573 * RETURN: 0 if success 3574 * error code if failure 3575 * 3576 * Timestamps: 3577 * tdvp - ctime|mtime updated 3578 * svp - ctime updated 3579 */ 3580/* ARGSUSED */ 3581static int 3582zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr, 3583 caller_context_t *ct, int flags) 3584{ 3585 znode_t *dzp = VTOZ(tdvp); 3586 znode_t *tzp, *szp; 3587 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 3588 zilog_t *zilog; 3589 zfs_dirlock_t *dl; 3590 dmu_tx_t *tx; 3591 vnode_t *realvp; 3592 int error; 3593 int zf = ZNEW; 3594 uid_t owner; 3595 3596 ASSERT(tdvp->v_type == VDIR); 3597 3598 ZFS_ENTER(zfsvfs); 3599 ZFS_VERIFY_ZP(dzp); 3600 zilog = zfsvfs->z_log; 3601 3602 if (VOP_REALVP(svp, &realvp, ct) == 0) 3603 svp = realvp; 3604 3605 if (svp->v_vfsp != tdvp->v_vfsp) { 3606 ZFS_EXIT(zfsvfs); 3607 return (EXDEV); 3608 } 3609 szp = VTOZ(svp); 3610 ZFS_VERIFY_ZP(szp); 3611 3612 if (zfsvfs->z_utf8 && u8_validate(name, 3613 strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 3614 ZFS_EXIT(zfsvfs); 3615 return (EILSEQ); 3616 } 3617 if (flags & FIGNORECASE) 3618 zf |= ZCILOOK; 3619 3620top: 3621 /* 3622 * We do not support links between attributes and non-attributes 3623 * because of the potential security risk of creating links 3624 * into "normal" file space in order to circumvent restrictions 3625 * imposed in attribute space. 3626 */ 3627 if ((szp->z_phys->zp_flags & ZFS_XATTR) != 3628 (dzp->z_phys->zp_flags & ZFS_XATTR)) { 3629 ZFS_EXIT(zfsvfs); 3630 return (EINVAL); 3631 } 3632 3633 /* 3634 * POSIX dictates that we return EPERM here. 3635 * Better choices include ENOTSUP or EISDIR. 3636 */ 3637 if (svp->v_type == VDIR) { 3638 ZFS_EXIT(zfsvfs); 3639 return (EPERM); 3640 } 3641 3642 owner = zfs_fuid_map_id(zfsvfs, szp->z_phys->zp_uid, cr, ZFS_OWNER); 3643 if (owner != crgetuid(cr) && 3644 secpolicy_basic_link(svp, cr) != 0) { 3645 ZFS_EXIT(zfsvfs); 3646 return (EPERM); 3647 } 3648 3649 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { 3650 ZFS_EXIT(zfsvfs); 3651 return (error); 3652 } 3653 3654 /* 3655 * Attempt to lock directory; fail if entry already exists. 3656 */ 3657 error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL); 3658 if (error) { 3659 ZFS_EXIT(zfsvfs); 3660 return (error); 3661 } 3662 3663 tx = dmu_tx_create(zfsvfs->z_os); 3664 dmu_tx_hold_bonus(tx, szp->z_id); 3665 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 3666 error = dmu_tx_assign(tx, zfsvfs->z_assign); 3667 if (error) { 3668 zfs_dirent_unlock(dl); 3669 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 3670 dmu_tx_wait(tx); 3671 dmu_tx_abort(tx); 3672 goto top; 3673 } 3674 dmu_tx_abort(tx); 3675 ZFS_EXIT(zfsvfs); 3676 return (error); 3677 } 3678 3679 error = zfs_link_create(dl, szp, tx, 0); 3680 3681 if (error == 0) { 3682 uint64_t txtype = TX_LINK; 3683 if (flags & FIGNORECASE) 3684 txtype |= TX_CI; 3685 zfs_log_link(zilog, tx, txtype, dzp, szp, name); 3686 } 3687 3688 dmu_tx_commit(tx); 3689 3690 zfs_dirent_unlock(dl); 3691 3692 if (error == 0) { 3693 vnevent_link(svp, ct); 3694 } 3695 3696 ZFS_EXIT(zfsvfs); 3697 return (error); 3698} 3699 3700/*ARGSUSED*/ 3701void 3702zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) 3703{ 3704 znode_t *zp = VTOZ(vp); 3705 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 3706 int error; 3707 3708 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); 3709 if (zp->z_dbuf == NULL) { 3710 /* 3711 * The fs has been unmounted, or we did a 3712 * suspend/resume and this file no longer exists. 3713 */ 3714 VI_LOCK(vp); 3715 vp->v_count = 0; /* count arrives as 1 */ 3716 VI_UNLOCK(vp); 3717 vrecycle(vp, curthread); 3718 rw_exit(&zfsvfs->z_teardown_inactive_lock); 3719 return; 3720 } 3721 3722 if (zp->z_atime_dirty && zp->z_unlinked == 0) { 3723 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); 3724 3725 dmu_tx_hold_bonus(tx, zp->z_id); 3726 error = dmu_tx_assign(tx, TXG_WAIT); 3727 if (error) { 3728 dmu_tx_abort(tx); 3729 } else { 3730 dmu_buf_will_dirty(zp->z_dbuf, tx); 3731 mutex_enter(&zp->z_lock); 3732 zp->z_atime_dirty = 0; 3733 mutex_exit(&zp->z_lock); 3734 dmu_tx_commit(tx); 3735 } 3736 } 3737 3738 zfs_zinactive(zp); 3739 rw_exit(&zfsvfs->z_teardown_inactive_lock); 3740} 3741 3742CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid)); 3743CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid)); 3744 3745/*ARGSUSED*/ 3746static int 3747zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct) 3748{ 3749 znode_t *zp = VTOZ(vp); 3750 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 3751 uint32_t gen; 3752 uint64_t object = zp->z_id; 3753 zfid_short_t *zfid; 3754 int size, i; 3755 3756 ZFS_ENTER(zfsvfs); 3757 ZFS_VERIFY_ZP(zp); 3758 gen = (uint32_t)zp->z_gen; 3759 3760 size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN; 3761 fidp->fid_len = size; 3762 3763 zfid = (zfid_short_t *)fidp; 3764 3765 zfid->zf_len = size; 3766 3767 for (i = 0; i < sizeof (zfid->zf_object); i++) 3768 zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); 3769 3770 /* Must have a non-zero generation number to distinguish from .zfs */ 3771 if (gen == 0) 3772 gen = 1; 3773 for (i = 0; i < sizeof (zfid->zf_gen); i++) 3774 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); 3775 3776 if (size == LONG_FID_LEN) { 3777 uint64_t objsetid = dmu_objset_id(zfsvfs->z_os); 3778 zfid_long_t *zlfid; 3779 3780 zlfid = (zfid_long_t *)fidp; 3781 3782 for (i = 0; i < sizeof (zlfid->zf_setid); i++) 3783 zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i)); 3784 3785 /* XXX - this should be the generation number for the objset */ 3786 for (i = 0; i < sizeof (zlfid->zf_setgen); i++) 3787 zlfid->zf_setgen[i] = 0; 3788 } 3789 3790 ZFS_EXIT(zfsvfs); 3791 return (0); 3792} 3793 3794static int 3795zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, 3796 caller_context_t *ct) 3797{ 3798 znode_t *zp, *xzp; 3799 zfsvfs_t *zfsvfs; 3800 zfs_dirlock_t *dl; 3801 int error; 3802 3803 switch (cmd) { 3804 case _PC_LINK_MAX: 3805 *valp = INT_MAX; 3806 return (0); 3807 3808 case _PC_FILESIZEBITS: 3809 *valp = 64; 3810 return (0); 3811 3812#if 0 3813 case _PC_XATTR_EXISTS: 3814 zp = VTOZ(vp); 3815 zfsvfs = zp->z_zfsvfs; 3816 ZFS_ENTER(zfsvfs); 3817 ZFS_VERIFY_ZP(zp); 3818 *valp = 0; 3819 error = zfs_dirent_lock(&dl, zp, "", &xzp, 3820 ZXATTR | ZEXISTS | ZSHARED, NULL, NULL); 3821 if (error == 0) { 3822 zfs_dirent_unlock(dl); 3823 if (!zfs_dirempty(xzp)) 3824 *valp = 1; 3825 VN_RELE(ZTOV(xzp)); 3826 } else if (error == ENOENT) { 3827 /* 3828 * If there aren't extended attributes, it's the 3829 * same as having zero of them. 3830 */ 3831 error = 0; 3832 } 3833 ZFS_EXIT(zfsvfs); 3834 return (error); 3835#endif 3836 3837 case _PC_ACL_EXTENDED: 3838 *valp = 0; 3839 return (0); 3840 3841 case _PC_ACL_NFS4: 3842 *valp = 1; 3843 return (0); 3844 3845 case _PC_ACL_PATH_MAX: 3846 *valp = ACL_MAX_ENTRIES; 3847 return (0); 3848 3849 case _PC_MIN_HOLE_SIZE: 3850 *valp = (int)SPA_MINBLOCKSIZE; 3851 return (0); 3852 3853 default: 3854 return (EOPNOTSUPP); 3855 } 3856} 3857 3858/*ARGSUSED*/ 3859static int 3860zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, 3861 caller_context_t *ct) 3862{ 3863 znode_t *zp = VTOZ(vp); 3864 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 3865 int error; 3866 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 3867 3868 ZFS_ENTER(zfsvfs); 3869 ZFS_VERIFY_ZP(zp); 3870 error = zfs_getacl(zp, vsecp, skipaclchk, cr); 3871 ZFS_EXIT(zfsvfs); 3872 3873 return (error); 3874} 3875 3876/*ARGSUSED*/ 3877static int 3878zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, 3879 caller_context_t *ct) 3880{ 3881 znode_t *zp = VTOZ(vp); 3882 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 3883 int error; 3884 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 3885 3886 ZFS_ENTER(zfsvfs); 3887 ZFS_VERIFY_ZP(zp); 3888 error = zfs_setacl(zp, vsecp, skipaclchk, cr); 3889 ZFS_EXIT(zfsvfs); 3890 return (error); 3891} 3892 3893static int 3894zfs_freebsd_open(ap) 3895 struct vop_open_args /* { 3896 struct vnode *a_vp; 3897 int a_mode; 3898 struct ucred *a_cred; 3899 struct thread *a_td; 3900 } */ *ap; 3901{ 3902 vnode_t *vp = ap->a_vp; 3903 znode_t *zp = VTOZ(vp); 3904 int error; 3905 3906 error = zfs_open(&vp, ap->a_mode, ap->a_cred, NULL); 3907 if (error == 0) 3908 vnode_create_vobject(vp, zp->z_phys->zp_size, ap->a_td); 3909 return (error); 3910} 3911 3912static int 3913zfs_freebsd_close(ap) 3914 struct vop_close_args /* { 3915 struct vnode *a_vp; 3916 int a_fflag; 3917 struct ucred *a_cred; 3918 struct thread *a_td; 3919 } */ *ap; 3920{ 3921 3922 return (zfs_close(ap->a_vp, ap->a_fflag, 0, 0, ap->a_cred, NULL)); 3923} 3924 3925static int 3926zfs_freebsd_ioctl(ap) 3927 struct vop_ioctl_args /* { 3928 struct vnode *a_vp; 3929 u_long a_command; 3930 caddr_t a_data; 3931 int a_fflag; 3932 struct ucred *cred; 3933 struct thread *td; 3934 } */ *ap; 3935{ 3936 3937 return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data, 3938 ap->a_fflag, ap->a_cred, NULL, NULL)); 3939} 3940 3941static int 3942zfs_freebsd_read(ap) 3943 struct vop_read_args /* { 3944 struct vnode *a_vp; 3945 struct uio *a_uio; 3946 int a_ioflag; 3947 struct ucred *a_cred; 3948 } */ *ap; 3949{ 3950 3951 return (zfs_read(ap->a_vp, ap->a_uio, ap->a_ioflag, ap->a_cred, NULL)); 3952} 3953 3954static int 3955zfs_freebsd_write(ap) 3956 struct vop_write_args /* { 3957 struct vnode *a_vp; 3958 struct uio *a_uio; 3959 int a_ioflag; 3960 struct ucred *a_cred; 3961 } */ *ap; 3962{ 3963 3964 return (zfs_write(ap->a_vp, ap->a_uio, ap->a_ioflag, ap->a_cred, NULL)); 3965} 3966 3967static int 3968zfs_freebsd_access(ap) 3969 struct vop_access_args /* { 3970 struct vnode *a_vp; 3971 accmode_t a_accmode; 3972 struct ucred *a_cred; 3973 struct thread *a_td; 3974 } */ *ap; 3975{ 3976 3977 /* 3978 * ZFS itself only knowns about VREAD, VWRITE and VEXEC, the rest 3979 * we have to handle by calling vaccess(). 3980 */ 3981 if ((ap->a_accmode & ~(VREAD|VWRITE|VEXEC)) != 0) { 3982 vnode_t *vp = ap->a_vp; 3983 znode_t *zp = VTOZ(vp); 3984 znode_phys_t *zphys = zp->z_phys; 3985 3986 return (vaccess(vp->v_type, zphys->zp_mode, zphys->zp_uid, 3987 zphys->zp_gid, ap->a_accmode, ap->a_cred, NULL)); 3988 } 3989 3990 return (zfs_access(ap->a_vp, ap->a_accmode, 0, ap->a_cred, NULL)); 3991} 3992 3993static int 3994zfs_freebsd_lookup(ap) 3995 struct vop_lookup_args /* { 3996 struct vnode *a_dvp; 3997 struct vnode **a_vpp; 3998 struct componentname *a_cnp; 3999 } */ *ap; 4000{ 4001 struct componentname *cnp = ap->a_cnp; 4002 char nm[NAME_MAX + 1]; 4003 4004 ASSERT(cnp->cn_namelen < sizeof(nm)); 4005 strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm))); 4006 4007 return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop, 4008 cnp->cn_cred, cnp->cn_thread, 0)); 4009} 4010 4011static int 4012zfs_freebsd_create(ap) 4013 struct vop_create_args /* { 4014 struct vnode *a_dvp; 4015 struct vnode **a_vpp; 4016 struct componentname *a_cnp; 4017 struct vattr *a_vap; 4018 } */ *ap; 4019{ 4020 struct componentname *cnp = ap->a_cnp; 4021 vattr_t *vap = ap->a_vap; 4022 int mode; 4023 4024 ASSERT(cnp->cn_flags & SAVENAME); 4025 4026 vattr_init_mask(vap); 4027 mode = vap->va_mode & ALLPERMS; 4028 4029 return (zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode, 4030 ap->a_vpp, cnp->cn_cred, cnp->cn_thread)); 4031} 4032 4033static int 4034zfs_freebsd_remove(ap) 4035 struct vop_remove_args /* { 4036 struct vnode *a_dvp; 4037 struct vnode *a_vp; 4038 struct componentname *a_cnp; 4039 } */ *ap; 4040{ 4041 4042 ASSERT(ap->a_cnp->cn_flags & SAVENAME); 4043 4044 return (zfs_remove(ap->a_dvp, ap->a_cnp->cn_nameptr, 4045 ap->a_cnp->cn_cred, NULL, 0)); 4046} 4047 4048static int 4049zfs_freebsd_mkdir(ap) 4050 struct vop_mkdir_args /* { 4051 struct vnode *a_dvp; 4052 struct vnode **a_vpp; 4053 struct componentname *a_cnp; 4054 struct vattr *a_vap; 4055 } */ *ap; 4056{ 4057 vattr_t *vap = ap->a_vap; 4058 4059 ASSERT(ap->a_cnp->cn_flags & SAVENAME); 4060 4061 vattr_init_mask(vap); 4062 4063 return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp, 4064 ap->a_cnp->cn_cred, NULL, 0, NULL)); 4065} 4066 4067static int 4068zfs_freebsd_rmdir(ap) 4069 struct vop_rmdir_args /* { 4070 struct vnode *a_dvp; 4071 struct vnode *a_vp; 4072 struct componentname *a_cnp; 4073 } */ *ap; 4074{ 4075 struct componentname *cnp = ap->a_cnp; 4076 4077 ASSERT(cnp->cn_flags & SAVENAME); 4078 4079 return (zfs_rmdir(ap->a_dvp, cnp->cn_nameptr, NULL, cnp->cn_cred, NULL, 0)); 4080} 4081 4082static int 4083zfs_freebsd_readdir(ap) 4084 struct vop_readdir_args /* { 4085 struct vnode *a_vp; 4086 struct uio *a_uio; 4087 struct ucred *a_cred; 4088 int *a_eofflag; 4089 int *a_ncookies; 4090 u_long **a_cookies; 4091 } */ *ap; 4092{ 4093 4094 return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag, 4095 ap->a_ncookies, ap->a_cookies)); 4096} 4097 4098static int 4099zfs_freebsd_fsync(ap) 4100 struct vop_fsync_args /* { 4101 struct vnode *a_vp; 4102 int a_waitfor; 4103 struct thread *a_td; 4104 } */ *ap; 4105{ 4106 4107 vop_stdfsync(ap); 4108 return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL)); 4109} 4110 4111static int 4112zfs_freebsd_getattr(ap) 4113 struct vop_getattr_args /* { 4114 struct vnode *a_vp; 4115 struct vattr *a_vap; 4116 struct ucred *a_cred; 4117 struct thread *a_td; 4118 } */ *ap; 4119{ 4120 vattr_t *vap = ap->a_vap; 4121 xvattr_t xvap; 4122 u_long fflags = 0; 4123 int error; 4124 4125 xva_init(&xvap); 4126 xvap.xva_vattr = *vap; 4127 xvap.xva_vattr.va_mask |= AT_XVATTR; 4128 4129 /* Convert chflags into ZFS-type flags. */ 4130 /* XXX: what about SF_SETTABLE?. */ 4131 XVA_SET_REQ(&xvap, XAT_IMMUTABLE); 4132 XVA_SET_REQ(&xvap, XAT_APPENDONLY); 4133 XVA_SET_REQ(&xvap, XAT_NOUNLINK); 4134 XVA_SET_REQ(&xvap, XAT_NODUMP); 4135 error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL); 4136 if (error != 0) 4137 return (error); 4138 4139 /* Convert ZFS xattr into chflags. */ 4140#define FLAG_CHECK(fflag, xflag, xfield) do { \ 4141 if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0) \ 4142 fflags |= (fflag); \ 4143} while (0) 4144 FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE, 4145 xvap.xva_xoptattrs.xoa_immutable); 4146 FLAG_CHECK(SF_APPEND, XAT_APPENDONLY, 4147 xvap.xva_xoptattrs.xoa_appendonly); 4148 FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK, 4149 xvap.xva_xoptattrs.xoa_nounlink); 4150 FLAG_CHECK(UF_NODUMP, XAT_NODUMP, 4151 xvap.xva_xoptattrs.xoa_nodump); 4152#undef FLAG_CHECK 4153 *vap = xvap.xva_vattr; 4154 vap->va_flags = fflags; 4155 return (0); 4156} 4157 4158static int 4159zfs_freebsd_setattr(ap) 4160 struct vop_setattr_args /* { 4161 struct vnode *a_vp; 4162 struct vattr *a_vap; 4163 struct ucred *a_cred; 4164 struct thread *a_td; 4165 } */ *ap; 4166{ 4167 vnode_t *vp = ap->a_vp; 4168 vattr_t *vap = ap->a_vap; 4169 cred_t *cred = ap->a_cred; 4170 xvattr_t xvap; 4171 u_long fflags; 4172 uint64_t zflags; 4173 4174 vattr_init_mask(vap); 4175 vap->va_mask &= ~AT_NOSET; 4176 4177 xva_init(&xvap); 4178 xvap.xva_vattr = *vap; 4179 4180 zflags = VTOZ(vp)->z_phys->zp_flags; 4181 4182 if (vap->va_flags != VNOVAL) { 4183 int error; 4184 4185 fflags = vap->va_flags; 4186 if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_NODUMP)) != 0) 4187 return (EOPNOTSUPP); 4188 /* 4189 * Callers may only modify the file flags on objects they 4190 * have VADMIN rights for. 4191 */ 4192 if ((error = VOP_ACCESS(vp, VADMIN, cred, curthread)) != 0) 4193 return (error); 4194 /* 4195 * Unprivileged processes are not permitted to unset system 4196 * flags, or modify flags if any system flags are set. 4197 * Privileged non-jail processes may not modify system flags 4198 * if securelevel > 0 and any existing system flags are set. 4199 * Privileged jail processes behave like privileged non-jail 4200 * processes if the security.jail.chflags_allowed sysctl is 4201 * is non-zero; otherwise, they behave like unprivileged 4202 * processes. 4203 */ 4204 if (priv_check_cred(cred, PRIV_VFS_SYSFLAGS, 0) == 0) { 4205 if (zflags & 4206 (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) { 4207 error = securelevel_gt(cred, 0); 4208 if (error) 4209 return (error); 4210 } 4211 } else { 4212 if (zflags & 4213 (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) { 4214 return (EPERM); 4215 } 4216 if (fflags & 4217 (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) { 4218 return (EPERM); 4219 } 4220 } 4221 4222#define FLAG_CHANGE(fflag, zflag, xflag, xfield) do { \ 4223 if (((fflags & (fflag)) && !(zflags & (zflag))) || \ 4224 ((zflags & (zflag)) && !(fflags & (fflag)))) { \ 4225 XVA_SET_REQ(&xvap, (xflag)); \ 4226 (xfield) = ((fflags & (fflag)) != 0); \ 4227 } \ 4228} while (0) 4229 /* Convert chflags into ZFS-type flags. */ 4230 /* XXX: what about SF_SETTABLE?. */ 4231 FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE, 4232 xvap.xva_xoptattrs.xoa_immutable); 4233 FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY, 4234 xvap.xva_xoptattrs.xoa_appendonly); 4235 FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK, 4236 xvap.xva_xoptattrs.xoa_nounlink); 4237 FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP, 4238 xvap.xva_xoptattrs.xoa_nodump); 4239#undef FLAG_CHANGE 4240 } 4241 return (zfs_setattr(vp, (vattr_t *)&xvap, 0, cred, NULL)); 4242} 4243 4244static int 4245zfs_freebsd_rename(ap) 4246 struct vop_rename_args /* { 4247 struct vnode *a_fdvp; 4248 struct vnode *a_fvp; 4249 struct componentname *a_fcnp; 4250 struct vnode *a_tdvp; 4251 struct vnode *a_tvp; 4252 struct componentname *a_tcnp; 4253 } */ *ap; 4254{ 4255 vnode_t *fdvp = ap->a_fdvp; 4256 vnode_t *fvp = ap->a_fvp; 4257 vnode_t *tdvp = ap->a_tdvp; 4258 vnode_t *tvp = ap->a_tvp; 4259 int error; 4260 4261 ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART)); 4262 ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART)); 4263 4264 error = zfs_rename(fdvp, ap->a_fcnp->cn_nameptr, tdvp, 4265 ap->a_tcnp->cn_nameptr, ap->a_fcnp->cn_cred, NULL, 0); 4266 4267 if (tdvp == tvp) 4268 VN_RELE(tdvp); 4269 else 4270 VN_URELE(tdvp); 4271 if (tvp) 4272 VN_URELE(tvp); 4273 VN_RELE(fdvp); 4274 VN_RELE(fvp); 4275 4276 return (error); 4277} 4278 4279static int 4280zfs_freebsd_symlink(ap) 4281 struct vop_symlink_args /* { 4282 struct vnode *a_dvp; 4283 struct vnode **a_vpp; 4284 struct componentname *a_cnp; 4285 struct vattr *a_vap; 4286 char *a_target; 4287 } */ *ap; 4288{ 4289 struct componentname *cnp = ap->a_cnp; 4290 vattr_t *vap = ap->a_vap; 4291 4292 ASSERT(cnp->cn_flags & SAVENAME); 4293 4294 vap->va_type = VLNK; /* FreeBSD: Syscall only sets va_mode. */ 4295 vattr_init_mask(vap); 4296 4297 return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap, 4298 ap->a_target, cnp->cn_cred, cnp->cn_thread)); 4299} 4300 4301static int 4302zfs_freebsd_readlink(ap) 4303 struct vop_readlink_args /* { 4304 struct vnode *a_vp; 4305 struct uio *a_uio; 4306 struct ucred *a_cred; 4307 } */ *ap; 4308{ 4309 4310 return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL)); 4311} 4312 4313static int 4314zfs_freebsd_link(ap) 4315 struct vop_link_args /* { 4316 struct vnode *a_tdvp; 4317 struct vnode *a_vp; 4318 struct componentname *a_cnp; 4319 } */ *ap; 4320{ 4321 struct componentname *cnp = ap->a_cnp; 4322 4323 ASSERT(cnp->cn_flags & SAVENAME); 4324 4325 return (zfs_link(ap->a_tdvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred, NULL, 0)); 4326} 4327 4328static int 4329zfs_freebsd_inactive(ap) 4330 struct vop_inactive_args /* { 4331 struct vnode *a_vp; 4332 struct thread *a_td; 4333 } */ *ap; 4334{ 4335 vnode_t *vp = ap->a_vp; 4336 4337 zfs_inactive(vp, ap->a_td->td_ucred, NULL); 4338 return (0); 4339} 4340 4341static void 4342zfs_reclaim_complete(void *arg, int pending) 4343{ 4344 znode_t *zp = arg; 4345 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4346 4347 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); 4348 if (zp->z_dbuf != NULL) { 4349 ZFS_OBJ_HOLD_ENTER(zfsvfs, zp->z_id); 4350 zfs_znode_dmu_fini(zp); 4351 ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id); 4352 } 4353 zfs_znode_free(zp); 4354 rw_exit(&zfsvfs->z_teardown_inactive_lock); 4355 /* 4356 * If the file system is being unmounted, there is a process waiting 4357 * for us, wake it up. 4358 */ 4359 if (zfsvfs->z_unmounted) 4360 wakeup_one(zfsvfs); 4361} 4362 4363static int 4364zfs_freebsd_reclaim(ap) 4365 struct vop_reclaim_args /* { 4366 struct vnode *a_vp; 4367 struct thread *a_td; 4368 } */ *ap; 4369{ 4370 vnode_t *vp = ap->a_vp; 4371 znode_t *zp = VTOZ(vp); 4372 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4373 4374 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); 4375 4376 ASSERT(zp != NULL); 4377 4378 /* 4379 * Destroy the vm object and flush associated pages. 4380 */ 4381 vnode_destroy_vobject(vp); 4382 4383 mutex_enter(&zp->z_lock); 4384 ASSERT(zp->z_phys != NULL); 4385 zp->z_vnode = NULL; 4386 mutex_exit(&zp->z_lock); 4387 4388 if (zp->z_unlinked) 4389 ; /* Do nothing. */ 4390 else if (zp->z_dbuf == NULL) 4391 zfs_znode_free(zp); 4392 else /* if (!zp->z_unlinked && zp->z_dbuf != NULL) */ { 4393 int locked; 4394 4395 locked = MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id)) ? 2 : 4396 ZFS_OBJ_HOLD_TRYENTER(zfsvfs, zp->z_id); 4397 if (locked == 0) { 4398 /* 4399 * Lock can't be obtained due to deadlock possibility, 4400 * so defer znode destruction. 4401 */ 4402 TASK_INIT(&zp->z_task, 0, zfs_reclaim_complete, zp); 4403 taskqueue_enqueue(taskqueue_thread, &zp->z_task); 4404 } else { 4405 zfs_znode_dmu_fini(zp); 4406 if (locked == 1) 4407 ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id); 4408 zfs_znode_free(zp); 4409 } 4410 } 4411 VI_LOCK(vp); 4412 vp->v_data = NULL; 4413 ASSERT(vp->v_holdcnt >= 1); 4414 VI_UNLOCK(vp); 4415 rw_exit(&zfsvfs->z_teardown_inactive_lock); 4416 return (0); 4417} 4418 4419static int 4420zfs_freebsd_fid(ap) 4421 struct vop_fid_args /* { 4422 struct vnode *a_vp; 4423 struct fid *a_fid; 4424 } */ *ap; 4425{ 4426 4427 return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL)); 4428} 4429 4430static int 4431zfs_freebsd_pathconf(ap) 4432 struct vop_pathconf_args /* { 4433 struct vnode *a_vp; 4434 int a_name; 4435 register_t *a_retval; 4436 } */ *ap; 4437{ 4438 ulong_t val; 4439 int error; 4440 4441 error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred, NULL); 4442 if (error == 0) 4443 *ap->a_retval = val; 4444 else if (error == EOPNOTSUPP) 4445 error = vop_stdpathconf(ap); 4446 return (error); 4447} 4448 4449static int 4450zfs_freebsd_fifo_pathconf(ap) 4451 struct vop_pathconf_args /* { 4452 struct vnode *a_vp; 4453 int a_name; 4454 register_t *a_retval; 4455 } */ *ap; 4456{ 4457 4458 switch (ap->a_name) { 4459 case _PC_ACL_EXTENDED: 4460 case _PC_ACL_NFS4: 4461 case _PC_ACL_PATH_MAX: 4462 case _PC_MAC_PRESENT: 4463 return (zfs_freebsd_pathconf(ap)); 4464 default: 4465 return (fifo_specops.vop_pathconf(ap)); 4466 } 4467} 4468 4469/* 4470 * FreeBSD's extended attributes namespace defines file name prefix for ZFS' 4471 * extended attribute name: 4472 * 4473 * NAMESPACE PREFIX 4474 * system freebsd:system: 4475 * user (none, can be used to access ZFS fsattr(5) attributes 4476 * created on Solaris) 4477 */ 4478static int 4479zfs_create_attrname(int attrnamespace, const char *name, char *attrname, 4480 size_t size) 4481{ 4482 const char *namespace, *prefix, *suffix; 4483 4484 /* We don't allow '/' character in attribute name. */ 4485 if (strchr(name, '/') != NULL) 4486 return (EINVAL); 4487 /* We don't allow attribute names that start with "freebsd:" string. */ 4488 if (strncmp(name, "freebsd:", 8) == 0) 4489 return (EINVAL); 4490 4491 bzero(attrname, size); 4492 4493 switch (attrnamespace) { 4494 case EXTATTR_NAMESPACE_USER: 4495#if 0 4496 prefix = "freebsd:"; 4497 namespace = EXTATTR_NAMESPACE_USER_STRING; 4498 suffix = ":"; 4499#else 4500 /* 4501 * This is the default namespace by which we can access all 4502 * attributes created on Solaris. 4503 */ 4504 prefix = namespace = suffix = ""; 4505#endif 4506 break; 4507 case EXTATTR_NAMESPACE_SYSTEM: 4508 prefix = "freebsd:"; 4509 namespace = EXTATTR_NAMESPACE_SYSTEM_STRING; 4510 suffix = ":"; 4511 break; 4512 case EXTATTR_NAMESPACE_EMPTY: 4513 default: 4514 return (EINVAL); 4515 } 4516 if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix, 4517 name) >= size) { 4518 return (ENAMETOOLONG); 4519 } 4520 return (0); 4521} 4522 4523/* 4524 * Vnode operating to retrieve a named extended attribute. 4525 */ 4526static int 4527zfs_getextattr(struct vop_getextattr_args *ap) 4528/* 4529vop_getextattr { 4530 IN struct vnode *a_vp; 4531 IN int a_attrnamespace; 4532 IN const char *a_name; 4533 INOUT struct uio *a_uio; 4534 OUT size_t *a_size; 4535 IN struct ucred *a_cred; 4536 IN struct thread *a_td; 4537}; 4538*/ 4539{ 4540 zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; 4541 struct thread *td = ap->a_td; 4542 struct nameidata nd; 4543 char attrname[255]; 4544 struct vattr va; 4545 vnode_t *xvp = NULL, *vp; 4546 int error, flags; 4547 4548 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 4549 ap->a_cred, ap->a_td, VREAD); 4550 if (error != 0) 4551 return (error); 4552 4553 error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, 4554 sizeof(attrname)); 4555 if (error != 0) 4556 return (error); 4557 4558 ZFS_ENTER(zfsvfs); 4559 4560 error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, 4561 LOOKUP_XATTR); 4562 if (error != 0) { 4563 ZFS_EXIT(zfsvfs); 4564 return (error); 4565 } 4566 4567 flags = FREAD; 4568 NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_SYSSPACE, attrname, 4569 xvp, td); 4570 error = vn_open_cred(&nd, &flags, 0, 0, ap->a_cred, NULL); 4571 vp = nd.ni_vp; 4572 NDFREE(&nd, NDF_ONLY_PNBUF); 4573 if (error != 0) { 4574 ZFS_EXIT(zfsvfs); 4575 if (error == ENOENT) 4576 error = ENOATTR; 4577 return (error); 4578 } 4579 4580 if (ap->a_size != NULL) { 4581 error = VOP_GETATTR(vp, &va, ap->a_cred); 4582 if (error == 0) 4583 *ap->a_size = (size_t)va.va_size; 4584 } else if (ap->a_uio != NULL) 4585 error = VOP_READ(vp, ap->a_uio, IO_UNIT | IO_SYNC, ap->a_cred); 4586 4587 VOP_UNLOCK(vp, 0); 4588 vn_close(vp, flags, ap->a_cred, td); 4589 ZFS_EXIT(zfsvfs); 4590 4591 return (error); 4592} 4593 4594/* 4595 * Vnode operation to remove a named attribute. 4596 */ 4597int 4598zfs_deleteextattr(struct vop_deleteextattr_args *ap) 4599/* 4600vop_deleteextattr { 4601 IN struct vnode *a_vp; 4602 IN int a_attrnamespace; 4603 IN const char *a_name; 4604 IN struct ucred *a_cred; 4605 IN struct thread *a_td; 4606}; 4607*/ 4608{ 4609 zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; 4610 struct thread *td = ap->a_td; 4611 struct nameidata nd; 4612 char attrname[255]; 4613 struct vattr va; 4614 vnode_t *xvp = NULL, *vp; 4615 int error, flags; 4616 4617 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 4618 ap->a_cred, ap->a_td, VWRITE); 4619 if (error != 0) 4620 return (error); 4621 4622 error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, 4623 sizeof(attrname)); 4624 if (error != 0) 4625 return (error); 4626 4627 ZFS_ENTER(zfsvfs); 4628 4629 error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, 4630 LOOKUP_XATTR); 4631 if (error != 0) { 4632 ZFS_EXIT(zfsvfs); 4633 return (error); 4634 } 4635 4636 NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF | MPSAFE, 4637 UIO_SYSSPACE, attrname, xvp, td); 4638 error = namei(&nd); 4639 vp = nd.ni_vp; 4640 NDFREE(&nd, NDF_ONLY_PNBUF); 4641 if (error != 0) { 4642 ZFS_EXIT(zfsvfs); 4643 if (error == ENOENT) 4644 error = ENOATTR; 4645 return (error); 4646 } 4647 error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd); 4648 4649 vput(nd.ni_dvp); 4650 if (vp == nd.ni_dvp) 4651 vrele(vp); 4652 else 4653 vput(vp); 4654 ZFS_EXIT(zfsvfs); 4655 4656 return (error); 4657} 4658 4659/* 4660 * Vnode operation to set a named attribute. 4661 */ 4662static int 4663zfs_setextattr(struct vop_setextattr_args *ap) 4664/* 4665vop_setextattr { 4666 IN struct vnode *a_vp; 4667 IN int a_attrnamespace; 4668 IN const char *a_name; 4669 INOUT struct uio *a_uio; 4670 IN struct ucred *a_cred; 4671 IN struct thread *a_td; 4672}; 4673*/ 4674{ 4675 zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; 4676 struct thread *td = ap->a_td; 4677 struct nameidata nd; 4678 char attrname[255]; 4679 struct vattr va; 4680 vnode_t *xvp = NULL, *vp; 4681 int error, flags; 4682 4683 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 4684 ap->a_cred, ap->a_td, VWRITE); 4685 if (error != 0) 4686 return (error); 4687 4688 error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, 4689 sizeof(attrname)); 4690 if (error != 0) 4691 return (error); 4692 4693 ZFS_ENTER(zfsvfs); 4694 4695 error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, 4696 LOOKUP_XATTR | CREATE_XATTR_DIR); 4697 if (error != 0) { 4698 ZFS_EXIT(zfsvfs); 4699 return (error); 4700 } 4701 4702 flags = FFLAGS(O_WRONLY | O_CREAT); 4703 NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_SYSSPACE, attrname, 4704 xvp, td); 4705 error = vn_open_cred(&nd, &flags, 0600, 0, ap->a_cred, NULL); 4706 vp = nd.ni_vp; 4707 NDFREE(&nd, NDF_ONLY_PNBUF); 4708 if (error != 0) { 4709 ZFS_EXIT(zfsvfs); 4710 return (error); 4711 } 4712 4713 VATTR_NULL(&va); 4714 va.va_size = 0; 4715 error = VOP_SETATTR(vp, &va, ap->a_cred); 4716 if (error == 0) 4717 VOP_WRITE(vp, ap->a_uio, IO_UNIT | IO_SYNC, ap->a_cred); 4718 4719 VOP_UNLOCK(vp, 0); 4720 vn_close(vp, flags, ap->a_cred, td); 4721 ZFS_EXIT(zfsvfs); 4722 4723 return (error); 4724} 4725 4726/* 4727 * Vnode operation to retrieve extended attributes on a vnode. 4728 */ 4729static int 4730zfs_listextattr(struct vop_listextattr_args *ap) 4731/* 4732vop_listextattr { 4733 IN struct vnode *a_vp; 4734 IN int a_attrnamespace; 4735 INOUT struct uio *a_uio; 4736 OUT size_t *a_size; 4737 IN struct ucred *a_cred; 4738 IN struct thread *a_td; 4739}; 4740*/ 4741{ 4742 zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; 4743 struct thread *td = ap->a_td; 4744 struct nameidata nd; 4745 char attrprefix[16]; 4746 u_char dirbuf[sizeof(struct dirent)]; 4747 struct dirent *dp; 4748 struct iovec aiov; 4749 struct uio auio, *uio = ap->a_uio; 4750 size_t *sizep = ap->a_size; 4751 size_t plen; 4752 vnode_t *xvp = NULL, *vp; 4753 int done, error, eof, pos; 4754 4755 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 4756 ap->a_cred, ap->a_td, VREAD); 4757 if (error != 0) 4758 return (error); 4759 4760 error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix, 4761 sizeof(attrprefix)); 4762 if (error != 0) 4763 return (error); 4764 plen = strlen(attrprefix); 4765 4766 ZFS_ENTER(zfsvfs); 4767 4768 if (sizep != NULL) 4769 *sizep = 0; 4770 4771 error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, 4772 LOOKUP_XATTR); 4773 if (error != 0) { 4774 ZFS_EXIT(zfsvfs); 4775 /* 4776 * ENOATTR means that the EA directory does not yet exist, 4777 * i.e. there are no extended attributes there. 4778 */ 4779 if (error == ENOATTR) 4780 error = 0; 4781 return (error); 4782 } 4783 4784 NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED | MPSAFE, 4785 UIO_SYSSPACE, ".", xvp, td); 4786 error = namei(&nd); 4787 vp = nd.ni_vp; 4788 NDFREE(&nd, NDF_ONLY_PNBUF); 4789 if (error != 0) { 4790 ZFS_EXIT(zfsvfs); 4791 return (error); 4792 } 4793 4794 auio.uio_iov = &aiov; 4795 auio.uio_iovcnt = 1; 4796 auio.uio_segflg = UIO_SYSSPACE; 4797 auio.uio_td = td; 4798 auio.uio_rw = UIO_READ; 4799 auio.uio_offset = 0; 4800 4801 do { 4802 u_char nlen; 4803 4804 aiov.iov_base = (void *)dirbuf; 4805 aiov.iov_len = sizeof(dirbuf); 4806 auio.uio_resid = sizeof(dirbuf); 4807 error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL); 4808 done = sizeof(dirbuf) - auio.uio_resid; 4809 if (error != 0) 4810 break; 4811 for (pos = 0; pos < done;) { 4812 dp = (struct dirent *)(dirbuf + pos); 4813 pos += dp->d_reclen; 4814 /* 4815 * XXX: Temporarily we also accept DT_UNKNOWN, as this 4816 * is what we get when attribute was created on Solaris. 4817 */ 4818 if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN) 4819 continue; 4820 if (plen == 0 && strncmp(dp->d_name, "freebsd:", 8) == 0) 4821 continue; 4822 else if (strncmp(dp->d_name, attrprefix, plen) != 0) 4823 continue; 4824 nlen = dp->d_namlen - plen; 4825 if (sizep != NULL) 4826 *sizep += 1 + nlen; 4827 else if (uio != NULL) { 4828 /* 4829 * Format of extattr name entry is one byte for 4830 * length and the rest for name. 4831 */ 4832 error = uiomove(&nlen, 1, uio->uio_rw, uio); 4833 if (error == 0) { 4834 error = uiomove(dp->d_name + plen, nlen, 4835 uio->uio_rw, uio); 4836 } 4837 if (error != 0) 4838 break; 4839 } 4840 } 4841 } while (!eof && error == 0); 4842 4843 vput(vp); 4844 ZFS_EXIT(zfsvfs); 4845 4846 return (error); 4847} 4848 4849int 4850zfs_freebsd_getacl(ap) 4851 struct vop_getacl_args /* { 4852 struct vnode *vp; 4853 acl_type_t type; 4854 struct acl *aclp; 4855 struct ucred *cred; 4856 struct thread *td; 4857 } */ *ap; 4858{ 4859 int error; 4860 vsecattr_t vsecattr; 4861 4862 if (ap->a_type != ACL_TYPE_NFS4) 4863 return (EOPNOTSUPP); 4864 4865 vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT; 4866 if (error = zfs_getsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL)) 4867 return (error); 4868 4869 error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp, vsecattr.vsa_aclcnt); 4870 if (vsecattr.vsa_aclentp != NULL) 4871 kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz); 4872 4873 return (error); 4874} 4875 4876int 4877zfs_freebsd_setacl(ap) 4878 struct vop_setacl_args /* { 4879 struct vnode *vp; 4880 acl_type_t type; 4881 struct acl *aclp; 4882 struct ucred *cred; 4883 struct thread *td; 4884 } */ *ap; 4885{ 4886 int error; 4887 vsecattr_t vsecattr; 4888 int aclbsize; /* size of acl list in bytes */ 4889 aclent_t *aaclp; 4890 4891 if (ap->a_type != ACL_TYPE_NFS4) 4892 return (EOPNOTSUPP); 4893 4894 if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES) 4895 return (EINVAL); 4896 4897 /* 4898 * With NFSv4 ACLs, chmod(2) may need to add additional entries, 4899 * splitting every entry into two and appending "canonical six" 4900 * entries at the end. Don't allow for setting an ACL that would 4901 * cause chmod(2) to run out of ACL entries. 4902 */ 4903 if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES) 4904 return (ENOSPC); 4905 4906 vsecattr.vsa_mask = VSA_ACE; 4907 aclbsize = ap->a_aclp->acl_cnt * sizeof(ace_t); 4908 vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP); 4909 aaclp = vsecattr.vsa_aclentp; 4910 vsecattr.vsa_aclentsz = aclbsize; 4911 4912 aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp); 4913 error = zfs_setsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL); 4914 kmem_free(aaclp, aclbsize); 4915 4916 return (error); 4917} 4918 4919int 4920zfs_freebsd_aclcheck(ap) 4921 struct vop_aclcheck_args /* { 4922 struct vnode *vp; 4923 acl_type_t type; 4924 struct acl *aclp; 4925 struct ucred *cred; 4926 struct thread *td; 4927 } */ *ap; 4928{ 4929 4930 return (EOPNOTSUPP); 4931} 4932 4933struct vop_vector zfs_vnodeops; 4934struct vop_vector zfs_fifoops; 4935 4936struct vop_vector zfs_vnodeops = { 4937 .vop_default = &default_vnodeops, 4938 .vop_inactive = zfs_freebsd_inactive, 4939 .vop_reclaim = zfs_freebsd_reclaim, 4940 .vop_access = zfs_freebsd_access, 4941#ifdef FREEBSD_NAMECACHE 4942 .vop_lookup = vfs_cache_lookup, 4943 .vop_cachedlookup = zfs_freebsd_lookup, 4944#else 4945 .vop_lookup = zfs_freebsd_lookup, 4946#endif 4947 .vop_getattr = zfs_freebsd_getattr, 4948 .vop_setattr = zfs_freebsd_setattr, 4949 .vop_create = zfs_freebsd_create, 4950 .vop_mknod = zfs_freebsd_create, 4951 .vop_mkdir = zfs_freebsd_mkdir, 4952 .vop_readdir = zfs_freebsd_readdir, 4953 .vop_fsync = zfs_freebsd_fsync, 4954 .vop_open = zfs_freebsd_open, 4955 .vop_close = zfs_freebsd_close, 4956 .vop_rmdir = zfs_freebsd_rmdir, 4957 .vop_ioctl = zfs_freebsd_ioctl, 4958 .vop_link = zfs_freebsd_link, 4959 .vop_symlink = zfs_freebsd_symlink, 4960 .vop_readlink = zfs_freebsd_readlink, 4961 .vop_read = zfs_freebsd_read, 4962 .vop_write = zfs_freebsd_write, 4963 .vop_remove = zfs_freebsd_remove, 4964 .vop_rename = zfs_freebsd_rename, 4965 .vop_pathconf = zfs_freebsd_pathconf, 4966 .vop_bmap = VOP_EOPNOTSUPP, 4967 .vop_fid = zfs_freebsd_fid, 4968 .vop_getextattr = zfs_getextattr, 4969 .vop_deleteextattr = zfs_deleteextattr, 4970 .vop_setextattr = zfs_setextattr, 4971 .vop_listextattr = zfs_listextattr, 4972 .vop_getacl = zfs_freebsd_getacl, 4973 .vop_setacl = zfs_freebsd_setacl, 4974 .vop_aclcheck = zfs_freebsd_aclcheck, 4975}; 4976 4977struct vop_vector zfs_fifoops = { 4978 .vop_default = &fifo_specops, 4979 .vop_fsync = VOP_PANIC, 4980 .vop_access = zfs_freebsd_access, 4981 .vop_getattr = zfs_freebsd_getattr, 4982 .vop_inactive = zfs_freebsd_inactive, 4983 .vop_read = VOP_PANIC, 4984 .vop_reclaim = zfs_freebsd_reclaim, 4985 .vop_setattr = zfs_freebsd_setattr, 4986 .vop_write = VOP_PANIC, 4987 .vop_pathconf = zfs_freebsd_fifo_pathconf, 4988 .vop_fid = zfs_freebsd_fid, 4989 .vop_getacl = zfs_freebsd_getacl, 4990 .vop_setacl = zfs_freebsd_setacl, 4991 .vop_aclcheck = zfs_freebsd_aclcheck, 4992}; 4993