zfs_vnops.c revision 209097
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26/* Portions Copyright 2007 Jeremy Teo */ 27 28#include <sys/types.h> 29#include <sys/param.h> 30#include <sys/time.h> 31#include <sys/systm.h> 32#include <sys/sysmacros.h> 33#include <sys/resource.h> 34#include <sys/vfs.h> 35#include <sys/vnode.h> 36#include <sys/file.h> 37#include <sys/stat.h> 38#include <sys/kmem.h> 39#include <sys/taskq.h> 40#include <sys/uio.h> 41#include <sys/atomic.h> 42#include <sys/namei.h> 43#include <sys/mman.h> 44#include <sys/cmn_err.h> 45#include <sys/errno.h> 46#include <sys/unistd.h> 47#include <sys/zfs_dir.h> 48#include <sys/zfs_ioctl.h> 49#include <sys/fs/zfs.h> 50#include <sys/dmu.h> 51#include <sys/spa.h> 52#include <sys/txg.h> 53#include <sys/dbuf.h> 54#include <sys/zap.h> 55#include <sys/dirent.h> 56#include <sys/policy.h> 57#include <sys/sunddi.h> 58#include <sys/filio.h> 59#include <sys/zfs_ctldir.h> 60#include <sys/zfs_fuid.h> 61#include <sys/dnlc.h> 62#include <sys/zfs_rlock.h> 63#include <sys/extdirent.h> 64#include <sys/kidmap.h> 65#include <sys/bio.h> 66#include <sys/buf.h> 67#include <sys/sf_buf.h> 68#include <sys/sched.h> 69#include <sys/acl.h> 70 71/* 72 * Programming rules. 73 * 74 * Each vnode op performs some logical unit of work. To do this, the ZPL must 75 * properly lock its in-core state, create a DMU transaction, do the work, 76 * record this work in the intent log (ZIL), commit the DMU transaction, 77 * and wait for the intent log to commit if it is a synchronous operation. 78 * Moreover, the vnode ops must work in both normal and log replay context. 79 * The ordering of events is important to avoid deadlocks and references 80 * to freed memory. The example below illustrates the following Big Rules: 81 * 82 * (1) A check must be made in each zfs thread for a mounted file system. 83 * This is done avoiding races using ZFS_ENTER(zfsvfs). 84 * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes 85 * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros 86 * can return EIO from the calling function. 87 * 88 * (2) VN_RELE() should always be the last thing except for zil_commit() 89 * (if necessary) and ZFS_EXIT(). This is for 3 reasons: 90 * First, if it's the last reference, the vnode/znode 91 * can be freed, so the zp may point to freed memory. Second, the last 92 * reference will call zfs_zinactive(), which may induce a lot of work -- 93 * pushing cached pages (which acquires range locks) and syncing out 94 * cached atime changes. Third, zfs_zinactive() may require a new tx, 95 * which could deadlock the system if you were already holding one. 96 * If you must call VN_RELE() within a tx then use VN_RELE_ASYNC(). 97 * 98 * (3) All range locks must be grabbed before calling dmu_tx_assign(), 99 * as they can span dmu_tx_assign() calls. 100 * 101 * (4) Always pass zfsvfs->z_assign as the second argument to dmu_tx_assign(). 102 * In normal operation, this will be TXG_NOWAIT. During ZIL replay, 103 * it will be a specific txg. Either way, dmu_tx_assign() never blocks. 104 * This is critical because we don't want to block while holding locks. 105 * Note, in particular, that if a lock is sometimes acquired before 106 * the tx assigns, and sometimes after (e.g. z_lock), then failing to 107 * use a non-blocking assign can deadlock the system. The scenario: 108 * 109 * Thread A has grabbed a lock before calling dmu_tx_assign(). 110 * Thread B is in an already-assigned tx, and blocks for this lock. 111 * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() 112 * forever, because the previous txg can't quiesce until B's tx commits. 113 * 114 * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, 115 * then drop all locks, call dmu_tx_wait(), and try again. 116 * 117 * (5) If the operation succeeded, generate the intent log entry for it 118 * before dropping locks. This ensures that the ordering of events 119 * in the intent log matches the order in which they actually occurred. 120 * 121 * (6) At the end of each vnode op, the DMU tx must always commit, 122 * regardless of whether there were any errors. 123 * 124 * (7) After dropping all locks, invoke zil_commit(zilog, seq, foid) 125 * to ensure that synchronous semantics are provided when necessary. 126 * 127 * In general, this is how things should be ordered in each vnode op: 128 * 129 * ZFS_ENTER(zfsvfs); // exit if unmounted 130 * top: 131 * zfs_dirent_lock(&dl, ...) // lock directory entry (may VN_HOLD()) 132 * rw_enter(...); // grab any other locks you need 133 * tx = dmu_tx_create(...); // get DMU tx 134 * dmu_tx_hold_*(); // hold each object you might modify 135 * error = dmu_tx_assign(tx, zfsvfs->z_assign); // try to assign 136 * if (error) { 137 * rw_exit(...); // drop locks 138 * zfs_dirent_unlock(dl); // unlock directory entry 139 * VN_RELE(...); // release held vnodes 140 * if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 141 * dmu_tx_wait(tx); 142 * dmu_tx_abort(tx); 143 * goto top; 144 * } 145 * dmu_tx_abort(tx); // abort DMU tx 146 * ZFS_EXIT(zfsvfs); // finished in zfs 147 * return (error); // really out of space 148 * } 149 * error = do_real_work(); // do whatever this VOP does 150 * if (error == 0) 151 * zfs_log_*(...); // on success, make ZIL entry 152 * dmu_tx_commit(tx); // commit DMU tx -- error or not 153 * rw_exit(...); // drop locks 154 * zfs_dirent_unlock(dl); // unlock directory entry 155 * VN_RELE(...); // release held vnodes 156 * zil_commit(zilog, seq, foid); // synchronous when necessary 157 * ZFS_EXIT(zfsvfs); // finished in zfs 158 * return (error); // done, report error 159 */ 160 161/* ARGSUSED */ 162static int 163zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) 164{ 165 znode_t *zp = VTOZ(*vpp); 166 167 if ((flag & FWRITE) && (zp->z_phys->zp_flags & ZFS_APPENDONLY) && 168 ((flag & FAPPEND) == 0)) { 169 return (EPERM); 170 } 171 172 if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && 173 ZTOV(zp)->v_type == VREG && 174 !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) && 175 zp->z_phys->zp_size > 0) 176 if (fs_vscan(*vpp, cr, 0) != 0) 177 return (EACCES); 178 179 /* Keep a count of the synchronous opens in the znode */ 180 if (flag & (FSYNC | FDSYNC)) 181 atomic_inc_32(&zp->z_sync_cnt); 182 183 return (0); 184} 185 186/* ARGSUSED */ 187static int 188zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, 189 caller_context_t *ct) 190{ 191 znode_t *zp = VTOZ(vp); 192 193 /* Decrement the synchronous opens in the znode */ 194 if ((flag & (FSYNC | FDSYNC)) && (count == 1)) 195 atomic_dec_32(&zp->z_sync_cnt); 196 197 /* 198 * Clean up any locks held by this process on the vp. 199 */ 200 cleanlocks(vp, ddi_get_pid(), 0); 201 cleanshares(vp, ddi_get_pid()); 202 203 if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && 204 ZTOV(zp)->v_type == VREG && 205 !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) && 206 zp->z_phys->zp_size > 0) 207 VERIFY(fs_vscan(vp, cr, 1) == 0); 208 209 return (0); 210} 211 212/* 213 * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and 214 * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter. 215 */ 216static int 217zfs_holey(vnode_t *vp, u_long cmd, offset_t *off) 218{ 219 znode_t *zp = VTOZ(vp); 220 uint64_t noff = (uint64_t)*off; /* new offset */ 221 uint64_t file_sz; 222 int error; 223 boolean_t hole; 224 225 file_sz = zp->z_phys->zp_size; 226 if (noff >= file_sz) { 227 return (ENXIO); 228 } 229 230 if (cmd == _FIO_SEEK_HOLE) 231 hole = B_TRUE; 232 else 233 hole = B_FALSE; 234 235 error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff); 236 237 /* end of file? */ 238 if ((error == ESRCH) || (noff > file_sz)) { 239 /* 240 * Handle the virtual hole at the end of file. 241 */ 242 if (hole) { 243 *off = file_sz; 244 return (0); 245 } 246 return (ENXIO); 247 } 248 249 if (noff < *off) 250 return (error); 251 *off = noff; 252 return (error); 253} 254 255/* ARGSUSED */ 256static int 257zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred, 258 int *rvalp, caller_context_t *ct) 259{ 260 offset_t off; 261 int error; 262 zfsvfs_t *zfsvfs; 263 znode_t *zp; 264 265 switch (com) { 266 case _FIOFFS: 267 return (0); 268 269 /* 270 * The following two ioctls are used by bfu. Faking out, 271 * necessary to avoid bfu errors. 272 */ 273 case _FIOGDIO: 274 case _FIOSDIO: 275 return (0); 276 277 case _FIO_SEEK_DATA: 278 case _FIO_SEEK_HOLE: 279 if (ddi_copyin((void *)data, &off, sizeof (off), flag)) 280 return (EFAULT); 281 282 zp = VTOZ(vp); 283 zfsvfs = zp->z_zfsvfs; 284 ZFS_ENTER(zfsvfs); 285 ZFS_VERIFY_ZP(zp); 286 287 /* offset parameter is in/out */ 288 error = zfs_holey(vp, com, &off); 289 ZFS_EXIT(zfsvfs); 290 if (error) 291 return (error); 292 if (ddi_copyout(&off, (void *)data, sizeof (off), flag)) 293 return (EFAULT); 294 return (0); 295 } 296 return (ENOTTY); 297} 298 299/* 300 * When a file is memory mapped, we must keep the IO data synchronized 301 * between the DMU cache and the memory mapped pages. What this means: 302 * 303 * On Write: If we find a memory mapped page, we write to *both* 304 * the page and the dmu buffer. 305 * 306 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when 307 * the file is memory mapped. 308 */ 309static int 310mappedwrite(vnode_t *vp, int nbytes, uio_t *uio, dmu_tx_t *tx) 311{ 312 znode_t *zp = VTOZ(vp); 313 objset_t *os = zp->z_zfsvfs->z_os; 314 vm_object_t obj; 315 vm_page_t m; 316 struct sf_buf *sf; 317 int64_t start, off; 318 int len = nbytes; 319 int error = 0; 320 uint64_t dirbytes; 321 322 ASSERT(vp->v_mount != NULL); 323 obj = vp->v_object; 324 ASSERT(obj != NULL); 325 326 start = uio->uio_loffset; 327 off = start & PAGEOFFSET; 328 dirbytes = 0; 329 VM_OBJECT_LOCK(obj); 330 for (start &= PAGEMASK; len > 0; start += PAGESIZE) { 331 uint64_t bytes = MIN(PAGESIZE - off, len); 332 uint64_t fsize; 333 334again: 335 if ((m = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && 336 vm_page_is_valid(m, (vm_offset_t)off, bytes)) { 337 uint64_t woff; 338 caddr_t va; 339 340 if (vm_page_sleep_if_busy(m, FALSE, "zfsmwb")) 341 goto again; 342 fsize = obj->un_pager.vnp.vnp_size; 343 vm_page_busy(m); 344 vm_page_lock_queues(); 345 vm_page_undirty(m); 346 vm_page_unlock_queues(); 347 VM_OBJECT_UNLOCK(obj); 348 if (dirbytes > 0) { 349 error = dmu_write_uio(os, zp->z_id, uio, 350 dirbytes, tx); 351 dirbytes = 0; 352 } 353 if (error == 0) { 354 sched_pin(); 355 sf = sf_buf_alloc(m, SFB_CPUPRIVATE); 356 va = (caddr_t)sf_buf_kva(sf); 357 woff = uio->uio_loffset - off; 358 error = uiomove(va + off, bytes, UIO_WRITE, uio); 359 /* 360 * The uiomove() above could have been partially 361 * successful, that's why we call dmu_write() 362 * below unconditionally. The page was marked 363 * non-dirty above and we would lose the changes 364 * without doing so. If the uiomove() failed 365 * entirely, well, we just write what we got 366 * before one more time. 367 */ 368 dmu_write(os, zp->z_id, woff, 369 MIN(PAGESIZE, fsize - woff), va, tx); 370 sf_buf_free(sf); 371 sched_unpin(); 372 } 373 VM_OBJECT_LOCK(obj); 374 vm_page_wakeup(m); 375 } else { 376 if (__predict_false(obj->cache != NULL)) { 377 vm_page_cache_free(obj, OFF_TO_IDX(start), 378 OFF_TO_IDX(start) + 1); 379 } 380 dirbytes += bytes; 381 } 382 len -= bytes; 383 off = 0; 384 if (error) 385 break; 386 } 387 VM_OBJECT_UNLOCK(obj); 388 if (error == 0 && dirbytes > 0) 389 error = dmu_write_uio(os, zp->z_id, uio, dirbytes, tx); 390 return (error); 391} 392 393/* 394 * When a file is memory mapped, we must keep the IO data synchronized 395 * between the DMU cache and the memory mapped pages. What this means: 396 * 397 * On Read: We "read" preferentially from memory mapped pages, 398 * else we default from the dmu buffer. 399 * 400 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when 401 * the file is memory mapped. 402 */ 403static int 404mappedread(vnode_t *vp, int nbytes, uio_t *uio) 405{ 406 znode_t *zp = VTOZ(vp); 407 objset_t *os = zp->z_zfsvfs->z_os; 408 vm_object_t obj; 409 vm_page_t m; 410 struct sf_buf *sf; 411 int64_t start, off; 412 caddr_t va; 413 int len = nbytes; 414 int error = 0; 415 uint64_t dirbytes; 416 417 ASSERT(vp->v_mount != NULL); 418 obj = vp->v_object; 419 ASSERT(obj != NULL); 420 421 start = uio->uio_loffset; 422 off = start & PAGEOFFSET; 423 dirbytes = 0; 424 VM_OBJECT_LOCK(obj); 425 for (start &= PAGEMASK; len > 0; start += PAGESIZE) { 426 uint64_t bytes = MIN(PAGESIZE - off, len); 427 428again: 429 if ((m = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && 430 vm_page_is_valid(m, (vm_offset_t)off, bytes)) { 431 if (vm_page_sleep_if_busy(m, FALSE, "zfsmrb")) 432 goto again; 433 vm_page_busy(m); 434 VM_OBJECT_UNLOCK(obj); 435 if (dirbytes > 0) { 436 error = dmu_read_uio(os, zp->z_id, uio, 437 dirbytes); 438 dirbytes = 0; 439 } 440 if (error == 0) { 441 sched_pin(); 442 sf = sf_buf_alloc(m, SFB_CPUPRIVATE); 443 va = (caddr_t)sf_buf_kva(sf); 444 error = uiomove(va + off, bytes, UIO_READ, uio); 445 sf_buf_free(sf); 446 sched_unpin(); 447 } 448 VM_OBJECT_LOCK(obj); 449 vm_page_wakeup(m); 450 } else if (m != NULL && uio->uio_segflg == UIO_NOCOPY) { 451 /* 452 * The code below is here to make sendfile(2) work 453 * correctly with ZFS. As pointed out by ups@ 454 * sendfile(2) should be changed to use VOP_GETPAGES(), 455 * but it pessimize performance of sendfile/UFS, that's 456 * why I handle this special case in ZFS code. 457 */ 458 if (vm_page_sleep_if_busy(m, FALSE, "zfsmrb")) 459 goto again; 460 vm_page_busy(m); 461 VM_OBJECT_UNLOCK(obj); 462 if (dirbytes > 0) { 463 error = dmu_read_uio(os, zp->z_id, uio, 464 dirbytes); 465 dirbytes = 0; 466 } 467 if (error == 0) { 468 sched_pin(); 469 sf = sf_buf_alloc(m, SFB_CPUPRIVATE); 470 va = (caddr_t)sf_buf_kva(sf); 471 error = dmu_read(os, zp->z_id, start + off, 472 bytes, (void *)(va + off)); 473 sf_buf_free(sf); 474 sched_unpin(); 475 } 476 VM_OBJECT_LOCK(obj); 477 vm_page_wakeup(m); 478 if (error == 0) 479 uio->uio_resid -= bytes; 480 } else { 481 dirbytes += bytes; 482 } 483 len -= bytes; 484 off = 0; 485 if (error) 486 break; 487 } 488 VM_OBJECT_UNLOCK(obj); 489 if (error == 0 && dirbytes > 0) 490 error = dmu_read_uio(os, zp->z_id, uio, dirbytes); 491 return (error); 492} 493 494offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */ 495 496/* 497 * Read bytes from specified file into supplied buffer. 498 * 499 * IN: vp - vnode of file to be read from. 500 * uio - structure supplying read location, range info, 501 * and return buffer. 502 * ioflag - SYNC flags; used to provide FRSYNC semantics. 503 * cr - credentials of caller. 504 * ct - caller context 505 * 506 * OUT: uio - updated offset and range, buffer filled. 507 * 508 * RETURN: 0 if success 509 * error code if failure 510 * 511 * Side Effects: 512 * vp - atime updated if byte count > 0 513 */ 514/* ARGSUSED */ 515static int 516zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) 517{ 518 znode_t *zp = VTOZ(vp); 519 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 520 objset_t *os; 521 ssize_t n, nbytes; 522 int error; 523 rl_t *rl; 524 525 ZFS_ENTER(zfsvfs); 526 ZFS_VERIFY_ZP(zp); 527 os = zfsvfs->z_os; 528 529 if (zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) { 530 ZFS_EXIT(zfsvfs); 531 return (EACCES); 532 } 533 534 /* 535 * Validate file offset 536 */ 537 if (uio->uio_loffset < (offset_t)0) { 538 ZFS_EXIT(zfsvfs); 539 return (EINVAL); 540 } 541 542 /* 543 * Fasttrack empty reads 544 */ 545 if (uio->uio_resid == 0) { 546 ZFS_EXIT(zfsvfs); 547 return (0); 548 } 549 550 /* 551 * Check for mandatory locks 552 */ 553 if (MANDMODE((mode_t)zp->z_phys->zp_mode)) { 554 if (error = chklock(vp, FREAD, 555 uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) { 556 ZFS_EXIT(zfsvfs); 557 return (error); 558 } 559 } 560 561 /* 562 * If we're in FRSYNC mode, sync out this znode before reading it. 563 */ 564 if (ioflag & FRSYNC) 565 zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id); 566 567 /* 568 * Lock the range against changes. 569 */ 570 rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER); 571 572 /* 573 * If we are reading past end-of-file we can skip 574 * to the end; but we might still need to set atime. 575 */ 576 if (uio->uio_loffset >= zp->z_phys->zp_size) { 577 error = 0; 578 goto out; 579 } 580 581 ASSERT(uio->uio_loffset < zp->z_phys->zp_size); 582 n = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset); 583 584 while (n > 0) { 585 nbytes = MIN(n, zfs_read_chunk_size - 586 P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); 587 588 if (vn_has_cached_data(vp)) 589 error = mappedread(vp, nbytes, uio); 590 else 591 error = dmu_read_uio(os, zp->z_id, uio, nbytes); 592 if (error) { 593 /* convert checksum errors into IO errors */ 594 if (error == ECKSUM) 595 error = EIO; 596 break; 597 } 598 599 n -= nbytes; 600 } 601 602out: 603 zfs_range_unlock(rl); 604 605 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 606 ZFS_EXIT(zfsvfs); 607 return (error); 608} 609 610/* 611 * Fault in the pages of the first n bytes specified by the uio structure. 612 * 1 byte in each page is touched and the uio struct is unmodified. 613 * Any error will exit this routine as this is only a best 614 * attempt to get the pages resident. This is a copy of ufs_trans_touch(). 615 */ 616static void 617zfs_prefault_write(ssize_t n, struct uio *uio) 618{ 619 struct iovec *iov; 620 ulong_t cnt, incr; 621 caddr_t p; 622 623 if (uio->uio_segflg != UIO_USERSPACE) 624 return; 625 626 iov = uio->uio_iov; 627 628 while (n) { 629 cnt = MIN(iov->iov_len, n); 630 if (cnt == 0) { 631 /* empty iov entry */ 632 iov++; 633 continue; 634 } 635 n -= cnt; 636 /* 637 * touch each page in this segment. 638 */ 639 p = iov->iov_base; 640 while (cnt) { 641 if (fubyte(p) == -1) 642 return; 643 incr = MIN(cnt, PAGESIZE); 644 p += incr; 645 cnt -= incr; 646 } 647 /* 648 * touch the last byte in case it straddles a page. 649 */ 650 p--; 651 if (fubyte(p) == -1) 652 return; 653 iov++; 654 } 655} 656 657/* 658 * Write the bytes to a file. 659 * 660 * IN: vp - vnode of file to be written to. 661 * uio - structure supplying write location, range info, 662 * and data buffer. 663 * ioflag - IO_APPEND flag set if in append mode. 664 * cr - credentials of caller. 665 * ct - caller context (NFS/CIFS fem monitor only) 666 * 667 * OUT: uio - updated offset and range. 668 * 669 * RETURN: 0 if success 670 * error code if failure 671 * 672 * Timestamps: 673 * vp - ctime|mtime updated if byte count > 0 674 */ 675/* ARGSUSED */ 676static int 677zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) 678{ 679 znode_t *zp = VTOZ(vp); 680 rlim64_t limit = MAXOFFSET_T; 681 ssize_t start_resid = uio->uio_resid; 682 ssize_t tx_bytes; 683 uint64_t end_size; 684 dmu_tx_t *tx; 685 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 686 zilog_t *zilog; 687 offset_t woff; 688 ssize_t n, nbytes; 689 rl_t *rl; 690 int max_blksz = zfsvfs->z_max_blksz; 691 uint64_t pflags; 692 int error; 693 694 /* 695 * Fasttrack empty write 696 */ 697 n = start_resid; 698 if (n == 0) 699 return (0); 700 701 if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) 702 limit = MAXOFFSET_T; 703 704 ZFS_ENTER(zfsvfs); 705 ZFS_VERIFY_ZP(zp); 706 707 /* 708 * If immutable or not appending then return EPERM 709 */ 710 pflags = zp->z_phys->zp_flags; 711 if ((pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) || 712 ((pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) && 713 (uio->uio_loffset < zp->z_phys->zp_size))) { 714 ZFS_EXIT(zfsvfs); 715 return (EPERM); 716 } 717 718 zilog = zfsvfs->z_log; 719 720 /* 721 * Pre-fault the pages to ensure slow (eg NFS) pages 722 * don't hold up txg. 723 */ 724 zfs_prefault_write(n, uio); 725 726 /* 727 * If in append mode, set the io offset pointer to eof. 728 */ 729 if (ioflag & IO_APPEND) { 730 /* 731 * Range lock for a file append: 732 * The value for the start of range will be determined by 733 * zfs_range_lock() (to guarantee append semantics). 734 * If this write will cause the block size to increase, 735 * zfs_range_lock() will lock the entire file, so we must 736 * later reduce the range after we grow the block size. 737 */ 738 rl = zfs_range_lock(zp, 0, n, RL_APPEND); 739 if (rl->r_len == UINT64_MAX) { 740 /* overlocked, zp_size can't change */ 741 woff = uio->uio_loffset = zp->z_phys->zp_size; 742 } else { 743 woff = uio->uio_loffset = rl->r_off; 744 } 745 } else { 746 woff = uio->uio_loffset; 747 /* 748 * Validate file offset 749 */ 750 if (woff < 0) { 751 ZFS_EXIT(zfsvfs); 752 return (EINVAL); 753 } 754 755 /* 756 * If we need to grow the block size then zfs_range_lock() 757 * will lock a wider range than we request here. 758 * Later after growing the block size we reduce the range. 759 */ 760 rl = zfs_range_lock(zp, woff, n, RL_WRITER); 761 } 762 763 if (woff >= limit) { 764 zfs_range_unlock(rl); 765 ZFS_EXIT(zfsvfs); 766 return (EFBIG); 767 } 768 769 if ((woff + n) > limit || woff > (limit - n)) 770 n = limit - woff; 771 772 /* 773 * Check for mandatory locks 774 */ 775 if (MANDMODE((mode_t)zp->z_phys->zp_mode) && 776 (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) { 777 zfs_range_unlock(rl); 778 ZFS_EXIT(zfsvfs); 779 return (error); 780 } 781 end_size = MAX(zp->z_phys->zp_size, woff + n); 782 783 /* 784 * Write the file in reasonable size chunks. Each chunk is written 785 * in a separate transaction; this keeps the intent log records small 786 * and allows us to do more fine-grained space accounting. 787 */ 788 while (n > 0) { 789 /* 790 * Start a transaction. 791 */ 792 woff = uio->uio_loffset; 793 tx = dmu_tx_create(zfsvfs->z_os); 794 dmu_tx_hold_bonus(tx, zp->z_id); 795 dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz)); 796 error = dmu_tx_assign(tx, zfsvfs->z_assign); 797 if (error) { 798 if (error == ERESTART && 799 zfsvfs->z_assign == TXG_NOWAIT) { 800 dmu_tx_wait(tx); 801 dmu_tx_abort(tx); 802 continue; 803 } 804 dmu_tx_abort(tx); 805 break; 806 } 807 808 /* 809 * If zfs_range_lock() over-locked we grow the blocksize 810 * and then reduce the lock range. This will only happen 811 * on the first iteration since zfs_range_reduce() will 812 * shrink down r_len to the appropriate size. 813 */ 814 if (rl->r_len == UINT64_MAX) { 815 uint64_t new_blksz; 816 817 if (zp->z_blksz > max_blksz) { 818 ASSERT(!ISP2(zp->z_blksz)); 819 new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE); 820 } else { 821 new_blksz = MIN(end_size, max_blksz); 822 } 823 zfs_grow_blocksize(zp, new_blksz, tx); 824 zfs_range_reduce(rl, woff, n); 825 } 826 827 /* 828 * XXX - should we really limit each write to z_max_blksz? 829 * Perhaps we should use SPA_MAXBLOCKSIZE chunks? 830 */ 831 nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); 832 833 if (woff + nbytes > zp->z_phys->zp_size) 834 vnode_pager_setsize(vp, woff + nbytes); 835 836 rw_enter(&zp->z_map_lock, RW_READER); 837 838 tx_bytes = uio->uio_resid; 839 if (vn_has_cached_data(vp)) { 840 rw_exit(&zp->z_map_lock); 841 error = mappedwrite(vp, nbytes, uio, tx); 842 } else { 843 error = dmu_write_uio(zfsvfs->z_os, zp->z_id, 844 uio, nbytes, tx); 845 rw_exit(&zp->z_map_lock); 846 } 847 tx_bytes -= uio->uio_resid; 848 849 /* 850 * If we made no progress, we're done. If we made even 851 * partial progress, update the znode and ZIL accordingly. 852 */ 853 if (tx_bytes == 0) { 854 dmu_tx_commit(tx); 855 ASSERT(error != 0); 856 break; 857 } 858 859 /* 860 * Clear Set-UID/Set-GID bits on successful write if not 861 * privileged and at least one of the excute bits is set. 862 * 863 * It would be nice to to this after all writes have 864 * been done, but that would still expose the ISUID/ISGID 865 * to another app after the partial write is committed. 866 * 867 * Note: we don't call zfs_fuid_map_id() here because 868 * user 0 is not an ephemeral uid. 869 */ 870 mutex_enter(&zp->z_acl_lock); 871 if ((zp->z_phys->zp_mode & (S_IXUSR | (S_IXUSR >> 3) | 872 (S_IXUSR >> 6))) != 0 && 873 (zp->z_phys->zp_mode & (S_ISUID | S_ISGID)) != 0 && 874 secpolicy_vnode_setid_retain(vp, cr, 875 (zp->z_phys->zp_mode & S_ISUID) != 0 && 876 zp->z_phys->zp_uid == 0) != 0) { 877 zp->z_phys->zp_mode &= ~(S_ISUID | S_ISGID); 878 } 879 mutex_exit(&zp->z_acl_lock); 880 881 /* 882 * Update time stamp. NOTE: This marks the bonus buffer as 883 * dirty, so we don't have to do it again for zp_size. 884 */ 885 zfs_time_stamper(zp, CONTENT_MODIFIED, tx); 886 887 /* 888 * Update the file size (zp_size) if it has changed; 889 * account for possible concurrent updates. 890 */ 891 while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset) 892 (void) atomic_cas_64(&zp->z_phys->zp_size, end_size, 893 uio->uio_loffset); 894 zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag); 895 dmu_tx_commit(tx); 896 897 if (error != 0) 898 break; 899 ASSERT(tx_bytes == nbytes); 900 n -= nbytes; 901 } 902 903 zfs_range_unlock(rl); 904 905 /* 906 * If we're in replay mode, or we made no progress, return error. 907 * Otherwise, it's at least a partial write, so it's successful. 908 */ 909 if (zfsvfs->z_assign >= TXG_INITIAL || uio->uio_resid == start_resid) { 910 ZFS_EXIT(zfsvfs); 911 return (error); 912 } 913 914 if (ioflag & (FSYNC | FDSYNC)) 915 zil_commit(zilog, zp->z_last_itx, zp->z_id); 916 917 ZFS_EXIT(zfsvfs); 918 return (0); 919} 920 921void 922zfs_get_done(dmu_buf_t *db, void *vzgd) 923{ 924 zgd_t *zgd = (zgd_t *)vzgd; 925 rl_t *rl = zgd->zgd_rl; 926 vnode_t *vp = ZTOV(rl->r_zp); 927 objset_t *os = rl->r_zp->z_zfsvfs->z_os; 928 int vfslocked; 929 930 vfslocked = VFS_LOCK_GIANT(vp->v_vfsp); 931 dmu_buf_rele(db, vzgd); 932 zfs_range_unlock(rl); 933 /* 934 * Release the vnode asynchronously as we currently have the 935 * txg stopped from syncing. 936 */ 937 VN_RELE_ASYNC(vp, dsl_pool_vnrele_taskq(dmu_objset_pool(os))); 938 zil_add_block(zgd->zgd_zilog, zgd->zgd_bp); 939 kmem_free(zgd, sizeof (zgd_t)); 940 VFS_UNLOCK_GIANT(vfslocked); 941} 942 943/* 944 * Get data to generate a TX_WRITE intent log record. 945 */ 946int 947zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) 948{ 949 zfsvfs_t *zfsvfs = arg; 950 objset_t *os = zfsvfs->z_os; 951 znode_t *zp; 952 uint64_t off = lr->lr_offset; 953 dmu_buf_t *db; 954 rl_t *rl; 955 zgd_t *zgd; 956 int dlen = lr->lr_length; /* length of user data */ 957 int error = 0; 958 959 ASSERT(zio); 960 ASSERT(dlen != 0); 961 962 /* 963 * Nothing to do if the file has been removed 964 */ 965 if (zfs_zget(zfsvfs, lr->lr_foid, &zp) != 0) 966 return (ENOENT); 967 if (zp->z_unlinked) { 968 /* 969 * Release the vnode asynchronously as we currently have the 970 * txg stopped from syncing. 971 */ 972 VN_RELE_ASYNC(ZTOV(zp), 973 dsl_pool_vnrele_taskq(dmu_objset_pool(os))); 974 return (ENOENT); 975 } 976 977 /* 978 * Write records come in two flavors: immediate and indirect. 979 * For small writes it's cheaper to store the data with the 980 * log record (immediate); for large writes it's cheaper to 981 * sync the data and get a pointer to it (indirect) so that 982 * we don't have to write the data twice. 983 */ 984 if (buf != NULL) { /* immediate write */ 985 rl = zfs_range_lock(zp, off, dlen, RL_READER); 986 /* test for truncation needs to be done while range locked */ 987 if (off >= zp->z_phys->zp_size) { 988 error = ENOENT; 989 goto out; 990 } 991 VERIFY(0 == dmu_read(os, lr->lr_foid, off, dlen, buf)); 992 } else { /* indirect write */ 993 uint64_t boff; /* block starting offset */ 994 995 /* 996 * Have to lock the whole block to ensure when it's 997 * written out and it's checksum is being calculated 998 * that no one can change the data. We need to re-check 999 * blocksize after we get the lock in case it's changed! 1000 */ 1001 for (;;) { 1002 if (ISP2(zp->z_blksz)) { 1003 boff = P2ALIGN_TYPED(off, zp->z_blksz, 1004 uint64_t); 1005 } else { 1006 boff = 0; 1007 } 1008 dlen = zp->z_blksz; 1009 rl = zfs_range_lock(zp, boff, dlen, RL_READER); 1010 if (zp->z_blksz == dlen) 1011 break; 1012 zfs_range_unlock(rl); 1013 } 1014 /* test for truncation needs to be done while range locked */ 1015 if (off >= zp->z_phys->zp_size) { 1016 error = ENOENT; 1017 goto out; 1018 } 1019 zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP); 1020 zgd->zgd_rl = rl; 1021 zgd->zgd_zilog = zfsvfs->z_log; 1022 zgd->zgd_bp = &lr->lr_blkptr; 1023 VERIFY(0 == dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db)); 1024 ASSERT(boff == db->db_offset); 1025 lr->lr_blkoff = off - boff; 1026 error = dmu_sync(zio, db, &lr->lr_blkptr, 1027 lr->lr_common.lrc_txg, zfs_get_done, zgd); 1028 ASSERT((error && error != EINPROGRESS) || 1029 lr->lr_length <= zp->z_blksz); 1030 if (error == 0) 1031 zil_add_block(zfsvfs->z_log, &lr->lr_blkptr); 1032 /* 1033 * If we get EINPROGRESS, then we need to wait for a 1034 * write IO initiated by dmu_sync() to complete before 1035 * we can release this dbuf. We will finish everything 1036 * up in the zfs_get_done() callback. 1037 */ 1038 if (error == EINPROGRESS) 1039 return (0); 1040 dmu_buf_rele(db, zgd); 1041 kmem_free(zgd, sizeof (zgd_t)); 1042 } 1043out: 1044 zfs_range_unlock(rl); 1045 /* 1046 * Release the vnode asynchronously as we currently have the 1047 * txg stopped from syncing. 1048 */ 1049 VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os))); 1050 return (error); 1051} 1052 1053/*ARGSUSED*/ 1054static int 1055zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr, 1056 caller_context_t *ct) 1057{ 1058 znode_t *zp = VTOZ(vp); 1059 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1060 int error; 1061 1062 ZFS_ENTER(zfsvfs); 1063 ZFS_VERIFY_ZP(zp); 1064 1065 if (flag & V_ACE_MASK) 1066 error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); 1067 else 1068 error = zfs_zaccess_rwx(zp, mode, flag, cr); 1069 1070 ZFS_EXIT(zfsvfs); 1071 return (error); 1072} 1073 1074/* 1075 * Lookup an entry in a directory, or an extended attribute directory. 1076 * If it exists, return a held vnode reference for it. 1077 * 1078 * IN: dvp - vnode of directory to search. 1079 * nm - name of entry to lookup. 1080 * pnp - full pathname to lookup [UNUSED]. 1081 * flags - LOOKUP_XATTR set if looking for an attribute. 1082 * rdir - root directory vnode [UNUSED]. 1083 * cr - credentials of caller. 1084 * ct - caller context 1085 * direntflags - directory lookup flags 1086 * realpnp - returned pathname. 1087 * 1088 * OUT: vpp - vnode of located entry, NULL if not found. 1089 * 1090 * RETURN: 0 if success 1091 * error code if failure 1092 * 1093 * Timestamps: 1094 * NA 1095 */ 1096/* ARGSUSED */ 1097static int 1098zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp, 1099 int nameiop, cred_t *cr, kthread_t *td, int flags) 1100{ 1101 znode_t *zdp = VTOZ(dvp); 1102 zfsvfs_t *zfsvfs = zdp->z_zfsvfs; 1103 int error; 1104 int *direntflags = NULL; 1105 void *realpnp = NULL; 1106 1107 ZFS_ENTER(zfsvfs); 1108 ZFS_VERIFY_ZP(zdp); 1109 1110 *vpp = NULL; 1111 1112 if (flags & LOOKUP_XATTR) { 1113#ifdef TODO 1114 /* 1115 * If the xattr property is off, refuse the lookup request. 1116 */ 1117 if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) { 1118 ZFS_EXIT(zfsvfs); 1119 return (EINVAL); 1120 } 1121#endif 1122 1123 /* 1124 * We don't allow recursive attributes.. 1125 * Maybe someday we will. 1126 */ 1127 if (zdp->z_phys->zp_flags & ZFS_XATTR) { 1128 ZFS_EXIT(zfsvfs); 1129 return (EINVAL); 1130 } 1131 1132 if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) { 1133 ZFS_EXIT(zfsvfs); 1134 return (error); 1135 } 1136 1137 /* 1138 * Do we have permission to get into attribute directory? 1139 */ 1140 1141 if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0, 1142 B_FALSE, cr)) { 1143 VN_RELE(*vpp); 1144 *vpp = NULL; 1145 } 1146 1147 ZFS_EXIT(zfsvfs); 1148 return (error); 1149 } 1150 1151 if (dvp->v_type != VDIR) { 1152 ZFS_EXIT(zfsvfs); 1153 return (ENOTDIR); 1154 } 1155 1156 /* 1157 * Check accessibility of directory. 1158 */ 1159 1160 if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) { 1161 ZFS_EXIT(zfsvfs); 1162 return (error); 1163 } 1164 1165 if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), 1166 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 1167 ZFS_EXIT(zfsvfs); 1168 return (EILSEQ); 1169 } 1170 1171 error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp); 1172 if (error == 0) { 1173 /* 1174 * Convert device special files 1175 */ 1176 if (IS_DEVVP(*vpp)) { 1177 vnode_t *svp; 1178 1179 svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); 1180 VN_RELE(*vpp); 1181 if (svp == NULL) 1182 error = ENOSYS; 1183 else 1184 *vpp = svp; 1185 } 1186 } 1187 1188 /* Translate errors and add SAVENAME when needed. */ 1189 if (cnp->cn_flags & ISLASTCN) { 1190 switch (nameiop) { 1191 case CREATE: 1192 case RENAME: 1193 if (error == ENOENT) { 1194 error = EJUSTRETURN; 1195 cnp->cn_flags |= SAVENAME; 1196 break; 1197 } 1198 /* FALLTHROUGH */ 1199 case DELETE: 1200 if (error == 0) 1201 cnp->cn_flags |= SAVENAME; 1202 break; 1203 } 1204 } 1205 if (error == 0 && (nm[0] != '.' || nm[1] != '\0')) { 1206 int ltype = 0; 1207 1208 if (cnp->cn_flags & ISDOTDOT) { 1209 ltype = VOP_ISLOCKED(dvp); 1210 VOP_UNLOCK(dvp, 0); 1211 } 1212 ZFS_EXIT(zfsvfs); 1213 error = vn_lock(*vpp, cnp->cn_lkflags); 1214 if (cnp->cn_flags & ISDOTDOT) 1215 vn_lock(dvp, ltype | LK_RETRY); 1216 if (error != 0) { 1217 VN_RELE(*vpp); 1218 *vpp = NULL; 1219 return (error); 1220 } 1221 } else { 1222 ZFS_EXIT(zfsvfs); 1223 } 1224 1225#ifdef FREEBSD_NAMECACHE 1226 /* 1227 * Insert name into cache (as non-existent) if appropriate. 1228 */ 1229 if (error == ENOENT && (cnp->cn_flags & MAKEENTRY) && nameiop != CREATE) 1230 cache_enter(dvp, *vpp, cnp); 1231 /* 1232 * Insert name into cache if appropriate. 1233 */ 1234 if (error == 0 && (cnp->cn_flags & MAKEENTRY)) { 1235 if (!(cnp->cn_flags & ISLASTCN) || 1236 (nameiop != DELETE && nameiop != RENAME)) { 1237 cache_enter(dvp, *vpp, cnp); 1238 } 1239 } 1240#endif 1241 1242 return (error); 1243} 1244 1245/* 1246 * Attempt to create a new entry in a directory. If the entry 1247 * already exists, truncate the file if permissible, else return 1248 * an error. Return the vp of the created or trunc'd file. 1249 * 1250 * IN: dvp - vnode of directory to put new file entry in. 1251 * name - name of new file entry. 1252 * vap - attributes of new file. 1253 * excl - flag indicating exclusive or non-exclusive mode. 1254 * mode - mode to open file with. 1255 * cr - credentials of caller. 1256 * flag - large file flag [UNUSED]. 1257 * ct - caller context 1258 * vsecp - ACL to be set 1259 * 1260 * OUT: vpp - vnode of created or trunc'd entry. 1261 * 1262 * RETURN: 0 if success 1263 * error code if failure 1264 * 1265 * Timestamps: 1266 * dvp - ctime|mtime updated if new entry created 1267 * vp - ctime|mtime always, atime if new 1268 */ 1269 1270/* ARGSUSED */ 1271static int 1272zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode, 1273 vnode_t **vpp, cred_t *cr, kthread_t *td) 1274{ 1275 znode_t *zp, *dzp = VTOZ(dvp); 1276 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1277 zilog_t *zilog; 1278 objset_t *os; 1279 zfs_dirlock_t *dl; 1280 dmu_tx_t *tx; 1281 int error; 1282 zfs_acl_t *aclp = NULL; 1283 zfs_fuid_info_t *fuidp = NULL; 1284 void *vsecp = NULL; 1285 int flag = 0; 1286 1287 /* 1288 * If we have an ephemeral id, ACL, or XVATTR then 1289 * make sure file system is at proper version 1290 */ 1291 1292 if (zfsvfs->z_use_fuids == B_FALSE && 1293 (vsecp || (vap->va_mask & AT_XVATTR) || 1294 IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr)))) 1295 return (EINVAL); 1296 1297 ZFS_ENTER(zfsvfs); 1298 ZFS_VERIFY_ZP(dzp); 1299 os = zfsvfs->z_os; 1300 zilog = zfsvfs->z_log; 1301 1302 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), 1303 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 1304 ZFS_EXIT(zfsvfs); 1305 return (EILSEQ); 1306 } 1307 1308 if (vap->va_mask & AT_XVATTR) { 1309 if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap, 1310 crgetuid(cr), cr, vap->va_type)) != 0) { 1311 ZFS_EXIT(zfsvfs); 1312 return (error); 1313 } 1314 } 1315top: 1316 *vpp = NULL; 1317 1318 if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr)) 1319 vap->va_mode &= ~S_ISVTX; 1320 1321 if (*name == '\0') { 1322 /* 1323 * Null component name refers to the directory itself. 1324 */ 1325 VN_HOLD(dvp); 1326 zp = dzp; 1327 dl = NULL; 1328 error = 0; 1329 } else { 1330 /* possible VN_HOLD(zp) */ 1331 int zflg = 0; 1332 1333 if (flag & FIGNORECASE) 1334 zflg |= ZCILOOK; 1335 1336 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 1337 NULL, NULL); 1338 if (error) { 1339 if (strcmp(name, "..") == 0) 1340 error = EISDIR; 1341 ZFS_EXIT(zfsvfs); 1342 if (aclp) 1343 zfs_acl_free(aclp); 1344 return (error); 1345 } 1346 } 1347 if (vsecp && aclp == NULL) { 1348 error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, &aclp); 1349 if (error) { 1350 ZFS_EXIT(zfsvfs); 1351 if (dl) 1352 zfs_dirent_unlock(dl); 1353 return (error); 1354 } 1355 } 1356 1357 if (zp == NULL) { 1358 uint64_t txtype; 1359 1360 /* 1361 * Create a new file object and update the directory 1362 * to reference it. 1363 */ 1364 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { 1365 goto out; 1366 } 1367 1368 /* 1369 * We only support the creation of regular files in 1370 * extended attribute directories. 1371 */ 1372 if ((dzp->z_phys->zp_flags & ZFS_XATTR) && 1373 (vap->va_type != VREG)) { 1374 error = EINVAL; 1375 goto out; 1376 } 1377 1378 tx = dmu_tx_create(os); 1379 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 1380 if ((aclp && aclp->z_has_fuids) || IS_EPHEMERAL(crgetuid(cr)) || 1381 IS_EPHEMERAL(crgetgid(cr))) { 1382 if (zfsvfs->z_fuid_obj == 0) { 1383 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 1384 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 1385 FUID_SIZE_ESTIMATE(zfsvfs)); 1386 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, 1387 FALSE, NULL); 1388 } else { 1389 dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); 1390 dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, 1391 FUID_SIZE_ESTIMATE(zfsvfs)); 1392 } 1393 } 1394 dmu_tx_hold_bonus(tx, dzp->z_id); 1395 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 1396 if ((dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) || aclp) { 1397 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 1398 0, SPA_MAXBLOCKSIZE); 1399 } 1400 error = dmu_tx_assign(tx, zfsvfs->z_assign); 1401 if (error) { 1402 zfs_dirent_unlock(dl); 1403 if (error == ERESTART && 1404 zfsvfs->z_assign == TXG_NOWAIT) { 1405 dmu_tx_wait(tx); 1406 dmu_tx_abort(tx); 1407 goto top; 1408 } 1409 dmu_tx_abort(tx); 1410 ZFS_EXIT(zfsvfs); 1411 if (aclp) 1412 zfs_acl_free(aclp); 1413 return (error); 1414 } 1415 zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, aclp, &fuidp); 1416 (void) zfs_link_create(dl, zp, tx, ZNEW); 1417 txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); 1418 if (flag & FIGNORECASE) 1419 txtype |= TX_CI; 1420 zfs_log_create(zilog, tx, txtype, dzp, zp, name, 1421 vsecp, fuidp, vap); 1422 if (fuidp) 1423 zfs_fuid_info_free(fuidp); 1424 dmu_tx_commit(tx); 1425 } else { 1426 int aflags = (flag & FAPPEND) ? V_APPEND : 0; 1427 1428 /* 1429 * A directory entry already exists for this name. 1430 */ 1431 /* 1432 * Can't truncate an existing file if in exclusive mode. 1433 */ 1434 if (excl == EXCL) { 1435 error = EEXIST; 1436 goto out; 1437 } 1438 /* 1439 * Can't open a directory for writing. 1440 */ 1441 if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) { 1442 error = EISDIR; 1443 goto out; 1444 } 1445 /* 1446 * Verify requested access to file. 1447 */ 1448 if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) { 1449 goto out; 1450 } 1451 1452 mutex_enter(&dzp->z_lock); 1453 dzp->z_seq++; 1454 mutex_exit(&dzp->z_lock); 1455 1456 /* 1457 * Truncate regular files if requested. 1458 */ 1459 if ((ZTOV(zp)->v_type == VREG) && 1460 (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) { 1461 /* we can't hold any locks when calling zfs_freesp() */ 1462 zfs_dirent_unlock(dl); 1463 dl = NULL; 1464 error = zfs_freesp(zp, 0, 0, mode, TRUE); 1465 if (error == 0) { 1466 vnevent_create(ZTOV(zp), ct); 1467 } 1468 } 1469 } 1470out: 1471 if (dl) 1472 zfs_dirent_unlock(dl); 1473 1474 if (error) { 1475 if (zp) 1476 VN_RELE(ZTOV(zp)); 1477 } else { 1478 *vpp = ZTOV(zp); 1479 /* 1480 * If vnode is for a device return a specfs vnode instead. 1481 */ 1482 if (IS_DEVVP(*vpp)) { 1483 struct vnode *svp; 1484 1485 svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); 1486 VN_RELE(*vpp); 1487 if (svp == NULL) { 1488 error = ENOSYS; 1489 } 1490 *vpp = svp; 1491 } 1492 } 1493 if (aclp) 1494 zfs_acl_free(aclp); 1495 1496 ZFS_EXIT(zfsvfs); 1497 return (error); 1498} 1499 1500/* 1501 * Remove an entry from a directory. 1502 * 1503 * IN: dvp - vnode of directory to remove entry from. 1504 * name - name of entry to remove. 1505 * cr - credentials of caller. 1506 * ct - caller context 1507 * flags - case flags 1508 * 1509 * RETURN: 0 if success 1510 * error code if failure 1511 * 1512 * Timestamps: 1513 * dvp - ctime|mtime 1514 * vp - ctime (if nlink > 0) 1515 */ 1516/*ARGSUSED*/ 1517static int 1518zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct, 1519 int flags) 1520{ 1521 znode_t *zp, *dzp = VTOZ(dvp); 1522 znode_t *xzp = NULL; 1523 vnode_t *vp; 1524 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1525 zilog_t *zilog; 1526 uint64_t acl_obj, xattr_obj; 1527 zfs_dirlock_t *dl; 1528 dmu_tx_t *tx; 1529 boolean_t may_delete_now, delete_now = FALSE; 1530 boolean_t unlinked, toobig = FALSE; 1531 uint64_t txtype; 1532 pathname_t *realnmp = NULL; 1533 pathname_t realnm; 1534 int error; 1535 int zflg = ZEXISTS; 1536 1537 ZFS_ENTER(zfsvfs); 1538 ZFS_VERIFY_ZP(dzp); 1539 zilog = zfsvfs->z_log; 1540 1541 if (flags & FIGNORECASE) { 1542 zflg |= ZCILOOK; 1543 pn_alloc(&realnm); 1544 realnmp = &realnm; 1545 } 1546 1547top: 1548 /* 1549 * Attempt to lock directory; fail if entry doesn't exist. 1550 */ 1551 if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 1552 NULL, realnmp)) { 1553 if (realnmp) 1554 pn_free(realnmp); 1555 ZFS_EXIT(zfsvfs); 1556 return (error); 1557 } 1558 1559 vp = ZTOV(zp); 1560 1561 if (error = zfs_zaccess_delete(dzp, zp, cr)) { 1562 goto out; 1563 } 1564 1565 /* 1566 * Need to use rmdir for removing directories. 1567 */ 1568 if (vp->v_type == VDIR) { 1569 error = EPERM; 1570 goto out; 1571 } 1572 1573 vnevent_remove(vp, dvp, name, ct); 1574 1575 if (realnmp) 1576 dnlc_remove(dvp, realnmp->pn_buf); 1577 else 1578 dnlc_remove(dvp, name); 1579 1580 may_delete_now = FALSE; 1581 1582 /* 1583 * We may delete the znode now, or we may put it in the unlinked set; 1584 * it depends on whether we're the last link, and on whether there are 1585 * other holds on the vnode. So we dmu_tx_hold() the right things to 1586 * allow for either case. 1587 */ 1588 tx = dmu_tx_create(zfsvfs->z_os); 1589 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); 1590 dmu_tx_hold_bonus(tx, zp->z_id); 1591 if (may_delete_now) { 1592 toobig = 1593 zp->z_phys->zp_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT; 1594 /* if the file is too big, only hold_free a token amount */ 1595 dmu_tx_hold_free(tx, zp->z_id, 0, 1596 (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END)); 1597 } 1598 1599 /* are there any extended attributes? */ 1600 if ((xattr_obj = zp->z_phys->zp_xattr) != 0) { 1601 /* XXX - do we need this if we are deleting? */ 1602 dmu_tx_hold_bonus(tx, xattr_obj); 1603 } 1604 1605 /* are there any additional acls */ 1606 if ((acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj) != 0 && 1607 may_delete_now) 1608 dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); 1609 1610 /* charge as an update -- would be nice not to charge at all */ 1611 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 1612 1613 error = dmu_tx_assign(tx, zfsvfs->z_assign); 1614 if (error) { 1615 zfs_dirent_unlock(dl); 1616 VN_RELE(vp); 1617 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 1618 dmu_tx_wait(tx); 1619 dmu_tx_abort(tx); 1620 goto top; 1621 } 1622 if (realnmp) 1623 pn_free(realnmp); 1624 dmu_tx_abort(tx); 1625 ZFS_EXIT(zfsvfs); 1626 return (error); 1627 } 1628 1629 /* 1630 * Remove the directory entry. 1631 */ 1632 error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked); 1633 1634 if (error) { 1635 dmu_tx_commit(tx); 1636 goto out; 1637 } 1638 1639 if (0 && unlinked) { 1640 VI_LOCK(vp); 1641 delete_now = may_delete_now && !toobig && 1642 vp->v_count == 1 && !vn_has_cached_data(vp) && 1643 zp->z_phys->zp_xattr == xattr_obj && 1644 zp->z_phys->zp_acl.z_acl_extern_obj == acl_obj; 1645 VI_UNLOCK(vp); 1646 } 1647 1648 if (delete_now) { 1649 if (zp->z_phys->zp_xattr) { 1650 error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp); 1651 ASSERT3U(error, ==, 0); 1652 ASSERT3U(xzp->z_phys->zp_links, ==, 2); 1653 dmu_buf_will_dirty(xzp->z_dbuf, tx); 1654 mutex_enter(&xzp->z_lock); 1655 xzp->z_unlinked = 1; 1656 xzp->z_phys->zp_links = 0; 1657 mutex_exit(&xzp->z_lock); 1658 zfs_unlinked_add(xzp, tx); 1659 zp->z_phys->zp_xattr = 0; /* probably unnecessary */ 1660 } 1661 mutex_enter(&zp->z_lock); 1662 VI_LOCK(vp); 1663 vp->v_count--; 1664 ASSERT3U(vp->v_count, ==, 0); 1665 VI_UNLOCK(vp); 1666 mutex_exit(&zp->z_lock); 1667 zfs_znode_delete(zp, tx); 1668 } else if (unlinked) { 1669 zfs_unlinked_add(zp, tx); 1670 } 1671 1672 txtype = TX_REMOVE; 1673 if (flags & FIGNORECASE) 1674 txtype |= TX_CI; 1675 zfs_log_remove(zilog, tx, txtype, dzp, name); 1676 1677 dmu_tx_commit(tx); 1678out: 1679 if (realnmp) 1680 pn_free(realnmp); 1681 1682 zfs_dirent_unlock(dl); 1683 1684 if (!delete_now) { 1685 VN_RELE(vp); 1686 } else if (xzp) { 1687 /* this rele is delayed to prevent nesting transactions */ 1688 VN_RELE(ZTOV(xzp)); 1689 } 1690 1691 ZFS_EXIT(zfsvfs); 1692 return (error); 1693} 1694 1695/* 1696 * Create a new directory and insert it into dvp using the name 1697 * provided. Return a pointer to the inserted directory. 1698 * 1699 * IN: dvp - vnode of directory to add subdir to. 1700 * dirname - name of new directory. 1701 * vap - attributes of new directory. 1702 * cr - credentials of caller. 1703 * ct - caller context 1704 * vsecp - ACL to be set 1705 * 1706 * OUT: vpp - vnode of created directory. 1707 * 1708 * RETURN: 0 if success 1709 * error code if failure 1710 * 1711 * Timestamps: 1712 * dvp - ctime|mtime updated 1713 * vp - ctime|mtime|atime updated 1714 */ 1715/*ARGSUSED*/ 1716static int 1717zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr, 1718 caller_context_t *ct, int flags, vsecattr_t *vsecp) 1719{ 1720 znode_t *zp, *dzp = VTOZ(dvp); 1721 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1722 zilog_t *zilog; 1723 zfs_dirlock_t *dl; 1724 uint64_t txtype; 1725 dmu_tx_t *tx; 1726 int error; 1727 zfs_acl_t *aclp = NULL; 1728 zfs_fuid_info_t *fuidp = NULL; 1729 int zf = ZNEW; 1730 1731 ASSERT(vap->va_type == VDIR); 1732 1733 /* 1734 * If we have an ephemeral id, ACL, or XVATTR then 1735 * make sure file system is at proper version 1736 */ 1737 1738 if (zfsvfs->z_use_fuids == B_FALSE && 1739 (vsecp || (vap->va_mask & AT_XVATTR) || IS_EPHEMERAL(crgetuid(cr))|| 1740 IS_EPHEMERAL(crgetgid(cr)))) 1741 return (EINVAL); 1742 1743 ZFS_ENTER(zfsvfs); 1744 ZFS_VERIFY_ZP(dzp); 1745 zilog = zfsvfs->z_log; 1746 1747 if (dzp->z_phys->zp_flags & ZFS_XATTR) { 1748 ZFS_EXIT(zfsvfs); 1749 return (EINVAL); 1750 } 1751 1752 if (zfsvfs->z_utf8 && u8_validate(dirname, 1753 strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 1754 ZFS_EXIT(zfsvfs); 1755 return (EILSEQ); 1756 } 1757 if (flags & FIGNORECASE) 1758 zf |= ZCILOOK; 1759 1760 if (vap->va_mask & AT_XVATTR) 1761 if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap, 1762 crgetuid(cr), cr, vap->va_type)) != 0) { 1763 ZFS_EXIT(zfsvfs); 1764 return (error); 1765 } 1766 1767 /* 1768 * First make sure the new directory doesn't exist. 1769 */ 1770top: 1771 *vpp = NULL; 1772 1773 if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf, 1774 NULL, NULL)) { 1775 ZFS_EXIT(zfsvfs); 1776 return (error); 1777 } 1778 1779 if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) { 1780 zfs_dirent_unlock(dl); 1781 ZFS_EXIT(zfsvfs); 1782 return (error); 1783 } 1784 1785 if (vsecp && aclp == NULL) { 1786 error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, &aclp); 1787 if (error) { 1788 zfs_dirent_unlock(dl); 1789 ZFS_EXIT(zfsvfs); 1790 return (error); 1791 } 1792 } 1793 /* 1794 * Add a new entry to the directory. 1795 */ 1796 tx = dmu_tx_create(zfsvfs->z_os); 1797 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); 1798 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); 1799 if ((aclp && aclp->z_has_fuids) || IS_EPHEMERAL(crgetuid(cr)) || 1800 IS_EPHEMERAL(crgetgid(cr))) { 1801 if (zfsvfs->z_fuid_obj == 0) { 1802 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 1803 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 1804 FUID_SIZE_ESTIMATE(zfsvfs)); 1805 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); 1806 } else { 1807 dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); 1808 dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, 1809 FUID_SIZE_ESTIMATE(zfsvfs)); 1810 } 1811 } 1812 if ((dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) || aclp) 1813 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 1814 0, SPA_MAXBLOCKSIZE); 1815 error = dmu_tx_assign(tx, zfsvfs->z_assign); 1816 if (error) { 1817 zfs_dirent_unlock(dl); 1818 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 1819 dmu_tx_wait(tx); 1820 dmu_tx_abort(tx); 1821 goto top; 1822 } 1823 dmu_tx_abort(tx); 1824 ZFS_EXIT(zfsvfs); 1825 if (aclp) 1826 zfs_acl_free(aclp); 1827 return (error); 1828 } 1829 1830 /* 1831 * Create new node. 1832 */ 1833 zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, aclp, &fuidp); 1834 1835 if (aclp) 1836 zfs_acl_free(aclp); 1837 1838 /* 1839 * Now put new name in parent dir. 1840 */ 1841 (void) zfs_link_create(dl, zp, tx, ZNEW); 1842 1843 *vpp = ZTOV(zp); 1844 1845 txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap); 1846 if (flags & FIGNORECASE) 1847 txtype |= TX_CI; 1848 zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, fuidp, vap); 1849 1850 if (fuidp) 1851 zfs_fuid_info_free(fuidp); 1852 dmu_tx_commit(tx); 1853 1854 zfs_dirent_unlock(dl); 1855 1856 ZFS_EXIT(zfsvfs); 1857 return (0); 1858} 1859 1860/* 1861 * Remove a directory subdir entry. If the current working 1862 * directory is the same as the subdir to be removed, the 1863 * remove will fail. 1864 * 1865 * IN: dvp - vnode of directory to remove from. 1866 * name - name of directory to be removed. 1867 * cwd - vnode of current working directory. 1868 * cr - credentials of caller. 1869 * ct - caller context 1870 * flags - case flags 1871 * 1872 * RETURN: 0 if success 1873 * error code if failure 1874 * 1875 * Timestamps: 1876 * dvp - ctime|mtime updated 1877 */ 1878/*ARGSUSED*/ 1879static int 1880zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr, 1881 caller_context_t *ct, int flags) 1882{ 1883 znode_t *dzp = VTOZ(dvp); 1884 znode_t *zp; 1885 vnode_t *vp; 1886 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1887 zilog_t *zilog; 1888 zfs_dirlock_t *dl; 1889 dmu_tx_t *tx; 1890 int error; 1891 int zflg = ZEXISTS; 1892 1893 ZFS_ENTER(zfsvfs); 1894 ZFS_VERIFY_ZP(dzp); 1895 zilog = zfsvfs->z_log; 1896 1897 if (flags & FIGNORECASE) 1898 zflg |= ZCILOOK; 1899top: 1900 zp = NULL; 1901 1902 /* 1903 * Attempt to lock directory; fail if entry doesn't exist. 1904 */ 1905 if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 1906 NULL, NULL)) { 1907 ZFS_EXIT(zfsvfs); 1908 return (error); 1909 } 1910 1911 vp = ZTOV(zp); 1912 1913 if (error = zfs_zaccess_delete(dzp, zp, cr)) { 1914 goto out; 1915 } 1916 1917 if (vp->v_type != VDIR) { 1918 error = ENOTDIR; 1919 goto out; 1920 } 1921 1922 if (vp == cwd) { 1923 error = EINVAL; 1924 goto out; 1925 } 1926 1927 vnevent_rmdir(vp, dvp, name, ct); 1928 1929 /* 1930 * Grab a lock on the directory to make sure that noone is 1931 * trying to add (or lookup) entries while we are removing it. 1932 */ 1933 rw_enter(&zp->z_name_lock, RW_WRITER); 1934 1935 /* 1936 * Grab a lock on the parent pointer to make sure we play well 1937 * with the treewalk and directory rename code. 1938 */ 1939 rw_enter(&zp->z_parent_lock, RW_WRITER); 1940 1941 tx = dmu_tx_create(zfsvfs->z_os); 1942 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); 1943 dmu_tx_hold_bonus(tx, zp->z_id); 1944 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 1945 error = dmu_tx_assign(tx, zfsvfs->z_assign); 1946 if (error) { 1947 rw_exit(&zp->z_parent_lock); 1948 rw_exit(&zp->z_name_lock); 1949 zfs_dirent_unlock(dl); 1950 VN_RELE(vp); 1951 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 1952 dmu_tx_wait(tx); 1953 dmu_tx_abort(tx); 1954 goto top; 1955 } 1956 dmu_tx_abort(tx); 1957 ZFS_EXIT(zfsvfs); 1958 return (error); 1959 } 1960 1961#ifdef FREEBSD_NAMECACHE 1962 cache_purge(dvp); 1963#endif 1964 1965 error = zfs_link_destroy(dl, zp, tx, zflg, NULL); 1966 1967 if (error == 0) { 1968 uint64_t txtype = TX_RMDIR; 1969 if (flags & FIGNORECASE) 1970 txtype |= TX_CI; 1971 zfs_log_remove(zilog, tx, txtype, dzp, name); 1972 } 1973 1974 dmu_tx_commit(tx); 1975 1976 rw_exit(&zp->z_parent_lock); 1977 rw_exit(&zp->z_name_lock); 1978#ifdef FREEBSD_NAMECACHE 1979 cache_purge(vp); 1980#endif 1981out: 1982 zfs_dirent_unlock(dl); 1983 1984 VN_RELE(vp); 1985 1986 ZFS_EXIT(zfsvfs); 1987 return (error); 1988} 1989 1990/* 1991 * Read as many directory entries as will fit into the provided 1992 * buffer from the given directory cursor position (specified in 1993 * the uio structure. 1994 * 1995 * IN: vp - vnode of directory to read. 1996 * uio - structure supplying read location, range info, 1997 * and return buffer. 1998 * cr - credentials of caller. 1999 * ct - caller context 2000 * flags - case flags 2001 * 2002 * OUT: uio - updated offset and range, buffer filled. 2003 * eofp - set to true if end-of-file detected. 2004 * 2005 * RETURN: 0 if success 2006 * error code if failure 2007 * 2008 * Timestamps: 2009 * vp - atime updated 2010 * 2011 * Note that the low 4 bits of the cookie returned by zap is always zero. 2012 * This allows us to use the low range for "special" directory entries: 2013 * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, 2014 * we use the offset 2 for the '.zfs' directory. 2015 */ 2016/* ARGSUSED */ 2017static int 2018zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies) 2019{ 2020 znode_t *zp = VTOZ(vp); 2021 iovec_t *iovp; 2022 edirent_t *eodp; 2023 dirent64_t *odp; 2024 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2025 objset_t *os; 2026 caddr_t outbuf; 2027 size_t bufsize; 2028 zap_cursor_t zc; 2029 zap_attribute_t zap; 2030 uint_t bytes_wanted; 2031 uint64_t offset; /* must be unsigned; checks for < 1 */ 2032 int local_eof; 2033 int outcount; 2034 int error; 2035 uint8_t prefetch; 2036 boolean_t check_sysattrs; 2037 uint8_t type; 2038 int ncooks; 2039 u_long *cooks = NULL; 2040 int flags = 0; 2041 2042 ZFS_ENTER(zfsvfs); 2043 ZFS_VERIFY_ZP(zp); 2044 2045 /* 2046 * If we are not given an eof variable, 2047 * use a local one. 2048 */ 2049 if (eofp == NULL) 2050 eofp = &local_eof; 2051 2052 /* 2053 * Check for valid iov_len. 2054 */ 2055 if (uio->uio_iov->iov_len <= 0) { 2056 ZFS_EXIT(zfsvfs); 2057 return (EINVAL); 2058 } 2059 2060 /* 2061 * Quit if directory has been removed (posix) 2062 */ 2063 if ((*eofp = zp->z_unlinked) != 0) { 2064 ZFS_EXIT(zfsvfs); 2065 return (0); 2066 } 2067 2068 error = 0; 2069 os = zfsvfs->z_os; 2070 offset = uio->uio_loffset; 2071 prefetch = zp->z_zn_prefetch; 2072 2073 /* 2074 * Initialize the iterator cursor. 2075 */ 2076 if (offset <= 3) { 2077 /* 2078 * Start iteration from the beginning of the directory. 2079 */ 2080 zap_cursor_init(&zc, os, zp->z_id); 2081 } else { 2082 /* 2083 * The offset is a serialized cursor. 2084 */ 2085 zap_cursor_init_serialized(&zc, os, zp->z_id, offset); 2086 } 2087 2088 /* 2089 * Get space to change directory entries into fs independent format. 2090 */ 2091 iovp = uio->uio_iov; 2092 bytes_wanted = iovp->iov_len; 2093 if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) { 2094 bufsize = bytes_wanted; 2095 outbuf = kmem_alloc(bufsize, KM_SLEEP); 2096 odp = (struct dirent64 *)outbuf; 2097 } else { 2098 bufsize = bytes_wanted; 2099 odp = (struct dirent64 *)iovp->iov_base; 2100 } 2101 eodp = (struct edirent *)odp; 2102 2103 if (ncookies != NULL) { 2104 /* 2105 * Minimum entry size is dirent size and 1 byte for a file name. 2106 */ 2107 ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1); 2108 cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK); 2109 *cookies = cooks; 2110 *ncookies = ncooks; 2111 } 2112 /* 2113 * If this VFS supports the system attribute view interface; and 2114 * we're looking at an extended attribute directory; and we care 2115 * about normalization conflicts on this vfs; then we must check 2116 * for normalization conflicts with the sysattr name space. 2117 */ 2118#ifdef TODO 2119 check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && 2120 (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm && 2121 (flags & V_RDDIR_ENTFLAGS); 2122#else 2123 check_sysattrs = 0; 2124#endif 2125 2126 /* 2127 * Transform to file-system independent format 2128 */ 2129 outcount = 0; 2130 while (outcount < bytes_wanted) { 2131 ino64_t objnum; 2132 ushort_t reclen; 2133 off64_t *next; 2134 2135 /* 2136 * Special case `.', `..', and `.zfs'. 2137 */ 2138 if (offset == 0) { 2139 (void) strcpy(zap.za_name, "."); 2140 zap.za_normalization_conflict = 0; 2141 objnum = zp->z_id; 2142 type = DT_DIR; 2143 } else if (offset == 1) { 2144 (void) strcpy(zap.za_name, ".."); 2145 zap.za_normalization_conflict = 0; 2146 objnum = zp->z_phys->zp_parent; 2147 type = DT_DIR; 2148 } else if (offset == 2 && zfs_show_ctldir(zp)) { 2149 (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); 2150 zap.za_normalization_conflict = 0; 2151 objnum = ZFSCTL_INO_ROOT; 2152 type = DT_DIR; 2153 } else { 2154 /* 2155 * Grab next entry. 2156 */ 2157 if (error = zap_cursor_retrieve(&zc, &zap)) { 2158 if ((*eofp = (error == ENOENT)) != 0) 2159 break; 2160 else 2161 goto update; 2162 } 2163 2164 if (zap.za_integer_length != 8 || 2165 zap.za_num_integers != 1) { 2166 cmn_err(CE_WARN, "zap_readdir: bad directory " 2167 "entry, obj = %lld, offset = %lld\n", 2168 (u_longlong_t)zp->z_id, 2169 (u_longlong_t)offset); 2170 error = ENXIO; 2171 goto update; 2172 } 2173 2174 objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); 2175 /* 2176 * MacOS X can extract the object type here such as: 2177 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer); 2178 */ 2179 type = ZFS_DIRENT_TYPE(zap.za_first_integer); 2180 2181 if (check_sysattrs && !zap.za_normalization_conflict) { 2182#ifdef TODO 2183 zap.za_normalization_conflict = 2184 xattr_sysattr_casechk(zap.za_name); 2185#else 2186 panic("%s:%u: TODO", __func__, __LINE__); 2187#endif 2188 } 2189 } 2190 2191 if (flags & V_RDDIR_ENTFLAGS) 2192 reclen = EDIRENT_RECLEN(strlen(zap.za_name)); 2193 else 2194 reclen = DIRENT64_RECLEN(strlen(zap.za_name)); 2195 2196 /* 2197 * Will this entry fit in the buffer? 2198 */ 2199 if (outcount + reclen > bufsize) { 2200 /* 2201 * Did we manage to fit anything in the buffer? 2202 */ 2203 if (!outcount) { 2204 error = EINVAL; 2205 goto update; 2206 } 2207 break; 2208 } 2209 if (flags & V_RDDIR_ENTFLAGS) { 2210 /* 2211 * Add extended flag entry: 2212 */ 2213 eodp->ed_ino = objnum; 2214 eodp->ed_reclen = reclen; 2215 /* NOTE: ed_off is the offset for the *next* entry */ 2216 next = &(eodp->ed_off); 2217 eodp->ed_eflags = zap.za_normalization_conflict ? 2218 ED_CASE_CONFLICT : 0; 2219 (void) strncpy(eodp->ed_name, zap.za_name, 2220 EDIRENT_NAMELEN(reclen)); 2221 eodp = (edirent_t *)((intptr_t)eodp + reclen); 2222 } else { 2223 /* 2224 * Add normal entry: 2225 */ 2226 odp->d_ino = objnum; 2227 odp->d_reclen = reclen; 2228 odp->d_namlen = strlen(zap.za_name); 2229 (void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1); 2230 odp->d_type = type; 2231 odp = (dirent64_t *)((intptr_t)odp + reclen); 2232 } 2233 outcount += reclen; 2234 2235 ASSERT(outcount <= bufsize); 2236 2237 /* Prefetch znode */ 2238 if (prefetch) 2239 dmu_prefetch(os, objnum, 0, 0); 2240 2241 /* 2242 * Move to the next entry, fill in the previous offset. 2243 */ 2244 if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { 2245 zap_cursor_advance(&zc); 2246 offset = zap_cursor_serialize(&zc); 2247 } else { 2248 offset += 1; 2249 } 2250 2251 if (cooks != NULL) { 2252 *cooks++ = offset; 2253 ncooks--; 2254 KASSERT(ncooks >= 0, ("ncookies=%d", ncooks)); 2255 } 2256 } 2257 zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ 2258 2259 /* Subtract unused cookies */ 2260 if (ncookies != NULL) 2261 *ncookies -= ncooks; 2262 2263 if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) { 2264 iovp->iov_base += outcount; 2265 iovp->iov_len -= outcount; 2266 uio->uio_resid -= outcount; 2267 } else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) { 2268 /* 2269 * Reset the pointer. 2270 */ 2271 offset = uio->uio_loffset; 2272 } 2273 2274update: 2275 zap_cursor_fini(&zc); 2276 if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) 2277 kmem_free(outbuf, bufsize); 2278 2279 if (error == ENOENT) 2280 error = 0; 2281 2282 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 2283 2284 uio->uio_loffset = offset; 2285 ZFS_EXIT(zfsvfs); 2286 if (error != 0 && cookies != NULL) { 2287 free(*cookies, M_TEMP); 2288 *cookies = NULL; 2289 *ncookies = 0; 2290 } 2291 return (error); 2292} 2293 2294ulong_t zfs_fsync_sync_cnt = 4; 2295 2296static int 2297zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct) 2298{ 2299 znode_t *zp = VTOZ(vp); 2300 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2301 2302 (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt); 2303 2304 ZFS_ENTER(zfsvfs); 2305 ZFS_VERIFY_ZP(zp); 2306 zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id); 2307 ZFS_EXIT(zfsvfs); 2308 return (0); 2309} 2310 2311 2312/* 2313 * Get the requested file attributes and place them in the provided 2314 * vattr structure. 2315 * 2316 * IN: vp - vnode of file. 2317 * vap - va_mask identifies requested attributes. 2318 * If AT_XVATTR set, then optional attrs are requested 2319 * flags - ATTR_NOACLCHECK (CIFS server context) 2320 * cr - credentials of caller. 2321 * ct - caller context 2322 * 2323 * OUT: vap - attribute values. 2324 * 2325 * RETURN: 0 (always succeeds) 2326 */ 2327/* ARGSUSED */ 2328static int 2329zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, 2330 caller_context_t *ct) 2331{ 2332 znode_t *zp = VTOZ(vp); 2333 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2334 znode_phys_t *pzp; 2335 int error = 0; 2336 uint32_t blksize; 2337 u_longlong_t nblocks; 2338 uint64_t links; 2339 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ 2340 xoptattr_t *xoap = NULL; 2341 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 2342 2343 ZFS_ENTER(zfsvfs); 2344 ZFS_VERIFY_ZP(zp); 2345 pzp = zp->z_phys; 2346 2347 /* 2348 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES. 2349 * Also, if we are the owner don't bother, since owner should 2350 * always be allowed to read basic attributes of file. 2351 */ 2352 if (!(pzp->zp_flags & ZFS_ACL_TRIVIAL) && 2353 (pzp->zp_uid != crgetuid(cr))) { 2354 if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0, 2355 skipaclchk, cr)) { 2356 ZFS_EXIT(zfsvfs); 2357 return (error); 2358 } 2359 } 2360 2361 /* 2362 * Return all attributes. It's cheaper to provide the answer 2363 * than to determine whether we were asked the question. 2364 */ 2365 2366 mutex_enter(&zp->z_lock); 2367 vap->va_type = IFTOVT(pzp->zp_mode); 2368 vap->va_mode = pzp->zp_mode & ~S_IFMT; 2369 zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid); 2370// vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev; 2371 vap->va_nodeid = zp->z_id; 2372 if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp)) 2373 links = pzp->zp_links + 1; 2374 else 2375 links = pzp->zp_links; 2376 vap->va_nlink = MIN(links, UINT32_MAX); /* nlink_t limit! */ 2377 vap->va_size = pzp->zp_size; 2378 vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; 2379 vap->va_rdev = zfs_cmpldev(pzp->zp_rdev); 2380 vap->va_seq = zp->z_seq; 2381 vap->va_flags = 0; /* FreeBSD: Reset chflags(2) flags. */ 2382 2383 /* 2384 * Add in any requested optional attributes and the create time. 2385 * Also set the corresponding bits in the returned attribute bitmap. 2386 */ 2387 if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) { 2388 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { 2389 xoap->xoa_archive = 2390 ((pzp->zp_flags & ZFS_ARCHIVE) != 0); 2391 XVA_SET_RTN(xvap, XAT_ARCHIVE); 2392 } 2393 2394 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { 2395 xoap->xoa_readonly = 2396 ((pzp->zp_flags & ZFS_READONLY) != 0); 2397 XVA_SET_RTN(xvap, XAT_READONLY); 2398 } 2399 2400 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { 2401 xoap->xoa_system = 2402 ((pzp->zp_flags & ZFS_SYSTEM) != 0); 2403 XVA_SET_RTN(xvap, XAT_SYSTEM); 2404 } 2405 2406 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { 2407 xoap->xoa_hidden = 2408 ((pzp->zp_flags & ZFS_HIDDEN) != 0); 2409 XVA_SET_RTN(xvap, XAT_HIDDEN); 2410 } 2411 2412 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { 2413 xoap->xoa_nounlink = 2414 ((pzp->zp_flags & ZFS_NOUNLINK) != 0); 2415 XVA_SET_RTN(xvap, XAT_NOUNLINK); 2416 } 2417 2418 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { 2419 xoap->xoa_immutable = 2420 ((pzp->zp_flags & ZFS_IMMUTABLE) != 0); 2421 XVA_SET_RTN(xvap, XAT_IMMUTABLE); 2422 } 2423 2424 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { 2425 xoap->xoa_appendonly = 2426 ((pzp->zp_flags & ZFS_APPENDONLY) != 0); 2427 XVA_SET_RTN(xvap, XAT_APPENDONLY); 2428 } 2429 2430 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { 2431 xoap->xoa_nodump = 2432 ((pzp->zp_flags & ZFS_NODUMP) != 0); 2433 XVA_SET_RTN(xvap, XAT_NODUMP); 2434 } 2435 2436 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { 2437 xoap->xoa_opaque = 2438 ((pzp->zp_flags & ZFS_OPAQUE) != 0); 2439 XVA_SET_RTN(xvap, XAT_OPAQUE); 2440 } 2441 2442 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { 2443 xoap->xoa_av_quarantined = 2444 ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0); 2445 XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); 2446 } 2447 2448 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { 2449 xoap->xoa_av_modified = 2450 ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0); 2451 XVA_SET_RTN(xvap, XAT_AV_MODIFIED); 2452 } 2453 2454 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) && 2455 vp->v_type == VREG && 2456 (pzp->zp_flags & ZFS_BONUS_SCANSTAMP)) { 2457 size_t len; 2458 dmu_object_info_t doi; 2459 2460 /* 2461 * Only VREG files have anti-virus scanstamps, so we 2462 * won't conflict with symlinks in the bonus buffer. 2463 */ 2464 dmu_object_info_from_db(zp->z_dbuf, &doi); 2465 len = sizeof (xoap->xoa_av_scanstamp) + 2466 sizeof (znode_phys_t); 2467 if (len <= doi.doi_bonus_size) { 2468 /* 2469 * pzp points to the start of the 2470 * znode_phys_t. pzp + 1 points to the 2471 * first byte after the znode_phys_t. 2472 */ 2473 (void) memcpy(xoap->xoa_av_scanstamp, 2474 pzp + 1, 2475 sizeof (xoap->xoa_av_scanstamp)); 2476 XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); 2477 } 2478 } 2479 2480 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { 2481 ZFS_TIME_DECODE(&xoap->xoa_createtime, pzp->zp_crtime); 2482 XVA_SET_RTN(xvap, XAT_CREATETIME); 2483 } 2484 } 2485 2486 ZFS_TIME_DECODE(&vap->va_atime, pzp->zp_atime); 2487 ZFS_TIME_DECODE(&vap->va_mtime, pzp->zp_mtime); 2488 ZFS_TIME_DECODE(&vap->va_ctime, pzp->zp_ctime); 2489 ZFS_TIME_DECODE(&vap->va_birthtime, pzp->zp_crtime); 2490 2491 mutex_exit(&zp->z_lock); 2492 2493 dmu_object_size_from_db(zp->z_dbuf, &blksize, &nblocks); 2494 vap->va_blksize = blksize; 2495 vap->va_bytes = nblocks << 9; /* nblocks * 512 */ 2496 2497 if (zp->z_blksz == 0) { 2498 /* 2499 * Block size hasn't been set; suggest maximal I/O transfers. 2500 */ 2501 vap->va_blksize = zfsvfs->z_max_blksz; 2502 } 2503 2504 ZFS_EXIT(zfsvfs); 2505 return (0); 2506} 2507 2508/* 2509 * Set the file attributes to the values contained in the 2510 * vattr structure. 2511 * 2512 * IN: vp - vnode of file to be modified. 2513 * vap - new attribute values. 2514 * If AT_XVATTR set, then optional attrs are being set 2515 * flags - ATTR_UTIME set if non-default time values provided. 2516 * - ATTR_NOACLCHECK (CIFS context only). 2517 * cr - credentials of caller. 2518 * ct - caller context 2519 * 2520 * RETURN: 0 if success 2521 * error code if failure 2522 * 2523 * Timestamps: 2524 * vp - ctime updated, mtime updated if size changed. 2525 */ 2526/* ARGSUSED */ 2527static int 2528zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, 2529 caller_context_t *ct) 2530{ 2531 znode_t *zp = VTOZ(vp); 2532 znode_phys_t *pzp; 2533 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2534 zilog_t *zilog; 2535 dmu_tx_t *tx; 2536 vattr_t oldva; 2537 uint_t mask = vap->va_mask; 2538 uint_t saved_mask; 2539 uint64_t saved_mode; 2540 int trim_mask = 0; 2541 uint64_t new_mode; 2542 znode_t *attrzp; 2543 int need_policy = FALSE; 2544 int err; 2545 zfs_fuid_info_t *fuidp = NULL; 2546 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ 2547 xoptattr_t *xoap; 2548 zfs_acl_t *aclp = NULL; 2549 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 2550 2551 if (mask == 0) 2552 return (0); 2553 2554 if (mask & AT_NOSET) 2555 return (EINVAL); 2556 2557 ZFS_ENTER(zfsvfs); 2558 ZFS_VERIFY_ZP(zp); 2559 2560 pzp = zp->z_phys; 2561 zilog = zfsvfs->z_log; 2562 2563 /* 2564 * Make sure that if we have ephemeral uid/gid or xvattr specified 2565 * that file system is at proper version level 2566 */ 2567 2568 if (zfsvfs->z_use_fuids == B_FALSE && 2569 (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) || 2570 ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) || 2571 (mask & AT_XVATTR))) { 2572 ZFS_EXIT(zfsvfs); 2573 return (EINVAL); 2574 } 2575 2576 if (mask & AT_SIZE && vp->v_type == VDIR) { 2577 ZFS_EXIT(zfsvfs); 2578 return (EISDIR); 2579 } 2580 2581 if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) { 2582 ZFS_EXIT(zfsvfs); 2583 return (EINVAL); 2584 } 2585 2586 /* 2587 * If this is an xvattr_t, then get a pointer to the structure of 2588 * optional attributes. If this is NULL, then we have a vattr_t. 2589 */ 2590 xoap = xva_getxoptattr(xvap); 2591 2592 /* 2593 * Immutable files can only alter immutable bit and atime 2594 */ 2595 if ((pzp->zp_flags & ZFS_IMMUTABLE) && 2596 ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) || 2597 ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { 2598 ZFS_EXIT(zfsvfs); 2599 return (EPERM); 2600 } 2601 2602 if ((mask & AT_SIZE) && (pzp->zp_flags & ZFS_READONLY)) { 2603 ZFS_EXIT(zfsvfs); 2604 return (EPERM); 2605 } 2606 2607 /* 2608 * Verify timestamps doesn't overflow 32 bits. 2609 * ZFS can handle large timestamps, but 32bit syscalls can't 2610 * handle times greater than 2039. This check should be removed 2611 * once large timestamps are fully supported. 2612 */ 2613 if (mask & (AT_ATIME | AT_MTIME)) { 2614 if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || 2615 ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { 2616 ZFS_EXIT(zfsvfs); 2617 return (EOVERFLOW); 2618 } 2619 } 2620 2621top: 2622 attrzp = NULL; 2623 2624 if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { 2625 ZFS_EXIT(zfsvfs); 2626 return (EROFS); 2627 } 2628 2629 /* 2630 * First validate permissions 2631 */ 2632 2633 if (mask & AT_SIZE) { 2634 err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr); 2635 if (err) { 2636 ZFS_EXIT(zfsvfs); 2637 return (err); 2638 } 2639 /* 2640 * XXX - Note, we are not providing any open 2641 * mode flags here (like FNDELAY), so we may 2642 * block if there are locks present... this 2643 * should be addressed in openat(). 2644 */ 2645 /* XXX - would it be OK to generate a log record here? */ 2646 err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); 2647 if (err) { 2648 ZFS_EXIT(zfsvfs); 2649 return (err); 2650 } 2651 } 2652 2653 if (mask & (AT_ATIME|AT_MTIME) || 2654 ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || 2655 XVA_ISSET_REQ(xvap, XAT_READONLY) || 2656 XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || 2657 XVA_ISSET_REQ(xvap, XAT_CREATETIME) || 2658 XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) 2659 need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, 2660 skipaclchk, cr); 2661 2662 if (mask & (AT_UID|AT_GID)) { 2663 int idmask = (mask & (AT_UID|AT_GID)); 2664 int take_owner; 2665 int take_group; 2666 2667 /* 2668 * NOTE: even if a new mode is being set, 2669 * we may clear S_ISUID/S_ISGID bits. 2670 */ 2671 2672 if (!(mask & AT_MODE)) 2673 vap->va_mode = pzp->zp_mode; 2674 2675 /* 2676 * Take ownership or chgrp to group we are a member of 2677 */ 2678 2679 take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr)); 2680 take_group = (mask & AT_GID) && 2681 zfs_groupmember(zfsvfs, vap->va_gid, cr); 2682 2683 /* 2684 * If both AT_UID and AT_GID are set then take_owner and 2685 * take_group must both be set in order to allow taking 2686 * ownership. 2687 * 2688 * Otherwise, send the check through secpolicy_vnode_setattr() 2689 * 2690 */ 2691 2692 if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) || 2693 ((idmask == AT_UID) && take_owner) || 2694 ((idmask == AT_GID) && take_group)) { 2695 if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, 2696 skipaclchk, cr) == 0) { 2697 /* 2698 * Remove setuid/setgid for non-privileged users 2699 */ 2700 secpolicy_setid_clear(vap, vp, cr); 2701 trim_mask = (mask & (AT_UID|AT_GID)); 2702 } else { 2703 need_policy = TRUE; 2704 } 2705 } else { 2706 need_policy = TRUE; 2707 } 2708 } 2709 2710 mutex_enter(&zp->z_lock); 2711 oldva.va_mode = pzp->zp_mode; 2712 zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); 2713 if (mask & AT_XVATTR) { 2714 if ((need_policy == FALSE) && 2715 (XVA_ISSET_REQ(xvap, XAT_APPENDONLY) && 2716 xoap->xoa_appendonly != 2717 ((pzp->zp_flags & ZFS_APPENDONLY) != 0)) || 2718 (XVA_ISSET_REQ(xvap, XAT_NOUNLINK) && 2719 xoap->xoa_nounlink != 2720 ((pzp->zp_flags & ZFS_NOUNLINK) != 0)) || 2721 (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE) && 2722 xoap->xoa_immutable != 2723 ((pzp->zp_flags & ZFS_IMMUTABLE) != 0)) || 2724 (XVA_ISSET_REQ(xvap, XAT_NODUMP) && 2725 xoap->xoa_nodump != 2726 ((pzp->zp_flags & ZFS_NODUMP) != 0)) || 2727 (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED) && 2728 xoap->xoa_av_modified != 2729 ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0)) || 2730 ((XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED) && 2731 ((vp->v_type != VREG && xoap->xoa_av_quarantined) || 2732 xoap->xoa_av_quarantined != 2733 ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0)))) || 2734 (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) || 2735 (XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { 2736 need_policy = TRUE; 2737 } 2738 } 2739 2740 mutex_exit(&zp->z_lock); 2741 2742 if (mask & AT_MODE) { 2743 if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) { 2744 err = secpolicy_setid_setsticky_clear(vp, vap, 2745 &oldva, cr); 2746 if (err) { 2747 ZFS_EXIT(zfsvfs); 2748 return (err); 2749 } 2750 trim_mask |= AT_MODE; 2751 } else { 2752 need_policy = TRUE; 2753 } 2754 } 2755 2756 if (need_policy) { 2757 /* 2758 * If trim_mask is set then take ownership 2759 * has been granted or write_acl is present and user 2760 * has the ability to modify mode. In that case remove 2761 * UID|GID and or MODE from mask so that 2762 * secpolicy_vnode_setattr() doesn't revoke it. 2763 */ 2764 2765 if (trim_mask) { 2766 saved_mask = vap->va_mask; 2767 vap->va_mask &= ~trim_mask; 2768 if (trim_mask & AT_MODE) { 2769 /* 2770 * Save the mode, as secpolicy_vnode_setattr() 2771 * will overwrite it with ova.va_mode. 2772 */ 2773 saved_mode = vap->va_mode; 2774 } 2775 } 2776 err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags, 2777 (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp); 2778 if (err) { 2779 ZFS_EXIT(zfsvfs); 2780 return (err); 2781 } 2782 2783 if (trim_mask) { 2784 vap->va_mask |= saved_mask; 2785 if (trim_mask & AT_MODE) { 2786 /* 2787 * Recover the mode after 2788 * secpolicy_vnode_setattr(). 2789 */ 2790 vap->va_mode = saved_mode; 2791 } 2792 } 2793 } 2794 2795 /* 2796 * secpolicy_vnode_setattr, or take ownership may have 2797 * changed va_mask 2798 */ 2799 mask = vap->va_mask; 2800 2801 tx = dmu_tx_create(zfsvfs->z_os); 2802 dmu_tx_hold_bonus(tx, zp->z_id); 2803 if (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) || 2804 ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid))) { 2805 if (zfsvfs->z_fuid_obj == 0) { 2806 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 2807 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 2808 FUID_SIZE_ESTIMATE(zfsvfs)); 2809 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); 2810 } else { 2811 dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); 2812 dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, 2813 FUID_SIZE_ESTIMATE(zfsvfs)); 2814 } 2815 } 2816 2817 if (mask & AT_MODE) { 2818 uint64_t pmode = pzp->zp_mode; 2819 2820 new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); 2821 2822 if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)) { 2823 dmu_tx_abort(tx); 2824 ZFS_EXIT(zfsvfs); 2825 return (err); 2826 } 2827 if (pzp->zp_acl.z_acl_extern_obj) { 2828 /* Are we upgrading ACL from old V0 format to new V1 */ 2829 if (zfsvfs->z_version <= ZPL_VERSION_FUID && 2830 pzp->zp_acl.z_acl_version == 2831 ZFS_ACL_VERSION_INITIAL) { 2832 dmu_tx_hold_free(tx, 2833 pzp->zp_acl.z_acl_extern_obj, 0, 2834 DMU_OBJECT_END); 2835 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 2836 0, aclp->z_acl_bytes); 2837 } else { 2838 dmu_tx_hold_write(tx, 2839 pzp->zp_acl.z_acl_extern_obj, 0, 2840 aclp->z_acl_bytes); 2841 } 2842 } else if (aclp->z_acl_bytes > ZFS_ACE_SPACE) { 2843 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 2844 0, aclp->z_acl_bytes); 2845 } 2846 } 2847 2848 if ((mask & (AT_UID | AT_GID)) && pzp->zp_xattr != 0) { 2849 err = zfs_zget(zp->z_zfsvfs, pzp->zp_xattr, &attrzp); 2850 if (err) { 2851 dmu_tx_abort(tx); 2852 ZFS_EXIT(zfsvfs); 2853 if (aclp) 2854 zfs_acl_free(aclp); 2855 return (err); 2856 } 2857 dmu_tx_hold_bonus(tx, attrzp->z_id); 2858 } 2859 2860 err = dmu_tx_assign(tx, zfsvfs->z_assign); 2861 if (err) { 2862 if (attrzp) 2863 VN_RELE(ZTOV(attrzp)); 2864 2865 if (aclp) { 2866 zfs_acl_free(aclp); 2867 aclp = NULL; 2868 } 2869 2870 if (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 2871 dmu_tx_wait(tx); 2872 dmu_tx_abort(tx); 2873 goto top; 2874 } 2875 dmu_tx_abort(tx); 2876 ZFS_EXIT(zfsvfs); 2877 return (err); 2878 } 2879 2880 dmu_buf_will_dirty(zp->z_dbuf, tx); 2881 2882 /* 2883 * Set each attribute requested. 2884 * We group settings according to the locks they need to acquire. 2885 * 2886 * Note: you cannot set ctime directly, although it will be 2887 * updated as a side-effect of calling this function. 2888 */ 2889 2890 mutex_enter(&zp->z_lock); 2891 2892 if (mask & AT_MODE) { 2893 mutex_enter(&zp->z_acl_lock); 2894 zp->z_phys->zp_mode = new_mode; 2895 err = zfs_aclset_common(zp, aclp, cr, &fuidp, tx); 2896 ASSERT3U(err, ==, 0); 2897 mutex_exit(&zp->z_acl_lock); 2898 } 2899 2900 if (attrzp) 2901 mutex_enter(&attrzp->z_lock); 2902 2903 if (mask & AT_UID) { 2904 pzp->zp_uid = zfs_fuid_create(zfsvfs, 2905 vap->va_uid, cr, ZFS_OWNER, tx, &fuidp); 2906 if (attrzp) { 2907 attrzp->z_phys->zp_uid = zfs_fuid_create(zfsvfs, 2908 vap->va_uid, cr, ZFS_OWNER, tx, &fuidp); 2909 } 2910 } 2911 2912 if (mask & AT_GID) { 2913 pzp->zp_gid = zfs_fuid_create(zfsvfs, vap->va_gid, 2914 cr, ZFS_GROUP, tx, &fuidp); 2915 if (attrzp) 2916 attrzp->z_phys->zp_gid = zfs_fuid_create(zfsvfs, 2917 vap->va_gid, cr, ZFS_GROUP, tx, &fuidp); 2918 } 2919 2920 if (aclp) 2921 zfs_acl_free(aclp); 2922 2923 if (attrzp) 2924 mutex_exit(&attrzp->z_lock); 2925 2926 if (mask & AT_ATIME) 2927 ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime); 2928 2929 if (mask & AT_MTIME) 2930 ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime); 2931 2932 /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */ 2933 if (mask & AT_SIZE) 2934 zfs_time_stamper_locked(zp, CONTENT_MODIFIED, tx); 2935 else if (mask != 0) 2936 zfs_time_stamper_locked(zp, STATE_CHANGED, tx); 2937 /* 2938 * Do this after setting timestamps to prevent timestamp 2939 * update from toggling bit 2940 */ 2941 2942 if (xoap && (mask & AT_XVATTR)) { 2943 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { 2944 size_t len; 2945 dmu_object_info_t doi; 2946 2947 ASSERT(vp->v_type == VREG); 2948 2949 /* Grow the bonus buffer if necessary. */ 2950 dmu_object_info_from_db(zp->z_dbuf, &doi); 2951 len = sizeof (xoap->xoa_av_scanstamp) + 2952 sizeof (znode_phys_t); 2953 if (len > doi.doi_bonus_size) 2954 VERIFY(dmu_set_bonus(zp->z_dbuf, len, tx) == 0); 2955 } 2956 zfs_xvattr_set(zp, xvap); 2957 } 2958 2959 if (mask != 0) 2960 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); 2961 2962 if (fuidp) 2963 zfs_fuid_info_free(fuidp); 2964 mutex_exit(&zp->z_lock); 2965 2966 if (attrzp) 2967 VN_RELE(ZTOV(attrzp)); 2968 2969 dmu_tx_commit(tx); 2970 2971 ZFS_EXIT(zfsvfs); 2972 return (err); 2973} 2974 2975typedef struct zfs_zlock { 2976 krwlock_t *zl_rwlock; /* lock we acquired */ 2977 znode_t *zl_znode; /* znode we held */ 2978 struct zfs_zlock *zl_next; /* next in list */ 2979} zfs_zlock_t; 2980 2981/* 2982 * Drop locks and release vnodes that were held by zfs_rename_lock(). 2983 */ 2984static void 2985zfs_rename_unlock(zfs_zlock_t **zlpp) 2986{ 2987 zfs_zlock_t *zl; 2988 2989 while ((zl = *zlpp) != NULL) { 2990 if (zl->zl_znode != NULL) 2991 VN_RELE(ZTOV(zl->zl_znode)); 2992 rw_exit(zl->zl_rwlock); 2993 *zlpp = zl->zl_next; 2994 kmem_free(zl, sizeof (*zl)); 2995 } 2996} 2997 2998/* 2999 * Search back through the directory tree, using the ".." entries. 3000 * Lock each directory in the chain to prevent concurrent renames. 3001 * Fail any attempt to move a directory into one of its own descendants. 3002 * XXX - z_parent_lock can overlap with map or grow locks 3003 */ 3004static int 3005zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) 3006{ 3007 zfs_zlock_t *zl; 3008 znode_t *zp = tdzp; 3009 uint64_t rootid = zp->z_zfsvfs->z_root; 3010 uint64_t *oidp = &zp->z_id; 3011 krwlock_t *rwlp = &szp->z_parent_lock; 3012 krw_t rw = RW_WRITER; 3013 3014 /* 3015 * First pass write-locks szp and compares to zp->z_id. 3016 * Later passes read-lock zp and compare to zp->z_parent. 3017 */ 3018 do { 3019 if (!rw_tryenter(rwlp, rw)) { 3020 /* 3021 * Another thread is renaming in this path. 3022 * Note that if we are a WRITER, we don't have any 3023 * parent_locks held yet. 3024 */ 3025 if (rw == RW_READER && zp->z_id > szp->z_id) { 3026 /* 3027 * Drop our locks and restart 3028 */ 3029 zfs_rename_unlock(&zl); 3030 *zlpp = NULL; 3031 zp = tdzp; 3032 oidp = &zp->z_id; 3033 rwlp = &szp->z_parent_lock; 3034 rw = RW_WRITER; 3035 continue; 3036 } else { 3037 /* 3038 * Wait for other thread to drop its locks 3039 */ 3040 rw_enter(rwlp, rw); 3041 } 3042 } 3043 3044 zl = kmem_alloc(sizeof (*zl), KM_SLEEP); 3045 zl->zl_rwlock = rwlp; 3046 zl->zl_znode = NULL; 3047 zl->zl_next = *zlpp; 3048 *zlpp = zl; 3049 3050 if (*oidp == szp->z_id) /* We're a descendant of szp */ 3051 return (EINVAL); 3052 3053 if (*oidp == rootid) /* We've hit the top */ 3054 return (0); 3055 3056 if (rw == RW_READER) { /* i.e. not the first pass */ 3057 int error = zfs_zget(zp->z_zfsvfs, *oidp, &zp); 3058 if (error) 3059 return (error); 3060 zl->zl_znode = zp; 3061 } 3062 oidp = &zp->z_phys->zp_parent; 3063 rwlp = &zp->z_parent_lock; 3064 rw = RW_READER; 3065 3066 } while (zp->z_id != sdzp->z_id); 3067 3068 return (0); 3069} 3070 3071/* 3072 * Move an entry from the provided source directory to the target 3073 * directory. Change the entry name as indicated. 3074 * 3075 * IN: sdvp - Source directory containing the "old entry". 3076 * snm - Old entry name. 3077 * tdvp - Target directory to contain the "new entry". 3078 * tnm - New entry name. 3079 * cr - credentials of caller. 3080 * ct - caller context 3081 * flags - case flags 3082 * 3083 * RETURN: 0 if success 3084 * error code if failure 3085 * 3086 * Timestamps: 3087 * sdvp,tdvp - ctime|mtime updated 3088 */ 3089/*ARGSUSED*/ 3090static int 3091zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr, 3092 caller_context_t *ct, int flags) 3093{ 3094 znode_t *tdzp, *szp, *tzp; 3095 znode_t *sdzp = VTOZ(sdvp); 3096 zfsvfs_t *zfsvfs = sdzp->z_zfsvfs; 3097 zilog_t *zilog; 3098 vnode_t *realvp; 3099 zfs_dirlock_t *sdl, *tdl; 3100 dmu_tx_t *tx; 3101 zfs_zlock_t *zl; 3102 int cmp, serr, terr; 3103 int error = 0; 3104 int zflg = 0; 3105 3106 ZFS_ENTER(zfsvfs); 3107 ZFS_VERIFY_ZP(sdzp); 3108 zilog = zfsvfs->z_log; 3109 3110 /* 3111 * Make sure we have the real vp for the target directory. 3112 */ 3113 if (VOP_REALVP(tdvp, &realvp, ct) == 0) 3114 tdvp = realvp; 3115 3116 if (tdvp->v_vfsp != sdvp->v_vfsp) { 3117 ZFS_EXIT(zfsvfs); 3118 return (EXDEV); 3119 } 3120 3121 tdzp = VTOZ(tdvp); 3122 ZFS_VERIFY_ZP(tdzp); 3123 if (zfsvfs->z_utf8 && u8_validate(tnm, 3124 strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 3125 ZFS_EXIT(zfsvfs); 3126 return (EILSEQ); 3127 } 3128 3129 if (flags & FIGNORECASE) 3130 zflg |= ZCILOOK; 3131 3132top: 3133 szp = NULL; 3134 tzp = NULL; 3135 zl = NULL; 3136 3137 /* 3138 * This is to prevent the creation of links into attribute space 3139 * by renaming a linked file into/outof an attribute directory. 3140 * See the comment in zfs_link() for why this is considered bad. 3141 */ 3142 if ((tdzp->z_phys->zp_flags & ZFS_XATTR) != 3143 (sdzp->z_phys->zp_flags & ZFS_XATTR)) { 3144 ZFS_EXIT(zfsvfs); 3145 return (EINVAL); 3146 } 3147 3148 /* 3149 * Lock source and target directory entries. To prevent deadlock, 3150 * a lock ordering must be defined. We lock the directory with 3151 * the smallest object id first, or if it's a tie, the one with 3152 * the lexically first name. 3153 */ 3154 if (sdzp->z_id < tdzp->z_id) { 3155 cmp = -1; 3156 } else if (sdzp->z_id > tdzp->z_id) { 3157 cmp = 1; 3158 } else { 3159 /* 3160 * First compare the two name arguments without 3161 * considering any case folding. 3162 */ 3163 int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER); 3164 3165 cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error); 3166 ASSERT(error == 0 || !zfsvfs->z_utf8); 3167 if (cmp == 0) { 3168 /* 3169 * POSIX: "If the old argument and the new argument 3170 * both refer to links to the same existing file, 3171 * the rename() function shall return successfully 3172 * and perform no other action." 3173 */ 3174 ZFS_EXIT(zfsvfs); 3175 return (0); 3176 } 3177 /* 3178 * If the file system is case-folding, then we may 3179 * have some more checking to do. A case-folding file 3180 * system is either supporting mixed case sensitivity 3181 * access or is completely case-insensitive. Note 3182 * that the file system is always case preserving. 3183 * 3184 * In mixed sensitivity mode case sensitive behavior 3185 * is the default. FIGNORECASE must be used to 3186 * explicitly request case insensitive behavior. 3187 * 3188 * If the source and target names provided differ only 3189 * by case (e.g., a request to rename 'tim' to 'Tim'), 3190 * we will treat this as a special case in the 3191 * case-insensitive mode: as long as the source name 3192 * is an exact match, we will allow this to proceed as 3193 * a name-change request. 3194 */ 3195 if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE || 3196 (zfsvfs->z_case == ZFS_CASE_MIXED && 3197 flags & FIGNORECASE)) && 3198 u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST, 3199 &error) == 0) { 3200 /* 3201 * case preserving rename request, require exact 3202 * name matches 3203 */ 3204 zflg |= ZCIEXACT; 3205 zflg &= ~ZCILOOK; 3206 } 3207 } 3208 3209 /* 3210 * If the source and destination directories are the same, we should 3211 * grab the z_name_lock of that directory only once. 3212 */ 3213 if (sdzp == tdzp) { 3214 zflg |= ZHAVELOCK; 3215 rw_enter(&sdzp->z_name_lock, RW_READER); 3216 } 3217 3218 if (cmp < 0) { 3219 serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, 3220 ZEXISTS | zflg, NULL, NULL); 3221 terr = zfs_dirent_lock(&tdl, 3222 tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL); 3223 } else { 3224 terr = zfs_dirent_lock(&tdl, 3225 tdzp, tnm, &tzp, zflg, NULL, NULL); 3226 serr = zfs_dirent_lock(&sdl, 3227 sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg, 3228 NULL, NULL); 3229 } 3230 3231 if (serr) { 3232 /* 3233 * Source entry invalid or not there. 3234 */ 3235 if (!terr) { 3236 zfs_dirent_unlock(tdl); 3237 if (tzp) 3238 VN_RELE(ZTOV(tzp)); 3239 } 3240 3241 if (sdzp == tdzp) 3242 rw_exit(&sdzp->z_name_lock); 3243 3244 if (strcmp(snm, ".") == 0 || strcmp(snm, "..") == 0) 3245 serr = EINVAL; 3246 ZFS_EXIT(zfsvfs); 3247 return (serr); 3248 } 3249 if (terr) { 3250 zfs_dirent_unlock(sdl); 3251 VN_RELE(ZTOV(szp)); 3252 3253 if (sdzp == tdzp) 3254 rw_exit(&sdzp->z_name_lock); 3255 3256 if (strcmp(tnm, "..") == 0) 3257 terr = EINVAL; 3258 ZFS_EXIT(zfsvfs); 3259 return (terr); 3260 } 3261 3262 /* 3263 * Must have write access at the source to remove the old entry 3264 * and write access at the target to create the new entry. 3265 * Note that if target and source are the same, this can be 3266 * done in a single check. 3267 */ 3268 3269 if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr)) 3270 goto out; 3271 3272 if (ZTOV(szp)->v_type == VDIR) { 3273 /* 3274 * Check to make sure rename is valid. 3275 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d 3276 */ 3277 if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl)) 3278 goto out; 3279 } 3280 3281 /* 3282 * Does target exist? 3283 */ 3284 if (tzp) { 3285 /* 3286 * Source and target must be the same type. 3287 */ 3288 if (ZTOV(szp)->v_type == VDIR) { 3289 if (ZTOV(tzp)->v_type != VDIR) { 3290 error = ENOTDIR; 3291 goto out; 3292 } 3293 } else { 3294 if (ZTOV(tzp)->v_type == VDIR) { 3295 error = EISDIR; 3296 goto out; 3297 } 3298 } 3299 /* 3300 * POSIX dictates that when the source and target 3301 * entries refer to the same file object, rename 3302 * must do nothing and exit without error. 3303 */ 3304 if (szp->z_id == tzp->z_id) { 3305 error = 0; 3306 goto out; 3307 } 3308 } 3309 3310 vnevent_rename_src(ZTOV(szp), sdvp, snm, ct); 3311 if (tzp) 3312 vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct); 3313 3314 /* 3315 * notify the target directory if it is not the same 3316 * as source directory. 3317 */ 3318 if (tdvp != sdvp) { 3319 vnevent_rename_dest_dir(tdvp, ct); 3320 } 3321 3322 tx = dmu_tx_create(zfsvfs->z_os); 3323 dmu_tx_hold_bonus(tx, szp->z_id); /* nlink changes */ 3324 dmu_tx_hold_bonus(tx, sdzp->z_id); /* nlink changes */ 3325 dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); 3326 dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); 3327 if (sdzp != tdzp) 3328 dmu_tx_hold_bonus(tx, tdzp->z_id); /* nlink changes */ 3329 if (tzp) 3330 dmu_tx_hold_bonus(tx, tzp->z_id); /* parent changes */ 3331 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 3332 error = dmu_tx_assign(tx, zfsvfs->z_assign); 3333 if (error) { 3334 if (zl != NULL) 3335 zfs_rename_unlock(&zl); 3336 zfs_dirent_unlock(sdl); 3337 zfs_dirent_unlock(tdl); 3338 3339 if (sdzp == tdzp) 3340 rw_exit(&sdzp->z_name_lock); 3341 3342 VN_RELE(ZTOV(szp)); 3343 if (tzp) 3344 VN_RELE(ZTOV(tzp)); 3345 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 3346 dmu_tx_wait(tx); 3347 dmu_tx_abort(tx); 3348 goto top; 3349 } 3350 dmu_tx_abort(tx); 3351 ZFS_EXIT(zfsvfs); 3352 return (error); 3353 } 3354 3355 if (tzp) /* Attempt to remove the existing target */ 3356 error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL); 3357 3358 if (error == 0) { 3359 error = zfs_link_create(tdl, szp, tx, ZRENAMING); 3360 if (error == 0) { 3361 szp->z_phys->zp_flags |= ZFS_AV_MODIFIED; 3362 3363 error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); 3364 ASSERT(error == 0); 3365 3366 zfs_log_rename(zilog, tx, 3367 TX_RENAME | (flags & FIGNORECASE ? TX_CI : 0), 3368 sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp); 3369 3370 /* Update path information for the target vnode */ 3371 vn_renamepath(tdvp, ZTOV(szp), tnm, strlen(tnm)); 3372 } 3373#ifdef FREEBSD_NAMECACHE 3374 if (error == 0) { 3375 cache_purge(sdvp); 3376 cache_purge(tdvp); 3377 } 3378#endif 3379 } 3380 3381 dmu_tx_commit(tx); 3382out: 3383 if (zl != NULL) 3384 zfs_rename_unlock(&zl); 3385 3386 zfs_dirent_unlock(sdl); 3387 zfs_dirent_unlock(tdl); 3388 3389 if (sdzp == tdzp) 3390 rw_exit(&sdzp->z_name_lock); 3391 3392 VN_RELE(ZTOV(szp)); 3393 if (tzp) 3394 VN_RELE(ZTOV(tzp)); 3395 3396 ZFS_EXIT(zfsvfs); 3397 3398 return (error); 3399} 3400 3401/* 3402 * Insert the indicated symbolic reference entry into the directory. 3403 * 3404 * IN: dvp - Directory to contain new symbolic link. 3405 * link - Name for new symlink entry. 3406 * vap - Attributes of new entry. 3407 * target - Target path of new symlink. 3408 * cr - credentials of caller. 3409 * ct - caller context 3410 * flags - case flags 3411 * 3412 * RETURN: 0 if success 3413 * error code if failure 3414 * 3415 * Timestamps: 3416 * dvp - ctime|mtime updated 3417 */ 3418/*ARGSUSED*/ 3419static int 3420zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link, 3421 cred_t *cr, kthread_t *td) 3422{ 3423 znode_t *zp, *dzp = VTOZ(dvp); 3424 zfs_dirlock_t *dl; 3425 dmu_tx_t *tx; 3426 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 3427 zilog_t *zilog; 3428 int len = strlen(link); 3429 int error; 3430 int zflg = ZNEW; 3431 zfs_fuid_info_t *fuidp = NULL; 3432 int flags = 0; 3433 3434 ASSERT(vap->va_type == VLNK); 3435 3436 ZFS_ENTER(zfsvfs); 3437 ZFS_VERIFY_ZP(dzp); 3438 zilog = zfsvfs->z_log; 3439 3440 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), 3441 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 3442 ZFS_EXIT(zfsvfs); 3443 return (EILSEQ); 3444 } 3445 if (flags & FIGNORECASE) 3446 zflg |= ZCILOOK; 3447top: 3448 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { 3449 ZFS_EXIT(zfsvfs); 3450 return (error); 3451 } 3452 3453 if (len > MAXPATHLEN) { 3454 ZFS_EXIT(zfsvfs); 3455 return (ENAMETOOLONG); 3456 } 3457 3458 /* 3459 * Attempt to lock directory; fail if entry already exists. 3460 */ 3461 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); 3462 if (error) { 3463 ZFS_EXIT(zfsvfs); 3464 return (error); 3465 } 3466 3467 tx = dmu_tx_create(zfsvfs->z_os); 3468 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); 3469 dmu_tx_hold_bonus(tx, dzp->z_id); 3470 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 3471 if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) 3472 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE); 3473 if (IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr))) { 3474 if (zfsvfs->z_fuid_obj == 0) { 3475 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 3476 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 3477 FUID_SIZE_ESTIMATE(zfsvfs)); 3478 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); 3479 } else { 3480 dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); 3481 dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, 3482 FUID_SIZE_ESTIMATE(zfsvfs)); 3483 } 3484 } 3485 error = dmu_tx_assign(tx, zfsvfs->z_assign); 3486 if (error) { 3487 zfs_dirent_unlock(dl); 3488 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 3489 dmu_tx_wait(tx); 3490 dmu_tx_abort(tx); 3491 goto top; 3492 } 3493 dmu_tx_abort(tx); 3494 ZFS_EXIT(zfsvfs); 3495 return (error); 3496 } 3497 3498 dmu_buf_will_dirty(dzp->z_dbuf, tx); 3499 3500 /* 3501 * Create a new object for the symlink. 3502 * Put the link content into bonus buffer if it will fit; 3503 * otherwise, store it just like any other file data. 3504 */ 3505 if (sizeof (znode_phys_t) + len <= dmu_bonus_max()) { 3506 zfs_mknode(dzp, vap, tx, cr, 0, &zp, len, NULL, &fuidp); 3507 if (len != 0) 3508 bcopy(link, zp->z_phys + 1, len); 3509 } else { 3510 dmu_buf_t *dbp; 3511 3512 zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, NULL, &fuidp); 3513 /* 3514 * Nothing can access the znode yet so no locking needed 3515 * for growing the znode's blocksize. 3516 */ 3517 zfs_grow_blocksize(zp, len, tx); 3518 3519 VERIFY(0 == dmu_buf_hold(zfsvfs->z_os, 3520 zp->z_id, 0, FTAG, &dbp)); 3521 dmu_buf_will_dirty(dbp, tx); 3522 3523 ASSERT3U(len, <=, dbp->db_size); 3524 bcopy(link, dbp->db_data, len); 3525 dmu_buf_rele(dbp, FTAG); 3526 } 3527 zp->z_phys->zp_size = len; 3528 3529 /* 3530 * Insert the new object into the directory. 3531 */ 3532 (void) zfs_link_create(dl, zp, tx, ZNEW); 3533out: 3534 if (error == 0) { 3535 uint64_t txtype = TX_SYMLINK; 3536 if (flags & FIGNORECASE) 3537 txtype |= TX_CI; 3538 zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); 3539 *vpp = ZTOV(zp); 3540 } 3541 if (fuidp) 3542 zfs_fuid_info_free(fuidp); 3543 3544 dmu_tx_commit(tx); 3545 3546 zfs_dirent_unlock(dl); 3547 3548 ZFS_EXIT(zfsvfs); 3549 return (error); 3550} 3551 3552/* 3553 * Return, in the buffer contained in the provided uio structure, 3554 * the symbolic path referred to by vp. 3555 * 3556 * IN: vp - vnode of symbolic link. 3557 * uoip - structure to contain the link path. 3558 * cr - credentials of caller. 3559 * ct - caller context 3560 * 3561 * OUT: uio - structure to contain the link path. 3562 * 3563 * RETURN: 0 if success 3564 * error code if failure 3565 * 3566 * Timestamps: 3567 * vp - atime updated 3568 */ 3569/* ARGSUSED */ 3570static int 3571zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct) 3572{ 3573 znode_t *zp = VTOZ(vp); 3574 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 3575 size_t bufsz; 3576 int error; 3577 3578 ZFS_ENTER(zfsvfs); 3579 ZFS_VERIFY_ZP(zp); 3580 3581 bufsz = (size_t)zp->z_phys->zp_size; 3582 if (bufsz + sizeof (znode_phys_t) <= zp->z_dbuf->db_size) { 3583 error = uiomove(zp->z_phys + 1, 3584 MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); 3585 } else { 3586 dmu_buf_t *dbp; 3587 error = dmu_buf_hold(zfsvfs->z_os, zp->z_id, 0, FTAG, &dbp); 3588 if (error) { 3589 ZFS_EXIT(zfsvfs); 3590 return (error); 3591 } 3592 error = uiomove(dbp->db_data, 3593 MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); 3594 dmu_buf_rele(dbp, FTAG); 3595 } 3596 3597 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 3598 ZFS_EXIT(zfsvfs); 3599 return (error); 3600} 3601 3602/* 3603 * Insert a new entry into directory tdvp referencing svp. 3604 * 3605 * IN: tdvp - Directory to contain new entry. 3606 * svp - vnode of new entry. 3607 * name - name of new entry. 3608 * cr - credentials of caller. 3609 * ct - caller context 3610 * 3611 * RETURN: 0 if success 3612 * error code if failure 3613 * 3614 * Timestamps: 3615 * tdvp - ctime|mtime updated 3616 * svp - ctime updated 3617 */ 3618/* ARGSUSED */ 3619static int 3620zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr, 3621 caller_context_t *ct, int flags) 3622{ 3623 znode_t *dzp = VTOZ(tdvp); 3624 znode_t *tzp, *szp; 3625 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 3626 zilog_t *zilog; 3627 zfs_dirlock_t *dl; 3628 dmu_tx_t *tx; 3629 vnode_t *realvp; 3630 int error; 3631 int zf = ZNEW; 3632 uid_t owner; 3633 3634 ASSERT(tdvp->v_type == VDIR); 3635 3636 ZFS_ENTER(zfsvfs); 3637 ZFS_VERIFY_ZP(dzp); 3638 zilog = zfsvfs->z_log; 3639 3640 if (VOP_REALVP(svp, &realvp, ct) == 0) 3641 svp = realvp; 3642 3643 if (svp->v_vfsp != tdvp->v_vfsp) { 3644 ZFS_EXIT(zfsvfs); 3645 return (EXDEV); 3646 } 3647 szp = VTOZ(svp); 3648 ZFS_VERIFY_ZP(szp); 3649 3650 if (zfsvfs->z_utf8 && u8_validate(name, 3651 strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 3652 ZFS_EXIT(zfsvfs); 3653 return (EILSEQ); 3654 } 3655 if (flags & FIGNORECASE) 3656 zf |= ZCILOOK; 3657 3658top: 3659 /* 3660 * We do not support links between attributes and non-attributes 3661 * because of the potential security risk of creating links 3662 * into "normal" file space in order to circumvent restrictions 3663 * imposed in attribute space. 3664 */ 3665 if ((szp->z_phys->zp_flags & ZFS_XATTR) != 3666 (dzp->z_phys->zp_flags & ZFS_XATTR)) { 3667 ZFS_EXIT(zfsvfs); 3668 return (EINVAL); 3669 } 3670 3671 /* 3672 * POSIX dictates that we return EPERM here. 3673 * Better choices include ENOTSUP or EISDIR. 3674 */ 3675 if (svp->v_type == VDIR) { 3676 ZFS_EXIT(zfsvfs); 3677 return (EPERM); 3678 } 3679 3680 owner = zfs_fuid_map_id(zfsvfs, szp->z_phys->zp_uid, cr, ZFS_OWNER); 3681 if (owner != crgetuid(cr) && 3682 secpolicy_basic_link(svp, cr) != 0) { 3683 ZFS_EXIT(zfsvfs); 3684 return (EPERM); 3685 } 3686 3687 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { 3688 ZFS_EXIT(zfsvfs); 3689 return (error); 3690 } 3691 3692 /* 3693 * Attempt to lock directory; fail if entry already exists. 3694 */ 3695 error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL); 3696 if (error) { 3697 ZFS_EXIT(zfsvfs); 3698 return (error); 3699 } 3700 3701 tx = dmu_tx_create(zfsvfs->z_os); 3702 dmu_tx_hold_bonus(tx, szp->z_id); 3703 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 3704 error = dmu_tx_assign(tx, zfsvfs->z_assign); 3705 if (error) { 3706 zfs_dirent_unlock(dl); 3707 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 3708 dmu_tx_wait(tx); 3709 dmu_tx_abort(tx); 3710 goto top; 3711 } 3712 dmu_tx_abort(tx); 3713 ZFS_EXIT(zfsvfs); 3714 return (error); 3715 } 3716 3717 error = zfs_link_create(dl, szp, tx, 0); 3718 3719 if (error == 0) { 3720 uint64_t txtype = TX_LINK; 3721 if (flags & FIGNORECASE) 3722 txtype |= TX_CI; 3723 zfs_log_link(zilog, tx, txtype, dzp, szp, name); 3724 } 3725 3726 dmu_tx_commit(tx); 3727 3728 zfs_dirent_unlock(dl); 3729 3730 if (error == 0) { 3731 vnevent_link(svp, ct); 3732 } 3733 3734 ZFS_EXIT(zfsvfs); 3735 return (error); 3736} 3737 3738/*ARGSUSED*/ 3739void 3740zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) 3741{ 3742 znode_t *zp = VTOZ(vp); 3743 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 3744 int error; 3745 3746 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); 3747 if (zp->z_dbuf == NULL) { 3748 /* 3749 * The fs has been unmounted, or we did a 3750 * suspend/resume and this file no longer exists. 3751 */ 3752 VI_LOCK(vp); 3753 vp->v_count = 0; /* count arrives as 1 */ 3754 VI_UNLOCK(vp); 3755 vrecycle(vp, curthread); 3756 rw_exit(&zfsvfs->z_teardown_inactive_lock); 3757 return; 3758 } 3759 3760 if (zp->z_atime_dirty && zp->z_unlinked == 0) { 3761 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); 3762 3763 dmu_tx_hold_bonus(tx, zp->z_id); 3764 error = dmu_tx_assign(tx, TXG_WAIT); 3765 if (error) { 3766 dmu_tx_abort(tx); 3767 } else { 3768 dmu_buf_will_dirty(zp->z_dbuf, tx); 3769 mutex_enter(&zp->z_lock); 3770 zp->z_atime_dirty = 0; 3771 mutex_exit(&zp->z_lock); 3772 dmu_tx_commit(tx); 3773 } 3774 } 3775 3776 zfs_zinactive(zp); 3777 rw_exit(&zfsvfs->z_teardown_inactive_lock); 3778} 3779 3780CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid)); 3781CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid)); 3782 3783/*ARGSUSED*/ 3784static int 3785zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct) 3786{ 3787 znode_t *zp = VTOZ(vp); 3788 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 3789 uint32_t gen; 3790 uint64_t object = zp->z_id; 3791 zfid_short_t *zfid; 3792 int size, i; 3793 3794 ZFS_ENTER(zfsvfs); 3795 ZFS_VERIFY_ZP(zp); 3796 gen = (uint32_t)zp->z_gen; 3797 3798 size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN; 3799 fidp->fid_len = size; 3800 3801 zfid = (zfid_short_t *)fidp; 3802 3803 zfid->zf_len = size; 3804 3805 for (i = 0; i < sizeof (zfid->zf_object); i++) 3806 zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); 3807 3808 /* Must have a non-zero generation number to distinguish from .zfs */ 3809 if (gen == 0) 3810 gen = 1; 3811 for (i = 0; i < sizeof (zfid->zf_gen); i++) 3812 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); 3813 3814 if (size == LONG_FID_LEN) { 3815 uint64_t objsetid = dmu_objset_id(zfsvfs->z_os); 3816 zfid_long_t *zlfid; 3817 3818 zlfid = (zfid_long_t *)fidp; 3819 3820 for (i = 0; i < sizeof (zlfid->zf_setid); i++) 3821 zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i)); 3822 3823 /* XXX - this should be the generation number for the objset */ 3824 for (i = 0; i < sizeof (zlfid->zf_setgen); i++) 3825 zlfid->zf_setgen[i] = 0; 3826 } 3827 3828 ZFS_EXIT(zfsvfs); 3829 return (0); 3830} 3831 3832static int 3833zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, 3834 caller_context_t *ct) 3835{ 3836 znode_t *zp, *xzp; 3837 zfsvfs_t *zfsvfs; 3838 zfs_dirlock_t *dl; 3839 int error; 3840 3841 switch (cmd) { 3842 case _PC_LINK_MAX: 3843 *valp = INT_MAX; 3844 return (0); 3845 3846 case _PC_FILESIZEBITS: 3847 *valp = 64; 3848 return (0); 3849 3850#if 0 3851 case _PC_XATTR_EXISTS: 3852 zp = VTOZ(vp); 3853 zfsvfs = zp->z_zfsvfs; 3854 ZFS_ENTER(zfsvfs); 3855 ZFS_VERIFY_ZP(zp); 3856 *valp = 0; 3857 error = zfs_dirent_lock(&dl, zp, "", &xzp, 3858 ZXATTR | ZEXISTS | ZSHARED, NULL, NULL); 3859 if (error == 0) { 3860 zfs_dirent_unlock(dl); 3861 if (!zfs_dirempty(xzp)) 3862 *valp = 1; 3863 VN_RELE(ZTOV(xzp)); 3864 } else if (error == ENOENT) { 3865 /* 3866 * If there aren't extended attributes, it's the 3867 * same as having zero of them. 3868 */ 3869 error = 0; 3870 } 3871 ZFS_EXIT(zfsvfs); 3872 return (error); 3873#endif 3874 3875 case _PC_ACL_EXTENDED: 3876 *valp = 0; 3877 return (0); 3878 3879 case _PC_ACL_NFS4: 3880 *valp = 1; 3881 return (0); 3882 3883 case _PC_ACL_PATH_MAX: 3884 *valp = ACL_MAX_ENTRIES; 3885 return (0); 3886 3887 case _PC_MIN_HOLE_SIZE: 3888 *valp = (int)SPA_MINBLOCKSIZE; 3889 return (0); 3890 3891 default: 3892 return (EOPNOTSUPP); 3893 } 3894} 3895 3896/*ARGSUSED*/ 3897static int 3898zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, 3899 caller_context_t *ct) 3900{ 3901 znode_t *zp = VTOZ(vp); 3902 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 3903 int error; 3904 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 3905 3906 ZFS_ENTER(zfsvfs); 3907 ZFS_VERIFY_ZP(zp); 3908 error = zfs_getacl(zp, vsecp, skipaclchk, cr); 3909 ZFS_EXIT(zfsvfs); 3910 3911 return (error); 3912} 3913 3914/*ARGSUSED*/ 3915static int 3916zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, 3917 caller_context_t *ct) 3918{ 3919 znode_t *zp = VTOZ(vp); 3920 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 3921 int error; 3922 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 3923 3924 ZFS_ENTER(zfsvfs); 3925 ZFS_VERIFY_ZP(zp); 3926 error = zfs_setacl(zp, vsecp, skipaclchk, cr); 3927 ZFS_EXIT(zfsvfs); 3928 return (error); 3929} 3930 3931static int 3932zfs_freebsd_open(ap) 3933 struct vop_open_args /* { 3934 struct vnode *a_vp; 3935 int a_mode; 3936 struct ucred *a_cred; 3937 struct thread *a_td; 3938 } */ *ap; 3939{ 3940 vnode_t *vp = ap->a_vp; 3941 znode_t *zp = VTOZ(vp); 3942 int error; 3943 3944 error = zfs_open(&vp, ap->a_mode, ap->a_cred, NULL); 3945 if (error == 0) 3946 vnode_create_vobject(vp, zp->z_phys->zp_size, ap->a_td); 3947 return (error); 3948} 3949 3950static int 3951zfs_freebsd_close(ap) 3952 struct vop_close_args /* { 3953 struct vnode *a_vp; 3954 int a_fflag; 3955 struct ucred *a_cred; 3956 struct thread *a_td; 3957 } */ *ap; 3958{ 3959 3960 return (zfs_close(ap->a_vp, ap->a_fflag, 0, 0, ap->a_cred, NULL)); 3961} 3962 3963static int 3964zfs_freebsd_ioctl(ap) 3965 struct vop_ioctl_args /* { 3966 struct vnode *a_vp; 3967 u_long a_command; 3968 caddr_t a_data; 3969 int a_fflag; 3970 struct ucred *cred; 3971 struct thread *td; 3972 } */ *ap; 3973{ 3974 3975 return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data, 3976 ap->a_fflag, ap->a_cred, NULL, NULL)); 3977} 3978 3979static int 3980zfs_freebsd_read(ap) 3981 struct vop_read_args /* { 3982 struct vnode *a_vp; 3983 struct uio *a_uio; 3984 int a_ioflag; 3985 struct ucred *a_cred; 3986 } */ *ap; 3987{ 3988 3989 return (zfs_read(ap->a_vp, ap->a_uio, ap->a_ioflag, ap->a_cred, NULL)); 3990} 3991 3992static int 3993zfs_freebsd_write(ap) 3994 struct vop_write_args /* { 3995 struct vnode *a_vp; 3996 struct uio *a_uio; 3997 int a_ioflag; 3998 struct ucred *a_cred; 3999 } */ *ap; 4000{ 4001 4002 if (vn_rlimit_fsize(ap->a_vp, ap->a_uio, ap->a_uio->uio_td)) 4003 return (EFBIG); 4004 4005 return (zfs_write(ap->a_vp, ap->a_uio, ap->a_ioflag, ap->a_cred, NULL)); 4006} 4007 4008static int 4009zfs_freebsd_access(ap) 4010 struct vop_access_args /* { 4011 struct vnode *a_vp; 4012 accmode_t a_accmode; 4013 struct ucred *a_cred; 4014 struct thread *a_td; 4015 } */ *ap; 4016{ 4017 accmode_t accmode; 4018 int error = 0; 4019 4020 /* 4021 * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND, 4022 */ 4023 accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND); 4024 if (accmode != 0) 4025 error = zfs_access(ap->a_vp, accmode, 0, ap->a_cred, NULL); 4026 4027 /* 4028 * VADMIN has to be handled by vaccess(). 4029 */ 4030 if (error == 0) { 4031 accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND); 4032 if (accmode != 0) { 4033 vnode_t *vp = ap->a_vp; 4034 znode_t *zp = VTOZ(vp); 4035 znode_phys_t *zphys = zp->z_phys; 4036 4037 error = vaccess(vp->v_type, zphys->zp_mode, 4038 zphys->zp_uid, zphys->zp_gid, accmode, ap->a_cred, 4039 NULL); 4040 } 4041 } 4042 4043 return (error); 4044} 4045 4046static int 4047zfs_freebsd_lookup(ap) 4048 struct vop_lookup_args /* { 4049 struct vnode *a_dvp; 4050 struct vnode **a_vpp; 4051 struct componentname *a_cnp; 4052 } */ *ap; 4053{ 4054 struct componentname *cnp = ap->a_cnp; 4055 char nm[NAME_MAX + 1]; 4056 4057 ASSERT(cnp->cn_namelen < sizeof(nm)); 4058 strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm))); 4059 4060 return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop, 4061 cnp->cn_cred, cnp->cn_thread, 0)); 4062} 4063 4064static int 4065zfs_freebsd_create(ap) 4066 struct vop_create_args /* { 4067 struct vnode *a_dvp; 4068 struct vnode **a_vpp; 4069 struct componentname *a_cnp; 4070 struct vattr *a_vap; 4071 } */ *ap; 4072{ 4073 struct componentname *cnp = ap->a_cnp; 4074 vattr_t *vap = ap->a_vap; 4075 int mode; 4076 4077 ASSERT(cnp->cn_flags & SAVENAME); 4078 4079 vattr_init_mask(vap); 4080 mode = vap->va_mode & ALLPERMS; 4081 4082 return (zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode, 4083 ap->a_vpp, cnp->cn_cred, cnp->cn_thread)); 4084} 4085 4086static int 4087zfs_freebsd_remove(ap) 4088 struct vop_remove_args /* { 4089 struct vnode *a_dvp; 4090 struct vnode *a_vp; 4091 struct componentname *a_cnp; 4092 } */ *ap; 4093{ 4094 4095 ASSERT(ap->a_cnp->cn_flags & SAVENAME); 4096 4097 return (zfs_remove(ap->a_dvp, ap->a_cnp->cn_nameptr, 4098 ap->a_cnp->cn_cred, NULL, 0)); 4099} 4100 4101static int 4102zfs_freebsd_mkdir(ap) 4103 struct vop_mkdir_args /* { 4104 struct vnode *a_dvp; 4105 struct vnode **a_vpp; 4106 struct componentname *a_cnp; 4107 struct vattr *a_vap; 4108 } */ *ap; 4109{ 4110 vattr_t *vap = ap->a_vap; 4111 4112 ASSERT(ap->a_cnp->cn_flags & SAVENAME); 4113 4114 vattr_init_mask(vap); 4115 4116 return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp, 4117 ap->a_cnp->cn_cred, NULL, 0, NULL)); 4118} 4119 4120static int 4121zfs_freebsd_rmdir(ap) 4122 struct vop_rmdir_args /* { 4123 struct vnode *a_dvp; 4124 struct vnode *a_vp; 4125 struct componentname *a_cnp; 4126 } */ *ap; 4127{ 4128 struct componentname *cnp = ap->a_cnp; 4129 4130 ASSERT(cnp->cn_flags & SAVENAME); 4131 4132 return (zfs_rmdir(ap->a_dvp, cnp->cn_nameptr, NULL, cnp->cn_cred, NULL, 0)); 4133} 4134 4135static int 4136zfs_freebsd_readdir(ap) 4137 struct vop_readdir_args /* { 4138 struct vnode *a_vp; 4139 struct uio *a_uio; 4140 struct ucred *a_cred; 4141 int *a_eofflag; 4142 int *a_ncookies; 4143 u_long **a_cookies; 4144 } */ *ap; 4145{ 4146 4147 return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag, 4148 ap->a_ncookies, ap->a_cookies)); 4149} 4150 4151static int 4152zfs_freebsd_fsync(ap) 4153 struct vop_fsync_args /* { 4154 struct vnode *a_vp; 4155 int a_waitfor; 4156 struct thread *a_td; 4157 } */ *ap; 4158{ 4159 4160 vop_stdfsync(ap); 4161 return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL)); 4162} 4163 4164static int 4165zfs_freebsd_getattr(ap) 4166 struct vop_getattr_args /* { 4167 struct vnode *a_vp; 4168 struct vattr *a_vap; 4169 struct ucred *a_cred; 4170 struct thread *a_td; 4171 } */ *ap; 4172{ 4173 vattr_t *vap = ap->a_vap; 4174 xvattr_t xvap; 4175 u_long fflags = 0; 4176 int error; 4177 4178 xva_init(&xvap); 4179 xvap.xva_vattr = *vap; 4180 xvap.xva_vattr.va_mask |= AT_XVATTR; 4181 4182 /* Convert chflags into ZFS-type flags. */ 4183 /* XXX: what about SF_SETTABLE?. */ 4184 XVA_SET_REQ(&xvap, XAT_IMMUTABLE); 4185 XVA_SET_REQ(&xvap, XAT_APPENDONLY); 4186 XVA_SET_REQ(&xvap, XAT_NOUNLINK); 4187 XVA_SET_REQ(&xvap, XAT_NODUMP); 4188 error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL); 4189 if (error != 0) 4190 return (error); 4191 4192 /* Convert ZFS xattr into chflags. */ 4193#define FLAG_CHECK(fflag, xflag, xfield) do { \ 4194 if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0) \ 4195 fflags |= (fflag); \ 4196} while (0) 4197 FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE, 4198 xvap.xva_xoptattrs.xoa_immutable); 4199 FLAG_CHECK(SF_APPEND, XAT_APPENDONLY, 4200 xvap.xva_xoptattrs.xoa_appendonly); 4201 FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK, 4202 xvap.xva_xoptattrs.xoa_nounlink); 4203 FLAG_CHECK(UF_NODUMP, XAT_NODUMP, 4204 xvap.xva_xoptattrs.xoa_nodump); 4205#undef FLAG_CHECK 4206 *vap = xvap.xva_vattr; 4207 vap->va_flags = fflags; 4208 return (0); 4209} 4210 4211static int 4212zfs_freebsd_setattr(ap) 4213 struct vop_setattr_args /* { 4214 struct vnode *a_vp; 4215 struct vattr *a_vap; 4216 struct ucred *a_cred; 4217 struct thread *a_td; 4218 } */ *ap; 4219{ 4220 vnode_t *vp = ap->a_vp; 4221 vattr_t *vap = ap->a_vap; 4222 cred_t *cred = ap->a_cred; 4223 xvattr_t xvap; 4224 u_long fflags; 4225 uint64_t zflags; 4226 4227 vattr_init_mask(vap); 4228 vap->va_mask &= ~AT_NOSET; 4229 4230 xva_init(&xvap); 4231 xvap.xva_vattr = *vap; 4232 4233 zflags = VTOZ(vp)->z_phys->zp_flags; 4234 4235 if (vap->va_flags != VNOVAL) { 4236 zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs; 4237 int error; 4238 4239 if (zfsvfs->z_use_fuids == B_FALSE) 4240 return (EOPNOTSUPP); 4241 4242 fflags = vap->va_flags; 4243 if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_NODUMP)) != 0) 4244 return (EOPNOTSUPP); 4245 /* 4246 * Unprivileged processes are not permitted to unset system 4247 * flags, or modify flags if any system flags are set. 4248 * Privileged non-jail processes may not modify system flags 4249 * if securelevel > 0 and any existing system flags are set. 4250 * Privileged jail processes behave like privileged non-jail 4251 * processes if the security.jail.chflags_allowed sysctl is 4252 * is non-zero; otherwise, they behave like unprivileged 4253 * processes. 4254 */ 4255 if (secpolicy_fs_owner(vp->v_mount, cred) == 0 || 4256 priv_check_cred(cred, PRIV_VFS_SYSFLAGS, 0) == 0) { 4257 if (zflags & 4258 (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) { 4259 error = securelevel_gt(cred, 0); 4260 if (error != 0) 4261 return (error); 4262 } 4263 } else { 4264 /* 4265 * Callers may only modify the file flags on objects they 4266 * have VADMIN rights for. 4267 */ 4268 if ((error = VOP_ACCESS(vp, VADMIN, cred, curthread)) != 0) 4269 return (error); 4270 if (zflags & 4271 (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) { 4272 return (EPERM); 4273 } 4274 if (fflags & 4275 (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) { 4276 return (EPERM); 4277 } 4278 } 4279 4280#define FLAG_CHANGE(fflag, zflag, xflag, xfield) do { \ 4281 if (((fflags & (fflag)) && !(zflags & (zflag))) || \ 4282 ((zflags & (zflag)) && !(fflags & (fflag)))) { \ 4283 XVA_SET_REQ(&xvap, (xflag)); \ 4284 (xfield) = ((fflags & (fflag)) != 0); \ 4285 } \ 4286} while (0) 4287 /* Convert chflags into ZFS-type flags. */ 4288 /* XXX: what about SF_SETTABLE?. */ 4289 FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE, 4290 xvap.xva_xoptattrs.xoa_immutable); 4291 FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY, 4292 xvap.xva_xoptattrs.xoa_appendonly); 4293 FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK, 4294 xvap.xva_xoptattrs.xoa_nounlink); 4295 FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP, 4296 xvap.xva_xoptattrs.xoa_nodump); 4297#undef FLAG_CHANGE 4298 } 4299 return (zfs_setattr(vp, (vattr_t *)&xvap, 0, cred, NULL)); 4300} 4301 4302static int 4303zfs_freebsd_rename(ap) 4304 struct vop_rename_args /* { 4305 struct vnode *a_fdvp; 4306 struct vnode *a_fvp; 4307 struct componentname *a_fcnp; 4308 struct vnode *a_tdvp; 4309 struct vnode *a_tvp; 4310 struct componentname *a_tcnp; 4311 } */ *ap; 4312{ 4313 vnode_t *fdvp = ap->a_fdvp; 4314 vnode_t *fvp = ap->a_fvp; 4315 vnode_t *tdvp = ap->a_tdvp; 4316 vnode_t *tvp = ap->a_tvp; 4317 int error; 4318 4319 ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART)); 4320 ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART)); 4321 4322 error = zfs_rename(fdvp, ap->a_fcnp->cn_nameptr, tdvp, 4323 ap->a_tcnp->cn_nameptr, ap->a_fcnp->cn_cred, NULL, 0); 4324 4325 if (tdvp == tvp) 4326 VN_RELE(tdvp); 4327 else 4328 VN_URELE(tdvp); 4329 if (tvp) 4330 VN_URELE(tvp); 4331 VN_RELE(fdvp); 4332 VN_RELE(fvp); 4333 4334 return (error); 4335} 4336 4337static int 4338zfs_freebsd_symlink(ap) 4339 struct vop_symlink_args /* { 4340 struct vnode *a_dvp; 4341 struct vnode **a_vpp; 4342 struct componentname *a_cnp; 4343 struct vattr *a_vap; 4344 char *a_target; 4345 } */ *ap; 4346{ 4347 struct componentname *cnp = ap->a_cnp; 4348 vattr_t *vap = ap->a_vap; 4349 4350 ASSERT(cnp->cn_flags & SAVENAME); 4351 4352 vap->va_type = VLNK; /* FreeBSD: Syscall only sets va_mode. */ 4353 vattr_init_mask(vap); 4354 4355 return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap, 4356 ap->a_target, cnp->cn_cred, cnp->cn_thread)); 4357} 4358 4359static int 4360zfs_freebsd_readlink(ap) 4361 struct vop_readlink_args /* { 4362 struct vnode *a_vp; 4363 struct uio *a_uio; 4364 struct ucred *a_cred; 4365 } */ *ap; 4366{ 4367 4368 return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL)); 4369} 4370 4371static int 4372zfs_freebsd_link(ap) 4373 struct vop_link_args /* { 4374 struct vnode *a_tdvp; 4375 struct vnode *a_vp; 4376 struct componentname *a_cnp; 4377 } */ *ap; 4378{ 4379 struct componentname *cnp = ap->a_cnp; 4380 4381 ASSERT(cnp->cn_flags & SAVENAME); 4382 4383 return (zfs_link(ap->a_tdvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred, NULL, 0)); 4384} 4385 4386static int 4387zfs_freebsd_inactive(ap) 4388 struct vop_inactive_args /* { 4389 struct vnode *a_vp; 4390 struct thread *a_td; 4391 } */ *ap; 4392{ 4393 vnode_t *vp = ap->a_vp; 4394 4395 zfs_inactive(vp, ap->a_td->td_ucred, NULL); 4396 return (0); 4397} 4398 4399static void 4400zfs_reclaim_complete(void *arg, int pending) 4401{ 4402 znode_t *zp = arg; 4403 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4404 4405 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); 4406 if (zp->z_dbuf != NULL) { 4407 ZFS_OBJ_HOLD_ENTER(zfsvfs, zp->z_id); 4408 zfs_znode_dmu_fini(zp); 4409 ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id); 4410 } 4411 zfs_znode_free(zp); 4412 rw_exit(&zfsvfs->z_teardown_inactive_lock); 4413 /* 4414 * If the file system is being unmounted, there is a process waiting 4415 * for us, wake it up. 4416 */ 4417 if (zfsvfs->z_unmounted) 4418 wakeup_one(zfsvfs); 4419} 4420 4421static int 4422zfs_freebsd_reclaim(ap) 4423 struct vop_reclaim_args /* { 4424 struct vnode *a_vp; 4425 struct thread *a_td; 4426 } */ *ap; 4427{ 4428 vnode_t *vp = ap->a_vp; 4429 znode_t *zp = VTOZ(vp); 4430 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4431 4432 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); 4433 4434 ASSERT(zp != NULL); 4435 4436 /* 4437 * Destroy the vm object and flush associated pages. 4438 */ 4439 vnode_destroy_vobject(vp); 4440 4441 mutex_enter(&zp->z_lock); 4442 ASSERT(zp->z_phys != NULL); 4443 zp->z_vnode = NULL; 4444 mutex_exit(&zp->z_lock); 4445 4446 if (zp->z_unlinked) 4447 ; /* Do nothing. */ 4448 else if (zp->z_dbuf == NULL) 4449 zfs_znode_free(zp); 4450 else /* if (!zp->z_unlinked && zp->z_dbuf != NULL) */ { 4451 int locked; 4452 4453 locked = MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id)) ? 2 : 4454 ZFS_OBJ_HOLD_TRYENTER(zfsvfs, zp->z_id); 4455 if (locked == 0) { 4456 /* 4457 * Lock can't be obtained due to deadlock possibility, 4458 * so defer znode destruction. 4459 */ 4460 TASK_INIT(&zp->z_task, 0, zfs_reclaim_complete, zp); 4461 taskqueue_enqueue(taskqueue_thread, &zp->z_task); 4462 } else { 4463 zfs_znode_dmu_fini(zp); 4464 if (locked == 1) 4465 ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id); 4466 zfs_znode_free(zp); 4467 } 4468 } 4469 VI_LOCK(vp); 4470 vp->v_data = NULL; 4471 ASSERT(vp->v_holdcnt >= 1); 4472 VI_UNLOCK(vp); 4473 rw_exit(&zfsvfs->z_teardown_inactive_lock); 4474 return (0); 4475} 4476 4477static int 4478zfs_freebsd_fid(ap) 4479 struct vop_fid_args /* { 4480 struct vnode *a_vp; 4481 struct fid *a_fid; 4482 } */ *ap; 4483{ 4484 4485 return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL)); 4486} 4487 4488static int 4489zfs_freebsd_pathconf(ap) 4490 struct vop_pathconf_args /* { 4491 struct vnode *a_vp; 4492 int a_name; 4493 register_t *a_retval; 4494 } */ *ap; 4495{ 4496 ulong_t val; 4497 int error; 4498 4499 error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred, NULL); 4500 if (error == 0) 4501 *ap->a_retval = val; 4502 else if (error == EOPNOTSUPP) 4503 error = vop_stdpathconf(ap); 4504 return (error); 4505} 4506 4507static int 4508zfs_freebsd_fifo_pathconf(ap) 4509 struct vop_pathconf_args /* { 4510 struct vnode *a_vp; 4511 int a_name; 4512 register_t *a_retval; 4513 } */ *ap; 4514{ 4515 4516 switch (ap->a_name) { 4517 case _PC_ACL_EXTENDED: 4518 case _PC_ACL_NFS4: 4519 case _PC_ACL_PATH_MAX: 4520 case _PC_MAC_PRESENT: 4521 return (zfs_freebsd_pathconf(ap)); 4522 default: 4523 return (fifo_specops.vop_pathconf(ap)); 4524 } 4525} 4526 4527/* 4528 * FreeBSD's extended attributes namespace defines file name prefix for ZFS' 4529 * extended attribute name: 4530 * 4531 * NAMESPACE PREFIX 4532 * system freebsd:system: 4533 * user (none, can be used to access ZFS fsattr(5) attributes 4534 * created on Solaris) 4535 */ 4536static int 4537zfs_create_attrname(int attrnamespace, const char *name, char *attrname, 4538 size_t size) 4539{ 4540 const char *namespace, *prefix, *suffix; 4541 4542 /* We don't allow '/' character in attribute name. */ 4543 if (strchr(name, '/') != NULL) 4544 return (EINVAL); 4545 /* We don't allow attribute names that start with "freebsd:" string. */ 4546 if (strncmp(name, "freebsd:", 8) == 0) 4547 return (EINVAL); 4548 4549 bzero(attrname, size); 4550 4551 switch (attrnamespace) { 4552 case EXTATTR_NAMESPACE_USER: 4553#if 0 4554 prefix = "freebsd:"; 4555 namespace = EXTATTR_NAMESPACE_USER_STRING; 4556 suffix = ":"; 4557#else 4558 /* 4559 * This is the default namespace by which we can access all 4560 * attributes created on Solaris. 4561 */ 4562 prefix = namespace = suffix = ""; 4563#endif 4564 break; 4565 case EXTATTR_NAMESPACE_SYSTEM: 4566 prefix = "freebsd:"; 4567 namespace = EXTATTR_NAMESPACE_SYSTEM_STRING; 4568 suffix = ":"; 4569 break; 4570 case EXTATTR_NAMESPACE_EMPTY: 4571 default: 4572 return (EINVAL); 4573 } 4574 if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix, 4575 name) >= size) { 4576 return (ENAMETOOLONG); 4577 } 4578 return (0); 4579} 4580 4581/* 4582 * Vnode operating to retrieve a named extended attribute. 4583 */ 4584static int 4585zfs_getextattr(struct vop_getextattr_args *ap) 4586/* 4587vop_getextattr { 4588 IN struct vnode *a_vp; 4589 IN int a_attrnamespace; 4590 IN const char *a_name; 4591 INOUT struct uio *a_uio; 4592 OUT size_t *a_size; 4593 IN struct ucred *a_cred; 4594 IN struct thread *a_td; 4595}; 4596*/ 4597{ 4598 zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; 4599 struct thread *td = ap->a_td; 4600 struct nameidata nd; 4601 char attrname[255]; 4602 struct vattr va; 4603 vnode_t *xvp = NULL, *vp; 4604 int error, flags; 4605 4606 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 4607 ap->a_cred, ap->a_td, VREAD); 4608 if (error != 0) 4609 return (error); 4610 4611 error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, 4612 sizeof(attrname)); 4613 if (error != 0) 4614 return (error); 4615 4616 ZFS_ENTER(zfsvfs); 4617 4618 error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, 4619 LOOKUP_XATTR); 4620 if (error != 0) { 4621 ZFS_EXIT(zfsvfs); 4622 return (error); 4623 } 4624 4625 flags = FREAD; 4626 NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_SYSSPACE, attrname, 4627 xvp, td); 4628 error = vn_open_cred(&nd, &flags, 0, 0, ap->a_cred, NULL); 4629 vp = nd.ni_vp; 4630 NDFREE(&nd, NDF_ONLY_PNBUF); 4631 if (error != 0) { 4632 ZFS_EXIT(zfsvfs); 4633 if (error == ENOENT) 4634 error = ENOATTR; 4635 return (error); 4636 } 4637 4638 if (ap->a_size != NULL) { 4639 error = VOP_GETATTR(vp, &va, ap->a_cred); 4640 if (error == 0) 4641 *ap->a_size = (size_t)va.va_size; 4642 } else if (ap->a_uio != NULL) 4643 error = VOP_READ(vp, ap->a_uio, IO_UNIT | IO_SYNC, ap->a_cred); 4644 4645 VOP_UNLOCK(vp, 0); 4646 vn_close(vp, flags, ap->a_cred, td); 4647 ZFS_EXIT(zfsvfs); 4648 4649 return (error); 4650} 4651 4652/* 4653 * Vnode operation to remove a named attribute. 4654 */ 4655int 4656zfs_deleteextattr(struct vop_deleteextattr_args *ap) 4657/* 4658vop_deleteextattr { 4659 IN struct vnode *a_vp; 4660 IN int a_attrnamespace; 4661 IN const char *a_name; 4662 IN struct ucred *a_cred; 4663 IN struct thread *a_td; 4664}; 4665*/ 4666{ 4667 zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; 4668 struct thread *td = ap->a_td; 4669 struct nameidata nd; 4670 char attrname[255]; 4671 struct vattr va; 4672 vnode_t *xvp = NULL, *vp; 4673 int error, flags; 4674 4675 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 4676 ap->a_cred, ap->a_td, VWRITE); 4677 if (error != 0) 4678 return (error); 4679 4680 error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, 4681 sizeof(attrname)); 4682 if (error != 0) 4683 return (error); 4684 4685 ZFS_ENTER(zfsvfs); 4686 4687 error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, 4688 LOOKUP_XATTR); 4689 if (error != 0) { 4690 ZFS_EXIT(zfsvfs); 4691 return (error); 4692 } 4693 4694 NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF | MPSAFE, 4695 UIO_SYSSPACE, attrname, xvp, td); 4696 error = namei(&nd); 4697 vp = nd.ni_vp; 4698 NDFREE(&nd, NDF_ONLY_PNBUF); 4699 if (error != 0) { 4700 ZFS_EXIT(zfsvfs); 4701 if (error == ENOENT) 4702 error = ENOATTR; 4703 return (error); 4704 } 4705 error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd); 4706 4707 vput(nd.ni_dvp); 4708 if (vp == nd.ni_dvp) 4709 vrele(vp); 4710 else 4711 vput(vp); 4712 ZFS_EXIT(zfsvfs); 4713 4714 return (error); 4715} 4716 4717/* 4718 * Vnode operation to set a named attribute. 4719 */ 4720static int 4721zfs_setextattr(struct vop_setextattr_args *ap) 4722/* 4723vop_setextattr { 4724 IN struct vnode *a_vp; 4725 IN int a_attrnamespace; 4726 IN const char *a_name; 4727 INOUT struct uio *a_uio; 4728 IN struct ucred *a_cred; 4729 IN struct thread *a_td; 4730}; 4731*/ 4732{ 4733 zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; 4734 struct thread *td = ap->a_td; 4735 struct nameidata nd; 4736 char attrname[255]; 4737 struct vattr va; 4738 vnode_t *xvp = NULL, *vp; 4739 int error, flags; 4740 4741 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 4742 ap->a_cred, ap->a_td, VWRITE); 4743 if (error != 0) 4744 return (error); 4745 4746 error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, 4747 sizeof(attrname)); 4748 if (error != 0) 4749 return (error); 4750 4751 ZFS_ENTER(zfsvfs); 4752 4753 error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, 4754 LOOKUP_XATTR | CREATE_XATTR_DIR); 4755 if (error != 0) { 4756 ZFS_EXIT(zfsvfs); 4757 return (error); 4758 } 4759 4760 flags = FFLAGS(O_WRONLY | O_CREAT); 4761 NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_SYSSPACE, attrname, 4762 xvp, td); 4763 error = vn_open_cred(&nd, &flags, 0600, 0, ap->a_cred, NULL); 4764 vp = nd.ni_vp; 4765 NDFREE(&nd, NDF_ONLY_PNBUF); 4766 if (error != 0) { 4767 ZFS_EXIT(zfsvfs); 4768 return (error); 4769 } 4770 4771 VATTR_NULL(&va); 4772 va.va_size = 0; 4773 error = VOP_SETATTR(vp, &va, ap->a_cred); 4774 if (error == 0) 4775 VOP_WRITE(vp, ap->a_uio, IO_UNIT | IO_SYNC, ap->a_cred); 4776 4777 VOP_UNLOCK(vp, 0); 4778 vn_close(vp, flags, ap->a_cred, td); 4779 ZFS_EXIT(zfsvfs); 4780 4781 return (error); 4782} 4783 4784/* 4785 * Vnode operation to retrieve extended attributes on a vnode. 4786 */ 4787static int 4788zfs_listextattr(struct vop_listextattr_args *ap) 4789/* 4790vop_listextattr { 4791 IN struct vnode *a_vp; 4792 IN int a_attrnamespace; 4793 INOUT struct uio *a_uio; 4794 OUT size_t *a_size; 4795 IN struct ucred *a_cred; 4796 IN struct thread *a_td; 4797}; 4798*/ 4799{ 4800 zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; 4801 struct thread *td = ap->a_td; 4802 struct nameidata nd; 4803 char attrprefix[16]; 4804 u_char dirbuf[sizeof(struct dirent)]; 4805 struct dirent *dp; 4806 struct iovec aiov; 4807 struct uio auio, *uio = ap->a_uio; 4808 size_t *sizep = ap->a_size; 4809 size_t plen; 4810 vnode_t *xvp = NULL, *vp; 4811 int done, error, eof, pos; 4812 4813 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 4814 ap->a_cred, ap->a_td, VREAD); 4815 if (error != 0) 4816 return (error); 4817 4818 error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix, 4819 sizeof(attrprefix)); 4820 if (error != 0) 4821 return (error); 4822 plen = strlen(attrprefix); 4823 4824 ZFS_ENTER(zfsvfs); 4825 4826 if (sizep != NULL) 4827 *sizep = 0; 4828 4829 error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, 4830 LOOKUP_XATTR); 4831 if (error != 0) { 4832 ZFS_EXIT(zfsvfs); 4833 /* 4834 * ENOATTR means that the EA directory does not yet exist, 4835 * i.e. there are no extended attributes there. 4836 */ 4837 if (error == ENOATTR) 4838 error = 0; 4839 return (error); 4840 } 4841 4842 NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED | MPSAFE, 4843 UIO_SYSSPACE, ".", xvp, td); 4844 error = namei(&nd); 4845 vp = nd.ni_vp; 4846 NDFREE(&nd, NDF_ONLY_PNBUF); 4847 if (error != 0) { 4848 ZFS_EXIT(zfsvfs); 4849 return (error); 4850 } 4851 4852 auio.uio_iov = &aiov; 4853 auio.uio_iovcnt = 1; 4854 auio.uio_segflg = UIO_SYSSPACE; 4855 auio.uio_td = td; 4856 auio.uio_rw = UIO_READ; 4857 auio.uio_offset = 0; 4858 4859 do { 4860 u_char nlen; 4861 4862 aiov.iov_base = (void *)dirbuf; 4863 aiov.iov_len = sizeof(dirbuf); 4864 auio.uio_resid = sizeof(dirbuf); 4865 error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL); 4866 done = sizeof(dirbuf) - auio.uio_resid; 4867 if (error != 0) 4868 break; 4869 for (pos = 0; pos < done;) { 4870 dp = (struct dirent *)(dirbuf + pos); 4871 pos += dp->d_reclen; 4872 /* 4873 * XXX: Temporarily we also accept DT_UNKNOWN, as this 4874 * is what we get when attribute was created on Solaris. 4875 */ 4876 if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN) 4877 continue; 4878 if (plen == 0 && strncmp(dp->d_name, "freebsd:", 8) == 0) 4879 continue; 4880 else if (strncmp(dp->d_name, attrprefix, plen) != 0) 4881 continue; 4882 nlen = dp->d_namlen - plen; 4883 if (sizep != NULL) 4884 *sizep += 1 + nlen; 4885 else if (uio != NULL) { 4886 /* 4887 * Format of extattr name entry is one byte for 4888 * length and the rest for name. 4889 */ 4890 error = uiomove(&nlen, 1, uio->uio_rw, uio); 4891 if (error == 0) { 4892 error = uiomove(dp->d_name + plen, nlen, 4893 uio->uio_rw, uio); 4894 } 4895 if (error != 0) 4896 break; 4897 } 4898 } 4899 } while (!eof && error == 0); 4900 4901 vput(vp); 4902 ZFS_EXIT(zfsvfs); 4903 4904 return (error); 4905} 4906 4907int 4908zfs_freebsd_getacl(ap) 4909 struct vop_getacl_args /* { 4910 struct vnode *vp; 4911 acl_type_t type; 4912 struct acl *aclp; 4913 struct ucred *cred; 4914 struct thread *td; 4915 } */ *ap; 4916{ 4917 int error; 4918 vsecattr_t vsecattr; 4919 4920 if (ap->a_type != ACL_TYPE_NFS4) 4921 return (EINVAL); 4922 4923 vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT; 4924 if (error = zfs_getsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL)) 4925 return (error); 4926 4927 error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp, vsecattr.vsa_aclcnt); 4928 if (vsecattr.vsa_aclentp != NULL) 4929 kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz); 4930 4931 return (error); 4932} 4933 4934int 4935zfs_freebsd_setacl(ap) 4936 struct vop_setacl_args /* { 4937 struct vnode *vp; 4938 acl_type_t type; 4939 struct acl *aclp; 4940 struct ucred *cred; 4941 struct thread *td; 4942 } */ *ap; 4943{ 4944 int error; 4945 vsecattr_t vsecattr; 4946 int aclbsize; /* size of acl list in bytes */ 4947 aclent_t *aaclp; 4948 4949 if (ap->a_type != ACL_TYPE_NFS4) 4950 return (EINVAL); 4951 4952 if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES) 4953 return (EINVAL); 4954 4955 /* 4956 * With NFSv4 ACLs, chmod(2) may need to add additional entries, 4957 * splitting every entry into two and appending "canonical six" 4958 * entries at the end. Don't allow for setting an ACL that would 4959 * cause chmod(2) to run out of ACL entries. 4960 */ 4961 if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES) 4962 return (ENOSPC); 4963 4964 error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR); 4965 if (error != 0) 4966 return (error); 4967 4968 vsecattr.vsa_mask = VSA_ACE; 4969 aclbsize = ap->a_aclp->acl_cnt * sizeof(ace_t); 4970 vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP); 4971 aaclp = vsecattr.vsa_aclentp; 4972 vsecattr.vsa_aclentsz = aclbsize; 4973 4974 aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp); 4975 error = zfs_setsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL); 4976 kmem_free(aaclp, aclbsize); 4977 4978 return (error); 4979} 4980 4981int 4982zfs_freebsd_aclcheck(ap) 4983 struct vop_aclcheck_args /* { 4984 struct vnode *vp; 4985 acl_type_t type; 4986 struct acl *aclp; 4987 struct ucred *cred; 4988 struct thread *td; 4989 } */ *ap; 4990{ 4991 4992 return (EOPNOTSUPP); 4993} 4994 4995struct vop_vector zfs_vnodeops; 4996struct vop_vector zfs_fifoops; 4997 4998struct vop_vector zfs_vnodeops = { 4999 .vop_default = &default_vnodeops, 5000 .vop_inactive = zfs_freebsd_inactive, 5001 .vop_reclaim = zfs_freebsd_reclaim, 5002 .vop_access = zfs_freebsd_access, 5003#ifdef FREEBSD_NAMECACHE 5004 .vop_lookup = vfs_cache_lookup, 5005 .vop_cachedlookup = zfs_freebsd_lookup, 5006#else 5007 .vop_lookup = zfs_freebsd_lookup, 5008#endif 5009 .vop_getattr = zfs_freebsd_getattr, 5010 .vop_setattr = zfs_freebsd_setattr, 5011 .vop_create = zfs_freebsd_create, 5012 .vop_mknod = zfs_freebsd_create, 5013 .vop_mkdir = zfs_freebsd_mkdir, 5014 .vop_readdir = zfs_freebsd_readdir, 5015 .vop_fsync = zfs_freebsd_fsync, 5016 .vop_open = zfs_freebsd_open, 5017 .vop_close = zfs_freebsd_close, 5018 .vop_rmdir = zfs_freebsd_rmdir, 5019 .vop_ioctl = zfs_freebsd_ioctl, 5020 .vop_link = zfs_freebsd_link, 5021 .vop_symlink = zfs_freebsd_symlink, 5022 .vop_readlink = zfs_freebsd_readlink, 5023 .vop_read = zfs_freebsd_read, 5024 .vop_write = zfs_freebsd_write, 5025 .vop_remove = zfs_freebsd_remove, 5026 .vop_rename = zfs_freebsd_rename, 5027 .vop_pathconf = zfs_freebsd_pathconf, 5028 .vop_bmap = VOP_EOPNOTSUPP, 5029 .vop_fid = zfs_freebsd_fid, 5030 .vop_getextattr = zfs_getextattr, 5031 .vop_deleteextattr = zfs_deleteextattr, 5032 .vop_setextattr = zfs_setextattr, 5033 .vop_listextattr = zfs_listextattr, 5034 .vop_getacl = zfs_freebsd_getacl, 5035 .vop_setacl = zfs_freebsd_setacl, 5036 .vop_aclcheck = zfs_freebsd_aclcheck, 5037}; 5038 5039struct vop_vector zfs_fifoops = { 5040 .vop_default = &fifo_specops, 5041 .vop_fsync = zfs_freebsd_fsync, 5042 .vop_access = zfs_freebsd_access, 5043 .vop_getattr = zfs_freebsd_getattr, 5044 .vop_inactive = zfs_freebsd_inactive, 5045 .vop_read = VOP_PANIC, 5046 .vop_reclaim = zfs_freebsd_reclaim, 5047 .vop_setattr = zfs_freebsd_setattr, 5048 .vop_write = VOP_PANIC, 5049 .vop_pathconf = zfs_freebsd_fifo_pathconf, 5050 .vop_fid = zfs_freebsd_fid, 5051 .vop_getacl = zfs_freebsd_getacl, 5052 .vop_setacl = zfs_freebsd_setacl, 5053 .vop_aclcheck = zfs_freebsd_aclcheck, 5054}; 5055