zfs_vnops.c revision 207745
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26/* Portions Copyright 2007 Jeremy Teo */ 27 28#include <sys/types.h> 29#include <sys/param.h> 30#include <sys/time.h> 31#include <sys/systm.h> 32#include <sys/sysmacros.h> 33#include <sys/resource.h> 34#include <sys/vfs.h> 35#include <sys/vnode.h> 36#include <sys/file.h> 37#include <sys/stat.h> 38#include <sys/kmem.h> 39#include <sys/taskq.h> 40#include <sys/uio.h> 41#include <sys/atomic.h> 42#include <sys/namei.h> 43#include <sys/mman.h> 44#include <sys/cmn_err.h> 45#include <sys/errno.h> 46#include <sys/unistd.h> 47#include <sys/zfs_dir.h> 48#include <sys/zfs_ioctl.h> 49#include <sys/fs/zfs.h> 50#include <sys/dmu.h> 51#include <sys/spa.h> 52#include <sys/txg.h> 53#include <sys/dbuf.h> 54#include <sys/zap.h> 55#include <sys/dirent.h> 56#include <sys/policy.h> 57#include <sys/sunddi.h> 58#include <sys/filio.h> 59#include <sys/zfs_ctldir.h> 60#include <sys/zfs_fuid.h> 61#include <sys/dnlc.h> 62#include <sys/zfs_rlock.h> 63#include <sys/extdirent.h> 64#include <sys/kidmap.h> 65#include <sys/bio.h> 66#include <sys/buf.h> 67#include <sys/sf_buf.h> 68#include <sys/sched.h> 69#include <sys/acl.h> 70 71/* 72 * Programming rules. 73 * 74 * Each vnode op performs some logical unit of work. To do this, the ZPL must 75 * properly lock its in-core state, create a DMU transaction, do the work, 76 * record this work in the intent log (ZIL), commit the DMU transaction, 77 * and wait for the intent log to commit if it is a synchronous operation. 78 * Moreover, the vnode ops must work in both normal and log replay context. 79 * The ordering of events is important to avoid deadlocks and references 80 * to freed memory. The example below illustrates the following Big Rules: 81 * 82 * (1) A check must be made in each zfs thread for a mounted file system. 83 * This is done avoiding races using ZFS_ENTER(zfsvfs). 84 * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes 85 * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros 86 * can return EIO from the calling function. 87 * 88 * (2) VN_RELE() should always be the last thing except for zil_commit() 89 * (if necessary) and ZFS_EXIT(). This is for 3 reasons: 90 * First, if it's the last reference, the vnode/znode 91 * can be freed, so the zp may point to freed memory. Second, the last 92 * reference will call zfs_zinactive(), which may induce a lot of work -- 93 * pushing cached pages (which acquires range locks) and syncing out 94 * cached atime changes. Third, zfs_zinactive() may require a new tx, 95 * which could deadlock the system if you were already holding one. 96 * If you must call VN_RELE() within a tx then use VN_RELE_ASYNC(). 97 * 98 * (3) All range locks must be grabbed before calling dmu_tx_assign(), 99 * as they can span dmu_tx_assign() calls. 100 * 101 * (4) Always pass zfsvfs->z_assign as the second argument to dmu_tx_assign(). 102 * In normal operation, this will be TXG_NOWAIT. During ZIL replay, 103 * it will be a specific txg. Either way, dmu_tx_assign() never blocks. 104 * This is critical because we don't want to block while holding locks. 105 * Note, in particular, that if a lock is sometimes acquired before 106 * the tx assigns, and sometimes after (e.g. z_lock), then failing to 107 * use a non-blocking assign can deadlock the system. The scenario: 108 * 109 * Thread A has grabbed a lock before calling dmu_tx_assign(). 110 * Thread B is in an already-assigned tx, and blocks for this lock. 111 * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() 112 * forever, because the previous txg can't quiesce until B's tx commits. 113 * 114 * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, 115 * then drop all locks, call dmu_tx_wait(), and try again. 116 * 117 * (5) If the operation succeeded, generate the intent log entry for it 118 * before dropping locks. This ensures that the ordering of events 119 * in the intent log matches the order in which they actually occurred. 120 * 121 * (6) At the end of each vnode op, the DMU tx must always commit, 122 * regardless of whether there were any errors. 123 * 124 * (7) After dropping all locks, invoke zil_commit(zilog, seq, foid) 125 * to ensure that synchronous semantics are provided when necessary. 126 * 127 * In general, this is how things should be ordered in each vnode op: 128 * 129 * ZFS_ENTER(zfsvfs); // exit if unmounted 130 * top: 131 * zfs_dirent_lock(&dl, ...) // lock directory entry (may VN_HOLD()) 132 * rw_enter(...); // grab any other locks you need 133 * tx = dmu_tx_create(...); // get DMU tx 134 * dmu_tx_hold_*(); // hold each object you might modify 135 * error = dmu_tx_assign(tx, zfsvfs->z_assign); // try to assign 136 * if (error) { 137 * rw_exit(...); // drop locks 138 * zfs_dirent_unlock(dl); // unlock directory entry 139 * VN_RELE(...); // release held vnodes 140 * if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 141 * dmu_tx_wait(tx); 142 * dmu_tx_abort(tx); 143 * goto top; 144 * } 145 * dmu_tx_abort(tx); // abort DMU tx 146 * ZFS_EXIT(zfsvfs); // finished in zfs 147 * return (error); // really out of space 148 * } 149 * error = do_real_work(); // do whatever this VOP does 150 * if (error == 0) 151 * zfs_log_*(...); // on success, make ZIL entry 152 * dmu_tx_commit(tx); // commit DMU tx -- error or not 153 * rw_exit(...); // drop locks 154 * zfs_dirent_unlock(dl); // unlock directory entry 155 * VN_RELE(...); // release held vnodes 156 * zil_commit(zilog, seq, foid); // synchronous when necessary 157 * ZFS_EXIT(zfsvfs); // finished in zfs 158 * return (error); // done, report error 159 */ 160 161/* ARGSUSED */ 162static int 163zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) 164{ 165 znode_t *zp = VTOZ(*vpp); 166 167 if ((flag & FWRITE) && (zp->z_phys->zp_flags & ZFS_APPENDONLY) && 168 ((flag & FAPPEND) == 0)) { 169 return (EPERM); 170 } 171 172 if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && 173 ZTOV(zp)->v_type == VREG && 174 !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) && 175 zp->z_phys->zp_size > 0) 176 if (fs_vscan(*vpp, cr, 0) != 0) 177 return (EACCES); 178 179 /* Keep a count of the synchronous opens in the znode */ 180 if (flag & (FSYNC | FDSYNC)) 181 atomic_inc_32(&zp->z_sync_cnt); 182 183 return (0); 184} 185 186/* ARGSUSED */ 187static int 188zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, 189 caller_context_t *ct) 190{ 191 znode_t *zp = VTOZ(vp); 192 193 /* Decrement the synchronous opens in the znode */ 194 if ((flag & (FSYNC | FDSYNC)) && (count == 1)) 195 atomic_dec_32(&zp->z_sync_cnt); 196 197 /* 198 * Clean up any locks held by this process on the vp. 199 */ 200 cleanlocks(vp, ddi_get_pid(), 0); 201 cleanshares(vp, ddi_get_pid()); 202 203 if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && 204 ZTOV(zp)->v_type == VREG && 205 !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) && 206 zp->z_phys->zp_size > 0) 207 VERIFY(fs_vscan(vp, cr, 1) == 0); 208 209 return (0); 210} 211 212/* 213 * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and 214 * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter. 215 */ 216static int 217zfs_holey(vnode_t *vp, u_long cmd, offset_t *off) 218{ 219 znode_t *zp = VTOZ(vp); 220 uint64_t noff = (uint64_t)*off; /* new offset */ 221 uint64_t file_sz; 222 int error; 223 boolean_t hole; 224 225 file_sz = zp->z_phys->zp_size; 226 if (noff >= file_sz) { 227 return (ENXIO); 228 } 229 230 if (cmd == _FIO_SEEK_HOLE) 231 hole = B_TRUE; 232 else 233 hole = B_FALSE; 234 235 error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff); 236 237 /* end of file? */ 238 if ((error == ESRCH) || (noff > file_sz)) { 239 /* 240 * Handle the virtual hole at the end of file. 241 */ 242 if (hole) { 243 *off = file_sz; 244 return (0); 245 } 246 return (ENXIO); 247 } 248 249 if (noff < *off) 250 return (error); 251 *off = noff; 252 return (error); 253} 254 255/* ARGSUSED */ 256static int 257zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred, 258 int *rvalp, caller_context_t *ct) 259{ 260 offset_t off; 261 int error; 262 zfsvfs_t *zfsvfs; 263 znode_t *zp; 264 265 switch (com) { 266 case _FIOFFS: 267 return (0); 268 269 /* 270 * The following two ioctls are used by bfu. Faking out, 271 * necessary to avoid bfu errors. 272 */ 273 case _FIOGDIO: 274 case _FIOSDIO: 275 return (0); 276 277 case _FIO_SEEK_DATA: 278 case _FIO_SEEK_HOLE: 279 if (ddi_copyin((void *)data, &off, sizeof (off), flag)) 280 return (EFAULT); 281 282 zp = VTOZ(vp); 283 zfsvfs = zp->z_zfsvfs; 284 ZFS_ENTER(zfsvfs); 285 ZFS_VERIFY_ZP(zp); 286 287 /* offset parameter is in/out */ 288 error = zfs_holey(vp, com, &off); 289 ZFS_EXIT(zfsvfs); 290 if (error) 291 return (error); 292 if (ddi_copyout(&off, (void *)data, sizeof (off), flag)) 293 return (EFAULT); 294 return (0); 295 } 296 return (ENOTTY); 297} 298 299/* 300 * When a file is memory mapped, we must keep the IO data synchronized 301 * between the DMU cache and the memory mapped pages. What this means: 302 * 303 * On Write: If we find a memory mapped page, we write to *both* 304 * the page and the dmu buffer. 305 * 306 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when 307 * the file is memory mapped. 308 */ 309static int 310mappedwrite(vnode_t *vp, int nbytes, uio_t *uio, dmu_tx_t *tx) 311{ 312 znode_t *zp = VTOZ(vp); 313 objset_t *os = zp->z_zfsvfs->z_os; 314 vm_object_t obj; 315 vm_page_t m; 316 struct sf_buf *sf; 317 int64_t start, off; 318 int len = nbytes; 319 int error = 0; 320 uint64_t dirbytes; 321 322 ASSERT(vp->v_mount != NULL); 323 obj = vp->v_object; 324 ASSERT(obj != NULL); 325 326 start = uio->uio_loffset; 327 off = start & PAGEOFFSET; 328 dirbytes = 0; 329 VM_OBJECT_LOCK(obj); 330 for (start &= PAGEMASK; len > 0; start += PAGESIZE) { 331 uint64_t bytes = MIN(PAGESIZE - off, len); 332 uint64_t fsize; 333 334again: 335 if ((m = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && 336 vm_page_is_valid(m, (vm_offset_t)off, bytes)) { 337 uint64_t woff; 338 caddr_t va; 339 340 if (vm_page_sleep_if_busy(m, FALSE, "zfsmwb")) 341 goto again; 342 fsize = obj->un_pager.vnp.vnp_size; 343 vm_page_busy(m); 344 vm_page_lock_queues(); 345 vm_page_undirty(m); 346 vm_page_unlock_queues(); 347 VM_OBJECT_UNLOCK(obj); 348 if (dirbytes > 0) { 349 error = dmu_write_uio(os, zp->z_id, uio, 350 dirbytes, tx); 351 dirbytes = 0; 352 } 353 if (error == 0) { 354 sched_pin(); 355 sf = sf_buf_alloc(m, SFB_CPUPRIVATE); 356 va = (caddr_t)sf_buf_kva(sf); 357 woff = uio->uio_loffset - off; 358 error = uiomove(va + off, bytes, UIO_WRITE, uio); 359 /* 360 * The uiomove() above could have been partially 361 * successful, that's why we call dmu_write() 362 * below unconditionally. The page was marked 363 * non-dirty above and we would lose the changes 364 * without doing so. If the uiomove() failed 365 * entirely, well, we just write what we got 366 * before one more time. 367 */ 368 dmu_write(os, zp->z_id, woff, 369 MIN(PAGESIZE, fsize - woff), va, tx); 370 sf_buf_free(sf); 371 sched_unpin(); 372 } 373 VM_OBJECT_LOCK(obj); 374 vm_page_wakeup(m); 375 } else { 376 if (__predict_false(obj->cache != NULL)) { 377 vm_page_cache_free(obj, OFF_TO_IDX(start), 378 OFF_TO_IDX(start) + 1); 379 } 380 dirbytes += bytes; 381 } 382 len -= bytes; 383 off = 0; 384 if (error) 385 break; 386 } 387 VM_OBJECT_UNLOCK(obj); 388 if (error == 0 && dirbytes > 0) 389 error = dmu_write_uio(os, zp->z_id, uio, dirbytes, tx); 390 return (error); 391} 392 393/* 394 * When a file is memory mapped, we must keep the IO data synchronized 395 * between the DMU cache and the memory mapped pages. What this means: 396 * 397 * On Read: We "read" preferentially from memory mapped pages, 398 * else we default from the dmu buffer. 399 * 400 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when 401 * the file is memory mapped. 402 */ 403static int 404mappedread(vnode_t *vp, int nbytes, uio_t *uio) 405{ 406 znode_t *zp = VTOZ(vp); 407 objset_t *os = zp->z_zfsvfs->z_os; 408 vm_object_t obj; 409 vm_page_t m; 410 struct sf_buf *sf; 411 int64_t start, off; 412 caddr_t va; 413 int len = nbytes; 414 int error = 0; 415 uint64_t dirbytes; 416 417 ASSERT(vp->v_mount != NULL); 418 obj = vp->v_object; 419 ASSERT(obj != NULL); 420 421 start = uio->uio_loffset; 422 off = start & PAGEOFFSET; 423 dirbytes = 0; 424 VM_OBJECT_LOCK(obj); 425 for (start &= PAGEMASK; len > 0; start += PAGESIZE) { 426 uint64_t bytes = MIN(PAGESIZE - off, len); 427 428again: 429 if ((m = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && 430 vm_page_is_valid(m, (vm_offset_t)off, bytes)) { 431 if (vm_page_sleep_if_busy(m, FALSE, "zfsmrb")) 432 goto again; 433 vm_page_busy(m); 434 VM_OBJECT_UNLOCK(obj); 435 if (dirbytes > 0) { 436 error = dmu_read_uio(os, zp->z_id, uio, 437 dirbytes); 438 dirbytes = 0; 439 } 440 if (error == 0) { 441 sched_pin(); 442 sf = sf_buf_alloc(m, SFB_CPUPRIVATE); 443 va = (caddr_t)sf_buf_kva(sf); 444 error = uiomove(va + off, bytes, UIO_READ, uio); 445 sf_buf_free(sf); 446 sched_unpin(); 447 } 448 VM_OBJECT_LOCK(obj); 449 vm_page_wakeup(m); 450 } else if (m != NULL && uio->uio_segflg == UIO_NOCOPY) { 451 /* 452 * The code below is here to make sendfile(2) work 453 * correctly with ZFS. As pointed out by ups@ 454 * sendfile(2) should be changed to use VOP_GETPAGES(), 455 * but it pessimize performance of sendfile/UFS, that's 456 * why I handle this special case in ZFS code. 457 */ 458 if (vm_page_sleep_if_busy(m, FALSE, "zfsmrb")) 459 goto again; 460 vm_page_busy(m); 461 VM_OBJECT_UNLOCK(obj); 462 if (dirbytes > 0) { 463 error = dmu_read_uio(os, zp->z_id, uio, 464 dirbytes); 465 dirbytes = 0; 466 } 467 if (error == 0) { 468 sched_pin(); 469 sf = sf_buf_alloc(m, SFB_CPUPRIVATE); 470 va = (caddr_t)sf_buf_kva(sf); 471 error = dmu_read(os, zp->z_id, start + off, 472 bytes, (void *)(va + off)); 473 sf_buf_free(sf); 474 sched_unpin(); 475 } 476 VM_OBJECT_LOCK(obj); 477 vm_page_wakeup(m); 478 if (error == 0) 479 uio->uio_resid -= bytes; 480 } else { 481 dirbytes += bytes; 482 } 483 len -= bytes; 484 off = 0; 485 if (error) 486 break; 487 } 488 VM_OBJECT_UNLOCK(obj); 489 if (error == 0 && dirbytes > 0) 490 error = dmu_read_uio(os, zp->z_id, uio, dirbytes); 491 return (error); 492} 493 494offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */ 495 496/* 497 * Read bytes from specified file into supplied buffer. 498 * 499 * IN: vp - vnode of file to be read from. 500 * uio - structure supplying read location, range info, 501 * and return buffer. 502 * ioflag - SYNC flags; used to provide FRSYNC semantics. 503 * cr - credentials of caller. 504 * ct - caller context 505 * 506 * OUT: uio - updated offset and range, buffer filled. 507 * 508 * RETURN: 0 if success 509 * error code if failure 510 * 511 * Side Effects: 512 * vp - atime updated if byte count > 0 513 */ 514/* ARGSUSED */ 515static int 516zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) 517{ 518 znode_t *zp = VTOZ(vp); 519 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 520 objset_t *os; 521 ssize_t n, nbytes; 522 int error; 523 rl_t *rl; 524 525 ZFS_ENTER(zfsvfs); 526 ZFS_VERIFY_ZP(zp); 527 os = zfsvfs->z_os; 528 529 if (zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) { 530 ZFS_EXIT(zfsvfs); 531 return (EACCES); 532 } 533 534 /* 535 * Validate file offset 536 */ 537 if (uio->uio_loffset < (offset_t)0) { 538 ZFS_EXIT(zfsvfs); 539 return (EINVAL); 540 } 541 542 /* 543 * Fasttrack empty reads 544 */ 545 if (uio->uio_resid == 0) { 546 ZFS_EXIT(zfsvfs); 547 return (0); 548 } 549 550 /* 551 * Check for mandatory locks 552 */ 553 if (MANDMODE((mode_t)zp->z_phys->zp_mode)) { 554 if (error = chklock(vp, FREAD, 555 uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) { 556 ZFS_EXIT(zfsvfs); 557 return (error); 558 } 559 } 560 561 /* 562 * If we're in FRSYNC mode, sync out this znode before reading it. 563 */ 564 if (ioflag & FRSYNC) 565 zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id); 566 567 /* 568 * Lock the range against changes. 569 */ 570 rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER); 571 572 /* 573 * If we are reading past end-of-file we can skip 574 * to the end; but we might still need to set atime. 575 */ 576 if (uio->uio_loffset >= zp->z_phys->zp_size) { 577 error = 0; 578 goto out; 579 } 580 581 ASSERT(uio->uio_loffset < zp->z_phys->zp_size); 582 n = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset); 583 584 while (n > 0) { 585 nbytes = MIN(n, zfs_read_chunk_size - 586 P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); 587 588 if (vn_has_cached_data(vp)) 589 error = mappedread(vp, nbytes, uio); 590 else 591 error = dmu_read_uio(os, zp->z_id, uio, nbytes); 592 if (error) { 593 /* convert checksum errors into IO errors */ 594 if (error == ECKSUM) 595 error = EIO; 596 break; 597 } 598 599 n -= nbytes; 600 } 601 602out: 603 zfs_range_unlock(rl); 604 605 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 606 ZFS_EXIT(zfsvfs); 607 return (error); 608} 609 610/* 611 * Fault in the pages of the first n bytes specified by the uio structure. 612 * 1 byte in each page is touched and the uio struct is unmodified. 613 * Any error will exit this routine as this is only a best 614 * attempt to get the pages resident. This is a copy of ufs_trans_touch(). 615 */ 616static void 617zfs_prefault_write(ssize_t n, struct uio *uio) 618{ 619 struct iovec *iov; 620 ulong_t cnt, incr; 621 caddr_t p; 622 623 if (uio->uio_segflg != UIO_USERSPACE) 624 return; 625 626 iov = uio->uio_iov; 627 628 while (n) { 629 cnt = MIN(iov->iov_len, n); 630 if (cnt == 0) { 631 /* empty iov entry */ 632 iov++; 633 continue; 634 } 635 n -= cnt; 636 /* 637 * touch each page in this segment. 638 */ 639 p = iov->iov_base; 640 while (cnt) { 641 if (fubyte(p) == -1) 642 return; 643 incr = MIN(cnt, PAGESIZE); 644 p += incr; 645 cnt -= incr; 646 } 647 /* 648 * touch the last byte in case it straddles a page. 649 */ 650 p--; 651 if (fubyte(p) == -1) 652 return; 653 iov++; 654 } 655} 656 657/* 658 * Write the bytes to a file. 659 * 660 * IN: vp - vnode of file to be written to. 661 * uio - structure supplying write location, range info, 662 * and data buffer. 663 * ioflag - IO_APPEND flag set if in append mode. 664 * cr - credentials of caller. 665 * ct - caller context (NFS/CIFS fem monitor only) 666 * 667 * OUT: uio - updated offset and range. 668 * 669 * RETURN: 0 if success 670 * error code if failure 671 * 672 * Timestamps: 673 * vp - ctime|mtime updated if byte count > 0 674 */ 675/* ARGSUSED */ 676static int 677zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) 678{ 679 znode_t *zp = VTOZ(vp); 680 rlim64_t limit = MAXOFFSET_T; 681 ssize_t start_resid = uio->uio_resid; 682 ssize_t tx_bytes; 683 uint64_t end_size; 684 dmu_tx_t *tx; 685 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 686 zilog_t *zilog; 687 offset_t woff; 688 ssize_t n, nbytes; 689 rl_t *rl; 690 int max_blksz = zfsvfs->z_max_blksz; 691 uint64_t pflags; 692 int error; 693 694 /* 695 * Fasttrack empty write 696 */ 697 n = start_resid; 698 if (n == 0) 699 return (0); 700 701 if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) 702 limit = MAXOFFSET_T; 703 704 ZFS_ENTER(zfsvfs); 705 ZFS_VERIFY_ZP(zp); 706 707 /* 708 * If immutable or not appending then return EPERM 709 */ 710 pflags = zp->z_phys->zp_flags; 711 if ((pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) || 712 ((pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) && 713 (uio->uio_loffset < zp->z_phys->zp_size))) { 714 ZFS_EXIT(zfsvfs); 715 return (EPERM); 716 } 717 718 zilog = zfsvfs->z_log; 719 720 /* 721 * Pre-fault the pages to ensure slow (eg NFS) pages 722 * don't hold up txg. 723 */ 724 zfs_prefault_write(n, uio); 725 726 /* 727 * If in append mode, set the io offset pointer to eof. 728 */ 729 if (ioflag & IO_APPEND) { 730 /* 731 * Range lock for a file append: 732 * The value for the start of range will be determined by 733 * zfs_range_lock() (to guarantee append semantics). 734 * If this write will cause the block size to increase, 735 * zfs_range_lock() will lock the entire file, so we must 736 * later reduce the range after we grow the block size. 737 */ 738 rl = zfs_range_lock(zp, 0, n, RL_APPEND); 739 if (rl->r_len == UINT64_MAX) { 740 /* overlocked, zp_size can't change */ 741 woff = uio->uio_loffset = zp->z_phys->zp_size; 742 } else { 743 woff = uio->uio_loffset = rl->r_off; 744 } 745 } else { 746 woff = uio->uio_loffset; 747 /* 748 * Validate file offset 749 */ 750 if (woff < 0) { 751 ZFS_EXIT(zfsvfs); 752 return (EINVAL); 753 } 754 755 /* 756 * If we need to grow the block size then zfs_range_lock() 757 * will lock a wider range than we request here. 758 * Later after growing the block size we reduce the range. 759 */ 760 rl = zfs_range_lock(zp, woff, n, RL_WRITER); 761 } 762 763 if (woff >= limit) { 764 zfs_range_unlock(rl); 765 ZFS_EXIT(zfsvfs); 766 return (EFBIG); 767 } 768 769 if ((woff + n) > limit || woff > (limit - n)) 770 n = limit - woff; 771 772 /* 773 * Check for mandatory locks 774 */ 775 if (MANDMODE((mode_t)zp->z_phys->zp_mode) && 776 (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) { 777 zfs_range_unlock(rl); 778 ZFS_EXIT(zfsvfs); 779 return (error); 780 } 781 end_size = MAX(zp->z_phys->zp_size, woff + n); 782 783 /* 784 * Write the file in reasonable size chunks. Each chunk is written 785 * in a separate transaction; this keeps the intent log records small 786 * and allows us to do more fine-grained space accounting. 787 */ 788 while (n > 0) { 789 /* 790 * Start a transaction. 791 */ 792 woff = uio->uio_loffset; 793 tx = dmu_tx_create(zfsvfs->z_os); 794 dmu_tx_hold_bonus(tx, zp->z_id); 795 dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz)); 796 error = dmu_tx_assign(tx, zfsvfs->z_assign); 797 if (error) { 798 if (error == ERESTART && 799 zfsvfs->z_assign == TXG_NOWAIT) { 800 dmu_tx_wait(tx); 801 dmu_tx_abort(tx); 802 continue; 803 } 804 dmu_tx_abort(tx); 805 break; 806 } 807 808 /* 809 * If zfs_range_lock() over-locked we grow the blocksize 810 * and then reduce the lock range. This will only happen 811 * on the first iteration since zfs_range_reduce() will 812 * shrink down r_len to the appropriate size. 813 */ 814 if (rl->r_len == UINT64_MAX) { 815 uint64_t new_blksz; 816 817 if (zp->z_blksz > max_blksz) { 818 ASSERT(!ISP2(zp->z_blksz)); 819 new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE); 820 } else { 821 new_blksz = MIN(end_size, max_blksz); 822 } 823 zfs_grow_blocksize(zp, new_blksz, tx); 824 zfs_range_reduce(rl, woff, n); 825 } 826 827 /* 828 * XXX - should we really limit each write to z_max_blksz? 829 * Perhaps we should use SPA_MAXBLOCKSIZE chunks? 830 */ 831 nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); 832 833 if (woff + nbytes > zp->z_phys->zp_size) 834 vnode_pager_setsize(vp, woff + nbytes); 835 836 rw_enter(&zp->z_map_lock, RW_READER); 837 838 tx_bytes = uio->uio_resid; 839 if (vn_has_cached_data(vp)) { 840 rw_exit(&zp->z_map_lock); 841 error = mappedwrite(vp, nbytes, uio, tx); 842 } else { 843 error = dmu_write_uio(zfsvfs->z_os, zp->z_id, 844 uio, nbytes, tx); 845 rw_exit(&zp->z_map_lock); 846 } 847 tx_bytes -= uio->uio_resid; 848 849 /* 850 * If we made no progress, we're done. If we made even 851 * partial progress, update the znode and ZIL accordingly. 852 */ 853 if (tx_bytes == 0) { 854 dmu_tx_commit(tx); 855 ASSERT(error != 0); 856 break; 857 } 858 859 /* 860 * Clear Set-UID/Set-GID bits on successful write if not 861 * privileged and at least one of the excute bits is set. 862 * 863 * It would be nice to to this after all writes have 864 * been done, but that would still expose the ISUID/ISGID 865 * to another app after the partial write is committed. 866 * 867 * Note: we don't call zfs_fuid_map_id() here because 868 * user 0 is not an ephemeral uid. 869 */ 870 mutex_enter(&zp->z_acl_lock); 871 if ((zp->z_phys->zp_mode & (S_IXUSR | (S_IXUSR >> 3) | 872 (S_IXUSR >> 6))) != 0 && 873 (zp->z_phys->zp_mode & (S_ISUID | S_ISGID)) != 0 && 874 secpolicy_vnode_setid_retain(vp, cr, 875 (zp->z_phys->zp_mode & S_ISUID) != 0 && 876 zp->z_phys->zp_uid == 0) != 0) { 877 zp->z_phys->zp_mode &= ~(S_ISUID | S_ISGID); 878 } 879 mutex_exit(&zp->z_acl_lock); 880 881 /* 882 * Update time stamp. NOTE: This marks the bonus buffer as 883 * dirty, so we don't have to do it again for zp_size. 884 */ 885 zfs_time_stamper(zp, CONTENT_MODIFIED, tx); 886 887 /* 888 * Update the file size (zp_size) if it has changed; 889 * account for possible concurrent updates. 890 */ 891 while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset) 892 (void) atomic_cas_64(&zp->z_phys->zp_size, end_size, 893 uio->uio_loffset); 894 zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag); 895 dmu_tx_commit(tx); 896 897 if (error != 0) 898 break; 899 ASSERT(tx_bytes == nbytes); 900 n -= nbytes; 901 } 902 903 zfs_range_unlock(rl); 904 905 /* 906 * If we're in replay mode, or we made no progress, return error. 907 * Otherwise, it's at least a partial write, so it's successful. 908 */ 909 if (zfsvfs->z_assign >= TXG_INITIAL || uio->uio_resid == start_resid) { 910 ZFS_EXIT(zfsvfs); 911 return (error); 912 } 913 914 if (ioflag & (FSYNC | FDSYNC)) 915 zil_commit(zilog, zp->z_last_itx, zp->z_id); 916 917 ZFS_EXIT(zfsvfs); 918 return (0); 919} 920 921void 922zfs_get_done(dmu_buf_t *db, void *vzgd) 923{ 924 zgd_t *zgd = (zgd_t *)vzgd; 925 rl_t *rl = zgd->zgd_rl; 926 vnode_t *vp = ZTOV(rl->r_zp); 927 objset_t *os = rl->r_zp->z_zfsvfs->z_os; 928 int vfslocked; 929 930 vfslocked = VFS_LOCK_GIANT(vp->v_vfsp); 931 dmu_buf_rele(db, vzgd); 932 zfs_range_unlock(rl); 933 /* 934 * Release the vnode asynchronously as we currently have the 935 * txg stopped from syncing. 936 */ 937 VN_RELE_ASYNC(vp, dsl_pool_vnrele_taskq(dmu_objset_pool(os))); 938 zil_add_block(zgd->zgd_zilog, zgd->zgd_bp); 939 kmem_free(zgd, sizeof (zgd_t)); 940 VFS_UNLOCK_GIANT(vfslocked); 941} 942 943/* 944 * Get data to generate a TX_WRITE intent log record. 945 */ 946int 947zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) 948{ 949 zfsvfs_t *zfsvfs = arg; 950 objset_t *os = zfsvfs->z_os; 951 znode_t *zp; 952 uint64_t off = lr->lr_offset; 953 dmu_buf_t *db; 954 rl_t *rl; 955 zgd_t *zgd; 956 int dlen = lr->lr_length; /* length of user data */ 957 int error = 0; 958 959 ASSERT(zio); 960 ASSERT(dlen != 0); 961 962 /* 963 * Nothing to do if the file has been removed 964 */ 965 if (zfs_zget(zfsvfs, lr->lr_foid, &zp) != 0) 966 return (ENOENT); 967 if (zp->z_unlinked) { 968 /* 969 * Release the vnode asynchronously as we currently have the 970 * txg stopped from syncing. 971 */ 972 VN_RELE_ASYNC(ZTOV(zp), 973 dsl_pool_vnrele_taskq(dmu_objset_pool(os))); 974 return (ENOENT); 975 } 976 977 /* 978 * Write records come in two flavors: immediate and indirect. 979 * For small writes it's cheaper to store the data with the 980 * log record (immediate); for large writes it's cheaper to 981 * sync the data and get a pointer to it (indirect) so that 982 * we don't have to write the data twice. 983 */ 984 if (buf != NULL) { /* immediate write */ 985 rl = zfs_range_lock(zp, off, dlen, RL_READER); 986 /* test for truncation needs to be done while range locked */ 987 if (off >= zp->z_phys->zp_size) { 988 error = ENOENT; 989 goto out; 990 } 991 VERIFY(0 == dmu_read(os, lr->lr_foid, off, dlen, buf)); 992 } else { /* indirect write */ 993 uint64_t boff; /* block starting offset */ 994 995 /* 996 * Have to lock the whole block to ensure when it's 997 * written out and it's checksum is being calculated 998 * that no one can change the data. We need to re-check 999 * blocksize after we get the lock in case it's changed! 1000 */ 1001 for (;;) { 1002 if (ISP2(zp->z_blksz)) { 1003 boff = P2ALIGN_TYPED(off, zp->z_blksz, 1004 uint64_t); 1005 } else { 1006 boff = 0; 1007 } 1008 dlen = zp->z_blksz; 1009 rl = zfs_range_lock(zp, boff, dlen, RL_READER); 1010 if (zp->z_blksz == dlen) 1011 break; 1012 zfs_range_unlock(rl); 1013 } 1014 /* test for truncation needs to be done while range locked */ 1015 if (off >= zp->z_phys->zp_size) { 1016 error = ENOENT; 1017 goto out; 1018 } 1019 zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP); 1020 zgd->zgd_rl = rl; 1021 zgd->zgd_zilog = zfsvfs->z_log; 1022 zgd->zgd_bp = &lr->lr_blkptr; 1023 VERIFY(0 == dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db)); 1024 ASSERT(boff == db->db_offset); 1025 lr->lr_blkoff = off - boff; 1026 error = dmu_sync(zio, db, &lr->lr_blkptr, 1027 lr->lr_common.lrc_txg, zfs_get_done, zgd); 1028 ASSERT((error && error != EINPROGRESS) || 1029 lr->lr_length <= zp->z_blksz); 1030 if (error == 0) 1031 zil_add_block(zfsvfs->z_log, &lr->lr_blkptr); 1032 /* 1033 * If we get EINPROGRESS, then we need to wait for a 1034 * write IO initiated by dmu_sync() to complete before 1035 * we can release this dbuf. We will finish everything 1036 * up in the zfs_get_done() callback. 1037 */ 1038 if (error == EINPROGRESS) 1039 return (0); 1040 dmu_buf_rele(db, zgd); 1041 kmem_free(zgd, sizeof (zgd_t)); 1042 } 1043out: 1044 zfs_range_unlock(rl); 1045 /* 1046 * Release the vnode asynchronously as we currently have the 1047 * txg stopped from syncing. 1048 */ 1049 VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os))); 1050 return (error); 1051} 1052 1053/*ARGSUSED*/ 1054static int 1055zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr, 1056 caller_context_t *ct) 1057{ 1058 znode_t *zp = VTOZ(vp); 1059 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1060 int error; 1061 1062 ZFS_ENTER(zfsvfs); 1063 ZFS_VERIFY_ZP(zp); 1064 1065 if (flag & V_ACE_MASK) 1066 error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); 1067 else 1068 error = zfs_zaccess_rwx(zp, mode, flag, cr); 1069 1070 ZFS_EXIT(zfsvfs); 1071 return (error); 1072} 1073 1074/* 1075 * Lookup an entry in a directory, or an extended attribute directory. 1076 * If it exists, return a held vnode reference for it. 1077 * 1078 * IN: dvp - vnode of directory to search. 1079 * nm - name of entry to lookup. 1080 * pnp - full pathname to lookup [UNUSED]. 1081 * flags - LOOKUP_XATTR set if looking for an attribute. 1082 * rdir - root directory vnode [UNUSED]. 1083 * cr - credentials of caller. 1084 * ct - caller context 1085 * direntflags - directory lookup flags 1086 * realpnp - returned pathname. 1087 * 1088 * OUT: vpp - vnode of located entry, NULL if not found. 1089 * 1090 * RETURN: 0 if success 1091 * error code if failure 1092 * 1093 * Timestamps: 1094 * NA 1095 */ 1096/* ARGSUSED */ 1097static int 1098zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp, 1099 int nameiop, cred_t *cr, kthread_t *td, int flags) 1100{ 1101 znode_t *zdp = VTOZ(dvp); 1102 zfsvfs_t *zfsvfs = zdp->z_zfsvfs; 1103 int error; 1104 int *direntflags = NULL; 1105 void *realpnp = NULL; 1106 1107 ZFS_ENTER(zfsvfs); 1108 ZFS_VERIFY_ZP(zdp); 1109 1110 *vpp = NULL; 1111 1112 if (flags & LOOKUP_XATTR) { 1113#ifdef TODO 1114 /* 1115 * If the xattr property is off, refuse the lookup request. 1116 */ 1117 if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) { 1118 ZFS_EXIT(zfsvfs); 1119 return (EINVAL); 1120 } 1121#endif 1122 1123 /* 1124 * We don't allow recursive attributes.. 1125 * Maybe someday we will. 1126 */ 1127 if (zdp->z_phys->zp_flags & ZFS_XATTR) { 1128 ZFS_EXIT(zfsvfs); 1129 return (EINVAL); 1130 } 1131 1132 if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) { 1133 ZFS_EXIT(zfsvfs); 1134 return (error); 1135 } 1136 1137 /* 1138 * Do we have permission to get into attribute directory? 1139 */ 1140 1141 if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0, 1142 B_FALSE, cr)) { 1143 VN_RELE(*vpp); 1144 *vpp = NULL; 1145 } 1146 1147 ZFS_EXIT(zfsvfs); 1148 return (error); 1149 } 1150 1151 if (dvp->v_type != VDIR) { 1152 ZFS_EXIT(zfsvfs); 1153 return (ENOTDIR); 1154 } 1155 1156 /* 1157 * Check accessibility of directory. 1158 */ 1159 1160 if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) { 1161 ZFS_EXIT(zfsvfs); 1162 return (error); 1163 } 1164 1165 if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), 1166 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 1167 ZFS_EXIT(zfsvfs); 1168 return (EILSEQ); 1169 } 1170 1171 error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp); 1172 if (error == 0) { 1173 /* 1174 * Convert device special files 1175 */ 1176 if (IS_DEVVP(*vpp)) { 1177 vnode_t *svp; 1178 1179 svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); 1180 VN_RELE(*vpp); 1181 if (svp == NULL) 1182 error = ENOSYS; 1183 else 1184 *vpp = svp; 1185 } 1186 } 1187 1188 /* Translate errors and add SAVENAME when needed. */ 1189 if (cnp->cn_flags & ISLASTCN) { 1190 switch (nameiop) { 1191 case CREATE: 1192 case RENAME: 1193 if (error == ENOENT) { 1194 error = EJUSTRETURN; 1195 cnp->cn_flags |= SAVENAME; 1196 break; 1197 } 1198 /* FALLTHROUGH */ 1199 case DELETE: 1200 if (error == 0) 1201 cnp->cn_flags |= SAVENAME; 1202 break; 1203 } 1204 } 1205 if (error == 0 && (nm[0] != '.' || nm[1] != '\0')) { 1206 int ltype = 0; 1207 1208 if (cnp->cn_flags & ISDOTDOT) { 1209 ltype = VOP_ISLOCKED(dvp); 1210 VOP_UNLOCK(dvp, 0); 1211 } 1212 ZFS_EXIT(zfsvfs); 1213 error = vn_lock(*vpp, cnp->cn_lkflags); 1214 if (cnp->cn_flags & ISDOTDOT) 1215 vn_lock(dvp, ltype | LK_RETRY); 1216 if (error != 0) { 1217 VN_RELE(*vpp); 1218 *vpp = NULL; 1219 return (error); 1220 } 1221 } else { 1222 ZFS_EXIT(zfsvfs); 1223 } 1224 1225#ifdef FREEBSD_NAMECACHE 1226 /* 1227 * Insert name into cache (as non-existent) if appropriate. 1228 */ 1229 if (error == ENOENT && (cnp->cn_flags & MAKEENTRY) && nameiop != CREATE) 1230 cache_enter(dvp, *vpp, cnp); 1231 /* 1232 * Insert name into cache if appropriate. 1233 */ 1234 if (error == 0 && (cnp->cn_flags & MAKEENTRY)) { 1235 if (!(cnp->cn_flags & ISLASTCN) || 1236 (nameiop != DELETE && nameiop != RENAME)) { 1237 cache_enter(dvp, *vpp, cnp); 1238 } 1239 } 1240#endif 1241 1242 return (error); 1243} 1244 1245/* 1246 * Attempt to create a new entry in a directory. If the entry 1247 * already exists, truncate the file if permissible, else return 1248 * an error. Return the vp of the created or trunc'd file. 1249 * 1250 * IN: dvp - vnode of directory to put new file entry in. 1251 * name - name of new file entry. 1252 * vap - attributes of new file. 1253 * excl - flag indicating exclusive or non-exclusive mode. 1254 * mode - mode to open file with. 1255 * cr - credentials of caller. 1256 * flag - large file flag [UNUSED]. 1257 * ct - caller context 1258 * vsecp - ACL to be set 1259 * 1260 * OUT: vpp - vnode of created or trunc'd entry. 1261 * 1262 * RETURN: 0 if success 1263 * error code if failure 1264 * 1265 * Timestamps: 1266 * dvp - ctime|mtime updated if new entry created 1267 * vp - ctime|mtime always, atime if new 1268 */ 1269 1270/* ARGSUSED */ 1271static int 1272zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode, 1273 vnode_t **vpp, cred_t *cr, kthread_t *td) 1274{ 1275 znode_t *zp, *dzp = VTOZ(dvp); 1276 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1277 zilog_t *zilog; 1278 objset_t *os; 1279 zfs_dirlock_t *dl; 1280 dmu_tx_t *tx; 1281 int error; 1282 zfs_acl_t *aclp = NULL; 1283 zfs_fuid_info_t *fuidp = NULL; 1284 void *vsecp = NULL; 1285 int flag = 0; 1286 1287 /* 1288 * If we have an ephemeral id, ACL, or XVATTR then 1289 * make sure file system is at proper version 1290 */ 1291 1292 if (zfsvfs->z_use_fuids == B_FALSE && 1293 (vsecp || (vap->va_mask & AT_XVATTR) || 1294 IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr)))) 1295 return (EINVAL); 1296 1297 ZFS_ENTER(zfsvfs); 1298 ZFS_VERIFY_ZP(dzp); 1299 os = zfsvfs->z_os; 1300 zilog = zfsvfs->z_log; 1301 1302 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), 1303 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 1304 ZFS_EXIT(zfsvfs); 1305 return (EILSEQ); 1306 } 1307 1308 if (vap->va_mask & AT_XVATTR) { 1309 if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap, 1310 crgetuid(cr), cr, vap->va_type)) != 0) { 1311 ZFS_EXIT(zfsvfs); 1312 return (error); 1313 } 1314 } 1315top: 1316 *vpp = NULL; 1317 1318 if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr)) 1319 vap->va_mode &= ~S_ISVTX; 1320 1321 if (*name == '\0') { 1322 /* 1323 * Null component name refers to the directory itself. 1324 */ 1325 VN_HOLD(dvp); 1326 zp = dzp; 1327 dl = NULL; 1328 error = 0; 1329 } else { 1330 /* possible VN_HOLD(zp) */ 1331 int zflg = 0; 1332 1333 if (flag & FIGNORECASE) 1334 zflg |= ZCILOOK; 1335 1336 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 1337 NULL, NULL); 1338 if (error) { 1339 if (strcmp(name, "..") == 0) 1340 error = EISDIR; 1341 ZFS_EXIT(zfsvfs); 1342 if (aclp) 1343 zfs_acl_free(aclp); 1344 return (error); 1345 } 1346 } 1347 if (vsecp && aclp == NULL) { 1348 error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, &aclp); 1349 if (error) { 1350 ZFS_EXIT(zfsvfs); 1351 if (dl) 1352 zfs_dirent_unlock(dl); 1353 return (error); 1354 } 1355 } 1356 1357 if (zp == NULL) { 1358 uint64_t txtype; 1359 1360 /* 1361 * Create a new file object and update the directory 1362 * to reference it. 1363 */ 1364 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { 1365 goto out; 1366 } 1367 1368 /* 1369 * We only support the creation of regular files in 1370 * extended attribute directories. 1371 */ 1372 if ((dzp->z_phys->zp_flags & ZFS_XATTR) && 1373 (vap->va_type != VREG)) { 1374 error = EINVAL; 1375 goto out; 1376 } 1377 1378 tx = dmu_tx_create(os); 1379 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 1380 if ((aclp && aclp->z_has_fuids) || IS_EPHEMERAL(crgetuid(cr)) || 1381 IS_EPHEMERAL(crgetgid(cr))) { 1382 if (zfsvfs->z_fuid_obj == 0) { 1383 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 1384 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 1385 FUID_SIZE_ESTIMATE(zfsvfs)); 1386 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, 1387 FALSE, NULL); 1388 } else { 1389 dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); 1390 dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, 1391 FUID_SIZE_ESTIMATE(zfsvfs)); 1392 } 1393 } 1394 dmu_tx_hold_bonus(tx, dzp->z_id); 1395 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 1396 if ((dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) || aclp) { 1397 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 1398 0, SPA_MAXBLOCKSIZE); 1399 } 1400 error = dmu_tx_assign(tx, zfsvfs->z_assign); 1401 if (error) { 1402 zfs_dirent_unlock(dl); 1403 if (error == ERESTART && 1404 zfsvfs->z_assign == TXG_NOWAIT) { 1405 dmu_tx_wait(tx); 1406 dmu_tx_abort(tx); 1407 goto top; 1408 } 1409 dmu_tx_abort(tx); 1410 ZFS_EXIT(zfsvfs); 1411 if (aclp) 1412 zfs_acl_free(aclp); 1413 return (error); 1414 } 1415 zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, aclp, &fuidp); 1416 (void) zfs_link_create(dl, zp, tx, ZNEW); 1417 txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); 1418 if (flag & FIGNORECASE) 1419 txtype |= TX_CI; 1420 zfs_log_create(zilog, tx, txtype, dzp, zp, name, 1421 vsecp, fuidp, vap); 1422 if (fuidp) 1423 zfs_fuid_info_free(fuidp); 1424 dmu_tx_commit(tx); 1425 } else { 1426 int aflags = (flag & FAPPEND) ? V_APPEND : 0; 1427 1428 /* 1429 * A directory entry already exists for this name. 1430 */ 1431 /* 1432 * Can't truncate an existing file if in exclusive mode. 1433 */ 1434 if (excl == EXCL) { 1435 error = EEXIST; 1436 goto out; 1437 } 1438 /* 1439 * Can't open a directory for writing. 1440 */ 1441 if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) { 1442 error = EISDIR; 1443 goto out; 1444 } 1445 /* 1446 * Verify requested access to file. 1447 */ 1448 if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) { 1449 goto out; 1450 } 1451 1452 mutex_enter(&dzp->z_lock); 1453 dzp->z_seq++; 1454 mutex_exit(&dzp->z_lock); 1455 1456 /* 1457 * Truncate regular files if requested. 1458 */ 1459 if ((ZTOV(zp)->v_type == VREG) && 1460 (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) { 1461 /* we can't hold any locks when calling zfs_freesp() */ 1462 zfs_dirent_unlock(dl); 1463 dl = NULL; 1464 error = zfs_freesp(zp, 0, 0, mode, TRUE); 1465 if (error == 0) { 1466 vnevent_create(ZTOV(zp), ct); 1467 } 1468 } 1469 } 1470out: 1471 if (dl) 1472 zfs_dirent_unlock(dl); 1473 1474 if (error) { 1475 if (zp) 1476 VN_RELE(ZTOV(zp)); 1477 } else { 1478 *vpp = ZTOV(zp); 1479 /* 1480 * If vnode is for a device return a specfs vnode instead. 1481 */ 1482 if (IS_DEVVP(*vpp)) { 1483 struct vnode *svp; 1484 1485 svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); 1486 VN_RELE(*vpp); 1487 if (svp == NULL) { 1488 error = ENOSYS; 1489 } 1490 *vpp = svp; 1491 } 1492 } 1493 if (aclp) 1494 zfs_acl_free(aclp); 1495 1496 ZFS_EXIT(zfsvfs); 1497 return (error); 1498} 1499 1500/* 1501 * Remove an entry from a directory. 1502 * 1503 * IN: dvp - vnode of directory to remove entry from. 1504 * name - name of entry to remove. 1505 * cr - credentials of caller. 1506 * ct - caller context 1507 * flags - case flags 1508 * 1509 * RETURN: 0 if success 1510 * error code if failure 1511 * 1512 * Timestamps: 1513 * dvp - ctime|mtime 1514 * vp - ctime (if nlink > 0) 1515 */ 1516/*ARGSUSED*/ 1517static int 1518zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct, 1519 int flags) 1520{ 1521 znode_t *zp, *dzp = VTOZ(dvp); 1522 znode_t *xzp = NULL; 1523 vnode_t *vp; 1524 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1525 zilog_t *zilog; 1526 uint64_t acl_obj, xattr_obj; 1527 zfs_dirlock_t *dl; 1528 dmu_tx_t *tx; 1529 boolean_t may_delete_now, delete_now = FALSE; 1530 boolean_t unlinked, toobig = FALSE; 1531 uint64_t txtype; 1532 pathname_t *realnmp = NULL; 1533 pathname_t realnm; 1534 int error; 1535 int zflg = ZEXISTS; 1536 1537 ZFS_ENTER(zfsvfs); 1538 ZFS_VERIFY_ZP(dzp); 1539 zilog = zfsvfs->z_log; 1540 1541 if (flags & FIGNORECASE) { 1542 zflg |= ZCILOOK; 1543 pn_alloc(&realnm); 1544 realnmp = &realnm; 1545 } 1546 1547top: 1548 /* 1549 * Attempt to lock directory; fail if entry doesn't exist. 1550 */ 1551 if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 1552 NULL, realnmp)) { 1553 if (realnmp) 1554 pn_free(realnmp); 1555 ZFS_EXIT(zfsvfs); 1556 return (error); 1557 } 1558 1559 vp = ZTOV(zp); 1560 1561 if (error = zfs_zaccess_delete(dzp, zp, cr)) { 1562 goto out; 1563 } 1564 1565 /* 1566 * Need to use rmdir for removing directories. 1567 */ 1568 if (vp->v_type == VDIR) { 1569 error = EPERM; 1570 goto out; 1571 } 1572 1573 vnevent_remove(vp, dvp, name, ct); 1574 1575 if (realnmp) 1576 dnlc_remove(dvp, realnmp->pn_buf); 1577 else 1578 dnlc_remove(dvp, name); 1579 1580 may_delete_now = FALSE; 1581 1582 /* 1583 * We may delete the znode now, or we may put it in the unlinked set; 1584 * it depends on whether we're the last link, and on whether there are 1585 * other holds on the vnode. So we dmu_tx_hold() the right things to 1586 * allow for either case. 1587 */ 1588 tx = dmu_tx_create(zfsvfs->z_os); 1589 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); 1590 dmu_tx_hold_bonus(tx, zp->z_id); 1591 if (may_delete_now) { 1592 toobig = 1593 zp->z_phys->zp_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT; 1594 /* if the file is too big, only hold_free a token amount */ 1595 dmu_tx_hold_free(tx, zp->z_id, 0, 1596 (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END)); 1597 } 1598 1599 /* are there any extended attributes? */ 1600 if ((xattr_obj = zp->z_phys->zp_xattr) != 0) { 1601 /* XXX - do we need this if we are deleting? */ 1602 dmu_tx_hold_bonus(tx, xattr_obj); 1603 } 1604 1605 /* are there any additional acls */ 1606 if ((acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj) != 0 && 1607 may_delete_now) 1608 dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); 1609 1610 /* charge as an update -- would be nice not to charge at all */ 1611 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 1612 1613 error = dmu_tx_assign(tx, zfsvfs->z_assign); 1614 if (error) { 1615 zfs_dirent_unlock(dl); 1616 VN_RELE(vp); 1617 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 1618 dmu_tx_wait(tx); 1619 dmu_tx_abort(tx); 1620 goto top; 1621 } 1622 if (realnmp) 1623 pn_free(realnmp); 1624 dmu_tx_abort(tx); 1625 ZFS_EXIT(zfsvfs); 1626 return (error); 1627 } 1628 1629 /* 1630 * Remove the directory entry. 1631 */ 1632 error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked); 1633 1634 if (error) { 1635 dmu_tx_commit(tx); 1636 goto out; 1637 } 1638 1639 if (0 && unlinked) { 1640 VI_LOCK(vp); 1641 delete_now = may_delete_now && !toobig && 1642 vp->v_count == 1 && !vn_has_cached_data(vp) && 1643 zp->z_phys->zp_xattr == xattr_obj && 1644 zp->z_phys->zp_acl.z_acl_extern_obj == acl_obj; 1645 VI_UNLOCK(vp); 1646 } 1647 1648 if (delete_now) { 1649 if (zp->z_phys->zp_xattr) { 1650 error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp); 1651 ASSERT3U(error, ==, 0); 1652 ASSERT3U(xzp->z_phys->zp_links, ==, 2); 1653 dmu_buf_will_dirty(xzp->z_dbuf, tx); 1654 mutex_enter(&xzp->z_lock); 1655 xzp->z_unlinked = 1; 1656 xzp->z_phys->zp_links = 0; 1657 mutex_exit(&xzp->z_lock); 1658 zfs_unlinked_add(xzp, tx); 1659 zp->z_phys->zp_xattr = 0; /* probably unnecessary */ 1660 } 1661 mutex_enter(&zp->z_lock); 1662 VI_LOCK(vp); 1663 vp->v_count--; 1664 ASSERT3U(vp->v_count, ==, 0); 1665 VI_UNLOCK(vp); 1666 mutex_exit(&zp->z_lock); 1667 zfs_znode_delete(zp, tx); 1668 } else if (unlinked) { 1669 zfs_unlinked_add(zp, tx); 1670 } 1671 1672 txtype = TX_REMOVE; 1673 if (flags & FIGNORECASE) 1674 txtype |= TX_CI; 1675 zfs_log_remove(zilog, tx, txtype, dzp, name); 1676 1677 dmu_tx_commit(tx); 1678out: 1679 if (realnmp) 1680 pn_free(realnmp); 1681 1682 zfs_dirent_unlock(dl); 1683 1684 if (!delete_now) { 1685 VN_RELE(vp); 1686 } else if (xzp) { 1687 /* this rele is delayed to prevent nesting transactions */ 1688 VN_RELE(ZTOV(xzp)); 1689 } 1690 1691 ZFS_EXIT(zfsvfs); 1692 return (error); 1693} 1694 1695/* 1696 * Create a new directory and insert it into dvp using the name 1697 * provided. Return a pointer to the inserted directory. 1698 * 1699 * IN: dvp - vnode of directory to add subdir to. 1700 * dirname - name of new directory. 1701 * vap - attributes of new directory. 1702 * cr - credentials of caller. 1703 * ct - caller context 1704 * vsecp - ACL to be set 1705 * 1706 * OUT: vpp - vnode of created directory. 1707 * 1708 * RETURN: 0 if success 1709 * error code if failure 1710 * 1711 * Timestamps: 1712 * dvp - ctime|mtime updated 1713 * vp - ctime|mtime|atime updated 1714 */ 1715/*ARGSUSED*/ 1716static int 1717zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr, 1718 caller_context_t *ct, int flags, vsecattr_t *vsecp) 1719{ 1720 znode_t *zp, *dzp = VTOZ(dvp); 1721 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1722 zilog_t *zilog; 1723 zfs_dirlock_t *dl; 1724 uint64_t txtype; 1725 dmu_tx_t *tx; 1726 int error; 1727 zfs_acl_t *aclp = NULL; 1728 zfs_fuid_info_t *fuidp = NULL; 1729 int zf = ZNEW; 1730 1731 ASSERT(vap->va_type == VDIR); 1732 1733 /* 1734 * If we have an ephemeral id, ACL, or XVATTR then 1735 * make sure file system is at proper version 1736 */ 1737 1738 if (zfsvfs->z_use_fuids == B_FALSE && 1739 (vsecp || (vap->va_mask & AT_XVATTR) || IS_EPHEMERAL(crgetuid(cr))|| 1740 IS_EPHEMERAL(crgetgid(cr)))) 1741 return (EINVAL); 1742 1743 ZFS_ENTER(zfsvfs); 1744 ZFS_VERIFY_ZP(dzp); 1745 zilog = zfsvfs->z_log; 1746 1747 if (dzp->z_phys->zp_flags & ZFS_XATTR) { 1748 ZFS_EXIT(zfsvfs); 1749 return (EINVAL); 1750 } 1751 1752 if (zfsvfs->z_utf8 && u8_validate(dirname, 1753 strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 1754 ZFS_EXIT(zfsvfs); 1755 return (EILSEQ); 1756 } 1757 if (flags & FIGNORECASE) 1758 zf |= ZCILOOK; 1759 1760 if (vap->va_mask & AT_XVATTR) 1761 if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap, 1762 crgetuid(cr), cr, vap->va_type)) != 0) { 1763 ZFS_EXIT(zfsvfs); 1764 return (error); 1765 } 1766 1767 /* 1768 * First make sure the new directory doesn't exist. 1769 */ 1770top: 1771 *vpp = NULL; 1772 1773 if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf, 1774 NULL, NULL)) { 1775 ZFS_EXIT(zfsvfs); 1776 return (error); 1777 } 1778 1779 if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) { 1780 zfs_dirent_unlock(dl); 1781 ZFS_EXIT(zfsvfs); 1782 return (error); 1783 } 1784 1785 if (vsecp && aclp == NULL) { 1786 error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, &aclp); 1787 if (error) { 1788 zfs_dirent_unlock(dl); 1789 ZFS_EXIT(zfsvfs); 1790 return (error); 1791 } 1792 } 1793 /* 1794 * Add a new entry to the directory. 1795 */ 1796 tx = dmu_tx_create(zfsvfs->z_os); 1797 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); 1798 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); 1799 if ((aclp && aclp->z_has_fuids) || IS_EPHEMERAL(crgetuid(cr)) || 1800 IS_EPHEMERAL(crgetgid(cr))) { 1801 if (zfsvfs->z_fuid_obj == 0) { 1802 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 1803 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 1804 FUID_SIZE_ESTIMATE(zfsvfs)); 1805 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); 1806 } else { 1807 dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); 1808 dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, 1809 FUID_SIZE_ESTIMATE(zfsvfs)); 1810 } 1811 } 1812 if ((dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) || aclp) 1813 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 1814 0, SPA_MAXBLOCKSIZE); 1815 error = dmu_tx_assign(tx, zfsvfs->z_assign); 1816 if (error) { 1817 zfs_dirent_unlock(dl); 1818 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 1819 dmu_tx_wait(tx); 1820 dmu_tx_abort(tx); 1821 goto top; 1822 } 1823 dmu_tx_abort(tx); 1824 ZFS_EXIT(zfsvfs); 1825 if (aclp) 1826 zfs_acl_free(aclp); 1827 return (error); 1828 } 1829 1830 /* 1831 * Create new node. 1832 */ 1833 zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, aclp, &fuidp); 1834 1835 if (aclp) 1836 zfs_acl_free(aclp); 1837 1838 /* 1839 * Now put new name in parent dir. 1840 */ 1841 (void) zfs_link_create(dl, zp, tx, ZNEW); 1842 1843 *vpp = ZTOV(zp); 1844 1845 txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap); 1846 if (flags & FIGNORECASE) 1847 txtype |= TX_CI; 1848 zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, fuidp, vap); 1849 1850 if (fuidp) 1851 zfs_fuid_info_free(fuidp); 1852 dmu_tx_commit(tx); 1853 1854 zfs_dirent_unlock(dl); 1855 1856 ZFS_EXIT(zfsvfs); 1857 return (0); 1858} 1859 1860/* 1861 * Remove a directory subdir entry. If the current working 1862 * directory is the same as the subdir to be removed, the 1863 * remove will fail. 1864 * 1865 * IN: dvp - vnode of directory to remove from. 1866 * name - name of directory to be removed. 1867 * cwd - vnode of current working directory. 1868 * cr - credentials of caller. 1869 * ct - caller context 1870 * flags - case flags 1871 * 1872 * RETURN: 0 if success 1873 * error code if failure 1874 * 1875 * Timestamps: 1876 * dvp - ctime|mtime updated 1877 */ 1878/*ARGSUSED*/ 1879static int 1880zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr, 1881 caller_context_t *ct, int flags) 1882{ 1883 znode_t *dzp = VTOZ(dvp); 1884 znode_t *zp; 1885 vnode_t *vp; 1886 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1887 zilog_t *zilog; 1888 zfs_dirlock_t *dl; 1889 dmu_tx_t *tx; 1890 int error; 1891 int zflg = ZEXISTS; 1892 1893 ZFS_ENTER(zfsvfs); 1894 ZFS_VERIFY_ZP(dzp); 1895 zilog = zfsvfs->z_log; 1896 1897 if (flags & FIGNORECASE) 1898 zflg |= ZCILOOK; 1899top: 1900 zp = NULL; 1901 1902 /* 1903 * Attempt to lock directory; fail if entry doesn't exist. 1904 */ 1905 if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 1906 NULL, NULL)) { 1907 ZFS_EXIT(zfsvfs); 1908 return (error); 1909 } 1910 1911 vp = ZTOV(zp); 1912 1913 if (error = zfs_zaccess_delete(dzp, zp, cr)) { 1914 goto out; 1915 } 1916 1917 if (vp->v_type != VDIR) { 1918 error = ENOTDIR; 1919 goto out; 1920 } 1921 1922 if (vp == cwd) { 1923 error = EINVAL; 1924 goto out; 1925 } 1926 1927 vnevent_rmdir(vp, dvp, name, ct); 1928 1929 /* 1930 * Grab a lock on the directory to make sure that noone is 1931 * trying to add (or lookup) entries while we are removing it. 1932 */ 1933 rw_enter(&zp->z_name_lock, RW_WRITER); 1934 1935 /* 1936 * Grab a lock on the parent pointer to make sure we play well 1937 * with the treewalk and directory rename code. 1938 */ 1939 rw_enter(&zp->z_parent_lock, RW_WRITER); 1940 1941 tx = dmu_tx_create(zfsvfs->z_os); 1942 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); 1943 dmu_tx_hold_bonus(tx, zp->z_id); 1944 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 1945 error = dmu_tx_assign(tx, zfsvfs->z_assign); 1946 if (error) { 1947 rw_exit(&zp->z_parent_lock); 1948 rw_exit(&zp->z_name_lock); 1949 zfs_dirent_unlock(dl); 1950 VN_RELE(vp); 1951 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 1952 dmu_tx_wait(tx); 1953 dmu_tx_abort(tx); 1954 goto top; 1955 } 1956 dmu_tx_abort(tx); 1957 ZFS_EXIT(zfsvfs); 1958 return (error); 1959 } 1960 1961#ifdef FREEBSD_NAMECACHE 1962 cache_purge(dvp); 1963#endif 1964 1965 error = zfs_link_destroy(dl, zp, tx, zflg, NULL); 1966 1967 if (error == 0) { 1968 uint64_t txtype = TX_RMDIR; 1969 if (flags & FIGNORECASE) 1970 txtype |= TX_CI; 1971 zfs_log_remove(zilog, tx, txtype, dzp, name); 1972 } 1973 1974 dmu_tx_commit(tx); 1975 1976 rw_exit(&zp->z_parent_lock); 1977 rw_exit(&zp->z_name_lock); 1978#ifdef FREEBSD_NAMECACHE 1979 cache_purge(vp); 1980#endif 1981out: 1982 zfs_dirent_unlock(dl); 1983 1984 VN_RELE(vp); 1985 1986 ZFS_EXIT(zfsvfs); 1987 return (error); 1988} 1989 1990/* 1991 * Read as many directory entries as will fit into the provided 1992 * buffer from the given directory cursor position (specified in 1993 * the uio structure. 1994 * 1995 * IN: vp - vnode of directory to read. 1996 * uio - structure supplying read location, range info, 1997 * and return buffer. 1998 * cr - credentials of caller. 1999 * ct - caller context 2000 * flags - case flags 2001 * 2002 * OUT: uio - updated offset and range, buffer filled. 2003 * eofp - set to true if end-of-file detected. 2004 * 2005 * RETURN: 0 if success 2006 * error code if failure 2007 * 2008 * Timestamps: 2009 * vp - atime updated 2010 * 2011 * Note that the low 4 bits of the cookie returned by zap is always zero. 2012 * This allows us to use the low range for "special" directory entries: 2013 * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, 2014 * we use the offset 2 for the '.zfs' directory. 2015 */ 2016/* ARGSUSED */ 2017static int 2018zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies) 2019{ 2020 znode_t *zp = VTOZ(vp); 2021 iovec_t *iovp; 2022 edirent_t *eodp; 2023 dirent64_t *odp; 2024 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2025 objset_t *os; 2026 caddr_t outbuf; 2027 size_t bufsize; 2028 zap_cursor_t zc; 2029 zap_attribute_t zap; 2030 uint_t bytes_wanted; 2031 uint64_t offset; /* must be unsigned; checks for < 1 */ 2032 int local_eof; 2033 int outcount; 2034 int error; 2035 uint8_t prefetch; 2036 boolean_t check_sysattrs; 2037 uint8_t type; 2038 int ncooks; 2039 u_long *cooks = NULL; 2040 int flags = 0; 2041 2042 ZFS_ENTER(zfsvfs); 2043 ZFS_VERIFY_ZP(zp); 2044 2045 /* 2046 * If we are not given an eof variable, 2047 * use a local one. 2048 */ 2049 if (eofp == NULL) 2050 eofp = &local_eof; 2051 2052 /* 2053 * Check for valid iov_len. 2054 */ 2055 if (uio->uio_iov->iov_len <= 0) { 2056 ZFS_EXIT(zfsvfs); 2057 return (EINVAL); 2058 } 2059 2060 /* 2061 * Quit if directory has been removed (posix) 2062 */ 2063 if ((*eofp = zp->z_unlinked) != 0) { 2064 ZFS_EXIT(zfsvfs); 2065 return (0); 2066 } 2067 2068 error = 0; 2069 os = zfsvfs->z_os; 2070 offset = uio->uio_loffset; 2071 prefetch = zp->z_zn_prefetch; 2072 2073 /* 2074 * Initialize the iterator cursor. 2075 */ 2076 if (offset <= 3) { 2077 /* 2078 * Start iteration from the beginning of the directory. 2079 */ 2080 zap_cursor_init(&zc, os, zp->z_id); 2081 } else { 2082 /* 2083 * The offset is a serialized cursor. 2084 */ 2085 zap_cursor_init_serialized(&zc, os, zp->z_id, offset); 2086 } 2087 2088 /* 2089 * Get space to change directory entries into fs independent format. 2090 */ 2091 iovp = uio->uio_iov; 2092 bytes_wanted = iovp->iov_len; 2093 if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) { 2094 bufsize = bytes_wanted; 2095 outbuf = kmem_alloc(bufsize, KM_SLEEP); 2096 odp = (struct dirent64 *)outbuf; 2097 } else { 2098 bufsize = bytes_wanted; 2099 odp = (struct dirent64 *)iovp->iov_base; 2100 } 2101 eodp = (struct edirent *)odp; 2102 2103 if (ncookies != NULL) { 2104 /* 2105 * Minimum entry size is dirent size and 1 byte for a file name. 2106 */ 2107 ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1); 2108 cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK); 2109 *cookies = cooks; 2110 *ncookies = ncooks; 2111 } 2112 /* 2113 * If this VFS supports the system attribute view interface; and 2114 * we're looking at an extended attribute directory; and we care 2115 * about normalization conflicts on this vfs; then we must check 2116 * for normalization conflicts with the sysattr name space. 2117 */ 2118#ifdef TODO 2119 check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && 2120 (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm && 2121 (flags & V_RDDIR_ENTFLAGS); 2122#else 2123 check_sysattrs = 0; 2124#endif 2125 2126 /* 2127 * Transform to file-system independent format 2128 */ 2129 outcount = 0; 2130 while (outcount < bytes_wanted) { 2131 ino64_t objnum; 2132 ushort_t reclen; 2133 off64_t *next; 2134 2135 /* 2136 * Special case `.', `..', and `.zfs'. 2137 */ 2138 if (offset == 0) { 2139 (void) strcpy(zap.za_name, "."); 2140 zap.za_normalization_conflict = 0; 2141 objnum = zp->z_id; 2142 type = DT_DIR; 2143 } else if (offset == 1) { 2144 (void) strcpy(zap.za_name, ".."); 2145 zap.za_normalization_conflict = 0; 2146 objnum = zp->z_phys->zp_parent; 2147 type = DT_DIR; 2148 } else if (offset == 2 && zfs_show_ctldir(zp)) { 2149 (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); 2150 zap.za_normalization_conflict = 0; 2151 objnum = ZFSCTL_INO_ROOT; 2152 type = DT_DIR; 2153 } else { 2154 /* 2155 * Grab next entry. 2156 */ 2157 if (error = zap_cursor_retrieve(&zc, &zap)) { 2158 if ((*eofp = (error == ENOENT)) != 0) 2159 break; 2160 else 2161 goto update; 2162 } 2163 2164 if (zap.za_integer_length != 8 || 2165 zap.za_num_integers != 1) { 2166 cmn_err(CE_WARN, "zap_readdir: bad directory " 2167 "entry, obj = %lld, offset = %lld\n", 2168 (u_longlong_t)zp->z_id, 2169 (u_longlong_t)offset); 2170 error = ENXIO; 2171 goto update; 2172 } 2173 2174 objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); 2175 /* 2176 * MacOS X can extract the object type here such as: 2177 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer); 2178 */ 2179 type = ZFS_DIRENT_TYPE(zap.za_first_integer); 2180 2181 if (check_sysattrs && !zap.za_normalization_conflict) { 2182#ifdef TODO 2183 zap.za_normalization_conflict = 2184 xattr_sysattr_casechk(zap.za_name); 2185#else 2186 panic("%s:%u: TODO", __func__, __LINE__); 2187#endif 2188 } 2189 } 2190 2191 if (flags & V_RDDIR_ENTFLAGS) 2192 reclen = EDIRENT_RECLEN(strlen(zap.za_name)); 2193 else 2194 reclen = DIRENT64_RECLEN(strlen(zap.za_name)); 2195 2196 /* 2197 * Will this entry fit in the buffer? 2198 */ 2199 if (outcount + reclen > bufsize) { 2200 /* 2201 * Did we manage to fit anything in the buffer? 2202 */ 2203 if (!outcount) { 2204 error = EINVAL; 2205 goto update; 2206 } 2207 break; 2208 } 2209 if (flags & V_RDDIR_ENTFLAGS) { 2210 /* 2211 * Add extended flag entry: 2212 */ 2213 eodp->ed_ino = objnum; 2214 eodp->ed_reclen = reclen; 2215 /* NOTE: ed_off is the offset for the *next* entry */ 2216 next = &(eodp->ed_off); 2217 eodp->ed_eflags = zap.za_normalization_conflict ? 2218 ED_CASE_CONFLICT : 0; 2219 (void) strncpy(eodp->ed_name, zap.za_name, 2220 EDIRENT_NAMELEN(reclen)); 2221 eodp = (edirent_t *)((intptr_t)eodp + reclen); 2222 } else { 2223 /* 2224 * Add normal entry: 2225 */ 2226 odp->d_ino = objnum; 2227 odp->d_reclen = reclen; 2228 odp->d_namlen = strlen(zap.za_name); 2229 (void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1); 2230 odp->d_type = type; 2231 odp = (dirent64_t *)((intptr_t)odp + reclen); 2232 } 2233 outcount += reclen; 2234 2235 ASSERT(outcount <= bufsize); 2236 2237 /* Prefetch znode */ 2238 if (prefetch) 2239 dmu_prefetch(os, objnum, 0, 0); 2240 2241 /* 2242 * Move to the next entry, fill in the previous offset. 2243 */ 2244 if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { 2245 zap_cursor_advance(&zc); 2246 offset = zap_cursor_serialize(&zc); 2247 } else { 2248 offset += 1; 2249 } 2250 2251 if (cooks != NULL) { 2252 *cooks++ = offset; 2253 ncooks--; 2254 KASSERT(ncooks >= 0, ("ncookies=%d", ncooks)); 2255 } 2256 } 2257 zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ 2258 2259 /* Subtract unused cookies */ 2260 if (ncookies != NULL) 2261 *ncookies -= ncooks; 2262 2263 if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) { 2264 iovp->iov_base += outcount; 2265 iovp->iov_len -= outcount; 2266 uio->uio_resid -= outcount; 2267 } else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) { 2268 /* 2269 * Reset the pointer. 2270 */ 2271 offset = uio->uio_loffset; 2272 } 2273 2274update: 2275 zap_cursor_fini(&zc); 2276 if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) 2277 kmem_free(outbuf, bufsize); 2278 2279 if (error == ENOENT) 2280 error = 0; 2281 2282 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 2283 2284 uio->uio_loffset = offset; 2285 ZFS_EXIT(zfsvfs); 2286 if (error != 0 && cookies != NULL) { 2287 free(*cookies, M_TEMP); 2288 *cookies = NULL; 2289 *ncookies = 0; 2290 } 2291 return (error); 2292} 2293 2294ulong_t zfs_fsync_sync_cnt = 4; 2295 2296static int 2297zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct) 2298{ 2299 znode_t *zp = VTOZ(vp); 2300 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2301 2302 (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt); 2303 2304 ZFS_ENTER(zfsvfs); 2305 ZFS_VERIFY_ZP(zp); 2306 zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id); 2307 ZFS_EXIT(zfsvfs); 2308 return (0); 2309} 2310 2311 2312/* 2313 * Get the requested file attributes and place them in the provided 2314 * vattr structure. 2315 * 2316 * IN: vp - vnode of file. 2317 * vap - va_mask identifies requested attributes. 2318 * If AT_XVATTR set, then optional attrs are requested 2319 * flags - ATTR_NOACLCHECK (CIFS server context) 2320 * cr - credentials of caller. 2321 * ct - caller context 2322 * 2323 * OUT: vap - attribute values. 2324 * 2325 * RETURN: 0 (always succeeds) 2326 */ 2327/* ARGSUSED */ 2328static int 2329zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, 2330 caller_context_t *ct) 2331{ 2332 znode_t *zp = VTOZ(vp); 2333 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2334 znode_phys_t *pzp; 2335 int error = 0; 2336 uint32_t blksize; 2337 u_longlong_t nblocks; 2338 uint64_t links; 2339 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ 2340 xoptattr_t *xoap = NULL; 2341 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 2342 2343 ZFS_ENTER(zfsvfs); 2344 ZFS_VERIFY_ZP(zp); 2345 pzp = zp->z_phys; 2346 2347 mutex_enter(&zp->z_lock); 2348 2349 /* 2350 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES. 2351 * Also, if we are the owner don't bother, since owner should 2352 * always be allowed to read basic attributes of file. 2353 */ 2354 if (!(pzp->zp_flags & ZFS_ACL_TRIVIAL) && 2355 (pzp->zp_uid != crgetuid(cr))) { 2356 if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0, 2357 skipaclchk, cr)) { 2358 mutex_exit(&zp->z_lock); 2359 ZFS_EXIT(zfsvfs); 2360 return (error); 2361 } 2362 } 2363 2364 /* 2365 * Return all attributes. It's cheaper to provide the answer 2366 * than to determine whether we were asked the question. 2367 */ 2368 2369 vap->va_type = IFTOVT(pzp->zp_mode); 2370 vap->va_mode = pzp->zp_mode & ~S_IFMT; 2371 zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid); 2372// vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev; 2373 vap->va_nodeid = zp->z_id; 2374 if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp)) 2375 links = pzp->zp_links + 1; 2376 else 2377 links = pzp->zp_links; 2378 vap->va_nlink = MIN(links, UINT32_MAX); /* nlink_t limit! */ 2379 vap->va_size = pzp->zp_size; 2380 vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; 2381 vap->va_rdev = zfs_cmpldev(pzp->zp_rdev); 2382 vap->va_seq = zp->z_seq; 2383 vap->va_flags = 0; /* FreeBSD: Reset chflags(2) flags. */ 2384 2385 /* 2386 * Add in any requested optional attributes and the create time. 2387 * Also set the corresponding bits in the returned attribute bitmap. 2388 */ 2389 if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) { 2390 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { 2391 xoap->xoa_archive = 2392 ((pzp->zp_flags & ZFS_ARCHIVE) != 0); 2393 XVA_SET_RTN(xvap, XAT_ARCHIVE); 2394 } 2395 2396 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { 2397 xoap->xoa_readonly = 2398 ((pzp->zp_flags & ZFS_READONLY) != 0); 2399 XVA_SET_RTN(xvap, XAT_READONLY); 2400 } 2401 2402 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { 2403 xoap->xoa_system = 2404 ((pzp->zp_flags & ZFS_SYSTEM) != 0); 2405 XVA_SET_RTN(xvap, XAT_SYSTEM); 2406 } 2407 2408 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { 2409 xoap->xoa_hidden = 2410 ((pzp->zp_flags & ZFS_HIDDEN) != 0); 2411 XVA_SET_RTN(xvap, XAT_HIDDEN); 2412 } 2413 2414 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { 2415 xoap->xoa_nounlink = 2416 ((pzp->zp_flags & ZFS_NOUNLINK) != 0); 2417 XVA_SET_RTN(xvap, XAT_NOUNLINK); 2418 } 2419 2420 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { 2421 xoap->xoa_immutable = 2422 ((pzp->zp_flags & ZFS_IMMUTABLE) != 0); 2423 XVA_SET_RTN(xvap, XAT_IMMUTABLE); 2424 } 2425 2426 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { 2427 xoap->xoa_appendonly = 2428 ((pzp->zp_flags & ZFS_APPENDONLY) != 0); 2429 XVA_SET_RTN(xvap, XAT_APPENDONLY); 2430 } 2431 2432 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { 2433 xoap->xoa_nodump = 2434 ((pzp->zp_flags & ZFS_NODUMP) != 0); 2435 XVA_SET_RTN(xvap, XAT_NODUMP); 2436 } 2437 2438 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { 2439 xoap->xoa_opaque = 2440 ((pzp->zp_flags & ZFS_OPAQUE) != 0); 2441 XVA_SET_RTN(xvap, XAT_OPAQUE); 2442 } 2443 2444 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { 2445 xoap->xoa_av_quarantined = 2446 ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0); 2447 XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); 2448 } 2449 2450 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { 2451 xoap->xoa_av_modified = 2452 ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0); 2453 XVA_SET_RTN(xvap, XAT_AV_MODIFIED); 2454 } 2455 2456 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) && 2457 vp->v_type == VREG && 2458 (pzp->zp_flags & ZFS_BONUS_SCANSTAMP)) { 2459 size_t len; 2460 dmu_object_info_t doi; 2461 2462 /* 2463 * Only VREG files have anti-virus scanstamps, so we 2464 * won't conflict with symlinks in the bonus buffer. 2465 */ 2466 dmu_object_info_from_db(zp->z_dbuf, &doi); 2467 len = sizeof (xoap->xoa_av_scanstamp) + 2468 sizeof (znode_phys_t); 2469 if (len <= doi.doi_bonus_size) { 2470 /* 2471 * pzp points to the start of the 2472 * znode_phys_t. pzp + 1 points to the 2473 * first byte after the znode_phys_t. 2474 */ 2475 (void) memcpy(xoap->xoa_av_scanstamp, 2476 pzp + 1, 2477 sizeof (xoap->xoa_av_scanstamp)); 2478 XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); 2479 } 2480 } 2481 2482 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { 2483 ZFS_TIME_DECODE(&xoap->xoa_createtime, pzp->zp_crtime); 2484 XVA_SET_RTN(xvap, XAT_CREATETIME); 2485 } 2486 } 2487 2488 ZFS_TIME_DECODE(&vap->va_atime, pzp->zp_atime); 2489 ZFS_TIME_DECODE(&vap->va_mtime, pzp->zp_mtime); 2490 ZFS_TIME_DECODE(&vap->va_ctime, pzp->zp_ctime); 2491 ZFS_TIME_DECODE(&vap->va_birthtime, pzp->zp_crtime); 2492 2493 mutex_exit(&zp->z_lock); 2494 2495 dmu_object_size_from_db(zp->z_dbuf, &blksize, &nblocks); 2496 vap->va_blksize = blksize; 2497 vap->va_bytes = nblocks << 9; /* nblocks * 512 */ 2498 2499 if (zp->z_blksz == 0) { 2500 /* 2501 * Block size hasn't been set; suggest maximal I/O transfers. 2502 */ 2503 vap->va_blksize = zfsvfs->z_max_blksz; 2504 } 2505 2506 ZFS_EXIT(zfsvfs); 2507 return (0); 2508} 2509 2510/* 2511 * Set the file attributes to the values contained in the 2512 * vattr structure. 2513 * 2514 * IN: vp - vnode of file to be modified. 2515 * vap - new attribute values. 2516 * If AT_XVATTR set, then optional attrs are being set 2517 * flags - ATTR_UTIME set if non-default time values provided. 2518 * - ATTR_NOACLCHECK (CIFS context only). 2519 * cr - credentials of caller. 2520 * ct - caller context 2521 * 2522 * RETURN: 0 if success 2523 * error code if failure 2524 * 2525 * Timestamps: 2526 * vp - ctime updated, mtime updated if size changed. 2527 */ 2528/* ARGSUSED */ 2529static int 2530zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, 2531 caller_context_t *ct) 2532{ 2533 znode_t *zp = VTOZ(vp); 2534 znode_phys_t *pzp; 2535 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2536 zilog_t *zilog; 2537 dmu_tx_t *tx; 2538 vattr_t oldva; 2539 uint_t mask = vap->va_mask; 2540 uint_t saved_mask; 2541 uint64_t saved_mode; 2542 int trim_mask = 0; 2543 uint64_t new_mode; 2544 znode_t *attrzp; 2545 int need_policy = FALSE; 2546 int err; 2547 zfs_fuid_info_t *fuidp = NULL; 2548 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ 2549 xoptattr_t *xoap; 2550 zfs_acl_t *aclp = NULL; 2551 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 2552 2553 if (mask == 0) 2554 return (0); 2555 2556 if (mask & AT_NOSET) 2557 return (EINVAL); 2558 2559 ZFS_ENTER(zfsvfs); 2560 ZFS_VERIFY_ZP(zp); 2561 2562 pzp = zp->z_phys; 2563 zilog = zfsvfs->z_log; 2564 2565 /* 2566 * Make sure that if we have ephemeral uid/gid or xvattr specified 2567 * that file system is at proper version level 2568 */ 2569 2570 if (zfsvfs->z_use_fuids == B_FALSE && 2571 (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) || 2572 ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) || 2573 (mask & AT_XVATTR))) { 2574 ZFS_EXIT(zfsvfs); 2575 return (EINVAL); 2576 } 2577 2578 if (mask & AT_SIZE && vp->v_type == VDIR) { 2579 ZFS_EXIT(zfsvfs); 2580 return (EISDIR); 2581 } 2582 2583 if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) { 2584 ZFS_EXIT(zfsvfs); 2585 return (EINVAL); 2586 } 2587 2588 /* 2589 * If this is an xvattr_t, then get a pointer to the structure of 2590 * optional attributes. If this is NULL, then we have a vattr_t. 2591 */ 2592 xoap = xva_getxoptattr(xvap); 2593 2594 /* 2595 * Immutable files can only alter immutable bit and atime 2596 */ 2597 if ((pzp->zp_flags & ZFS_IMMUTABLE) && 2598 ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) || 2599 ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { 2600 ZFS_EXIT(zfsvfs); 2601 return (EPERM); 2602 } 2603 2604 if ((mask & AT_SIZE) && (pzp->zp_flags & ZFS_READONLY)) { 2605 ZFS_EXIT(zfsvfs); 2606 return (EPERM); 2607 } 2608 2609 /* 2610 * Verify timestamps doesn't overflow 32 bits. 2611 * ZFS can handle large timestamps, but 32bit syscalls can't 2612 * handle times greater than 2039. This check should be removed 2613 * once large timestamps are fully supported. 2614 */ 2615 if (mask & (AT_ATIME | AT_MTIME)) { 2616 if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || 2617 ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { 2618 ZFS_EXIT(zfsvfs); 2619 return (EOVERFLOW); 2620 } 2621 } 2622 2623top: 2624 attrzp = NULL; 2625 2626 if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { 2627 ZFS_EXIT(zfsvfs); 2628 return (EROFS); 2629 } 2630 2631 /* 2632 * First validate permissions 2633 */ 2634 2635 if (mask & AT_SIZE) { 2636 err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr); 2637 if (err) { 2638 ZFS_EXIT(zfsvfs); 2639 return (err); 2640 } 2641 /* 2642 * XXX - Note, we are not providing any open 2643 * mode flags here (like FNDELAY), so we may 2644 * block if there are locks present... this 2645 * should be addressed in openat(). 2646 */ 2647 /* XXX - would it be OK to generate a log record here? */ 2648 err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); 2649 if (err) { 2650 ZFS_EXIT(zfsvfs); 2651 return (err); 2652 } 2653 } 2654 2655 if (mask & (AT_ATIME|AT_MTIME) || 2656 ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || 2657 XVA_ISSET_REQ(xvap, XAT_READONLY) || 2658 XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || 2659 XVA_ISSET_REQ(xvap, XAT_CREATETIME) || 2660 XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) 2661 need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, 2662 skipaclchk, cr); 2663 2664 if (mask & (AT_UID|AT_GID)) { 2665 int idmask = (mask & (AT_UID|AT_GID)); 2666 int take_owner; 2667 int take_group; 2668 2669 /* 2670 * NOTE: even if a new mode is being set, 2671 * we may clear S_ISUID/S_ISGID bits. 2672 */ 2673 2674 if (!(mask & AT_MODE)) 2675 vap->va_mode = pzp->zp_mode; 2676 2677 /* 2678 * Take ownership or chgrp to group we are a member of 2679 */ 2680 2681 take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr)); 2682 take_group = (mask & AT_GID) && 2683 zfs_groupmember(zfsvfs, vap->va_gid, cr); 2684 2685 /* 2686 * If both AT_UID and AT_GID are set then take_owner and 2687 * take_group must both be set in order to allow taking 2688 * ownership. 2689 * 2690 * Otherwise, send the check through secpolicy_vnode_setattr() 2691 * 2692 */ 2693 2694 if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) || 2695 ((idmask == AT_UID) && take_owner) || 2696 ((idmask == AT_GID) && take_group)) { 2697 if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, 2698 skipaclchk, cr) == 0) { 2699 /* 2700 * Remove setuid/setgid for non-privileged users 2701 */ 2702 secpolicy_setid_clear(vap, vp, cr); 2703 trim_mask = (mask & (AT_UID|AT_GID)); 2704 } else { 2705 need_policy = TRUE; 2706 } 2707 } else { 2708 need_policy = TRUE; 2709 } 2710 } 2711 2712 mutex_enter(&zp->z_lock); 2713 oldva.va_mode = pzp->zp_mode; 2714 zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); 2715 if (mask & AT_XVATTR) { 2716 if ((need_policy == FALSE) && 2717 (XVA_ISSET_REQ(xvap, XAT_APPENDONLY) && 2718 xoap->xoa_appendonly != 2719 ((pzp->zp_flags & ZFS_APPENDONLY) != 0)) || 2720 (XVA_ISSET_REQ(xvap, XAT_NOUNLINK) && 2721 xoap->xoa_nounlink != 2722 ((pzp->zp_flags & ZFS_NOUNLINK) != 0)) || 2723 (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE) && 2724 xoap->xoa_immutable != 2725 ((pzp->zp_flags & ZFS_IMMUTABLE) != 0)) || 2726 (XVA_ISSET_REQ(xvap, XAT_NODUMP) && 2727 xoap->xoa_nodump != 2728 ((pzp->zp_flags & ZFS_NODUMP) != 0)) || 2729 (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED) && 2730 xoap->xoa_av_modified != 2731 ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0)) || 2732 ((XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED) && 2733 ((vp->v_type != VREG && xoap->xoa_av_quarantined) || 2734 xoap->xoa_av_quarantined != 2735 ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0)))) || 2736 (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) || 2737 (XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { 2738 need_policy = TRUE; 2739 } 2740 } 2741 2742 mutex_exit(&zp->z_lock); 2743 2744 if (mask & AT_MODE) { 2745 if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) { 2746 err = secpolicy_setid_setsticky_clear(vp, vap, 2747 &oldva, cr); 2748 if (err) { 2749 ZFS_EXIT(zfsvfs); 2750 return (err); 2751 } 2752 trim_mask |= AT_MODE; 2753 } else { 2754 need_policy = TRUE; 2755 } 2756 } 2757 2758 if (need_policy) { 2759 /* 2760 * If trim_mask is set then take ownership 2761 * has been granted or write_acl is present and user 2762 * has the ability to modify mode. In that case remove 2763 * UID|GID and or MODE from mask so that 2764 * secpolicy_vnode_setattr() doesn't revoke it. 2765 */ 2766 2767 if (trim_mask) { 2768 saved_mask = vap->va_mask; 2769 vap->va_mask &= ~trim_mask; 2770 if (trim_mask & AT_MODE) { 2771 /* 2772 * Save the mode, as secpolicy_vnode_setattr() 2773 * will overwrite it with ova.va_mode. 2774 */ 2775 saved_mode = vap->va_mode; 2776 } 2777 } 2778 err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags, 2779 (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp); 2780 if (err) { 2781 ZFS_EXIT(zfsvfs); 2782 return (err); 2783 } 2784 2785 if (trim_mask) { 2786 vap->va_mask |= saved_mask; 2787 if (trim_mask & AT_MODE) { 2788 /* 2789 * Recover the mode after 2790 * secpolicy_vnode_setattr(). 2791 */ 2792 vap->va_mode = saved_mode; 2793 } 2794 } 2795 } 2796 2797 /* 2798 * secpolicy_vnode_setattr, or take ownership may have 2799 * changed va_mask 2800 */ 2801 mask = vap->va_mask; 2802 2803 tx = dmu_tx_create(zfsvfs->z_os); 2804 dmu_tx_hold_bonus(tx, zp->z_id); 2805 if (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) || 2806 ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid))) { 2807 if (zfsvfs->z_fuid_obj == 0) { 2808 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 2809 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 2810 FUID_SIZE_ESTIMATE(zfsvfs)); 2811 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); 2812 } else { 2813 dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); 2814 dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, 2815 FUID_SIZE_ESTIMATE(zfsvfs)); 2816 } 2817 } 2818 2819 if (mask & AT_MODE) { 2820 uint64_t pmode = pzp->zp_mode; 2821 2822 new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); 2823 2824 if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)) { 2825 dmu_tx_abort(tx); 2826 ZFS_EXIT(zfsvfs); 2827 return (err); 2828 } 2829 if (pzp->zp_acl.z_acl_extern_obj) { 2830 /* Are we upgrading ACL from old V0 format to new V1 */ 2831 if (zfsvfs->z_version <= ZPL_VERSION_FUID && 2832 pzp->zp_acl.z_acl_version == 2833 ZFS_ACL_VERSION_INITIAL) { 2834 dmu_tx_hold_free(tx, 2835 pzp->zp_acl.z_acl_extern_obj, 0, 2836 DMU_OBJECT_END); 2837 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 2838 0, aclp->z_acl_bytes); 2839 } else { 2840 dmu_tx_hold_write(tx, 2841 pzp->zp_acl.z_acl_extern_obj, 0, 2842 aclp->z_acl_bytes); 2843 } 2844 } else if (aclp->z_acl_bytes > ZFS_ACE_SPACE) { 2845 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 2846 0, aclp->z_acl_bytes); 2847 } 2848 } 2849 2850 if ((mask & (AT_UID | AT_GID)) && pzp->zp_xattr != 0) { 2851 err = zfs_zget(zp->z_zfsvfs, pzp->zp_xattr, &attrzp); 2852 if (err) { 2853 dmu_tx_abort(tx); 2854 ZFS_EXIT(zfsvfs); 2855 if (aclp) 2856 zfs_acl_free(aclp); 2857 return (err); 2858 } 2859 dmu_tx_hold_bonus(tx, attrzp->z_id); 2860 } 2861 2862 err = dmu_tx_assign(tx, zfsvfs->z_assign); 2863 if (err) { 2864 if (attrzp) 2865 VN_RELE(ZTOV(attrzp)); 2866 2867 if (aclp) { 2868 zfs_acl_free(aclp); 2869 aclp = NULL; 2870 } 2871 2872 if (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 2873 dmu_tx_wait(tx); 2874 dmu_tx_abort(tx); 2875 goto top; 2876 } 2877 dmu_tx_abort(tx); 2878 ZFS_EXIT(zfsvfs); 2879 return (err); 2880 } 2881 2882 dmu_buf_will_dirty(zp->z_dbuf, tx); 2883 2884 /* 2885 * Set each attribute requested. 2886 * We group settings according to the locks they need to acquire. 2887 * 2888 * Note: you cannot set ctime directly, although it will be 2889 * updated as a side-effect of calling this function. 2890 */ 2891 2892 mutex_enter(&zp->z_lock); 2893 2894 if (mask & AT_MODE) { 2895 mutex_enter(&zp->z_acl_lock); 2896 zp->z_phys->zp_mode = new_mode; 2897 err = zfs_aclset_common(zp, aclp, cr, &fuidp, tx); 2898 ASSERT3U(err, ==, 0); 2899 mutex_exit(&zp->z_acl_lock); 2900 } 2901 2902 if (attrzp) 2903 mutex_enter(&attrzp->z_lock); 2904 2905 if (mask & AT_UID) { 2906 pzp->zp_uid = zfs_fuid_create(zfsvfs, 2907 vap->va_uid, cr, ZFS_OWNER, tx, &fuidp); 2908 if (attrzp) { 2909 attrzp->z_phys->zp_uid = zfs_fuid_create(zfsvfs, 2910 vap->va_uid, cr, ZFS_OWNER, tx, &fuidp); 2911 } 2912 } 2913 2914 if (mask & AT_GID) { 2915 pzp->zp_gid = zfs_fuid_create(zfsvfs, vap->va_gid, 2916 cr, ZFS_GROUP, tx, &fuidp); 2917 if (attrzp) 2918 attrzp->z_phys->zp_gid = zfs_fuid_create(zfsvfs, 2919 vap->va_gid, cr, ZFS_GROUP, tx, &fuidp); 2920 } 2921 2922 if (aclp) 2923 zfs_acl_free(aclp); 2924 2925 if (attrzp) 2926 mutex_exit(&attrzp->z_lock); 2927 2928 if (mask & AT_ATIME) 2929 ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime); 2930 2931 if (mask & AT_MTIME) 2932 ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime); 2933 2934 /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */ 2935 if (mask & AT_SIZE) 2936 zfs_time_stamper_locked(zp, CONTENT_MODIFIED, tx); 2937 else if (mask != 0) 2938 zfs_time_stamper_locked(zp, STATE_CHANGED, tx); 2939 /* 2940 * Do this after setting timestamps to prevent timestamp 2941 * update from toggling bit 2942 */ 2943 2944 if (xoap && (mask & AT_XVATTR)) { 2945 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { 2946 size_t len; 2947 dmu_object_info_t doi; 2948 2949 ASSERT(vp->v_type == VREG); 2950 2951 /* Grow the bonus buffer if necessary. */ 2952 dmu_object_info_from_db(zp->z_dbuf, &doi); 2953 len = sizeof (xoap->xoa_av_scanstamp) + 2954 sizeof (znode_phys_t); 2955 if (len > doi.doi_bonus_size) 2956 VERIFY(dmu_set_bonus(zp->z_dbuf, len, tx) == 0); 2957 } 2958 zfs_xvattr_set(zp, xvap); 2959 } 2960 2961 if (mask != 0) 2962 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); 2963 2964 if (fuidp) 2965 zfs_fuid_info_free(fuidp); 2966 mutex_exit(&zp->z_lock); 2967 2968 if (attrzp) 2969 VN_RELE(ZTOV(attrzp)); 2970 2971 dmu_tx_commit(tx); 2972 2973 ZFS_EXIT(zfsvfs); 2974 return (err); 2975} 2976 2977typedef struct zfs_zlock { 2978 krwlock_t *zl_rwlock; /* lock we acquired */ 2979 znode_t *zl_znode; /* znode we held */ 2980 struct zfs_zlock *zl_next; /* next in list */ 2981} zfs_zlock_t; 2982 2983/* 2984 * Drop locks and release vnodes that were held by zfs_rename_lock(). 2985 */ 2986static void 2987zfs_rename_unlock(zfs_zlock_t **zlpp) 2988{ 2989 zfs_zlock_t *zl; 2990 2991 while ((zl = *zlpp) != NULL) { 2992 if (zl->zl_znode != NULL) 2993 VN_RELE(ZTOV(zl->zl_znode)); 2994 rw_exit(zl->zl_rwlock); 2995 *zlpp = zl->zl_next; 2996 kmem_free(zl, sizeof (*zl)); 2997 } 2998} 2999 3000/* 3001 * Search back through the directory tree, using the ".." entries. 3002 * Lock each directory in the chain to prevent concurrent renames. 3003 * Fail any attempt to move a directory into one of its own descendants. 3004 * XXX - z_parent_lock can overlap with map or grow locks 3005 */ 3006static int 3007zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) 3008{ 3009 zfs_zlock_t *zl; 3010 znode_t *zp = tdzp; 3011 uint64_t rootid = zp->z_zfsvfs->z_root; 3012 uint64_t *oidp = &zp->z_id; 3013 krwlock_t *rwlp = &szp->z_parent_lock; 3014 krw_t rw = RW_WRITER; 3015 3016 /* 3017 * First pass write-locks szp and compares to zp->z_id. 3018 * Later passes read-lock zp and compare to zp->z_parent. 3019 */ 3020 do { 3021 if (!rw_tryenter(rwlp, rw)) { 3022 /* 3023 * Another thread is renaming in this path. 3024 * Note that if we are a WRITER, we don't have any 3025 * parent_locks held yet. 3026 */ 3027 if (rw == RW_READER && zp->z_id > szp->z_id) { 3028 /* 3029 * Drop our locks and restart 3030 */ 3031 zfs_rename_unlock(&zl); 3032 *zlpp = NULL; 3033 zp = tdzp; 3034 oidp = &zp->z_id; 3035 rwlp = &szp->z_parent_lock; 3036 rw = RW_WRITER; 3037 continue; 3038 } else { 3039 /* 3040 * Wait for other thread to drop its locks 3041 */ 3042 rw_enter(rwlp, rw); 3043 } 3044 } 3045 3046 zl = kmem_alloc(sizeof (*zl), KM_SLEEP); 3047 zl->zl_rwlock = rwlp; 3048 zl->zl_znode = NULL; 3049 zl->zl_next = *zlpp; 3050 *zlpp = zl; 3051 3052 if (*oidp == szp->z_id) /* We're a descendant of szp */ 3053 return (EINVAL); 3054 3055 if (*oidp == rootid) /* We've hit the top */ 3056 return (0); 3057 3058 if (rw == RW_READER) { /* i.e. not the first pass */ 3059 int error = zfs_zget(zp->z_zfsvfs, *oidp, &zp); 3060 if (error) 3061 return (error); 3062 zl->zl_znode = zp; 3063 } 3064 oidp = &zp->z_phys->zp_parent; 3065 rwlp = &zp->z_parent_lock; 3066 rw = RW_READER; 3067 3068 } while (zp->z_id != sdzp->z_id); 3069 3070 return (0); 3071} 3072 3073/* 3074 * Move an entry from the provided source directory to the target 3075 * directory. Change the entry name as indicated. 3076 * 3077 * IN: sdvp - Source directory containing the "old entry". 3078 * snm - Old entry name. 3079 * tdvp - Target directory to contain the "new entry". 3080 * tnm - New entry name. 3081 * cr - credentials of caller. 3082 * ct - caller context 3083 * flags - case flags 3084 * 3085 * RETURN: 0 if success 3086 * error code if failure 3087 * 3088 * Timestamps: 3089 * sdvp,tdvp - ctime|mtime updated 3090 */ 3091/*ARGSUSED*/ 3092static int 3093zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr, 3094 caller_context_t *ct, int flags) 3095{ 3096 znode_t *tdzp, *szp, *tzp; 3097 znode_t *sdzp = VTOZ(sdvp); 3098 zfsvfs_t *zfsvfs = sdzp->z_zfsvfs; 3099 zilog_t *zilog; 3100 vnode_t *realvp; 3101 zfs_dirlock_t *sdl, *tdl; 3102 dmu_tx_t *tx; 3103 zfs_zlock_t *zl; 3104 int cmp, serr, terr; 3105 int error = 0; 3106 int zflg = 0; 3107 3108 ZFS_ENTER(zfsvfs); 3109 ZFS_VERIFY_ZP(sdzp); 3110 zilog = zfsvfs->z_log; 3111 3112 /* 3113 * Make sure we have the real vp for the target directory. 3114 */ 3115 if (VOP_REALVP(tdvp, &realvp, ct) == 0) 3116 tdvp = realvp; 3117 3118 if (tdvp->v_vfsp != sdvp->v_vfsp) { 3119 ZFS_EXIT(zfsvfs); 3120 return (EXDEV); 3121 } 3122 3123 tdzp = VTOZ(tdvp); 3124 ZFS_VERIFY_ZP(tdzp); 3125 if (zfsvfs->z_utf8 && u8_validate(tnm, 3126 strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 3127 ZFS_EXIT(zfsvfs); 3128 return (EILSEQ); 3129 } 3130 3131 if (flags & FIGNORECASE) 3132 zflg |= ZCILOOK; 3133 3134top: 3135 szp = NULL; 3136 tzp = NULL; 3137 zl = NULL; 3138 3139 /* 3140 * This is to prevent the creation of links into attribute space 3141 * by renaming a linked file into/outof an attribute directory. 3142 * See the comment in zfs_link() for why this is considered bad. 3143 */ 3144 if ((tdzp->z_phys->zp_flags & ZFS_XATTR) != 3145 (sdzp->z_phys->zp_flags & ZFS_XATTR)) { 3146 ZFS_EXIT(zfsvfs); 3147 return (EINVAL); 3148 } 3149 3150 /* 3151 * Lock source and target directory entries. To prevent deadlock, 3152 * a lock ordering must be defined. We lock the directory with 3153 * the smallest object id first, or if it's a tie, the one with 3154 * the lexically first name. 3155 */ 3156 if (sdzp->z_id < tdzp->z_id) { 3157 cmp = -1; 3158 } else if (sdzp->z_id > tdzp->z_id) { 3159 cmp = 1; 3160 } else { 3161 /* 3162 * First compare the two name arguments without 3163 * considering any case folding. 3164 */ 3165 int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER); 3166 3167 cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error); 3168 ASSERT(error == 0 || !zfsvfs->z_utf8); 3169 if (cmp == 0) { 3170 /* 3171 * POSIX: "If the old argument and the new argument 3172 * both refer to links to the same existing file, 3173 * the rename() function shall return successfully 3174 * and perform no other action." 3175 */ 3176 ZFS_EXIT(zfsvfs); 3177 return (0); 3178 } 3179 /* 3180 * If the file system is case-folding, then we may 3181 * have some more checking to do. A case-folding file 3182 * system is either supporting mixed case sensitivity 3183 * access or is completely case-insensitive. Note 3184 * that the file system is always case preserving. 3185 * 3186 * In mixed sensitivity mode case sensitive behavior 3187 * is the default. FIGNORECASE must be used to 3188 * explicitly request case insensitive behavior. 3189 * 3190 * If the source and target names provided differ only 3191 * by case (e.g., a request to rename 'tim' to 'Tim'), 3192 * we will treat this as a special case in the 3193 * case-insensitive mode: as long as the source name 3194 * is an exact match, we will allow this to proceed as 3195 * a name-change request. 3196 */ 3197 if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE || 3198 (zfsvfs->z_case == ZFS_CASE_MIXED && 3199 flags & FIGNORECASE)) && 3200 u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST, 3201 &error) == 0) { 3202 /* 3203 * case preserving rename request, require exact 3204 * name matches 3205 */ 3206 zflg |= ZCIEXACT; 3207 zflg &= ~ZCILOOK; 3208 } 3209 } 3210 3211 if (cmp < 0) { 3212 serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, 3213 ZEXISTS | zflg, NULL, NULL); 3214 terr = zfs_dirent_lock(&tdl, 3215 tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL); 3216 } else { 3217 terr = zfs_dirent_lock(&tdl, 3218 tdzp, tnm, &tzp, zflg, NULL, NULL); 3219 serr = zfs_dirent_lock(&sdl, 3220 sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg, 3221 NULL, NULL); 3222 } 3223 3224 if (serr) { 3225 /* 3226 * Source entry invalid or not there. 3227 */ 3228 if (!terr) { 3229 zfs_dirent_unlock(tdl); 3230 if (tzp) 3231 VN_RELE(ZTOV(tzp)); 3232 } 3233 if (strcmp(snm, ".") == 0 || strcmp(snm, "..") == 0) 3234 serr = EINVAL; 3235 ZFS_EXIT(zfsvfs); 3236 return (serr); 3237 } 3238 if (terr) { 3239 zfs_dirent_unlock(sdl); 3240 VN_RELE(ZTOV(szp)); 3241 if (strcmp(tnm, "..") == 0) 3242 terr = EINVAL; 3243 ZFS_EXIT(zfsvfs); 3244 return (terr); 3245 } 3246 3247 /* 3248 * Must have write access at the source to remove the old entry 3249 * and write access at the target to create the new entry. 3250 * Note that if target and source are the same, this can be 3251 * done in a single check. 3252 */ 3253 3254 if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr)) 3255 goto out; 3256 3257 if (ZTOV(szp)->v_type == VDIR) { 3258 /* 3259 * Check to make sure rename is valid. 3260 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d 3261 */ 3262 if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl)) 3263 goto out; 3264 } 3265 3266 /* 3267 * Does target exist? 3268 */ 3269 if (tzp) { 3270 /* 3271 * Source and target must be the same type. 3272 */ 3273 if (ZTOV(szp)->v_type == VDIR) { 3274 if (ZTOV(tzp)->v_type != VDIR) { 3275 error = ENOTDIR; 3276 goto out; 3277 } 3278 } else { 3279 if (ZTOV(tzp)->v_type == VDIR) { 3280 error = EISDIR; 3281 goto out; 3282 } 3283 } 3284 /* 3285 * POSIX dictates that when the source and target 3286 * entries refer to the same file object, rename 3287 * must do nothing and exit without error. 3288 */ 3289 if (szp->z_id == tzp->z_id) { 3290 error = 0; 3291 goto out; 3292 } 3293 } 3294 3295 vnevent_rename_src(ZTOV(szp), sdvp, snm, ct); 3296 if (tzp) 3297 vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct); 3298 3299 /* 3300 * notify the target directory if it is not the same 3301 * as source directory. 3302 */ 3303 if (tdvp != sdvp) { 3304 vnevent_rename_dest_dir(tdvp, ct); 3305 } 3306 3307 tx = dmu_tx_create(zfsvfs->z_os); 3308 dmu_tx_hold_bonus(tx, szp->z_id); /* nlink changes */ 3309 dmu_tx_hold_bonus(tx, sdzp->z_id); /* nlink changes */ 3310 dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); 3311 dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); 3312 if (sdzp != tdzp) 3313 dmu_tx_hold_bonus(tx, tdzp->z_id); /* nlink changes */ 3314 if (tzp) 3315 dmu_tx_hold_bonus(tx, tzp->z_id); /* parent changes */ 3316 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 3317 error = dmu_tx_assign(tx, zfsvfs->z_assign); 3318 if (error) { 3319 if (zl != NULL) 3320 zfs_rename_unlock(&zl); 3321 zfs_dirent_unlock(sdl); 3322 zfs_dirent_unlock(tdl); 3323 VN_RELE(ZTOV(szp)); 3324 if (tzp) 3325 VN_RELE(ZTOV(tzp)); 3326 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 3327 dmu_tx_wait(tx); 3328 dmu_tx_abort(tx); 3329 goto top; 3330 } 3331 dmu_tx_abort(tx); 3332 ZFS_EXIT(zfsvfs); 3333 return (error); 3334 } 3335 3336 if (tzp) /* Attempt to remove the existing target */ 3337 error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL); 3338 3339 if (error == 0) { 3340 error = zfs_link_create(tdl, szp, tx, ZRENAMING); 3341 if (error == 0) { 3342 szp->z_phys->zp_flags |= ZFS_AV_MODIFIED; 3343 3344 error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); 3345 ASSERT(error == 0); 3346 3347 zfs_log_rename(zilog, tx, 3348 TX_RENAME | (flags & FIGNORECASE ? TX_CI : 0), 3349 sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp); 3350 3351 /* Update path information for the target vnode */ 3352 vn_renamepath(tdvp, ZTOV(szp), tnm, strlen(tnm)); 3353 } 3354#ifdef FREEBSD_NAMECACHE 3355 if (error == 0) { 3356 cache_purge(sdvp); 3357 cache_purge(tdvp); 3358 } 3359#endif 3360 } 3361 3362 dmu_tx_commit(tx); 3363out: 3364 if (zl != NULL) 3365 zfs_rename_unlock(&zl); 3366 3367 zfs_dirent_unlock(sdl); 3368 zfs_dirent_unlock(tdl); 3369 3370 VN_RELE(ZTOV(szp)); 3371 if (tzp) 3372 VN_RELE(ZTOV(tzp)); 3373 3374 ZFS_EXIT(zfsvfs); 3375 3376 return (error); 3377} 3378 3379/* 3380 * Insert the indicated symbolic reference entry into the directory. 3381 * 3382 * IN: dvp - Directory to contain new symbolic link. 3383 * link - Name for new symlink entry. 3384 * vap - Attributes of new entry. 3385 * target - Target path of new symlink. 3386 * cr - credentials of caller. 3387 * ct - caller context 3388 * flags - case flags 3389 * 3390 * RETURN: 0 if success 3391 * error code if failure 3392 * 3393 * Timestamps: 3394 * dvp - ctime|mtime updated 3395 */ 3396/*ARGSUSED*/ 3397static int 3398zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link, 3399 cred_t *cr, kthread_t *td) 3400{ 3401 znode_t *zp, *dzp = VTOZ(dvp); 3402 zfs_dirlock_t *dl; 3403 dmu_tx_t *tx; 3404 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 3405 zilog_t *zilog; 3406 int len = strlen(link); 3407 int error; 3408 int zflg = ZNEW; 3409 zfs_fuid_info_t *fuidp = NULL; 3410 int flags = 0; 3411 3412 ASSERT(vap->va_type == VLNK); 3413 3414 ZFS_ENTER(zfsvfs); 3415 ZFS_VERIFY_ZP(dzp); 3416 zilog = zfsvfs->z_log; 3417 3418 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), 3419 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 3420 ZFS_EXIT(zfsvfs); 3421 return (EILSEQ); 3422 } 3423 if (flags & FIGNORECASE) 3424 zflg |= ZCILOOK; 3425top: 3426 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { 3427 ZFS_EXIT(zfsvfs); 3428 return (error); 3429 } 3430 3431 if (len > MAXPATHLEN) { 3432 ZFS_EXIT(zfsvfs); 3433 return (ENAMETOOLONG); 3434 } 3435 3436 /* 3437 * Attempt to lock directory; fail if entry already exists. 3438 */ 3439 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); 3440 if (error) { 3441 ZFS_EXIT(zfsvfs); 3442 return (error); 3443 } 3444 3445 tx = dmu_tx_create(zfsvfs->z_os); 3446 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); 3447 dmu_tx_hold_bonus(tx, dzp->z_id); 3448 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 3449 if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) 3450 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE); 3451 if (IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr))) { 3452 if (zfsvfs->z_fuid_obj == 0) { 3453 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 3454 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 3455 FUID_SIZE_ESTIMATE(zfsvfs)); 3456 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); 3457 } else { 3458 dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); 3459 dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, 3460 FUID_SIZE_ESTIMATE(zfsvfs)); 3461 } 3462 } 3463 error = dmu_tx_assign(tx, zfsvfs->z_assign); 3464 if (error) { 3465 zfs_dirent_unlock(dl); 3466 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 3467 dmu_tx_wait(tx); 3468 dmu_tx_abort(tx); 3469 goto top; 3470 } 3471 dmu_tx_abort(tx); 3472 ZFS_EXIT(zfsvfs); 3473 return (error); 3474 } 3475 3476 dmu_buf_will_dirty(dzp->z_dbuf, tx); 3477 3478 /* 3479 * Create a new object for the symlink. 3480 * Put the link content into bonus buffer if it will fit; 3481 * otherwise, store it just like any other file data. 3482 */ 3483 if (sizeof (znode_phys_t) + len <= dmu_bonus_max()) { 3484 zfs_mknode(dzp, vap, tx, cr, 0, &zp, len, NULL, &fuidp); 3485 if (len != 0) 3486 bcopy(link, zp->z_phys + 1, len); 3487 } else { 3488 dmu_buf_t *dbp; 3489 3490 zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, NULL, &fuidp); 3491 /* 3492 * Nothing can access the znode yet so no locking needed 3493 * for growing the znode's blocksize. 3494 */ 3495 zfs_grow_blocksize(zp, len, tx); 3496 3497 VERIFY(0 == dmu_buf_hold(zfsvfs->z_os, 3498 zp->z_id, 0, FTAG, &dbp)); 3499 dmu_buf_will_dirty(dbp, tx); 3500 3501 ASSERT3U(len, <=, dbp->db_size); 3502 bcopy(link, dbp->db_data, len); 3503 dmu_buf_rele(dbp, FTAG); 3504 } 3505 zp->z_phys->zp_size = len; 3506 3507 /* 3508 * Insert the new object into the directory. 3509 */ 3510 (void) zfs_link_create(dl, zp, tx, ZNEW); 3511out: 3512 if (error == 0) { 3513 uint64_t txtype = TX_SYMLINK; 3514 if (flags & FIGNORECASE) 3515 txtype |= TX_CI; 3516 zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); 3517 *vpp = ZTOV(zp); 3518 } 3519 if (fuidp) 3520 zfs_fuid_info_free(fuidp); 3521 3522 dmu_tx_commit(tx); 3523 3524 zfs_dirent_unlock(dl); 3525 3526 ZFS_EXIT(zfsvfs); 3527 return (error); 3528} 3529 3530/* 3531 * Return, in the buffer contained in the provided uio structure, 3532 * the symbolic path referred to by vp. 3533 * 3534 * IN: vp - vnode of symbolic link. 3535 * uoip - structure to contain the link path. 3536 * cr - credentials of caller. 3537 * ct - caller context 3538 * 3539 * OUT: uio - structure to contain the link path. 3540 * 3541 * RETURN: 0 if success 3542 * error code if failure 3543 * 3544 * Timestamps: 3545 * vp - atime updated 3546 */ 3547/* ARGSUSED */ 3548static int 3549zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct) 3550{ 3551 znode_t *zp = VTOZ(vp); 3552 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 3553 size_t bufsz; 3554 int error; 3555 3556 ZFS_ENTER(zfsvfs); 3557 ZFS_VERIFY_ZP(zp); 3558 3559 bufsz = (size_t)zp->z_phys->zp_size; 3560 if (bufsz + sizeof (znode_phys_t) <= zp->z_dbuf->db_size) { 3561 error = uiomove(zp->z_phys + 1, 3562 MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); 3563 } else { 3564 dmu_buf_t *dbp; 3565 error = dmu_buf_hold(zfsvfs->z_os, zp->z_id, 0, FTAG, &dbp); 3566 if (error) { 3567 ZFS_EXIT(zfsvfs); 3568 return (error); 3569 } 3570 error = uiomove(dbp->db_data, 3571 MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); 3572 dmu_buf_rele(dbp, FTAG); 3573 } 3574 3575 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 3576 ZFS_EXIT(zfsvfs); 3577 return (error); 3578} 3579 3580/* 3581 * Insert a new entry into directory tdvp referencing svp. 3582 * 3583 * IN: tdvp - Directory to contain new entry. 3584 * svp - vnode of new entry. 3585 * name - name of new entry. 3586 * cr - credentials of caller. 3587 * ct - caller context 3588 * 3589 * RETURN: 0 if success 3590 * error code if failure 3591 * 3592 * Timestamps: 3593 * tdvp - ctime|mtime updated 3594 * svp - ctime updated 3595 */ 3596/* ARGSUSED */ 3597static int 3598zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr, 3599 caller_context_t *ct, int flags) 3600{ 3601 znode_t *dzp = VTOZ(tdvp); 3602 znode_t *tzp, *szp; 3603 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 3604 zilog_t *zilog; 3605 zfs_dirlock_t *dl; 3606 dmu_tx_t *tx; 3607 vnode_t *realvp; 3608 int error; 3609 int zf = ZNEW; 3610 uid_t owner; 3611 3612 ASSERT(tdvp->v_type == VDIR); 3613 3614 ZFS_ENTER(zfsvfs); 3615 ZFS_VERIFY_ZP(dzp); 3616 zilog = zfsvfs->z_log; 3617 3618 if (VOP_REALVP(svp, &realvp, ct) == 0) 3619 svp = realvp; 3620 3621 if (svp->v_vfsp != tdvp->v_vfsp) { 3622 ZFS_EXIT(zfsvfs); 3623 return (EXDEV); 3624 } 3625 szp = VTOZ(svp); 3626 ZFS_VERIFY_ZP(szp); 3627 3628 if (zfsvfs->z_utf8 && u8_validate(name, 3629 strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 3630 ZFS_EXIT(zfsvfs); 3631 return (EILSEQ); 3632 } 3633 if (flags & FIGNORECASE) 3634 zf |= ZCILOOK; 3635 3636top: 3637 /* 3638 * We do not support links between attributes and non-attributes 3639 * because of the potential security risk of creating links 3640 * into "normal" file space in order to circumvent restrictions 3641 * imposed in attribute space. 3642 */ 3643 if ((szp->z_phys->zp_flags & ZFS_XATTR) != 3644 (dzp->z_phys->zp_flags & ZFS_XATTR)) { 3645 ZFS_EXIT(zfsvfs); 3646 return (EINVAL); 3647 } 3648 3649 /* 3650 * POSIX dictates that we return EPERM here. 3651 * Better choices include ENOTSUP or EISDIR. 3652 */ 3653 if (svp->v_type == VDIR) { 3654 ZFS_EXIT(zfsvfs); 3655 return (EPERM); 3656 } 3657 3658 owner = zfs_fuid_map_id(zfsvfs, szp->z_phys->zp_uid, cr, ZFS_OWNER); 3659 if (owner != crgetuid(cr) && 3660 secpolicy_basic_link(svp, cr) != 0) { 3661 ZFS_EXIT(zfsvfs); 3662 return (EPERM); 3663 } 3664 3665 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { 3666 ZFS_EXIT(zfsvfs); 3667 return (error); 3668 } 3669 3670 /* 3671 * Attempt to lock directory; fail if entry already exists. 3672 */ 3673 error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL); 3674 if (error) { 3675 ZFS_EXIT(zfsvfs); 3676 return (error); 3677 } 3678 3679 tx = dmu_tx_create(zfsvfs->z_os); 3680 dmu_tx_hold_bonus(tx, szp->z_id); 3681 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 3682 error = dmu_tx_assign(tx, zfsvfs->z_assign); 3683 if (error) { 3684 zfs_dirent_unlock(dl); 3685 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 3686 dmu_tx_wait(tx); 3687 dmu_tx_abort(tx); 3688 goto top; 3689 } 3690 dmu_tx_abort(tx); 3691 ZFS_EXIT(zfsvfs); 3692 return (error); 3693 } 3694 3695 error = zfs_link_create(dl, szp, tx, 0); 3696 3697 if (error == 0) { 3698 uint64_t txtype = TX_LINK; 3699 if (flags & FIGNORECASE) 3700 txtype |= TX_CI; 3701 zfs_log_link(zilog, tx, txtype, dzp, szp, name); 3702 } 3703 3704 dmu_tx_commit(tx); 3705 3706 zfs_dirent_unlock(dl); 3707 3708 if (error == 0) { 3709 vnevent_link(svp, ct); 3710 } 3711 3712 ZFS_EXIT(zfsvfs); 3713 return (error); 3714} 3715 3716/*ARGSUSED*/ 3717void 3718zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) 3719{ 3720 znode_t *zp = VTOZ(vp); 3721 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 3722 int error; 3723 3724 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); 3725 if (zp->z_dbuf == NULL) { 3726 /* 3727 * The fs has been unmounted, or we did a 3728 * suspend/resume and this file no longer exists. 3729 */ 3730 VI_LOCK(vp); 3731 vp->v_count = 0; /* count arrives as 1 */ 3732 VI_UNLOCK(vp); 3733 vrecycle(vp, curthread); 3734 rw_exit(&zfsvfs->z_teardown_inactive_lock); 3735 return; 3736 } 3737 3738 if (zp->z_atime_dirty && zp->z_unlinked == 0) { 3739 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); 3740 3741 dmu_tx_hold_bonus(tx, zp->z_id); 3742 error = dmu_tx_assign(tx, TXG_WAIT); 3743 if (error) { 3744 dmu_tx_abort(tx); 3745 } else { 3746 dmu_buf_will_dirty(zp->z_dbuf, tx); 3747 mutex_enter(&zp->z_lock); 3748 zp->z_atime_dirty = 0; 3749 mutex_exit(&zp->z_lock); 3750 dmu_tx_commit(tx); 3751 } 3752 } 3753 3754 zfs_zinactive(zp); 3755 rw_exit(&zfsvfs->z_teardown_inactive_lock); 3756} 3757 3758CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid)); 3759CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid)); 3760 3761/*ARGSUSED*/ 3762static int 3763zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct) 3764{ 3765 znode_t *zp = VTOZ(vp); 3766 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 3767 uint32_t gen; 3768 uint64_t object = zp->z_id; 3769 zfid_short_t *zfid; 3770 int size, i; 3771 3772 ZFS_ENTER(zfsvfs); 3773 ZFS_VERIFY_ZP(zp); 3774 gen = (uint32_t)zp->z_gen; 3775 3776 size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN; 3777 fidp->fid_len = size; 3778 3779 zfid = (zfid_short_t *)fidp; 3780 3781 zfid->zf_len = size; 3782 3783 for (i = 0; i < sizeof (zfid->zf_object); i++) 3784 zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); 3785 3786 /* Must have a non-zero generation number to distinguish from .zfs */ 3787 if (gen == 0) 3788 gen = 1; 3789 for (i = 0; i < sizeof (zfid->zf_gen); i++) 3790 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); 3791 3792 if (size == LONG_FID_LEN) { 3793 uint64_t objsetid = dmu_objset_id(zfsvfs->z_os); 3794 zfid_long_t *zlfid; 3795 3796 zlfid = (zfid_long_t *)fidp; 3797 3798 for (i = 0; i < sizeof (zlfid->zf_setid); i++) 3799 zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i)); 3800 3801 /* XXX - this should be the generation number for the objset */ 3802 for (i = 0; i < sizeof (zlfid->zf_setgen); i++) 3803 zlfid->zf_setgen[i] = 0; 3804 } 3805 3806 ZFS_EXIT(zfsvfs); 3807 return (0); 3808} 3809 3810static int 3811zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, 3812 caller_context_t *ct) 3813{ 3814 znode_t *zp, *xzp; 3815 zfsvfs_t *zfsvfs; 3816 zfs_dirlock_t *dl; 3817 int error; 3818 3819 switch (cmd) { 3820 case _PC_LINK_MAX: 3821 *valp = INT_MAX; 3822 return (0); 3823 3824 case _PC_FILESIZEBITS: 3825 *valp = 64; 3826 return (0); 3827 3828#if 0 3829 case _PC_XATTR_EXISTS: 3830 zp = VTOZ(vp); 3831 zfsvfs = zp->z_zfsvfs; 3832 ZFS_ENTER(zfsvfs); 3833 ZFS_VERIFY_ZP(zp); 3834 *valp = 0; 3835 error = zfs_dirent_lock(&dl, zp, "", &xzp, 3836 ZXATTR | ZEXISTS | ZSHARED, NULL, NULL); 3837 if (error == 0) { 3838 zfs_dirent_unlock(dl); 3839 if (!zfs_dirempty(xzp)) 3840 *valp = 1; 3841 VN_RELE(ZTOV(xzp)); 3842 } else if (error == ENOENT) { 3843 /* 3844 * If there aren't extended attributes, it's the 3845 * same as having zero of them. 3846 */ 3847 error = 0; 3848 } 3849 ZFS_EXIT(zfsvfs); 3850 return (error); 3851#endif 3852 3853 case _PC_ACL_EXTENDED: 3854 *valp = 0; 3855 return (0); 3856 3857 case _PC_ACL_NFS4: 3858 *valp = 1; 3859 return (0); 3860 3861 case _PC_ACL_PATH_MAX: 3862 *valp = ACL_MAX_ENTRIES; 3863 return (0); 3864 3865 case _PC_MIN_HOLE_SIZE: 3866 *valp = (int)SPA_MINBLOCKSIZE; 3867 return (0); 3868 3869 default: 3870 return (EOPNOTSUPP); 3871 } 3872} 3873 3874/*ARGSUSED*/ 3875static int 3876zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, 3877 caller_context_t *ct) 3878{ 3879 znode_t *zp = VTOZ(vp); 3880 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 3881 int error; 3882 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 3883 3884 ZFS_ENTER(zfsvfs); 3885 ZFS_VERIFY_ZP(zp); 3886 error = zfs_getacl(zp, vsecp, skipaclchk, cr); 3887 ZFS_EXIT(zfsvfs); 3888 3889 return (error); 3890} 3891 3892/*ARGSUSED*/ 3893static int 3894zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, 3895 caller_context_t *ct) 3896{ 3897 znode_t *zp = VTOZ(vp); 3898 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 3899 int error; 3900 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 3901 3902 ZFS_ENTER(zfsvfs); 3903 ZFS_VERIFY_ZP(zp); 3904 error = zfs_setacl(zp, vsecp, skipaclchk, cr); 3905 ZFS_EXIT(zfsvfs); 3906 return (error); 3907} 3908 3909static int 3910zfs_freebsd_open(ap) 3911 struct vop_open_args /* { 3912 struct vnode *a_vp; 3913 int a_mode; 3914 struct ucred *a_cred; 3915 struct thread *a_td; 3916 } */ *ap; 3917{ 3918 vnode_t *vp = ap->a_vp; 3919 znode_t *zp = VTOZ(vp); 3920 int error; 3921 3922 error = zfs_open(&vp, ap->a_mode, ap->a_cred, NULL); 3923 if (error == 0) 3924 vnode_create_vobject(vp, zp->z_phys->zp_size, ap->a_td); 3925 return (error); 3926} 3927 3928static int 3929zfs_freebsd_close(ap) 3930 struct vop_close_args /* { 3931 struct vnode *a_vp; 3932 int a_fflag; 3933 struct ucred *a_cred; 3934 struct thread *a_td; 3935 } */ *ap; 3936{ 3937 3938 return (zfs_close(ap->a_vp, ap->a_fflag, 0, 0, ap->a_cred, NULL)); 3939} 3940 3941static int 3942zfs_freebsd_ioctl(ap) 3943 struct vop_ioctl_args /* { 3944 struct vnode *a_vp; 3945 u_long a_command; 3946 caddr_t a_data; 3947 int a_fflag; 3948 struct ucred *cred; 3949 struct thread *td; 3950 } */ *ap; 3951{ 3952 3953 return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data, 3954 ap->a_fflag, ap->a_cred, NULL, NULL)); 3955} 3956 3957static int 3958zfs_freebsd_read(ap) 3959 struct vop_read_args /* { 3960 struct vnode *a_vp; 3961 struct uio *a_uio; 3962 int a_ioflag; 3963 struct ucred *a_cred; 3964 } */ *ap; 3965{ 3966 3967 return (zfs_read(ap->a_vp, ap->a_uio, ap->a_ioflag, ap->a_cred, NULL)); 3968} 3969 3970static int 3971zfs_freebsd_write(ap) 3972 struct vop_write_args /* { 3973 struct vnode *a_vp; 3974 struct uio *a_uio; 3975 int a_ioflag; 3976 struct ucred *a_cred; 3977 } */ *ap; 3978{ 3979 3980 if (vn_rlimit_fsize(ap->a_vp, ap->a_uio, ap->a_uio->uio_td)) 3981 return (EFBIG); 3982 3983 return (zfs_write(ap->a_vp, ap->a_uio, ap->a_ioflag, ap->a_cred, NULL)); 3984} 3985 3986static int 3987zfs_freebsd_access(ap) 3988 struct vop_access_args /* { 3989 struct vnode *a_vp; 3990 accmode_t a_accmode; 3991 struct ucred *a_cred; 3992 struct thread *a_td; 3993 } */ *ap; 3994{ 3995 accmode_t accmode; 3996 int error = 0; 3997 3998 /* 3999 * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND, 4000 */ 4001 accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND); 4002 if (accmode != 0) 4003 error = zfs_access(ap->a_vp, accmode, 0, ap->a_cred, NULL); 4004 4005 /* 4006 * VADMIN has to be handled by vaccess(). 4007 */ 4008 if (error == 0) { 4009 accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND); 4010 if (accmode != 0) { 4011 vnode_t *vp = ap->a_vp; 4012 znode_t *zp = VTOZ(vp); 4013 znode_phys_t *zphys = zp->z_phys; 4014 4015 error = vaccess(vp->v_type, zphys->zp_mode, 4016 zphys->zp_uid, zphys->zp_gid, accmode, ap->a_cred, 4017 NULL); 4018 } 4019 } 4020 4021 return (error); 4022} 4023 4024static int 4025zfs_freebsd_lookup(ap) 4026 struct vop_lookup_args /* { 4027 struct vnode *a_dvp; 4028 struct vnode **a_vpp; 4029 struct componentname *a_cnp; 4030 } */ *ap; 4031{ 4032 struct componentname *cnp = ap->a_cnp; 4033 char nm[NAME_MAX + 1]; 4034 4035 ASSERT(cnp->cn_namelen < sizeof(nm)); 4036 strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm))); 4037 4038 return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop, 4039 cnp->cn_cred, cnp->cn_thread, 0)); 4040} 4041 4042static int 4043zfs_freebsd_create(ap) 4044 struct vop_create_args /* { 4045 struct vnode *a_dvp; 4046 struct vnode **a_vpp; 4047 struct componentname *a_cnp; 4048 struct vattr *a_vap; 4049 } */ *ap; 4050{ 4051 struct componentname *cnp = ap->a_cnp; 4052 vattr_t *vap = ap->a_vap; 4053 int mode; 4054 4055 ASSERT(cnp->cn_flags & SAVENAME); 4056 4057 vattr_init_mask(vap); 4058 mode = vap->va_mode & ALLPERMS; 4059 4060 return (zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode, 4061 ap->a_vpp, cnp->cn_cred, cnp->cn_thread)); 4062} 4063 4064static int 4065zfs_freebsd_remove(ap) 4066 struct vop_remove_args /* { 4067 struct vnode *a_dvp; 4068 struct vnode *a_vp; 4069 struct componentname *a_cnp; 4070 } */ *ap; 4071{ 4072 4073 ASSERT(ap->a_cnp->cn_flags & SAVENAME); 4074 4075 return (zfs_remove(ap->a_dvp, ap->a_cnp->cn_nameptr, 4076 ap->a_cnp->cn_cred, NULL, 0)); 4077} 4078 4079static int 4080zfs_freebsd_mkdir(ap) 4081 struct vop_mkdir_args /* { 4082 struct vnode *a_dvp; 4083 struct vnode **a_vpp; 4084 struct componentname *a_cnp; 4085 struct vattr *a_vap; 4086 } */ *ap; 4087{ 4088 vattr_t *vap = ap->a_vap; 4089 4090 ASSERT(ap->a_cnp->cn_flags & SAVENAME); 4091 4092 vattr_init_mask(vap); 4093 4094 return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp, 4095 ap->a_cnp->cn_cred, NULL, 0, NULL)); 4096} 4097 4098static int 4099zfs_freebsd_rmdir(ap) 4100 struct vop_rmdir_args /* { 4101 struct vnode *a_dvp; 4102 struct vnode *a_vp; 4103 struct componentname *a_cnp; 4104 } */ *ap; 4105{ 4106 struct componentname *cnp = ap->a_cnp; 4107 4108 ASSERT(cnp->cn_flags & SAVENAME); 4109 4110 return (zfs_rmdir(ap->a_dvp, cnp->cn_nameptr, NULL, cnp->cn_cred, NULL, 0)); 4111} 4112 4113static int 4114zfs_freebsd_readdir(ap) 4115 struct vop_readdir_args /* { 4116 struct vnode *a_vp; 4117 struct uio *a_uio; 4118 struct ucred *a_cred; 4119 int *a_eofflag; 4120 int *a_ncookies; 4121 u_long **a_cookies; 4122 } */ *ap; 4123{ 4124 4125 return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag, 4126 ap->a_ncookies, ap->a_cookies)); 4127} 4128 4129static int 4130zfs_freebsd_fsync(ap) 4131 struct vop_fsync_args /* { 4132 struct vnode *a_vp; 4133 int a_waitfor; 4134 struct thread *a_td; 4135 } */ *ap; 4136{ 4137 4138 vop_stdfsync(ap); 4139 return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL)); 4140} 4141 4142static int 4143zfs_freebsd_getattr(ap) 4144 struct vop_getattr_args /* { 4145 struct vnode *a_vp; 4146 struct vattr *a_vap; 4147 struct ucred *a_cred; 4148 struct thread *a_td; 4149 } */ *ap; 4150{ 4151 vattr_t *vap = ap->a_vap; 4152 xvattr_t xvap; 4153 u_long fflags = 0; 4154 int error; 4155 4156 xva_init(&xvap); 4157 xvap.xva_vattr = *vap; 4158 xvap.xva_vattr.va_mask |= AT_XVATTR; 4159 4160 /* Convert chflags into ZFS-type flags. */ 4161 /* XXX: what about SF_SETTABLE?. */ 4162 XVA_SET_REQ(&xvap, XAT_IMMUTABLE); 4163 XVA_SET_REQ(&xvap, XAT_APPENDONLY); 4164 XVA_SET_REQ(&xvap, XAT_NOUNLINK); 4165 XVA_SET_REQ(&xvap, XAT_NODUMP); 4166 error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL); 4167 if (error != 0) 4168 return (error); 4169 4170 /* Convert ZFS xattr into chflags. */ 4171#define FLAG_CHECK(fflag, xflag, xfield) do { \ 4172 if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0) \ 4173 fflags |= (fflag); \ 4174} while (0) 4175 FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE, 4176 xvap.xva_xoptattrs.xoa_immutable); 4177 FLAG_CHECK(SF_APPEND, XAT_APPENDONLY, 4178 xvap.xva_xoptattrs.xoa_appendonly); 4179 FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK, 4180 xvap.xva_xoptattrs.xoa_nounlink); 4181 FLAG_CHECK(UF_NODUMP, XAT_NODUMP, 4182 xvap.xva_xoptattrs.xoa_nodump); 4183#undef FLAG_CHECK 4184 *vap = xvap.xva_vattr; 4185 vap->va_flags = fflags; 4186 return (0); 4187} 4188 4189static int 4190zfs_freebsd_setattr(ap) 4191 struct vop_setattr_args /* { 4192 struct vnode *a_vp; 4193 struct vattr *a_vap; 4194 struct ucred *a_cred; 4195 struct thread *a_td; 4196 } */ *ap; 4197{ 4198 vnode_t *vp = ap->a_vp; 4199 vattr_t *vap = ap->a_vap; 4200 cred_t *cred = ap->a_cred; 4201 xvattr_t xvap; 4202 u_long fflags; 4203 uint64_t zflags; 4204 4205 vattr_init_mask(vap); 4206 vap->va_mask &= ~AT_NOSET; 4207 4208 xva_init(&xvap); 4209 xvap.xva_vattr = *vap; 4210 4211 zflags = VTOZ(vp)->z_phys->zp_flags; 4212 4213 if (vap->va_flags != VNOVAL) { 4214 zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs; 4215 int error; 4216 4217 if (zfsvfs->z_use_fuids == B_FALSE) 4218 return (EOPNOTSUPP); 4219 4220 fflags = vap->va_flags; 4221 if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_NODUMP)) != 0) 4222 return (EOPNOTSUPP); 4223 /* 4224 * Unprivileged processes are not permitted to unset system 4225 * flags, or modify flags if any system flags are set. 4226 * Privileged non-jail processes may not modify system flags 4227 * if securelevel > 0 and any existing system flags are set. 4228 * Privileged jail processes behave like privileged non-jail 4229 * processes if the security.jail.chflags_allowed sysctl is 4230 * is non-zero; otherwise, they behave like unprivileged 4231 * processes. 4232 */ 4233 if (secpolicy_fs_owner(vp->v_mount, cred) == 0 || 4234 priv_check_cred(cred, PRIV_VFS_SYSFLAGS, 0) == 0) { 4235 if (zflags & 4236 (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) { 4237 error = securelevel_gt(cred, 0); 4238 if (error != 0) 4239 return (error); 4240 } 4241 } else { 4242 /* 4243 * Callers may only modify the file flags on objects they 4244 * have VADMIN rights for. 4245 */ 4246 if ((error = VOP_ACCESS(vp, VADMIN, cred, curthread)) != 0) 4247 return (error); 4248 if (zflags & 4249 (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) { 4250 return (EPERM); 4251 } 4252 if (fflags & 4253 (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) { 4254 return (EPERM); 4255 } 4256 } 4257 4258#define FLAG_CHANGE(fflag, zflag, xflag, xfield) do { \ 4259 if (((fflags & (fflag)) && !(zflags & (zflag))) || \ 4260 ((zflags & (zflag)) && !(fflags & (fflag)))) { \ 4261 XVA_SET_REQ(&xvap, (xflag)); \ 4262 (xfield) = ((fflags & (fflag)) != 0); \ 4263 } \ 4264} while (0) 4265 /* Convert chflags into ZFS-type flags. */ 4266 /* XXX: what about SF_SETTABLE?. */ 4267 FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE, 4268 xvap.xva_xoptattrs.xoa_immutable); 4269 FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY, 4270 xvap.xva_xoptattrs.xoa_appendonly); 4271 FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK, 4272 xvap.xva_xoptattrs.xoa_nounlink); 4273 FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP, 4274 xvap.xva_xoptattrs.xoa_nodump); 4275#undef FLAG_CHANGE 4276 } 4277 return (zfs_setattr(vp, (vattr_t *)&xvap, 0, cred, NULL)); 4278} 4279 4280static int 4281zfs_freebsd_rename(ap) 4282 struct vop_rename_args /* { 4283 struct vnode *a_fdvp; 4284 struct vnode *a_fvp; 4285 struct componentname *a_fcnp; 4286 struct vnode *a_tdvp; 4287 struct vnode *a_tvp; 4288 struct componentname *a_tcnp; 4289 } */ *ap; 4290{ 4291 vnode_t *fdvp = ap->a_fdvp; 4292 vnode_t *fvp = ap->a_fvp; 4293 vnode_t *tdvp = ap->a_tdvp; 4294 vnode_t *tvp = ap->a_tvp; 4295 int error; 4296 4297 ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART)); 4298 ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART)); 4299 4300 error = zfs_rename(fdvp, ap->a_fcnp->cn_nameptr, tdvp, 4301 ap->a_tcnp->cn_nameptr, ap->a_fcnp->cn_cred, NULL, 0); 4302 4303 if (tdvp == tvp) 4304 VN_RELE(tdvp); 4305 else 4306 VN_URELE(tdvp); 4307 if (tvp) 4308 VN_URELE(tvp); 4309 VN_RELE(fdvp); 4310 VN_RELE(fvp); 4311 4312 return (error); 4313} 4314 4315static int 4316zfs_freebsd_symlink(ap) 4317 struct vop_symlink_args /* { 4318 struct vnode *a_dvp; 4319 struct vnode **a_vpp; 4320 struct componentname *a_cnp; 4321 struct vattr *a_vap; 4322 char *a_target; 4323 } */ *ap; 4324{ 4325 struct componentname *cnp = ap->a_cnp; 4326 vattr_t *vap = ap->a_vap; 4327 4328 ASSERT(cnp->cn_flags & SAVENAME); 4329 4330 vap->va_type = VLNK; /* FreeBSD: Syscall only sets va_mode. */ 4331 vattr_init_mask(vap); 4332 4333 return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap, 4334 ap->a_target, cnp->cn_cred, cnp->cn_thread)); 4335} 4336 4337static int 4338zfs_freebsd_readlink(ap) 4339 struct vop_readlink_args /* { 4340 struct vnode *a_vp; 4341 struct uio *a_uio; 4342 struct ucred *a_cred; 4343 } */ *ap; 4344{ 4345 4346 return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL)); 4347} 4348 4349static int 4350zfs_freebsd_link(ap) 4351 struct vop_link_args /* { 4352 struct vnode *a_tdvp; 4353 struct vnode *a_vp; 4354 struct componentname *a_cnp; 4355 } */ *ap; 4356{ 4357 struct componentname *cnp = ap->a_cnp; 4358 4359 ASSERT(cnp->cn_flags & SAVENAME); 4360 4361 return (zfs_link(ap->a_tdvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred, NULL, 0)); 4362} 4363 4364static int 4365zfs_freebsd_inactive(ap) 4366 struct vop_inactive_args /* { 4367 struct vnode *a_vp; 4368 struct thread *a_td; 4369 } */ *ap; 4370{ 4371 vnode_t *vp = ap->a_vp; 4372 4373 zfs_inactive(vp, ap->a_td->td_ucred, NULL); 4374 return (0); 4375} 4376 4377static void 4378zfs_reclaim_complete(void *arg, int pending) 4379{ 4380 znode_t *zp = arg; 4381 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4382 4383 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); 4384 if (zp->z_dbuf != NULL) { 4385 ZFS_OBJ_HOLD_ENTER(zfsvfs, zp->z_id); 4386 zfs_znode_dmu_fini(zp); 4387 ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id); 4388 } 4389 zfs_znode_free(zp); 4390 rw_exit(&zfsvfs->z_teardown_inactive_lock); 4391 /* 4392 * If the file system is being unmounted, there is a process waiting 4393 * for us, wake it up. 4394 */ 4395 if (zfsvfs->z_unmounted) 4396 wakeup_one(zfsvfs); 4397} 4398 4399static int 4400zfs_freebsd_reclaim(ap) 4401 struct vop_reclaim_args /* { 4402 struct vnode *a_vp; 4403 struct thread *a_td; 4404 } */ *ap; 4405{ 4406 vnode_t *vp = ap->a_vp; 4407 znode_t *zp = VTOZ(vp); 4408 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4409 4410 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); 4411 4412 ASSERT(zp != NULL); 4413 4414 /* 4415 * Destroy the vm object and flush associated pages. 4416 */ 4417 vnode_destroy_vobject(vp); 4418 4419 mutex_enter(&zp->z_lock); 4420 ASSERT(zp->z_phys != NULL); 4421 zp->z_vnode = NULL; 4422 mutex_exit(&zp->z_lock); 4423 4424 if (zp->z_unlinked) 4425 ; /* Do nothing. */ 4426 else if (zp->z_dbuf == NULL) 4427 zfs_znode_free(zp); 4428 else /* if (!zp->z_unlinked && zp->z_dbuf != NULL) */ { 4429 int locked; 4430 4431 locked = MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id)) ? 2 : 4432 ZFS_OBJ_HOLD_TRYENTER(zfsvfs, zp->z_id); 4433 if (locked == 0) { 4434 /* 4435 * Lock can't be obtained due to deadlock possibility, 4436 * so defer znode destruction. 4437 */ 4438 TASK_INIT(&zp->z_task, 0, zfs_reclaim_complete, zp); 4439 taskqueue_enqueue(taskqueue_thread, &zp->z_task); 4440 } else { 4441 zfs_znode_dmu_fini(zp); 4442 if (locked == 1) 4443 ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id); 4444 zfs_znode_free(zp); 4445 } 4446 } 4447 VI_LOCK(vp); 4448 vp->v_data = NULL; 4449 ASSERT(vp->v_holdcnt >= 1); 4450 VI_UNLOCK(vp); 4451 rw_exit(&zfsvfs->z_teardown_inactive_lock); 4452 return (0); 4453} 4454 4455static int 4456zfs_freebsd_fid(ap) 4457 struct vop_fid_args /* { 4458 struct vnode *a_vp; 4459 struct fid *a_fid; 4460 } */ *ap; 4461{ 4462 4463 return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL)); 4464} 4465 4466static int 4467zfs_freebsd_pathconf(ap) 4468 struct vop_pathconf_args /* { 4469 struct vnode *a_vp; 4470 int a_name; 4471 register_t *a_retval; 4472 } */ *ap; 4473{ 4474 ulong_t val; 4475 int error; 4476 4477 error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred, NULL); 4478 if (error == 0) 4479 *ap->a_retval = val; 4480 else if (error == EOPNOTSUPP) 4481 error = vop_stdpathconf(ap); 4482 return (error); 4483} 4484 4485static int 4486zfs_freebsd_fifo_pathconf(ap) 4487 struct vop_pathconf_args /* { 4488 struct vnode *a_vp; 4489 int a_name; 4490 register_t *a_retval; 4491 } */ *ap; 4492{ 4493 4494 switch (ap->a_name) { 4495 case _PC_ACL_EXTENDED: 4496 case _PC_ACL_NFS4: 4497 case _PC_ACL_PATH_MAX: 4498 case _PC_MAC_PRESENT: 4499 return (zfs_freebsd_pathconf(ap)); 4500 default: 4501 return (fifo_specops.vop_pathconf(ap)); 4502 } 4503} 4504 4505/* 4506 * FreeBSD's extended attributes namespace defines file name prefix for ZFS' 4507 * extended attribute name: 4508 * 4509 * NAMESPACE PREFIX 4510 * system freebsd:system: 4511 * user (none, can be used to access ZFS fsattr(5) attributes 4512 * created on Solaris) 4513 */ 4514static int 4515zfs_create_attrname(int attrnamespace, const char *name, char *attrname, 4516 size_t size) 4517{ 4518 const char *namespace, *prefix, *suffix; 4519 4520 /* We don't allow '/' character in attribute name. */ 4521 if (strchr(name, '/') != NULL) 4522 return (EINVAL); 4523 /* We don't allow attribute names that start with "freebsd:" string. */ 4524 if (strncmp(name, "freebsd:", 8) == 0) 4525 return (EINVAL); 4526 4527 bzero(attrname, size); 4528 4529 switch (attrnamespace) { 4530 case EXTATTR_NAMESPACE_USER: 4531#if 0 4532 prefix = "freebsd:"; 4533 namespace = EXTATTR_NAMESPACE_USER_STRING; 4534 suffix = ":"; 4535#else 4536 /* 4537 * This is the default namespace by which we can access all 4538 * attributes created on Solaris. 4539 */ 4540 prefix = namespace = suffix = ""; 4541#endif 4542 break; 4543 case EXTATTR_NAMESPACE_SYSTEM: 4544 prefix = "freebsd:"; 4545 namespace = EXTATTR_NAMESPACE_SYSTEM_STRING; 4546 suffix = ":"; 4547 break; 4548 case EXTATTR_NAMESPACE_EMPTY: 4549 default: 4550 return (EINVAL); 4551 } 4552 if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix, 4553 name) >= size) { 4554 return (ENAMETOOLONG); 4555 } 4556 return (0); 4557} 4558 4559/* 4560 * Vnode operating to retrieve a named extended attribute. 4561 */ 4562static int 4563zfs_getextattr(struct vop_getextattr_args *ap) 4564/* 4565vop_getextattr { 4566 IN struct vnode *a_vp; 4567 IN int a_attrnamespace; 4568 IN const char *a_name; 4569 INOUT struct uio *a_uio; 4570 OUT size_t *a_size; 4571 IN struct ucred *a_cred; 4572 IN struct thread *a_td; 4573}; 4574*/ 4575{ 4576 zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; 4577 struct thread *td = ap->a_td; 4578 struct nameidata nd; 4579 char attrname[255]; 4580 struct vattr va; 4581 vnode_t *xvp = NULL, *vp; 4582 int error, flags; 4583 4584 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 4585 ap->a_cred, ap->a_td, VREAD); 4586 if (error != 0) 4587 return (error); 4588 4589 error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, 4590 sizeof(attrname)); 4591 if (error != 0) 4592 return (error); 4593 4594 ZFS_ENTER(zfsvfs); 4595 4596 error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, 4597 LOOKUP_XATTR); 4598 if (error != 0) { 4599 ZFS_EXIT(zfsvfs); 4600 return (error); 4601 } 4602 4603 flags = FREAD; 4604 NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_SYSSPACE, attrname, 4605 xvp, td); 4606 error = vn_open_cred(&nd, &flags, 0, 0, ap->a_cred, NULL); 4607 vp = nd.ni_vp; 4608 NDFREE(&nd, NDF_ONLY_PNBUF); 4609 if (error != 0) { 4610 ZFS_EXIT(zfsvfs); 4611 if (error == ENOENT) 4612 error = ENOATTR; 4613 return (error); 4614 } 4615 4616 if (ap->a_size != NULL) { 4617 error = VOP_GETATTR(vp, &va, ap->a_cred); 4618 if (error == 0) 4619 *ap->a_size = (size_t)va.va_size; 4620 } else if (ap->a_uio != NULL) 4621 error = VOP_READ(vp, ap->a_uio, IO_UNIT | IO_SYNC, ap->a_cred); 4622 4623 VOP_UNLOCK(vp, 0); 4624 vn_close(vp, flags, ap->a_cred, td); 4625 ZFS_EXIT(zfsvfs); 4626 4627 return (error); 4628} 4629 4630/* 4631 * Vnode operation to remove a named attribute. 4632 */ 4633int 4634zfs_deleteextattr(struct vop_deleteextattr_args *ap) 4635/* 4636vop_deleteextattr { 4637 IN struct vnode *a_vp; 4638 IN int a_attrnamespace; 4639 IN const char *a_name; 4640 IN struct ucred *a_cred; 4641 IN struct thread *a_td; 4642}; 4643*/ 4644{ 4645 zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; 4646 struct thread *td = ap->a_td; 4647 struct nameidata nd; 4648 char attrname[255]; 4649 struct vattr va; 4650 vnode_t *xvp = NULL, *vp; 4651 int error, flags; 4652 4653 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 4654 ap->a_cred, ap->a_td, VWRITE); 4655 if (error != 0) 4656 return (error); 4657 4658 error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, 4659 sizeof(attrname)); 4660 if (error != 0) 4661 return (error); 4662 4663 ZFS_ENTER(zfsvfs); 4664 4665 error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, 4666 LOOKUP_XATTR); 4667 if (error != 0) { 4668 ZFS_EXIT(zfsvfs); 4669 return (error); 4670 } 4671 4672 NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF | MPSAFE, 4673 UIO_SYSSPACE, attrname, xvp, td); 4674 error = namei(&nd); 4675 vp = nd.ni_vp; 4676 NDFREE(&nd, NDF_ONLY_PNBUF); 4677 if (error != 0) { 4678 ZFS_EXIT(zfsvfs); 4679 if (error == ENOENT) 4680 error = ENOATTR; 4681 return (error); 4682 } 4683 error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd); 4684 4685 vput(nd.ni_dvp); 4686 if (vp == nd.ni_dvp) 4687 vrele(vp); 4688 else 4689 vput(vp); 4690 ZFS_EXIT(zfsvfs); 4691 4692 return (error); 4693} 4694 4695/* 4696 * Vnode operation to set a named attribute. 4697 */ 4698static int 4699zfs_setextattr(struct vop_setextattr_args *ap) 4700/* 4701vop_setextattr { 4702 IN struct vnode *a_vp; 4703 IN int a_attrnamespace; 4704 IN const char *a_name; 4705 INOUT struct uio *a_uio; 4706 IN struct ucred *a_cred; 4707 IN struct thread *a_td; 4708}; 4709*/ 4710{ 4711 zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; 4712 struct thread *td = ap->a_td; 4713 struct nameidata nd; 4714 char attrname[255]; 4715 struct vattr va; 4716 vnode_t *xvp = NULL, *vp; 4717 int error, flags; 4718 4719 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 4720 ap->a_cred, ap->a_td, VWRITE); 4721 if (error != 0) 4722 return (error); 4723 4724 error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, 4725 sizeof(attrname)); 4726 if (error != 0) 4727 return (error); 4728 4729 ZFS_ENTER(zfsvfs); 4730 4731 error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, 4732 LOOKUP_XATTR | CREATE_XATTR_DIR); 4733 if (error != 0) { 4734 ZFS_EXIT(zfsvfs); 4735 return (error); 4736 } 4737 4738 flags = FFLAGS(O_WRONLY | O_CREAT); 4739 NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_SYSSPACE, attrname, 4740 xvp, td); 4741 error = vn_open_cred(&nd, &flags, 0600, 0, ap->a_cred, NULL); 4742 vp = nd.ni_vp; 4743 NDFREE(&nd, NDF_ONLY_PNBUF); 4744 if (error != 0) { 4745 ZFS_EXIT(zfsvfs); 4746 return (error); 4747 } 4748 4749 VATTR_NULL(&va); 4750 va.va_size = 0; 4751 error = VOP_SETATTR(vp, &va, ap->a_cred); 4752 if (error == 0) 4753 VOP_WRITE(vp, ap->a_uio, IO_UNIT | IO_SYNC, ap->a_cred); 4754 4755 VOP_UNLOCK(vp, 0); 4756 vn_close(vp, flags, ap->a_cred, td); 4757 ZFS_EXIT(zfsvfs); 4758 4759 return (error); 4760} 4761 4762/* 4763 * Vnode operation to retrieve extended attributes on a vnode. 4764 */ 4765static int 4766zfs_listextattr(struct vop_listextattr_args *ap) 4767/* 4768vop_listextattr { 4769 IN struct vnode *a_vp; 4770 IN int a_attrnamespace; 4771 INOUT struct uio *a_uio; 4772 OUT size_t *a_size; 4773 IN struct ucred *a_cred; 4774 IN struct thread *a_td; 4775}; 4776*/ 4777{ 4778 zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; 4779 struct thread *td = ap->a_td; 4780 struct nameidata nd; 4781 char attrprefix[16]; 4782 u_char dirbuf[sizeof(struct dirent)]; 4783 struct dirent *dp; 4784 struct iovec aiov; 4785 struct uio auio, *uio = ap->a_uio; 4786 size_t *sizep = ap->a_size; 4787 size_t plen; 4788 vnode_t *xvp = NULL, *vp; 4789 int done, error, eof, pos; 4790 4791 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 4792 ap->a_cred, ap->a_td, VREAD); 4793 if (error != 0) 4794 return (error); 4795 4796 error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix, 4797 sizeof(attrprefix)); 4798 if (error != 0) 4799 return (error); 4800 plen = strlen(attrprefix); 4801 4802 ZFS_ENTER(zfsvfs); 4803 4804 if (sizep != NULL) 4805 *sizep = 0; 4806 4807 error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, 4808 LOOKUP_XATTR); 4809 if (error != 0) { 4810 ZFS_EXIT(zfsvfs); 4811 /* 4812 * ENOATTR means that the EA directory does not yet exist, 4813 * i.e. there are no extended attributes there. 4814 */ 4815 if (error == ENOATTR) 4816 error = 0; 4817 return (error); 4818 } 4819 4820 NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED | MPSAFE, 4821 UIO_SYSSPACE, ".", xvp, td); 4822 error = namei(&nd); 4823 vp = nd.ni_vp; 4824 NDFREE(&nd, NDF_ONLY_PNBUF); 4825 if (error != 0) { 4826 ZFS_EXIT(zfsvfs); 4827 return (error); 4828 } 4829 4830 auio.uio_iov = &aiov; 4831 auio.uio_iovcnt = 1; 4832 auio.uio_segflg = UIO_SYSSPACE; 4833 auio.uio_td = td; 4834 auio.uio_rw = UIO_READ; 4835 auio.uio_offset = 0; 4836 4837 do { 4838 u_char nlen; 4839 4840 aiov.iov_base = (void *)dirbuf; 4841 aiov.iov_len = sizeof(dirbuf); 4842 auio.uio_resid = sizeof(dirbuf); 4843 error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL); 4844 done = sizeof(dirbuf) - auio.uio_resid; 4845 if (error != 0) 4846 break; 4847 for (pos = 0; pos < done;) { 4848 dp = (struct dirent *)(dirbuf + pos); 4849 pos += dp->d_reclen; 4850 /* 4851 * XXX: Temporarily we also accept DT_UNKNOWN, as this 4852 * is what we get when attribute was created on Solaris. 4853 */ 4854 if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN) 4855 continue; 4856 if (plen == 0 && strncmp(dp->d_name, "freebsd:", 8) == 0) 4857 continue; 4858 else if (strncmp(dp->d_name, attrprefix, plen) != 0) 4859 continue; 4860 nlen = dp->d_namlen - plen; 4861 if (sizep != NULL) 4862 *sizep += 1 + nlen; 4863 else if (uio != NULL) { 4864 /* 4865 * Format of extattr name entry is one byte for 4866 * length and the rest for name. 4867 */ 4868 error = uiomove(&nlen, 1, uio->uio_rw, uio); 4869 if (error == 0) { 4870 error = uiomove(dp->d_name + plen, nlen, 4871 uio->uio_rw, uio); 4872 } 4873 if (error != 0) 4874 break; 4875 } 4876 } 4877 } while (!eof && error == 0); 4878 4879 vput(vp); 4880 ZFS_EXIT(zfsvfs); 4881 4882 return (error); 4883} 4884 4885int 4886zfs_freebsd_getacl(ap) 4887 struct vop_getacl_args /* { 4888 struct vnode *vp; 4889 acl_type_t type; 4890 struct acl *aclp; 4891 struct ucred *cred; 4892 struct thread *td; 4893 } */ *ap; 4894{ 4895 int error; 4896 vsecattr_t vsecattr; 4897 4898 if (ap->a_type != ACL_TYPE_NFS4) 4899 return (EINVAL); 4900 4901 vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT; 4902 if (error = zfs_getsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL)) 4903 return (error); 4904 4905 error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp, vsecattr.vsa_aclcnt); 4906 if (vsecattr.vsa_aclentp != NULL) 4907 kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz); 4908 4909 return (error); 4910} 4911 4912int 4913zfs_freebsd_setacl(ap) 4914 struct vop_setacl_args /* { 4915 struct vnode *vp; 4916 acl_type_t type; 4917 struct acl *aclp; 4918 struct ucred *cred; 4919 struct thread *td; 4920 } */ *ap; 4921{ 4922 int error; 4923 vsecattr_t vsecattr; 4924 int aclbsize; /* size of acl list in bytes */ 4925 aclent_t *aaclp; 4926 4927 if (ap->a_type != ACL_TYPE_NFS4) 4928 return (EINVAL); 4929 4930 if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES) 4931 return (EINVAL); 4932 4933 /* 4934 * With NFSv4 ACLs, chmod(2) may need to add additional entries, 4935 * splitting every entry into two and appending "canonical six" 4936 * entries at the end. Don't allow for setting an ACL that would 4937 * cause chmod(2) to run out of ACL entries. 4938 */ 4939 if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES) 4940 return (ENOSPC); 4941 4942 vsecattr.vsa_mask = VSA_ACE; 4943 aclbsize = ap->a_aclp->acl_cnt * sizeof(ace_t); 4944 vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP); 4945 aaclp = vsecattr.vsa_aclentp; 4946 vsecattr.vsa_aclentsz = aclbsize; 4947 4948 aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp); 4949 error = zfs_setsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL); 4950 kmem_free(aaclp, aclbsize); 4951 4952 return (error); 4953} 4954 4955int 4956zfs_freebsd_aclcheck(ap) 4957 struct vop_aclcheck_args /* { 4958 struct vnode *vp; 4959 acl_type_t type; 4960 struct acl *aclp; 4961 struct ucred *cred; 4962 struct thread *td; 4963 } */ *ap; 4964{ 4965 4966 return (EOPNOTSUPP); 4967} 4968 4969struct vop_vector zfs_vnodeops; 4970struct vop_vector zfs_fifoops; 4971 4972struct vop_vector zfs_vnodeops = { 4973 .vop_default = &default_vnodeops, 4974 .vop_inactive = zfs_freebsd_inactive, 4975 .vop_reclaim = zfs_freebsd_reclaim, 4976 .vop_access = zfs_freebsd_access, 4977#ifdef FREEBSD_NAMECACHE 4978 .vop_lookup = vfs_cache_lookup, 4979 .vop_cachedlookup = zfs_freebsd_lookup, 4980#else 4981 .vop_lookup = zfs_freebsd_lookup, 4982#endif 4983 .vop_getattr = zfs_freebsd_getattr, 4984 .vop_setattr = zfs_freebsd_setattr, 4985 .vop_create = zfs_freebsd_create, 4986 .vop_mknod = zfs_freebsd_create, 4987 .vop_mkdir = zfs_freebsd_mkdir, 4988 .vop_readdir = zfs_freebsd_readdir, 4989 .vop_fsync = zfs_freebsd_fsync, 4990 .vop_open = zfs_freebsd_open, 4991 .vop_close = zfs_freebsd_close, 4992 .vop_rmdir = zfs_freebsd_rmdir, 4993 .vop_ioctl = zfs_freebsd_ioctl, 4994 .vop_link = zfs_freebsd_link, 4995 .vop_symlink = zfs_freebsd_symlink, 4996 .vop_readlink = zfs_freebsd_readlink, 4997 .vop_read = zfs_freebsd_read, 4998 .vop_write = zfs_freebsd_write, 4999 .vop_remove = zfs_freebsd_remove, 5000 .vop_rename = zfs_freebsd_rename, 5001 .vop_pathconf = zfs_freebsd_pathconf, 5002 .vop_bmap = VOP_EOPNOTSUPP, 5003 .vop_fid = zfs_freebsd_fid, 5004 .vop_getextattr = zfs_getextattr, 5005 .vop_deleteextattr = zfs_deleteextattr, 5006 .vop_setextattr = zfs_setextattr, 5007 .vop_listextattr = zfs_listextattr, 5008 .vop_getacl = zfs_freebsd_getacl, 5009 .vop_setacl = zfs_freebsd_setacl, 5010 .vop_aclcheck = zfs_freebsd_aclcheck, 5011}; 5012 5013struct vop_vector zfs_fifoops = { 5014 .vop_default = &fifo_specops, 5015 .vop_fsync = zfs_freebsd_fsync, 5016 .vop_access = zfs_freebsd_access, 5017 .vop_getattr = zfs_freebsd_getattr, 5018 .vop_inactive = zfs_freebsd_inactive, 5019 .vop_read = VOP_PANIC, 5020 .vop_reclaim = zfs_freebsd_reclaim, 5021 .vop_setattr = zfs_freebsd_setattr, 5022 .vop_write = VOP_PANIC, 5023 .vop_pathconf = zfs_freebsd_fifo_pathconf, 5024 .vop_fid = zfs_freebsd_fid, 5025 .vop_getacl = zfs_freebsd_getacl, 5026 .vop_setacl = zfs_freebsd_setacl, 5027 .vop_aclcheck = zfs_freebsd_aclcheck, 5028}; 5029