zfs_vnops.c revision 196299
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26/* Portions Copyright 2007 Jeremy Teo */ 27 28#include <sys/types.h> 29#include <sys/param.h> 30#include <sys/time.h> 31#include <sys/systm.h> 32#include <sys/sysmacros.h> 33#include <sys/resource.h> 34#include <sys/vfs.h> 35#include <sys/vnode.h> 36#include <sys/file.h> 37#include <sys/stat.h> 38#include <sys/kmem.h> 39#include <sys/taskq.h> 40#include <sys/uio.h> 41#include <sys/atomic.h> 42#include <sys/namei.h> 43#include <sys/mman.h> 44#include <sys/cmn_err.h> 45#include <sys/errno.h> 46#include <sys/unistd.h> 47#include <sys/zfs_dir.h> 48#include <sys/zfs_ioctl.h> 49#include <sys/fs/zfs.h> 50#include <sys/dmu.h> 51#include <sys/spa.h> 52#include <sys/txg.h> 53#include <sys/dbuf.h> 54#include <sys/zap.h> 55#include <sys/dirent.h> 56#include <sys/policy.h> 57#include <sys/sunddi.h> 58#include <sys/filio.h> 59#include <sys/zfs_ctldir.h> 60#include <sys/zfs_fuid.h> 61#include <sys/dnlc.h> 62#include <sys/zfs_rlock.h> 63#include <sys/extdirent.h> 64#include <sys/kidmap.h> 65#include <sys/bio.h> 66#include <sys/buf.h> 67#include <sys/sf_buf.h> 68#include <sys/sched.h> 69#include <sys/acl.h> 70 71/* 72 * Programming rules. 73 * 74 * Each vnode op performs some logical unit of work. To do this, the ZPL must 75 * properly lock its in-core state, create a DMU transaction, do the work, 76 * record this work in the intent log (ZIL), commit the DMU transaction, 77 * and wait for the intent log to commit if it is a synchronous operation. 78 * Moreover, the vnode ops must work in both normal and log replay context. 79 * The ordering of events is important to avoid deadlocks and references 80 * to freed memory. The example below illustrates the following Big Rules: 81 * 82 * (1) A check must be made in each zfs thread for a mounted file system. 83 * This is done avoiding races using ZFS_ENTER(zfsvfs). 84 * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes 85 * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros 86 * can return EIO from the calling function. 87 * 88 * (2) VN_RELE() should always be the last thing except for zil_commit() 89 * (if necessary) and ZFS_EXIT(). This is for 3 reasons: 90 * First, if it's the last reference, the vnode/znode 91 * can be freed, so the zp may point to freed memory. Second, the last 92 * reference will call zfs_zinactive(), which may induce a lot of work -- 93 * pushing cached pages (which acquires range locks) and syncing out 94 * cached atime changes. Third, zfs_zinactive() may require a new tx, 95 * which could deadlock the system if you were already holding one. 96 * If you must call VN_RELE() within a tx then use VN_RELE_ASYNC(). 97 * 98 * (3) All range locks must be grabbed before calling dmu_tx_assign(), 99 * as they can span dmu_tx_assign() calls. 100 * 101 * (4) Always pass zfsvfs->z_assign as the second argument to dmu_tx_assign(). 102 * In normal operation, this will be TXG_NOWAIT. During ZIL replay, 103 * it will be a specific txg. Either way, dmu_tx_assign() never blocks. 104 * This is critical because we don't want to block while holding locks. 105 * Note, in particular, that if a lock is sometimes acquired before 106 * the tx assigns, and sometimes after (e.g. z_lock), then failing to 107 * use a non-blocking assign can deadlock the system. The scenario: 108 * 109 * Thread A has grabbed a lock before calling dmu_tx_assign(). 110 * Thread B is in an already-assigned tx, and blocks for this lock. 111 * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() 112 * forever, because the previous txg can't quiesce until B's tx commits. 113 * 114 * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, 115 * then drop all locks, call dmu_tx_wait(), and try again. 116 * 117 * (5) If the operation succeeded, generate the intent log entry for it 118 * before dropping locks. This ensures that the ordering of events 119 * in the intent log matches the order in which they actually occurred. 120 * 121 * (6) At the end of each vnode op, the DMU tx must always commit, 122 * regardless of whether there were any errors. 123 * 124 * (7) After dropping all locks, invoke zil_commit(zilog, seq, foid) 125 * to ensure that synchronous semantics are provided when necessary. 126 * 127 * In general, this is how things should be ordered in each vnode op: 128 * 129 * ZFS_ENTER(zfsvfs); // exit if unmounted 130 * top: 131 * zfs_dirent_lock(&dl, ...) // lock directory entry (may VN_HOLD()) 132 * rw_enter(...); // grab any other locks you need 133 * tx = dmu_tx_create(...); // get DMU tx 134 * dmu_tx_hold_*(); // hold each object you might modify 135 * error = dmu_tx_assign(tx, zfsvfs->z_assign); // try to assign 136 * if (error) { 137 * rw_exit(...); // drop locks 138 * zfs_dirent_unlock(dl); // unlock directory entry 139 * VN_RELE(...); // release held vnodes 140 * if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 141 * dmu_tx_wait(tx); 142 * dmu_tx_abort(tx); 143 * goto top; 144 * } 145 * dmu_tx_abort(tx); // abort DMU tx 146 * ZFS_EXIT(zfsvfs); // finished in zfs 147 * return (error); // really out of space 148 * } 149 * error = do_real_work(); // do whatever this VOP does 150 * if (error == 0) 151 * zfs_log_*(...); // on success, make ZIL entry 152 * dmu_tx_commit(tx); // commit DMU tx -- error or not 153 * rw_exit(...); // drop locks 154 * zfs_dirent_unlock(dl); // unlock directory entry 155 * VN_RELE(...); // release held vnodes 156 * zil_commit(zilog, seq, foid); // synchronous when necessary 157 * ZFS_EXIT(zfsvfs); // finished in zfs 158 * return (error); // done, report error 159 */ 160 161/* ARGSUSED */ 162static int 163zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) 164{ 165 znode_t *zp = VTOZ(*vpp); 166 167 if ((flag & FWRITE) && (zp->z_phys->zp_flags & ZFS_APPENDONLY) && 168 ((flag & FAPPEND) == 0)) { 169 return (EPERM); 170 } 171 172 if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && 173 ZTOV(zp)->v_type == VREG && 174 !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) && 175 zp->z_phys->zp_size > 0) 176 if (fs_vscan(*vpp, cr, 0) != 0) 177 return (EACCES); 178 179 /* Keep a count of the synchronous opens in the znode */ 180 if (flag & (FSYNC | FDSYNC)) 181 atomic_inc_32(&zp->z_sync_cnt); 182 183 return (0); 184} 185 186/* ARGSUSED */ 187static int 188zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, 189 caller_context_t *ct) 190{ 191 znode_t *zp = VTOZ(vp); 192 193 /* Decrement the synchronous opens in the znode */ 194 if ((flag & (FSYNC | FDSYNC)) && (count == 1)) 195 atomic_dec_32(&zp->z_sync_cnt); 196 197 /* 198 * Clean up any locks held by this process on the vp. 199 */ 200 cleanlocks(vp, ddi_get_pid(), 0); 201 cleanshares(vp, ddi_get_pid()); 202 203 if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && 204 ZTOV(zp)->v_type == VREG && 205 !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) && 206 zp->z_phys->zp_size > 0) 207 VERIFY(fs_vscan(vp, cr, 1) == 0); 208 209 return (0); 210} 211 212/* 213 * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and 214 * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter. 215 */ 216static int 217zfs_holey(vnode_t *vp, u_long cmd, offset_t *off) 218{ 219 znode_t *zp = VTOZ(vp); 220 uint64_t noff = (uint64_t)*off; /* new offset */ 221 uint64_t file_sz; 222 int error; 223 boolean_t hole; 224 225 file_sz = zp->z_phys->zp_size; 226 if (noff >= file_sz) { 227 return (ENXIO); 228 } 229 230 if (cmd == _FIO_SEEK_HOLE) 231 hole = B_TRUE; 232 else 233 hole = B_FALSE; 234 235 error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff); 236 237 /* end of file? */ 238 if ((error == ESRCH) || (noff > file_sz)) { 239 /* 240 * Handle the virtual hole at the end of file. 241 */ 242 if (hole) { 243 *off = file_sz; 244 return (0); 245 } 246 return (ENXIO); 247 } 248 249 if (noff < *off) 250 return (error); 251 *off = noff; 252 return (error); 253} 254 255/* ARGSUSED */ 256static int 257zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred, 258 int *rvalp, caller_context_t *ct) 259{ 260 offset_t off; 261 int error; 262 zfsvfs_t *zfsvfs; 263 znode_t *zp; 264 265 switch (com) { 266 case _FIOFFS: 267 return (0); 268 269 /* 270 * The following two ioctls are used by bfu. Faking out, 271 * necessary to avoid bfu errors. 272 */ 273 case _FIOGDIO: 274 case _FIOSDIO: 275 return (0); 276 277 case _FIO_SEEK_DATA: 278 case _FIO_SEEK_HOLE: 279 if (ddi_copyin((void *)data, &off, sizeof (off), flag)) 280 return (EFAULT); 281 282 zp = VTOZ(vp); 283 zfsvfs = zp->z_zfsvfs; 284 ZFS_ENTER(zfsvfs); 285 ZFS_VERIFY_ZP(zp); 286 287 /* offset parameter is in/out */ 288 error = zfs_holey(vp, com, &off); 289 ZFS_EXIT(zfsvfs); 290 if (error) 291 return (error); 292 if (ddi_copyout(&off, (void *)data, sizeof (off), flag)) 293 return (EFAULT); 294 return (0); 295 } 296 return (ENOTTY); 297} 298 299/* 300 * When a file is memory mapped, we must keep the IO data synchronized 301 * between the DMU cache and the memory mapped pages. What this means: 302 * 303 * On Write: If we find a memory mapped page, we write to *both* 304 * the page and the dmu buffer. 305 * 306 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when 307 * the file is memory mapped. 308 */ 309static int 310mappedwrite(vnode_t *vp, int nbytes, uio_t *uio, dmu_tx_t *tx) 311{ 312 znode_t *zp = VTOZ(vp); 313 objset_t *os = zp->z_zfsvfs->z_os; 314 vm_object_t obj; 315 vm_page_t m; 316 struct sf_buf *sf; 317 int64_t start, off; 318 int len = nbytes; 319 int error = 0; 320 uint64_t dirbytes; 321 322 ASSERT(vp->v_mount != NULL); 323 obj = vp->v_object; 324 ASSERT(obj != NULL); 325 326 start = uio->uio_loffset; 327 off = start & PAGEOFFSET; 328 dirbytes = 0; 329 VM_OBJECT_LOCK(obj); 330 for (start &= PAGEMASK; len > 0; start += PAGESIZE) { 331 uint64_t bytes = MIN(PAGESIZE - off, len); 332 uint64_t fsize; 333 334again: 335 if ((m = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && 336 vm_page_is_valid(m, (vm_offset_t)off, bytes)) { 337 uint64_t woff; 338 caddr_t va; 339 340 if (vm_page_sleep_if_busy(m, FALSE, "zfsmwb")) 341 goto again; 342 fsize = obj->un_pager.vnp.vnp_size; 343 vm_page_busy(m); 344 vm_page_lock_queues(); 345 vm_page_undirty(m); 346 vm_page_unlock_queues(); 347 VM_OBJECT_UNLOCK(obj); 348 if (dirbytes > 0) { 349 error = dmu_write_uio(os, zp->z_id, uio, 350 dirbytes, tx); 351 dirbytes = 0; 352 } 353 if (error == 0) { 354 sched_pin(); 355 sf = sf_buf_alloc(m, SFB_CPUPRIVATE); 356 va = (caddr_t)sf_buf_kva(sf); 357 woff = uio->uio_loffset - off; 358 error = uiomove(va + off, bytes, UIO_WRITE, uio); 359 /* 360 * The uiomove() above could have been partially 361 * successful, that's why we call dmu_write() 362 * below unconditionally. The page was marked 363 * non-dirty above and we would lose the changes 364 * without doing so. If the uiomove() failed 365 * entirely, well, we just write what we got 366 * before one more time. 367 */ 368 dmu_write(os, zp->z_id, woff, 369 MIN(PAGESIZE, fsize - woff), va, tx); 370 sf_buf_free(sf); 371 sched_unpin(); 372 } 373 VM_OBJECT_LOCK(obj); 374 vm_page_wakeup(m); 375 } else { 376 if (__predict_false(obj->cache != NULL)) { 377 vm_page_cache_free(obj, OFF_TO_IDX(start), 378 OFF_TO_IDX(start) + 1); 379 } 380 dirbytes += bytes; 381 } 382 len -= bytes; 383 off = 0; 384 if (error) 385 break; 386 } 387 VM_OBJECT_UNLOCK(obj); 388 if (error == 0 && dirbytes > 0) 389 error = dmu_write_uio(os, zp->z_id, uio, dirbytes, tx); 390 return (error); 391} 392 393/* 394 * When a file is memory mapped, we must keep the IO data synchronized 395 * between the DMU cache and the memory mapped pages. What this means: 396 * 397 * On Read: We "read" preferentially from memory mapped pages, 398 * else we default from the dmu buffer. 399 * 400 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when 401 * the file is memory mapped. 402 */ 403static int 404mappedread(vnode_t *vp, int nbytes, uio_t *uio) 405{ 406 znode_t *zp = VTOZ(vp); 407 objset_t *os = zp->z_zfsvfs->z_os; 408 vm_object_t obj; 409 vm_page_t m; 410 struct sf_buf *sf; 411 int64_t start, off; 412 caddr_t va; 413 int len = nbytes; 414 int error = 0; 415 uint64_t dirbytes; 416 417 ASSERT(vp->v_mount != NULL); 418 obj = vp->v_object; 419 ASSERT(obj != NULL); 420 421 start = uio->uio_loffset; 422 off = start & PAGEOFFSET; 423 dirbytes = 0; 424 VM_OBJECT_LOCK(obj); 425 for (start &= PAGEMASK; len > 0; start += PAGESIZE) { 426 uint64_t bytes = MIN(PAGESIZE - off, len); 427 428again: 429 if ((m = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && 430 vm_page_is_valid(m, (vm_offset_t)off, bytes)) { 431 if (vm_page_sleep_if_busy(m, FALSE, "zfsmrb")) 432 goto again; 433 vm_page_busy(m); 434 VM_OBJECT_UNLOCK(obj); 435 if (dirbytes > 0) { 436 error = dmu_read_uio(os, zp->z_id, uio, 437 dirbytes); 438 dirbytes = 0; 439 } 440 if (error == 0) { 441 sched_pin(); 442 sf = sf_buf_alloc(m, SFB_CPUPRIVATE); 443 va = (caddr_t)sf_buf_kva(sf); 444 error = uiomove(va + off, bytes, UIO_READ, uio); 445 sf_buf_free(sf); 446 sched_unpin(); 447 } 448 VM_OBJECT_LOCK(obj); 449 vm_page_wakeup(m); 450 } else if (m != NULL && uio->uio_segflg == UIO_NOCOPY) { 451 /* 452 * The code below is here to make sendfile(2) work 453 * correctly with ZFS. As pointed out by ups@ 454 * sendfile(2) should be changed to use VOP_GETPAGES(), 455 * but it pessimize performance of sendfile/UFS, that's 456 * why I handle this special case in ZFS code. 457 */ 458 if (vm_page_sleep_if_busy(m, FALSE, "zfsmrb")) 459 goto again; 460 vm_page_busy(m); 461 VM_OBJECT_UNLOCK(obj); 462 if (dirbytes > 0) { 463 error = dmu_read_uio(os, zp->z_id, uio, 464 dirbytes); 465 dirbytes = 0; 466 } 467 if (error == 0) { 468 sched_pin(); 469 sf = sf_buf_alloc(m, SFB_CPUPRIVATE); 470 va = (caddr_t)sf_buf_kva(sf); 471 error = dmu_read(os, zp->z_id, start + off, 472 bytes, (void *)(va + off)); 473 sf_buf_free(sf); 474 sched_unpin(); 475 } 476 VM_OBJECT_LOCK(obj); 477 vm_page_wakeup(m); 478 if (error == 0) 479 uio->uio_resid -= bytes; 480 } else { 481 dirbytes += bytes; 482 } 483 len -= bytes; 484 off = 0; 485 if (error) 486 break; 487 } 488 VM_OBJECT_UNLOCK(obj); 489 if (error == 0 && dirbytes > 0) 490 error = dmu_read_uio(os, zp->z_id, uio, dirbytes); 491 return (error); 492} 493 494offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */ 495 496/* 497 * Read bytes from specified file into supplied buffer. 498 * 499 * IN: vp - vnode of file to be read from. 500 * uio - structure supplying read location, range info, 501 * and return buffer. 502 * ioflag - SYNC flags; used to provide FRSYNC semantics. 503 * cr - credentials of caller. 504 * ct - caller context 505 * 506 * OUT: uio - updated offset and range, buffer filled. 507 * 508 * RETURN: 0 if success 509 * error code if failure 510 * 511 * Side Effects: 512 * vp - atime updated if byte count > 0 513 */ 514/* ARGSUSED */ 515static int 516zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) 517{ 518 znode_t *zp = VTOZ(vp); 519 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 520 objset_t *os; 521 ssize_t n, nbytes; 522 int error; 523 rl_t *rl; 524 525 ZFS_ENTER(zfsvfs); 526 ZFS_VERIFY_ZP(zp); 527 os = zfsvfs->z_os; 528 529 if (zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) { 530 ZFS_EXIT(zfsvfs); 531 return (EACCES); 532 } 533 534 /* 535 * Validate file offset 536 */ 537 if (uio->uio_loffset < (offset_t)0) { 538 ZFS_EXIT(zfsvfs); 539 return (EINVAL); 540 } 541 542 /* 543 * Fasttrack empty reads 544 */ 545 if (uio->uio_resid == 0) { 546 ZFS_EXIT(zfsvfs); 547 return (0); 548 } 549 550 /* 551 * Check for mandatory locks 552 */ 553 if (MANDMODE((mode_t)zp->z_phys->zp_mode)) { 554 if (error = chklock(vp, FREAD, 555 uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) { 556 ZFS_EXIT(zfsvfs); 557 return (error); 558 } 559 } 560 561 /* 562 * If we're in FRSYNC mode, sync out this znode before reading it. 563 */ 564 if (ioflag & FRSYNC) 565 zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id); 566 567 /* 568 * Lock the range against changes. 569 */ 570 rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER); 571 572 /* 573 * If we are reading past end-of-file we can skip 574 * to the end; but we might still need to set atime. 575 */ 576 if (uio->uio_loffset >= zp->z_phys->zp_size) { 577 error = 0; 578 goto out; 579 } 580 581 ASSERT(uio->uio_loffset < zp->z_phys->zp_size); 582 n = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset); 583 584 while (n > 0) { 585 nbytes = MIN(n, zfs_read_chunk_size - 586 P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); 587 588 if (vn_has_cached_data(vp)) 589 error = mappedread(vp, nbytes, uio); 590 else 591 error = dmu_read_uio(os, zp->z_id, uio, nbytes); 592 if (error) { 593 /* convert checksum errors into IO errors */ 594 if (error == ECKSUM) 595 error = EIO; 596 break; 597 } 598 599 n -= nbytes; 600 } 601 602out: 603 zfs_range_unlock(rl); 604 605 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 606 ZFS_EXIT(zfsvfs); 607 return (error); 608} 609 610/* 611 * Fault in the pages of the first n bytes specified by the uio structure. 612 * 1 byte in each page is touched and the uio struct is unmodified. 613 * Any error will exit this routine as this is only a best 614 * attempt to get the pages resident. This is a copy of ufs_trans_touch(). 615 */ 616static void 617zfs_prefault_write(ssize_t n, struct uio *uio) 618{ 619 struct iovec *iov; 620 ulong_t cnt, incr; 621 caddr_t p; 622 623 if (uio->uio_segflg != UIO_USERSPACE) 624 return; 625 626 iov = uio->uio_iov; 627 628 while (n) { 629 cnt = MIN(iov->iov_len, n); 630 if (cnt == 0) { 631 /* empty iov entry */ 632 iov++; 633 continue; 634 } 635 n -= cnt; 636 /* 637 * touch each page in this segment. 638 */ 639 p = iov->iov_base; 640 while (cnt) { 641 if (fubyte(p) == -1) 642 return; 643 incr = MIN(cnt, PAGESIZE); 644 p += incr; 645 cnt -= incr; 646 } 647 /* 648 * touch the last byte in case it straddles a page. 649 */ 650 p--; 651 if (fubyte(p) == -1) 652 return; 653 iov++; 654 } 655} 656 657/* 658 * Write the bytes to a file. 659 * 660 * IN: vp - vnode of file to be written to. 661 * uio - structure supplying write location, range info, 662 * and data buffer. 663 * ioflag - IO_APPEND flag set if in append mode. 664 * cr - credentials of caller. 665 * ct - caller context (NFS/CIFS fem monitor only) 666 * 667 * OUT: uio - updated offset and range. 668 * 669 * RETURN: 0 if success 670 * error code if failure 671 * 672 * Timestamps: 673 * vp - ctime|mtime updated if byte count > 0 674 */ 675/* ARGSUSED */ 676static int 677zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) 678{ 679 znode_t *zp = VTOZ(vp); 680 rlim64_t limit = MAXOFFSET_T; 681 ssize_t start_resid = uio->uio_resid; 682 ssize_t tx_bytes; 683 uint64_t end_size; 684 dmu_tx_t *tx; 685 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 686 zilog_t *zilog; 687 offset_t woff; 688 ssize_t n, nbytes; 689 rl_t *rl; 690 int max_blksz = zfsvfs->z_max_blksz; 691 uint64_t pflags; 692 int error; 693 694 /* 695 * Fasttrack empty write 696 */ 697 n = start_resid; 698 if (n == 0) 699 return (0); 700 701 if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) 702 limit = MAXOFFSET_T; 703 704 ZFS_ENTER(zfsvfs); 705 ZFS_VERIFY_ZP(zp); 706 707 /* 708 * If immutable or not appending then return EPERM 709 */ 710 pflags = zp->z_phys->zp_flags; 711 if ((pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) || 712 ((pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) && 713 (uio->uio_loffset < zp->z_phys->zp_size))) { 714 ZFS_EXIT(zfsvfs); 715 return (EPERM); 716 } 717 718 zilog = zfsvfs->z_log; 719 720 /* 721 * Pre-fault the pages to ensure slow (eg NFS) pages 722 * don't hold up txg. 723 */ 724 zfs_prefault_write(n, uio); 725 726 /* 727 * If in append mode, set the io offset pointer to eof. 728 */ 729 if (ioflag & IO_APPEND) { 730 /* 731 * Range lock for a file append: 732 * The value for the start of range will be determined by 733 * zfs_range_lock() (to guarantee append semantics). 734 * If this write will cause the block size to increase, 735 * zfs_range_lock() will lock the entire file, so we must 736 * later reduce the range after we grow the block size. 737 */ 738 rl = zfs_range_lock(zp, 0, n, RL_APPEND); 739 if (rl->r_len == UINT64_MAX) { 740 /* overlocked, zp_size can't change */ 741 woff = uio->uio_loffset = zp->z_phys->zp_size; 742 } else { 743 woff = uio->uio_loffset = rl->r_off; 744 } 745 } else { 746 woff = uio->uio_loffset; 747 /* 748 * Validate file offset 749 */ 750 if (woff < 0) { 751 ZFS_EXIT(zfsvfs); 752 return (EINVAL); 753 } 754 755 /* 756 * If we need to grow the block size then zfs_range_lock() 757 * will lock a wider range than we request here. 758 * Later after growing the block size we reduce the range. 759 */ 760 rl = zfs_range_lock(zp, woff, n, RL_WRITER); 761 } 762 763 if (woff >= limit) { 764 zfs_range_unlock(rl); 765 ZFS_EXIT(zfsvfs); 766 return (EFBIG); 767 } 768 769 if ((woff + n) > limit || woff > (limit - n)) 770 n = limit - woff; 771 772 /* 773 * Check for mandatory locks 774 */ 775 if (MANDMODE((mode_t)zp->z_phys->zp_mode) && 776 (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) { 777 zfs_range_unlock(rl); 778 ZFS_EXIT(zfsvfs); 779 return (error); 780 } 781 end_size = MAX(zp->z_phys->zp_size, woff + n); 782 783 /* 784 * Write the file in reasonable size chunks. Each chunk is written 785 * in a separate transaction; this keeps the intent log records small 786 * and allows us to do more fine-grained space accounting. 787 */ 788 while (n > 0) { 789 /* 790 * Start a transaction. 791 */ 792 woff = uio->uio_loffset; 793 tx = dmu_tx_create(zfsvfs->z_os); 794 dmu_tx_hold_bonus(tx, zp->z_id); 795 dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz)); 796 error = dmu_tx_assign(tx, zfsvfs->z_assign); 797 if (error) { 798 if (error == ERESTART && 799 zfsvfs->z_assign == TXG_NOWAIT) { 800 dmu_tx_wait(tx); 801 dmu_tx_abort(tx); 802 continue; 803 } 804 dmu_tx_abort(tx); 805 break; 806 } 807 808 /* 809 * If zfs_range_lock() over-locked we grow the blocksize 810 * and then reduce the lock range. This will only happen 811 * on the first iteration since zfs_range_reduce() will 812 * shrink down r_len to the appropriate size. 813 */ 814 if (rl->r_len == UINT64_MAX) { 815 uint64_t new_blksz; 816 817 if (zp->z_blksz > max_blksz) { 818 ASSERT(!ISP2(zp->z_blksz)); 819 new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE); 820 } else { 821 new_blksz = MIN(end_size, max_blksz); 822 } 823 zfs_grow_blocksize(zp, new_blksz, tx); 824 zfs_range_reduce(rl, woff, n); 825 } 826 827 /* 828 * XXX - should we really limit each write to z_max_blksz? 829 * Perhaps we should use SPA_MAXBLOCKSIZE chunks? 830 */ 831 nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); 832 833 if (woff + nbytes > zp->z_phys->zp_size) 834 vnode_pager_setsize(vp, woff + nbytes); 835 836 rw_enter(&zp->z_map_lock, RW_READER); 837 838 tx_bytes = uio->uio_resid; 839 if (vn_has_cached_data(vp)) { 840 rw_exit(&zp->z_map_lock); 841 error = mappedwrite(vp, nbytes, uio, tx); 842 } else { 843 error = dmu_write_uio(zfsvfs->z_os, zp->z_id, 844 uio, nbytes, tx); 845 rw_exit(&zp->z_map_lock); 846 } 847 tx_bytes -= uio->uio_resid; 848 849 /* 850 * If we made no progress, we're done. If we made even 851 * partial progress, update the znode and ZIL accordingly. 852 */ 853 if (tx_bytes == 0) { 854 dmu_tx_commit(tx); 855 ASSERT(error != 0); 856 break; 857 } 858 859 /* 860 * Clear Set-UID/Set-GID bits on successful write if not 861 * privileged and at least one of the excute bits is set. 862 * 863 * It would be nice to to this after all writes have 864 * been done, but that would still expose the ISUID/ISGID 865 * to another app after the partial write is committed. 866 * 867 * Note: we don't call zfs_fuid_map_id() here because 868 * user 0 is not an ephemeral uid. 869 */ 870 mutex_enter(&zp->z_acl_lock); 871 if ((zp->z_phys->zp_mode & (S_IXUSR | (S_IXUSR >> 3) | 872 (S_IXUSR >> 6))) != 0 && 873 (zp->z_phys->zp_mode & (S_ISUID | S_ISGID)) != 0 && 874 secpolicy_vnode_setid_retain(vp, cr, 875 (zp->z_phys->zp_mode & S_ISUID) != 0 && 876 zp->z_phys->zp_uid == 0) != 0) { 877 zp->z_phys->zp_mode &= ~(S_ISUID | S_ISGID); 878 } 879 mutex_exit(&zp->z_acl_lock); 880 881 /* 882 * Update time stamp. NOTE: This marks the bonus buffer as 883 * dirty, so we don't have to do it again for zp_size. 884 */ 885 zfs_time_stamper(zp, CONTENT_MODIFIED, tx); 886 887 /* 888 * Update the file size (zp_size) if it has changed; 889 * account for possible concurrent updates. 890 */ 891 while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset) 892 (void) atomic_cas_64(&zp->z_phys->zp_size, end_size, 893 uio->uio_loffset); 894 zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag); 895 dmu_tx_commit(tx); 896 897 if (error != 0) 898 break; 899 ASSERT(tx_bytes == nbytes); 900 n -= nbytes; 901 } 902 903 zfs_range_unlock(rl); 904 905 /* 906 * If we're in replay mode, or we made no progress, return error. 907 * Otherwise, it's at least a partial write, so it's successful. 908 */ 909 if (zfsvfs->z_assign >= TXG_INITIAL || uio->uio_resid == start_resid) { 910 ZFS_EXIT(zfsvfs); 911 return (error); 912 } 913 914 if (ioflag & (FSYNC | FDSYNC)) 915 zil_commit(zilog, zp->z_last_itx, zp->z_id); 916 917 ZFS_EXIT(zfsvfs); 918 return (0); 919} 920 921void 922zfs_get_done(dmu_buf_t *db, void *vzgd) 923{ 924 zgd_t *zgd = (zgd_t *)vzgd; 925 rl_t *rl = zgd->zgd_rl; 926 vnode_t *vp = ZTOV(rl->r_zp); 927 int vfslocked; 928 929 vfslocked = VFS_LOCK_GIANT(vp->v_vfsp); 930 dmu_buf_rele(db, vzgd); 931 zfs_range_unlock(rl); 932 /* 933 * Release the vnode asynchronously as we currently have the 934 * txg stopped from syncing. 935 */ 936 VN_RELE_ASYNC(vp, NULL); 937 zil_add_block(zgd->zgd_zilog, zgd->zgd_bp); 938 kmem_free(zgd, sizeof (zgd_t)); 939 VFS_UNLOCK_GIANT(vfslocked); 940} 941 942/* 943 * Get data to generate a TX_WRITE intent log record. 944 */ 945int 946zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) 947{ 948 zfsvfs_t *zfsvfs = arg; 949 objset_t *os = zfsvfs->z_os; 950 znode_t *zp; 951 uint64_t off = lr->lr_offset; 952 dmu_buf_t *db; 953 rl_t *rl; 954 zgd_t *zgd; 955 int dlen = lr->lr_length; /* length of user data */ 956 int error = 0; 957 958 ASSERT(zio); 959 ASSERT(dlen != 0); 960 961 /* 962 * Nothing to do if the file has been removed 963 */ 964 if (zfs_zget(zfsvfs, lr->lr_foid, &zp) != 0) 965 return (ENOENT); 966 if (zp->z_unlinked) { 967 /* 968 * Release the vnode asynchronously as we currently have the 969 * txg stopped from syncing. 970 */ 971 VN_RELE_ASYNC(ZTOV(zp), NULL); 972 973 return (ENOENT); 974 } 975 976 /* 977 * Write records come in two flavors: immediate and indirect. 978 * For small writes it's cheaper to store the data with the 979 * log record (immediate); for large writes it's cheaper to 980 * sync the data and get a pointer to it (indirect) so that 981 * we don't have to write the data twice. 982 */ 983 if (buf != NULL) { /* immediate write */ 984 rl = zfs_range_lock(zp, off, dlen, RL_READER); 985 /* test for truncation needs to be done while range locked */ 986 if (off >= zp->z_phys->zp_size) { 987 error = ENOENT; 988 goto out; 989 } 990 VERIFY(0 == dmu_read(os, lr->lr_foid, off, dlen, buf)); 991 } else { /* indirect write */ 992 uint64_t boff; /* block starting offset */ 993 994 /* 995 * Have to lock the whole block to ensure when it's 996 * written out and it's checksum is being calculated 997 * that no one can change the data. We need to re-check 998 * blocksize after we get the lock in case it's changed! 999 */ 1000 for (;;) { 1001 if (ISP2(zp->z_blksz)) { 1002 boff = P2ALIGN_TYPED(off, zp->z_blksz, 1003 uint64_t); 1004 } else { 1005 boff = 0; 1006 } 1007 dlen = zp->z_blksz; 1008 rl = zfs_range_lock(zp, boff, dlen, RL_READER); 1009 if (zp->z_blksz == dlen) 1010 break; 1011 zfs_range_unlock(rl); 1012 } 1013 /* test for truncation needs to be done while range locked */ 1014 if (off >= zp->z_phys->zp_size) { 1015 error = ENOENT; 1016 goto out; 1017 } 1018 zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP); 1019 zgd->zgd_rl = rl; 1020 zgd->zgd_zilog = zfsvfs->z_log; 1021 zgd->zgd_bp = &lr->lr_blkptr; 1022 VERIFY(0 == dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db)); 1023 ASSERT(boff == db->db_offset); 1024 lr->lr_blkoff = off - boff; 1025 error = dmu_sync(zio, db, &lr->lr_blkptr, 1026 lr->lr_common.lrc_txg, zfs_get_done, zgd); 1027 ASSERT((error && error != EINPROGRESS) || 1028 lr->lr_length <= zp->z_blksz); 1029 if (error == 0) 1030 zil_add_block(zfsvfs->z_log, &lr->lr_blkptr); 1031 /* 1032 * If we get EINPROGRESS, then we need to wait for a 1033 * write IO initiated by dmu_sync() to complete before 1034 * we can release this dbuf. We will finish everything 1035 * up in the zfs_get_done() callback. 1036 */ 1037 if (error == EINPROGRESS) 1038 return (0); 1039 dmu_buf_rele(db, zgd); 1040 kmem_free(zgd, sizeof (zgd_t)); 1041 } 1042out: 1043 zfs_range_unlock(rl); 1044 /* 1045 * Release the vnode asynchronously as we currently have the 1046 * txg stopped from syncing. 1047 */ 1048 VN_RELE_ASYNC(ZTOV(zp), NULL); 1049 return (error); 1050} 1051 1052/*ARGSUSED*/ 1053static int 1054zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr, 1055 caller_context_t *ct) 1056{ 1057 znode_t *zp = VTOZ(vp); 1058 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1059 int error; 1060 1061 ZFS_ENTER(zfsvfs); 1062 ZFS_VERIFY_ZP(zp); 1063 1064 if (flag & V_ACE_MASK) 1065 error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); 1066 else 1067 error = zfs_zaccess_rwx(zp, mode, flag, cr); 1068 1069 ZFS_EXIT(zfsvfs); 1070 return (error); 1071} 1072 1073/* 1074 * Lookup an entry in a directory, or an extended attribute directory. 1075 * If it exists, return a held vnode reference for it. 1076 * 1077 * IN: dvp - vnode of directory to search. 1078 * nm - name of entry to lookup. 1079 * pnp - full pathname to lookup [UNUSED]. 1080 * flags - LOOKUP_XATTR set if looking for an attribute. 1081 * rdir - root directory vnode [UNUSED]. 1082 * cr - credentials of caller. 1083 * ct - caller context 1084 * direntflags - directory lookup flags 1085 * realpnp - returned pathname. 1086 * 1087 * OUT: vpp - vnode of located entry, NULL if not found. 1088 * 1089 * RETURN: 0 if success 1090 * error code if failure 1091 * 1092 * Timestamps: 1093 * NA 1094 */ 1095/* ARGSUSED */ 1096static int 1097zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp, 1098 int nameiop, cred_t *cr, kthread_t *td, int flags) 1099{ 1100 znode_t *zdp = VTOZ(dvp); 1101 zfsvfs_t *zfsvfs = zdp->z_zfsvfs; 1102 int error; 1103 int *direntflags = NULL; 1104 void *realpnp = NULL; 1105 1106 ZFS_ENTER(zfsvfs); 1107 ZFS_VERIFY_ZP(zdp); 1108 1109 *vpp = NULL; 1110 1111 if (flags & LOOKUP_XATTR) { 1112#ifdef TODO 1113 /* 1114 * If the xattr property is off, refuse the lookup request. 1115 */ 1116 if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) { 1117 ZFS_EXIT(zfsvfs); 1118 return (EINVAL); 1119 } 1120#endif 1121 1122 /* 1123 * We don't allow recursive attributes.. 1124 * Maybe someday we will. 1125 */ 1126 if (zdp->z_phys->zp_flags & ZFS_XATTR) { 1127 ZFS_EXIT(zfsvfs); 1128 return (EINVAL); 1129 } 1130 1131 if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) { 1132 ZFS_EXIT(zfsvfs); 1133 return (error); 1134 } 1135 1136 /* 1137 * Do we have permission to get into attribute directory? 1138 */ 1139 1140 if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0, 1141 B_FALSE, cr)) { 1142 VN_RELE(*vpp); 1143 *vpp = NULL; 1144 } 1145 1146 ZFS_EXIT(zfsvfs); 1147 return (error); 1148 } 1149 1150 if (dvp->v_type != VDIR) { 1151 ZFS_EXIT(zfsvfs); 1152 return (ENOTDIR); 1153 } 1154 1155 /* 1156 * Check accessibility of directory. 1157 */ 1158 1159 if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) { 1160 ZFS_EXIT(zfsvfs); 1161 return (error); 1162 } 1163 1164 if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), 1165 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 1166 ZFS_EXIT(zfsvfs); 1167 return (EILSEQ); 1168 } 1169 1170 error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp); 1171 if (error == 0) { 1172 /* 1173 * Convert device special files 1174 */ 1175 if (IS_DEVVP(*vpp)) { 1176 vnode_t *svp; 1177 1178 svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); 1179 VN_RELE(*vpp); 1180 if (svp == NULL) 1181 error = ENOSYS; 1182 else 1183 *vpp = svp; 1184 } 1185 } 1186 1187 ZFS_EXIT(zfsvfs); 1188 1189 /* Translate errors and add SAVENAME when needed. */ 1190 if (cnp->cn_flags & ISLASTCN) { 1191 switch (nameiop) { 1192 case CREATE: 1193 case RENAME: 1194 if (error == ENOENT) { 1195 error = EJUSTRETURN; 1196 cnp->cn_flags |= SAVENAME; 1197 break; 1198 } 1199 /* FALLTHROUGH */ 1200 case DELETE: 1201 if (error == 0) 1202 cnp->cn_flags |= SAVENAME; 1203 break; 1204 } 1205 } 1206 if (error == 0 && (nm[0] != '.' || nm[1] != '\0')) { 1207 int ltype = 0; 1208 1209 if (cnp->cn_flags & ISDOTDOT) { 1210 ltype = VOP_ISLOCKED(dvp); 1211 VOP_UNLOCK(dvp, 0); 1212 } 1213 error = vn_lock(*vpp, cnp->cn_lkflags); 1214 if (cnp->cn_flags & ISDOTDOT) 1215 vn_lock(dvp, ltype | LK_RETRY); 1216 if (error != 0) { 1217 VN_RELE(*vpp); 1218 *vpp = NULL; 1219 return (error); 1220 } 1221 } 1222 1223#ifdef FREEBSD_NAMECACHE 1224 /* 1225 * Insert name into cache (as non-existent) if appropriate. 1226 */ 1227 if (error == ENOENT && (cnp->cn_flags & MAKEENTRY) && nameiop != CREATE) 1228 cache_enter(dvp, *vpp, cnp); 1229 /* 1230 * Insert name into cache if appropriate. 1231 */ 1232 if (error == 0 && (cnp->cn_flags & MAKEENTRY)) { 1233 if (!(cnp->cn_flags & ISLASTCN) || 1234 (nameiop != DELETE && nameiop != RENAME)) { 1235 cache_enter(dvp, *vpp, cnp); 1236 } 1237 } 1238#endif 1239 1240 return (error); 1241} 1242 1243/* 1244 * Attempt to create a new entry in a directory. If the entry 1245 * already exists, truncate the file if permissible, else return 1246 * an error. Return the vp of the created or trunc'd file. 1247 * 1248 * IN: dvp - vnode of directory to put new file entry in. 1249 * name - name of new file entry. 1250 * vap - attributes of new file. 1251 * excl - flag indicating exclusive or non-exclusive mode. 1252 * mode - mode to open file with. 1253 * cr - credentials of caller. 1254 * flag - large file flag [UNUSED]. 1255 * ct - caller context 1256 * vsecp - ACL to be set 1257 * 1258 * OUT: vpp - vnode of created or trunc'd entry. 1259 * 1260 * RETURN: 0 if success 1261 * error code if failure 1262 * 1263 * Timestamps: 1264 * dvp - ctime|mtime updated if new entry created 1265 * vp - ctime|mtime always, atime if new 1266 */ 1267 1268/* ARGSUSED */ 1269static int 1270zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode, 1271 vnode_t **vpp, cred_t *cr, kthread_t *td) 1272{ 1273 znode_t *zp, *dzp = VTOZ(dvp); 1274 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1275 zilog_t *zilog; 1276 objset_t *os; 1277 zfs_dirlock_t *dl; 1278 dmu_tx_t *tx; 1279 int error; 1280 zfs_acl_t *aclp = NULL; 1281 zfs_fuid_info_t *fuidp = NULL; 1282 void *vsecp = NULL; 1283 int flag = 0; 1284 1285 /* 1286 * If we have an ephemeral id, ACL, or XVATTR then 1287 * make sure file system is at proper version 1288 */ 1289 1290 if (zfsvfs->z_use_fuids == B_FALSE && 1291 (vsecp || (vap->va_mask & AT_XVATTR) || 1292 IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr)))) 1293 return (EINVAL); 1294 1295 ZFS_ENTER(zfsvfs); 1296 ZFS_VERIFY_ZP(dzp); 1297 os = zfsvfs->z_os; 1298 zilog = zfsvfs->z_log; 1299 1300 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), 1301 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 1302 ZFS_EXIT(zfsvfs); 1303 return (EILSEQ); 1304 } 1305 1306 if (vap->va_mask & AT_XVATTR) { 1307 if ((error = secpolicy_xvattr((xvattr_t *)vap, 1308 crgetuid(cr), cr, vap->va_type)) != 0) { 1309 ZFS_EXIT(zfsvfs); 1310 return (error); 1311 } 1312 } 1313top: 1314 *vpp = NULL; 1315 1316 if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr)) 1317 vap->va_mode &= ~S_ISVTX; 1318 1319 if (*name == '\0') { 1320 /* 1321 * Null component name refers to the directory itself. 1322 */ 1323 VN_HOLD(dvp); 1324 zp = dzp; 1325 dl = NULL; 1326 error = 0; 1327 } else { 1328 /* possible VN_HOLD(zp) */ 1329 int zflg = 0; 1330 1331 if (flag & FIGNORECASE) 1332 zflg |= ZCILOOK; 1333 1334 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 1335 NULL, NULL); 1336 if (error) { 1337 if (strcmp(name, "..") == 0) 1338 error = EISDIR; 1339 ZFS_EXIT(zfsvfs); 1340 if (aclp) 1341 zfs_acl_free(aclp); 1342 return (error); 1343 } 1344 } 1345 if (vsecp && aclp == NULL) { 1346 error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, &aclp); 1347 if (error) { 1348 ZFS_EXIT(zfsvfs); 1349 if (dl) 1350 zfs_dirent_unlock(dl); 1351 return (error); 1352 } 1353 } 1354 1355 if (zp == NULL) { 1356 uint64_t txtype; 1357 1358 /* 1359 * Create a new file object and update the directory 1360 * to reference it. 1361 */ 1362 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { 1363 goto out; 1364 } 1365 1366 /* 1367 * We only support the creation of regular files in 1368 * extended attribute directories. 1369 */ 1370 if ((dzp->z_phys->zp_flags & ZFS_XATTR) && 1371 (vap->va_type != VREG)) { 1372 error = EINVAL; 1373 goto out; 1374 } 1375 1376 tx = dmu_tx_create(os); 1377 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 1378 if ((aclp && aclp->z_has_fuids) || IS_EPHEMERAL(crgetuid(cr)) || 1379 IS_EPHEMERAL(crgetgid(cr))) { 1380 if (zfsvfs->z_fuid_obj == 0) { 1381 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 1382 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 1383 FUID_SIZE_ESTIMATE(zfsvfs)); 1384 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, 1385 FALSE, NULL); 1386 } else { 1387 dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); 1388 dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, 1389 FUID_SIZE_ESTIMATE(zfsvfs)); 1390 } 1391 } 1392 dmu_tx_hold_bonus(tx, dzp->z_id); 1393 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 1394 if ((dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) || aclp) { 1395 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 1396 0, SPA_MAXBLOCKSIZE); 1397 } 1398 error = dmu_tx_assign(tx, zfsvfs->z_assign); 1399 if (error) { 1400 zfs_dirent_unlock(dl); 1401 if (error == ERESTART && 1402 zfsvfs->z_assign == TXG_NOWAIT) { 1403 dmu_tx_wait(tx); 1404 dmu_tx_abort(tx); 1405 goto top; 1406 } 1407 dmu_tx_abort(tx); 1408 ZFS_EXIT(zfsvfs); 1409 if (aclp) 1410 zfs_acl_free(aclp); 1411 return (error); 1412 } 1413 zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, aclp, &fuidp); 1414 (void) zfs_link_create(dl, zp, tx, ZNEW); 1415 txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); 1416 if (flag & FIGNORECASE) 1417 txtype |= TX_CI; 1418 zfs_log_create(zilog, tx, txtype, dzp, zp, name, 1419 vsecp, fuidp, vap); 1420 if (fuidp) 1421 zfs_fuid_info_free(fuidp); 1422 dmu_tx_commit(tx); 1423 } else { 1424 int aflags = (flag & FAPPEND) ? V_APPEND : 0; 1425 1426 /* 1427 * A directory entry already exists for this name. 1428 */ 1429 /* 1430 * Can't truncate an existing file if in exclusive mode. 1431 */ 1432 if (excl == EXCL) { 1433 error = EEXIST; 1434 goto out; 1435 } 1436 /* 1437 * Can't open a directory for writing. 1438 */ 1439 if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) { 1440 error = EISDIR; 1441 goto out; 1442 } 1443 /* 1444 * Verify requested access to file. 1445 */ 1446 if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) { 1447 goto out; 1448 } 1449 1450 mutex_enter(&dzp->z_lock); 1451 dzp->z_seq++; 1452 mutex_exit(&dzp->z_lock); 1453 1454 /* 1455 * Truncate regular files if requested. 1456 */ 1457 if ((ZTOV(zp)->v_type == VREG) && 1458 (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) { 1459 /* we can't hold any locks when calling zfs_freesp() */ 1460 zfs_dirent_unlock(dl); 1461 dl = NULL; 1462 error = zfs_freesp(zp, 0, 0, mode, TRUE); 1463 if (error == 0) { 1464 vnevent_create(ZTOV(zp), ct); 1465 } 1466 } 1467 } 1468out: 1469 if (dl) 1470 zfs_dirent_unlock(dl); 1471 1472 if (error) { 1473 if (zp) 1474 VN_RELE(ZTOV(zp)); 1475 } else { 1476 *vpp = ZTOV(zp); 1477 /* 1478 * If vnode is for a device return a specfs vnode instead. 1479 */ 1480 if (IS_DEVVP(*vpp)) { 1481 struct vnode *svp; 1482 1483 svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); 1484 VN_RELE(*vpp); 1485 if (svp == NULL) { 1486 error = ENOSYS; 1487 } 1488 *vpp = svp; 1489 } 1490 } 1491 if (aclp) 1492 zfs_acl_free(aclp); 1493 1494 ZFS_EXIT(zfsvfs); 1495 return (error); 1496} 1497 1498/* 1499 * Remove an entry from a directory. 1500 * 1501 * IN: dvp - vnode of directory to remove entry from. 1502 * name - name of entry to remove. 1503 * cr - credentials of caller. 1504 * ct - caller context 1505 * flags - case flags 1506 * 1507 * RETURN: 0 if success 1508 * error code if failure 1509 * 1510 * Timestamps: 1511 * dvp - ctime|mtime 1512 * vp - ctime (if nlink > 0) 1513 */ 1514/*ARGSUSED*/ 1515static int 1516zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct, 1517 int flags) 1518{ 1519 znode_t *zp, *dzp = VTOZ(dvp); 1520 znode_t *xzp = NULL; 1521 vnode_t *vp; 1522 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1523 zilog_t *zilog; 1524 uint64_t acl_obj, xattr_obj; 1525 zfs_dirlock_t *dl; 1526 dmu_tx_t *tx; 1527 boolean_t may_delete_now, delete_now = FALSE; 1528 boolean_t unlinked, toobig = FALSE; 1529 uint64_t txtype; 1530 pathname_t *realnmp = NULL; 1531 pathname_t realnm; 1532 int error; 1533 int zflg = ZEXISTS; 1534 1535 ZFS_ENTER(zfsvfs); 1536 ZFS_VERIFY_ZP(dzp); 1537 zilog = zfsvfs->z_log; 1538 1539 if (flags & FIGNORECASE) { 1540 zflg |= ZCILOOK; 1541 pn_alloc(&realnm); 1542 realnmp = &realnm; 1543 } 1544 1545top: 1546 /* 1547 * Attempt to lock directory; fail if entry doesn't exist. 1548 */ 1549 if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 1550 NULL, realnmp)) { 1551 if (realnmp) 1552 pn_free(realnmp); 1553 ZFS_EXIT(zfsvfs); 1554 return (error); 1555 } 1556 1557 vp = ZTOV(zp); 1558 1559 if (error = zfs_zaccess_delete(dzp, zp, cr)) { 1560 goto out; 1561 } 1562 1563 /* 1564 * Need to use rmdir for removing directories. 1565 */ 1566 if (vp->v_type == VDIR) { 1567 error = EPERM; 1568 goto out; 1569 } 1570 1571 vnevent_remove(vp, dvp, name, ct); 1572 1573 if (realnmp) 1574 dnlc_remove(dvp, realnmp->pn_buf); 1575 else 1576 dnlc_remove(dvp, name); 1577 1578 may_delete_now = FALSE; 1579 1580 /* 1581 * We may delete the znode now, or we may put it in the unlinked set; 1582 * it depends on whether we're the last link, and on whether there are 1583 * other holds on the vnode. So we dmu_tx_hold() the right things to 1584 * allow for either case. 1585 */ 1586 tx = dmu_tx_create(zfsvfs->z_os); 1587 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); 1588 dmu_tx_hold_bonus(tx, zp->z_id); 1589 if (may_delete_now) { 1590 toobig = 1591 zp->z_phys->zp_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT; 1592 /* if the file is too big, only hold_free a token amount */ 1593 dmu_tx_hold_free(tx, zp->z_id, 0, 1594 (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END)); 1595 } 1596 1597 /* are there any extended attributes? */ 1598 if ((xattr_obj = zp->z_phys->zp_xattr) != 0) { 1599 /* XXX - do we need this if we are deleting? */ 1600 dmu_tx_hold_bonus(tx, xattr_obj); 1601 } 1602 1603 /* are there any additional acls */ 1604 if ((acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj) != 0 && 1605 may_delete_now) 1606 dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); 1607 1608 /* charge as an update -- would be nice not to charge at all */ 1609 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 1610 1611 error = dmu_tx_assign(tx, zfsvfs->z_assign); 1612 if (error) { 1613 zfs_dirent_unlock(dl); 1614 VN_RELE(vp); 1615 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 1616 dmu_tx_wait(tx); 1617 dmu_tx_abort(tx); 1618 goto top; 1619 } 1620 if (realnmp) 1621 pn_free(realnmp); 1622 dmu_tx_abort(tx); 1623 ZFS_EXIT(zfsvfs); 1624 return (error); 1625 } 1626 1627 /* 1628 * Remove the directory entry. 1629 */ 1630 error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked); 1631 1632 if (error) { 1633 dmu_tx_commit(tx); 1634 goto out; 1635 } 1636 1637 if (0 && unlinked) { 1638 VI_LOCK(vp); 1639 delete_now = may_delete_now && !toobig && 1640 vp->v_count == 1 && !vn_has_cached_data(vp) && 1641 zp->z_phys->zp_xattr == xattr_obj && 1642 zp->z_phys->zp_acl.z_acl_extern_obj == acl_obj; 1643 VI_UNLOCK(vp); 1644 } 1645 1646 if (delete_now) { 1647 if (zp->z_phys->zp_xattr) { 1648 error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp); 1649 ASSERT3U(error, ==, 0); 1650 ASSERT3U(xzp->z_phys->zp_links, ==, 2); 1651 dmu_buf_will_dirty(xzp->z_dbuf, tx); 1652 mutex_enter(&xzp->z_lock); 1653 xzp->z_unlinked = 1; 1654 xzp->z_phys->zp_links = 0; 1655 mutex_exit(&xzp->z_lock); 1656 zfs_unlinked_add(xzp, tx); 1657 zp->z_phys->zp_xattr = 0; /* probably unnecessary */ 1658 } 1659 mutex_enter(&zp->z_lock); 1660 VI_LOCK(vp); 1661 vp->v_count--; 1662 ASSERT3U(vp->v_count, ==, 0); 1663 VI_UNLOCK(vp); 1664 mutex_exit(&zp->z_lock); 1665 zfs_znode_delete(zp, tx); 1666 } else if (unlinked) { 1667 zfs_unlinked_add(zp, tx); 1668 } 1669 1670 txtype = TX_REMOVE; 1671 if (flags & FIGNORECASE) 1672 txtype |= TX_CI; 1673 zfs_log_remove(zilog, tx, txtype, dzp, name); 1674 1675 dmu_tx_commit(tx); 1676out: 1677 if (realnmp) 1678 pn_free(realnmp); 1679 1680 zfs_dirent_unlock(dl); 1681 1682 if (!delete_now) { 1683 VN_RELE(vp); 1684 } else if (xzp) { 1685 /* this rele is delayed to prevent nesting transactions */ 1686 VN_RELE(ZTOV(xzp)); 1687 } 1688 1689 ZFS_EXIT(zfsvfs); 1690 return (error); 1691} 1692 1693/* 1694 * Create a new directory and insert it into dvp using the name 1695 * provided. Return a pointer to the inserted directory. 1696 * 1697 * IN: dvp - vnode of directory to add subdir to. 1698 * dirname - name of new directory. 1699 * vap - attributes of new directory. 1700 * cr - credentials of caller. 1701 * ct - caller context 1702 * vsecp - ACL to be set 1703 * 1704 * OUT: vpp - vnode of created directory. 1705 * 1706 * RETURN: 0 if success 1707 * error code if failure 1708 * 1709 * Timestamps: 1710 * dvp - ctime|mtime updated 1711 * vp - ctime|mtime|atime updated 1712 */ 1713/*ARGSUSED*/ 1714static int 1715zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr, 1716 caller_context_t *ct, int flags, vsecattr_t *vsecp) 1717{ 1718 znode_t *zp, *dzp = VTOZ(dvp); 1719 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1720 zilog_t *zilog; 1721 zfs_dirlock_t *dl; 1722 uint64_t txtype; 1723 dmu_tx_t *tx; 1724 int error; 1725 zfs_acl_t *aclp = NULL; 1726 zfs_fuid_info_t *fuidp = NULL; 1727 int zf = ZNEW; 1728 1729 ASSERT(vap->va_type == VDIR); 1730 1731 /* 1732 * If we have an ephemeral id, ACL, or XVATTR then 1733 * make sure file system is at proper version 1734 */ 1735 1736 if (zfsvfs->z_use_fuids == B_FALSE && 1737 (vsecp || (vap->va_mask & AT_XVATTR) || IS_EPHEMERAL(crgetuid(cr))|| 1738 IS_EPHEMERAL(crgetgid(cr)))) 1739 return (EINVAL); 1740 1741 ZFS_ENTER(zfsvfs); 1742 ZFS_VERIFY_ZP(dzp); 1743 zilog = zfsvfs->z_log; 1744 1745 if (dzp->z_phys->zp_flags & ZFS_XATTR) { 1746 ZFS_EXIT(zfsvfs); 1747 return (EINVAL); 1748 } 1749 1750 if (zfsvfs->z_utf8 && u8_validate(dirname, 1751 strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 1752 ZFS_EXIT(zfsvfs); 1753 return (EILSEQ); 1754 } 1755 if (flags & FIGNORECASE) 1756 zf |= ZCILOOK; 1757 1758 if (vap->va_mask & AT_XVATTR) 1759 if ((error = secpolicy_xvattr((xvattr_t *)vap, 1760 crgetuid(cr), cr, vap->va_type)) != 0) { 1761 ZFS_EXIT(zfsvfs); 1762 return (error); 1763 } 1764 1765 /* 1766 * First make sure the new directory doesn't exist. 1767 */ 1768top: 1769 *vpp = NULL; 1770 1771 if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf, 1772 NULL, NULL)) { 1773 ZFS_EXIT(zfsvfs); 1774 return (error); 1775 } 1776 1777 if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) { 1778 zfs_dirent_unlock(dl); 1779 ZFS_EXIT(zfsvfs); 1780 return (error); 1781 } 1782 1783 if (vsecp && aclp == NULL) { 1784 error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, &aclp); 1785 if (error) { 1786 zfs_dirent_unlock(dl); 1787 ZFS_EXIT(zfsvfs); 1788 return (error); 1789 } 1790 } 1791 /* 1792 * Add a new entry to the directory. 1793 */ 1794 tx = dmu_tx_create(zfsvfs->z_os); 1795 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); 1796 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); 1797 if ((aclp && aclp->z_has_fuids) || IS_EPHEMERAL(crgetuid(cr)) || 1798 IS_EPHEMERAL(crgetgid(cr))) { 1799 if (zfsvfs->z_fuid_obj == 0) { 1800 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 1801 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 1802 FUID_SIZE_ESTIMATE(zfsvfs)); 1803 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); 1804 } else { 1805 dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); 1806 dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, 1807 FUID_SIZE_ESTIMATE(zfsvfs)); 1808 } 1809 } 1810 if ((dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) || aclp) 1811 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 1812 0, SPA_MAXBLOCKSIZE); 1813 error = dmu_tx_assign(tx, zfsvfs->z_assign); 1814 if (error) { 1815 zfs_dirent_unlock(dl); 1816 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 1817 dmu_tx_wait(tx); 1818 dmu_tx_abort(tx); 1819 goto top; 1820 } 1821 dmu_tx_abort(tx); 1822 ZFS_EXIT(zfsvfs); 1823 if (aclp) 1824 zfs_acl_free(aclp); 1825 return (error); 1826 } 1827 1828 /* 1829 * Create new node. 1830 */ 1831 zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, aclp, &fuidp); 1832 1833 if (aclp) 1834 zfs_acl_free(aclp); 1835 1836 /* 1837 * Now put new name in parent dir. 1838 */ 1839 (void) zfs_link_create(dl, zp, tx, ZNEW); 1840 1841 *vpp = ZTOV(zp); 1842 1843 txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap); 1844 if (flags & FIGNORECASE) 1845 txtype |= TX_CI; 1846 zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, fuidp, vap); 1847 1848 if (fuidp) 1849 zfs_fuid_info_free(fuidp); 1850 dmu_tx_commit(tx); 1851 1852 zfs_dirent_unlock(dl); 1853 1854 ZFS_EXIT(zfsvfs); 1855 return (0); 1856} 1857 1858/* 1859 * Remove a directory subdir entry. If the current working 1860 * directory is the same as the subdir to be removed, the 1861 * remove will fail. 1862 * 1863 * IN: dvp - vnode of directory to remove from. 1864 * name - name of directory to be removed. 1865 * cwd - vnode of current working directory. 1866 * cr - credentials of caller. 1867 * ct - caller context 1868 * flags - case flags 1869 * 1870 * RETURN: 0 if success 1871 * error code if failure 1872 * 1873 * Timestamps: 1874 * dvp - ctime|mtime updated 1875 */ 1876/*ARGSUSED*/ 1877static int 1878zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr, 1879 caller_context_t *ct, int flags) 1880{ 1881 znode_t *dzp = VTOZ(dvp); 1882 znode_t *zp; 1883 vnode_t *vp; 1884 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1885 zilog_t *zilog; 1886 zfs_dirlock_t *dl; 1887 dmu_tx_t *tx; 1888 int error; 1889 int zflg = ZEXISTS; 1890 1891 ZFS_ENTER(zfsvfs); 1892 ZFS_VERIFY_ZP(dzp); 1893 zilog = zfsvfs->z_log; 1894 1895 if (flags & FIGNORECASE) 1896 zflg |= ZCILOOK; 1897top: 1898 zp = NULL; 1899 1900 /* 1901 * Attempt to lock directory; fail if entry doesn't exist. 1902 */ 1903 if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 1904 NULL, NULL)) { 1905 ZFS_EXIT(zfsvfs); 1906 return (error); 1907 } 1908 1909 vp = ZTOV(zp); 1910 1911 if (error = zfs_zaccess_delete(dzp, zp, cr)) { 1912 goto out; 1913 } 1914 1915 if (vp->v_type != VDIR) { 1916 error = ENOTDIR; 1917 goto out; 1918 } 1919 1920 if (vp == cwd) { 1921 error = EINVAL; 1922 goto out; 1923 } 1924 1925 vnevent_rmdir(vp, dvp, name, ct); 1926 1927 /* 1928 * Grab a lock on the directory to make sure that noone is 1929 * trying to add (or lookup) entries while we are removing it. 1930 */ 1931 rw_enter(&zp->z_name_lock, RW_WRITER); 1932 1933 /* 1934 * Grab a lock on the parent pointer to make sure we play well 1935 * with the treewalk and directory rename code. 1936 */ 1937 rw_enter(&zp->z_parent_lock, RW_WRITER); 1938 1939 tx = dmu_tx_create(zfsvfs->z_os); 1940 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); 1941 dmu_tx_hold_bonus(tx, zp->z_id); 1942 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 1943 error = dmu_tx_assign(tx, zfsvfs->z_assign); 1944 if (error) { 1945 rw_exit(&zp->z_parent_lock); 1946 rw_exit(&zp->z_name_lock); 1947 zfs_dirent_unlock(dl); 1948 VN_RELE(vp); 1949 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 1950 dmu_tx_wait(tx); 1951 dmu_tx_abort(tx); 1952 goto top; 1953 } 1954 dmu_tx_abort(tx); 1955 ZFS_EXIT(zfsvfs); 1956 return (error); 1957 } 1958 1959#ifdef FREEBSD_NAMECACHE 1960 cache_purge(dvp); 1961#endif 1962 1963 error = zfs_link_destroy(dl, zp, tx, zflg, NULL); 1964 1965 if (error == 0) { 1966 uint64_t txtype = TX_RMDIR; 1967 if (flags & FIGNORECASE) 1968 txtype |= TX_CI; 1969 zfs_log_remove(zilog, tx, txtype, dzp, name); 1970 } 1971 1972 dmu_tx_commit(tx); 1973 1974 rw_exit(&zp->z_parent_lock); 1975 rw_exit(&zp->z_name_lock); 1976#ifdef FREEBSD_NAMECACHE 1977 cache_purge(vp); 1978#endif 1979out: 1980 zfs_dirent_unlock(dl); 1981 1982 VN_RELE(vp); 1983 1984 ZFS_EXIT(zfsvfs); 1985 return (error); 1986} 1987 1988/* 1989 * Read as many directory entries as will fit into the provided 1990 * buffer from the given directory cursor position (specified in 1991 * the uio structure. 1992 * 1993 * IN: vp - vnode of directory to read. 1994 * uio - structure supplying read location, range info, 1995 * and return buffer. 1996 * cr - credentials of caller. 1997 * ct - caller context 1998 * flags - case flags 1999 * 2000 * OUT: uio - updated offset and range, buffer filled. 2001 * eofp - set to true if end-of-file detected. 2002 * 2003 * RETURN: 0 if success 2004 * error code if failure 2005 * 2006 * Timestamps: 2007 * vp - atime updated 2008 * 2009 * Note that the low 4 bits of the cookie returned by zap is always zero. 2010 * This allows us to use the low range for "special" directory entries: 2011 * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, 2012 * we use the offset 2 for the '.zfs' directory. 2013 */ 2014/* ARGSUSED */ 2015static int 2016zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies) 2017{ 2018 znode_t *zp = VTOZ(vp); 2019 iovec_t *iovp; 2020 edirent_t *eodp; 2021 dirent64_t *odp; 2022 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2023 objset_t *os; 2024 caddr_t outbuf; 2025 size_t bufsize; 2026 zap_cursor_t zc; 2027 zap_attribute_t zap; 2028 uint_t bytes_wanted; 2029 uint64_t offset; /* must be unsigned; checks for < 1 */ 2030 int local_eof; 2031 int outcount; 2032 int error; 2033 uint8_t prefetch; 2034 boolean_t check_sysattrs; 2035 uint8_t type; 2036 int ncooks; 2037 u_long *cooks = NULL; 2038 int flags = 0; 2039 2040 ZFS_ENTER(zfsvfs); 2041 ZFS_VERIFY_ZP(zp); 2042 2043 /* 2044 * If we are not given an eof variable, 2045 * use a local one. 2046 */ 2047 if (eofp == NULL) 2048 eofp = &local_eof; 2049 2050 /* 2051 * Check for valid iov_len. 2052 */ 2053 if (uio->uio_iov->iov_len <= 0) { 2054 ZFS_EXIT(zfsvfs); 2055 return (EINVAL); 2056 } 2057 2058 /* 2059 * Quit if directory has been removed (posix) 2060 */ 2061 if ((*eofp = zp->z_unlinked) != 0) { 2062 ZFS_EXIT(zfsvfs); 2063 return (0); 2064 } 2065 2066 error = 0; 2067 os = zfsvfs->z_os; 2068 offset = uio->uio_loffset; 2069 prefetch = zp->z_zn_prefetch; 2070 2071 /* 2072 * Initialize the iterator cursor. 2073 */ 2074 if (offset <= 3) { 2075 /* 2076 * Start iteration from the beginning of the directory. 2077 */ 2078 zap_cursor_init(&zc, os, zp->z_id); 2079 } else { 2080 /* 2081 * The offset is a serialized cursor. 2082 */ 2083 zap_cursor_init_serialized(&zc, os, zp->z_id, offset); 2084 } 2085 2086 /* 2087 * Get space to change directory entries into fs independent format. 2088 */ 2089 iovp = uio->uio_iov; 2090 bytes_wanted = iovp->iov_len; 2091 if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) { 2092 bufsize = bytes_wanted; 2093 outbuf = kmem_alloc(bufsize, KM_SLEEP); 2094 odp = (struct dirent64 *)outbuf; 2095 } else { 2096 bufsize = bytes_wanted; 2097 odp = (struct dirent64 *)iovp->iov_base; 2098 } 2099 eodp = (struct edirent *)odp; 2100 2101 if (ncookies != NULL) { 2102 /* 2103 * Minimum entry size is dirent size and 1 byte for a file name. 2104 */ 2105 ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1); 2106 cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK); 2107 *cookies = cooks; 2108 *ncookies = ncooks; 2109 } 2110 /* 2111 * If this VFS supports the system attribute view interface; and 2112 * we're looking at an extended attribute directory; and we care 2113 * about normalization conflicts on this vfs; then we must check 2114 * for normalization conflicts with the sysattr name space. 2115 */ 2116#ifdef TODO 2117 check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && 2118 (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm && 2119 (flags & V_RDDIR_ENTFLAGS); 2120#else 2121 check_sysattrs = 0; 2122#endif 2123 2124 /* 2125 * Transform to file-system independent format 2126 */ 2127 outcount = 0; 2128 while (outcount < bytes_wanted) { 2129 ino64_t objnum; 2130 ushort_t reclen; 2131 off64_t *next; 2132 2133 /* 2134 * Special case `.', `..', and `.zfs'. 2135 */ 2136 if (offset == 0) { 2137 (void) strcpy(zap.za_name, "."); 2138 zap.za_normalization_conflict = 0; 2139 objnum = zp->z_id; 2140 type = DT_DIR; 2141 } else if (offset == 1) { 2142 (void) strcpy(zap.za_name, ".."); 2143 zap.za_normalization_conflict = 0; 2144 objnum = zp->z_phys->zp_parent; 2145 type = DT_DIR; 2146 } else if (offset == 2 && zfs_show_ctldir(zp)) { 2147 (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); 2148 zap.za_normalization_conflict = 0; 2149 objnum = ZFSCTL_INO_ROOT; 2150 type = DT_DIR; 2151 } else { 2152 /* 2153 * Grab next entry. 2154 */ 2155 if (error = zap_cursor_retrieve(&zc, &zap)) { 2156 if ((*eofp = (error == ENOENT)) != 0) 2157 break; 2158 else 2159 goto update; 2160 } 2161 2162 if (zap.za_integer_length != 8 || 2163 zap.za_num_integers != 1) { 2164 cmn_err(CE_WARN, "zap_readdir: bad directory " 2165 "entry, obj = %lld, offset = %lld\n", 2166 (u_longlong_t)zp->z_id, 2167 (u_longlong_t)offset); 2168 error = ENXIO; 2169 goto update; 2170 } 2171 2172 objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); 2173 /* 2174 * MacOS X can extract the object type here such as: 2175 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer); 2176 */ 2177 type = ZFS_DIRENT_TYPE(zap.za_first_integer); 2178 2179 if (check_sysattrs && !zap.za_normalization_conflict) { 2180#ifdef TODO 2181 zap.za_normalization_conflict = 2182 xattr_sysattr_casechk(zap.za_name); 2183#else 2184 panic("%s:%u: TODO", __func__, __LINE__); 2185#endif 2186 } 2187 } 2188 2189 if (flags & V_RDDIR_ENTFLAGS) 2190 reclen = EDIRENT_RECLEN(strlen(zap.za_name)); 2191 else 2192 reclen = DIRENT64_RECLEN(strlen(zap.za_name)); 2193 2194 /* 2195 * Will this entry fit in the buffer? 2196 */ 2197 if (outcount + reclen > bufsize) { 2198 /* 2199 * Did we manage to fit anything in the buffer? 2200 */ 2201 if (!outcount) { 2202 error = EINVAL; 2203 goto update; 2204 } 2205 break; 2206 } 2207 if (flags & V_RDDIR_ENTFLAGS) { 2208 /* 2209 * Add extended flag entry: 2210 */ 2211 eodp->ed_ino = objnum; 2212 eodp->ed_reclen = reclen; 2213 /* NOTE: ed_off is the offset for the *next* entry */ 2214 next = &(eodp->ed_off); 2215 eodp->ed_eflags = zap.za_normalization_conflict ? 2216 ED_CASE_CONFLICT : 0; 2217 (void) strncpy(eodp->ed_name, zap.za_name, 2218 EDIRENT_NAMELEN(reclen)); 2219 eodp = (edirent_t *)((intptr_t)eodp + reclen); 2220 } else { 2221 /* 2222 * Add normal entry: 2223 */ 2224 odp->d_ino = objnum; 2225 odp->d_reclen = reclen; 2226 odp->d_namlen = strlen(zap.za_name); 2227 (void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1); 2228 odp->d_type = type; 2229 odp = (dirent64_t *)((intptr_t)odp + reclen); 2230 } 2231 outcount += reclen; 2232 2233 ASSERT(outcount <= bufsize); 2234 2235 /* Prefetch znode */ 2236 if (prefetch) 2237 dmu_prefetch(os, objnum, 0, 0); 2238 2239 /* 2240 * Move to the next entry, fill in the previous offset. 2241 */ 2242 if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { 2243 zap_cursor_advance(&zc); 2244 offset = zap_cursor_serialize(&zc); 2245 } else { 2246 offset += 1; 2247 } 2248 2249 if (cooks != NULL) { 2250 *cooks++ = offset; 2251 ncooks--; 2252 KASSERT(ncooks >= 0, ("ncookies=%d", ncooks)); 2253 } 2254 } 2255 zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ 2256 2257 /* Subtract unused cookies */ 2258 if (ncookies != NULL) 2259 *ncookies -= ncooks; 2260 2261 if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) { 2262 iovp->iov_base += outcount; 2263 iovp->iov_len -= outcount; 2264 uio->uio_resid -= outcount; 2265 } else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) { 2266 /* 2267 * Reset the pointer. 2268 */ 2269 offset = uio->uio_loffset; 2270 } 2271 2272update: 2273 zap_cursor_fini(&zc); 2274 if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) 2275 kmem_free(outbuf, bufsize); 2276 2277 if (error == ENOENT) 2278 error = 0; 2279 2280 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 2281 2282 uio->uio_loffset = offset; 2283 ZFS_EXIT(zfsvfs); 2284 if (error != 0 && cookies != NULL) { 2285 free(*cookies, M_TEMP); 2286 *cookies = NULL; 2287 *ncookies = 0; 2288 } 2289 return (error); 2290} 2291 2292ulong_t zfs_fsync_sync_cnt = 4; 2293 2294static int 2295zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct) 2296{ 2297 znode_t *zp = VTOZ(vp); 2298 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2299 2300 (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt); 2301 2302 ZFS_ENTER(zfsvfs); 2303 ZFS_VERIFY_ZP(zp); 2304 zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id); 2305 ZFS_EXIT(zfsvfs); 2306 return (0); 2307} 2308 2309 2310/* 2311 * Get the requested file attributes and place them in the provided 2312 * vattr structure. 2313 * 2314 * IN: vp - vnode of file. 2315 * vap - va_mask identifies requested attributes. 2316 * If AT_XVATTR set, then optional attrs are requested 2317 * flags - ATTR_NOACLCHECK (CIFS server context) 2318 * cr - credentials of caller. 2319 * ct - caller context 2320 * 2321 * OUT: vap - attribute values. 2322 * 2323 * RETURN: 0 (always succeeds) 2324 */ 2325/* ARGSUSED */ 2326static int 2327zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, 2328 caller_context_t *ct) 2329{ 2330 znode_t *zp = VTOZ(vp); 2331 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2332 znode_phys_t *pzp; 2333 int error = 0; 2334 uint32_t blksize; 2335 u_longlong_t nblocks; 2336 uint64_t links; 2337 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ 2338 xoptattr_t *xoap = NULL; 2339 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 2340 2341 ZFS_ENTER(zfsvfs); 2342 ZFS_VERIFY_ZP(zp); 2343 pzp = zp->z_phys; 2344 2345 mutex_enter(&zp->z_lock); 2346 2347 /* 2348 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES. 2349 * Also, if we are the owner don't bother, since owner should 2350 * always be allowed to read basic attributes of file. 2351 */ 2352 if (!(pzp->zp_flags & ZFS_ACL_TRIVIAL) && 2353 (pzp->zp_uid != crgetuid(cr))) { 2354 if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0, 2355 skipaclchk, cr)) { 2356 mutex_exit(&zp->z_lock); 2357 ZFS_EXIT(zfsvfs); 2358 return (error); 2359 } 2360 } 2361 2362 /* 2363 * Return all attributes. It's cheaper to provide the answer 2364 * than to determine whether we were asked the question. 2365 */ 2366 2367 vap->va_type = IFTOVT(pzp->zp_mode); 2368 vap->va_mode = pzp->zp_mode & ~S_IFMT; 2369 zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid); 2370// vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev; 2371 vap->va_nodeid = zp->z_id; 2372 if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp)) 2373 links = pzp->zp_links + 1; 2374 else 2375 links = pzp->zp_links; 2376 vap->va_nlink = MIN(links, UINT32_MAX); /* nlink_t limit! */ 2377 vap->va_size = pzp->zp_size; 2378 vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; 2379 vap->va_rdev = zfs_cmpldev(pzp->zp_rdev); 2380 vap->va_seq = zp->z_seq; 2381 vap->va_flags = 0; /* FreeBSD: Reset chflags(2) flags. */ 2382 2383 /* 2384 * Add in any requested optional attributes and the create time. 2385 * Also set the corresponding bits in the returned attribute bitmap. 2386 */ 2387 if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) { 2388 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { 2389 xoap->xoa_archive = 2390 ((pzp->zp_flags & ZFS_ARCHIVE) != 0); 2391 XVA_SET_RTN(xvap, XAT_ARCHIVE); 2392 } 2393 2394 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { 2395 xoap->xoa_readonly = 2396 ((pzp->zp_flags & ZFS_READONLY) != 0); 2397 XVA_SET_RTN(xvap, XAT_READONLY); 2398 } 2399 2400 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { 2401 xoap->xoa_system = 2402 ((pzp->zp_flags & ZFS_SYSTEM) != 0); 2403 XVA_SET_RTN(xvap, XAT_SYSTEM); 2404 } 2405 2406 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { 2407 xoap->xoa_hidden = 2408 ((pzp->zp_flags & ZFS_HIDDEN) != 0); 2409 XVA_SET_RTN(xvap, XAT_HIDDEN); 2410 } 2411 2412 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { 2413 xoap->xoa_nounlink = 2414 ((pzp->zp_flags & ZFS_NOUNLINK) != 0); 2415 XVA_SET_RTN(xvap, XAT_NOUNLINK); 2416 } 2417 2418 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { 2419 xoap->xoa_immutable = 2420 ((pzp->zp_flags & ZFS_IMMUTABLE) != 0); 2421 XVA_SET_RTN(xvap, XAT_IMMUTABLE); 2422 } 2423 2424 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { 2425 xoap->xoa_appendonly = 2426 ((pzp->zp_flags & ZFS_APPENDONLY) != 0); 2427 XVA_SET_RTN(xvap, XAT_APPENDONLY); 2428 } 2429 2430 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { 2431 xoap->xoa_nodump = 2432 ((pzp->zp_flags & ZFS_NODUMP) != 0); 2433 XVA_SET_RTN(xvap, XAT_NODUMP); 2434 } 2435 2436 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { 2437 xoap->xoa_opaque = 2438 ((pzp->zp_flags & ZFS_OPAQUE) != 0); 2439 XVA_SET_RTN(xvap, XAT_OPAQUE); 2440 } 2441 2442 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { 2443 xoap->xoa_av_quarantined = 2444 ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0); 2445 XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); 2446 } 2447 2448 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { 2449 xoap->xoa_av_modified = 2450 ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0); 2451 XVA_SET_RTN(xvap, XAT_AV_MODIFIED); 2452 } 2453 2454 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) && 2455 vp->v_type == VREG && 2456 (pzp->zp_flags & ZFS_BONUS_SCANSTAMP)) { 2457 size_t len; 2458 dmu_object_info_t doi; 2459 2460 /* 2461 * Only VREG files have anti-virus scanstamps, so we 2462 * won't conflict with symlinks in the bonus buffer. 2463 */ 2464 dmu_object_info_from_db(zp->z_dbuf, &doi); 2465 len = sizeof (xoap->xoa_av_scanstamp) + 2466 sizeof (znode_phys_t); 2467 if (len <= doi.doi_bonus_size) { 2468 /* 2469 * pzp points to the start of the 2470 * znode_phys_t. pzp + 1 points to the 2471 * first byte after the znode_phys_t. 2472 */ 2473 (void) memcpy(xoap->xoa_av_scanstamp, 2474 pzp + 1, 2475 sizeof (xoap->xoa_av_scanstamp)); 2476 XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); 2477 } 2478 } 2479 2480 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { 2481 ZFS_TIME_DECODE(&xoap->xoa_createtime, pzp->zp_crtime); 2482 XVA_SET_RTN(xvap, XAT_CREATETIME); 2483 } 2484 } 2485 2486 ZFS_TIME_DECODE(&vap->va_atime, pzp->zp_atime); 2487 ZFS_TIME_DECODE(&vap->va_mtime, pzp->zp_mtime); 2488 ZFS_TIME_DECODE(&vap->va_ctime, pzp->zp_ctime); 2489 ZFS_TIME_DECODE(&vap->va_birthtime, pzp->zp_crtime); 2490 2491 mutex_exit(&zp->z_lock); 2492 2493 dmu_object_size_from_db(zp->z_dbuf, &blksize, &nblocks); 2494 vap->va_blksize = blksize; 2495 vap->va_bytes = nblocks << 9; /* nblocks * 512 */ 2496 2497 if (zp->z_blksz == 0) { 2498 /* 2499 * Block size hasn't been set; suggest maximal I/O transfers. 2500 */ 2501 vap->va_blksize = zfsvfs->z_max_blksz; 2502 } 2503 2504 ZFS_EXIT(zfsvfs); 2505 return (0); 2506} 2507 2508/* 2509 * Set the file attributes to the values contained in the 2510 * vattr structure. 2511 * 2512 * IN: vp - vnode of file to be modified. 2513 * vap - new attribute values. 2514 * If AT_XVATTR set, then optional attrs are being set 2515 * flags - ATTR_UTIME set if non-default time values provided. 2516 * - ATTR_NOACLCHECK (CIFS context only). 2517 * cr - credentials of caller. 2518 * ct - caller context 2519 * 2520 * RETURN: 0 if success 2521 * error code if failure 2522 * 2523 * Timestamps: 2524 * vp - ctime updated, mtime updated if size changed. 2525 */ 2526/* ARGSUSED */ 2527static int 2528zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, 2529 caller_context_t *ct) 2530{ 2531 znode_t *zp = VTOZ(vp); 2532 znode_phys_t *pzp; 2533 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2534 zilog_t *zilog; 2535 dmu_tx_t *tx; 2536 vattr_t oldva; 2537 uint_t mask = vap->va_mask; 2538 uint_t saved_mask; 2539 int trim_mask = 0; 2540 uint64_t new_mode; 2541 znode_t *attrzp; 2542 int need_policy = FALSE; 2543 int err; 2544 zfs_fuid_info_t *fuidp = NULL; 2545 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ 2546 xoptattr_t *xoap; 2547 zfs_acl_t *aclp = NULL; 2548 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 2549 2550 if (mask == 0) 2551 return (0); 2552 2553 if (mask & AT_NOSET) 2554 return (EINVAL); 2555 2556 ZFS_ENTER(zfsvfs); 2557 ZFS_VERIFY_ZP(zp); 2558 2559 pzp = zp->z_phys; 2560 zilog = zfsvfs->z_log; 2561 2562 /* 2563 * Make sure that if we have ephemeral uid/gid or xvattr specified 2564 * that file system is at proper version level 2565 */ 2566 2567 if (zfsvfs->z_use_fuids == B_FALSE && 2568 (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) || 2569 ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) || 2570 (mask & AT_XVATTR))) { 2571 ZFS_EXIT(zfsvfs); 2572 return (EINVAL); 2573 } 2574 2575 if (mask & AT_SIZE && vp->v_type == VDIR) { 2576 ZFS_EXIT(zfsvfs); 2577 return (EISDIR); 2578 } 2579 2580 if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) { 2581 ZFS_EXIT(zfsvfs); 2582 return (EINVAL); 2583 } 2584 2585 /* 2586 * If this is an xvattr_t, then get a pointer to the structure of 2587 * optional attributes. If this is NULL, then we have a vattr_t. 2588 */ 2589 xoap = xva_getxoptattr(xvap); 2590 2591 /* 2592 * Immutable files can only alter immutable bit and atime 2593 */ 2594 if ((pzp->zp_flags & ZFS_IMMUTABLE) && 2595 ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) || 2596 ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { 2597 ZFS_EXIT(zfsvfs); 2598 return (EPERM); 2599 } 2600 2601 if ((mask & AT_SIZE) && (pzp->zp_flags & ZFS_READONLY)) { 2602 ZFS_EXIT(zfsvfs); 2603 return (EPERM); 2604 } 2605 2606 /* 2607 * Verify timestamps doesn't overflow 32 bits. 2608 * ZFS can handle large timestamps, but 32bit syscalls can't 2609 * handle times greater than 2039. This check should be removed 2610 * once large timestamps are fully supported. 2611 */ 2612 if (mask & (AT_ATIME | AT_MTIME)) { 2613 if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || 2614 ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { 2615 ZFS_EXIT(zfsvfs); 2616 return (EOVERFLOW); 2617 } 2618 } 2619 2620top: 2621 attrzp = NULL; 2622 2623 if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { 2624 ZFS_EXIT(zfsvfs); 2625 return (EROFS); 2626 } 2627 2628 /* 2629 * First validate permissions 2630 */ 2631 2632 if (mask & AT_SIZE) { 2633 err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr); 2634 if (err) { 2635 ZFS_EXIT(zfsvfs); 2636 return (err); 2637 } 2638 /* 2639 * XXX - Note, we are not providing any open 2640 * mode flags here (like FNDELAY), so we may 2641 * block if there are locks present... this 2642 * should be addressed in openat(). 2643 */ 2644 /* XXX - would it be OK to generate a log record here? */ 2645 err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); 2646 if (err) { 2647 ZFS_EXIT(zfsvfs); 2648 return (err); 2649 } 2650 } 2651 2652 if (mask & (AT_ATIME|AT_MTIME) || 2653 ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || 2654 XVA_ISSET_REQ(xvap, XAT_READONLY) || 2655 XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || 2656 XVA_ISSET_REQ(xvap, XAT_CREATETIME) || 2657 XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) 2658 need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, 2659 skipaclchk, cr); 2660 2661 if (mask & (AT_UID|AT_GID)) { 2662 int idmask = (mask & (AT_UID|AT_GID)); 2663 int take_owner; 2664 int take_group; 2665 2666 /* 2667 * NOTE: even if a new mode is being set, 2668 * we may clear S_ISUID/S_ISGID bits. 2669 */ 2670 2671 if (!(mask & AT_MODE)) 2672 vap->va_mode = pzp->zp_mode; 2673 2674 /* 2675 * Take ownership or chgrp to group we are a member of 2676 */ 2677 2678 take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr)); 2679 take_group = (mask & AT_GID) && 2680 zfs_groupmember(zfsvfs, vap->va_gid, cr); 2681 2682 /* 2683 * If both AT_UID and AT_GID are set then take_owner and 2684 * take_group must both be set in order to allow taking 2685 * ownership. 2686 * 2687 * Otherwise, send the check through secpolicy_vnode_setattr() 2688 * 2689 */ 2690 2691 if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) || 2692 ((idmask == AT_UID) && take_owner) || 2693 ((idmask == AT_GID) && take_group)) { 2694 if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, 2695 skipaclchk, cr) == 0) { 2696 /* 2697 * Remove setuid/setgid for non-privileged users 2698 */ 2699 secpolicy_setid_clear(vap, vp, cr); 2700 trim_mask = (mask & (AT_UID|AT_GID)); 2701 } else { 2702 need_policy = TRUE; 2703 } 2704 } else { 2705 need_policy = TRUE; 2706 } 2707 } 2708 2709 mutex_enter(&zp->z_lock); 2710 oldva.va_mode = pzp->zp_mode; 2711 zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); 2712 if (mask & AT_XVATTR) { 2713 if ((need_policy == FALSE) && 2714 (XVA_ISSET_REQ(xvap, XAT_APPENDONLY) && 2715 xoap->xoa_appendonly != 2716 ((pzp->zp_flags & ZFS_APPENDONLY) != 0)) || 2717 (XVA_ISSET_REQ(xvap, XAT_NOUNLINK) && 2718 xoap->xoa_nounlink != 2719 ((pzp->zp_flags & ZFS_NOUNLINK) != 0)) || 2720 (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE) && 2721 xoap->xoa_immutable != 2722 ((pzp->zp_flags & ZFS_IMMUTABLE) != 0)) || 2723 (XVA_ISSET_REQ(xvap, XAT_NODUMP) && 2724 xoap->xoa_nodump != 2725 ((pzp->zp_flags & ZFS_NODUMP) != 0)) || 2726 (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED) && 2727 xoap->xoa_av_modified != 2728 ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0)) || 2729 ((XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED) && 2730 ((vp->v_type != VREG && xoap->xoa_av_quarantined) || 2731 xoap->xoa_av_quarantined != 2732 ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0)))) || 2733 (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) || 2734 (XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { 2735 need_policy = TRUE; 2736 } 2737 } 2738 2739 mutex_exit(&zp->z_lock); 2740 2741 if (mask & AT_MODE) { 2742 if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) { 2743 err = secpolicy_setid_setsticky_clear(vp, vap, 2744 &oldva, cr); 2745 if (err) { 2746 ZFS_EXIT(zfsvfs); 2747 return (err); 2748 } 2749 trim_mask |= AT_MODE; 2750 } else { 2751 need_policy = TRUE; 2752 } 2753 } 2754 2755 if (need_policy) { 2756 /* 2757 * If trim_mask is set then take ownership 2758 * has been granted or write_acl is present and user 2759 * has the ability to modify mode. In that case remove 2760 * UID|GID and or MODE from mask so that 2761 * secpolicy_vnode_setattr() doesn't revoke it. 2762 */ 2763 2764 if (trim_mask) { 2765 saved_mask = vap->va_mask; 2766 vap->va_mask &= ~trim_mask; 2767 } 2768 err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags, 2769 (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp); 2770 if (err) { 2771 ZFS_EXIT(zfsvfs); 2772 return (err); 2773 } 2774 2775 if (trim_mask) 2776 vap->va_mask |= saved_mask; 2777 } 2778 2779 /* 2780 * secpolicy_vnode_setattr, or take ownership may have 2781 * changed va_mask 2782 */ 2783 mask = vap->va_mask; 2784 2785 tx = dmu_tx_create(zfsvfs->z_os); 2786 dmu_tx_hold_bonus(tx, zp->z_id); 2787 if (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) || 2788 ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid))) { 2789 if (zfsvfs->z_fuid_obj == 0) { 2790 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 2791 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 2792 FUID_SIZE_ESTIMATE(zfsvfs)); 2793 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); 2794 } else { 2795 dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); 2796 dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, 2797 FUID_SIZE_ESTIMATE(zfsvfs)); 2798 } 2799 } 2800 2801 if (mask & AT_MODE) { 2802 uint64_t pmode = pzp->zp_mode; 2803 2804 new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); 2805 2806 if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)) { 2807 dmu_tx_abort(tx); 2808 ZFS_EXIT(zfsvfs); 2809 return (err); 2810 } 2811 if (pzp->zp_acl.z_acl_extern_obj) { 2812 /* Are we upgrading ACL from old V0 format to new V1 */ 2813 if (zfsvfs->z_version <= ZPL_VERSION_FUID && 2814 pzp->zp_acl.z_acl_version == 2815 ZFS_ACL_VERSION_INITIAL) { 2816 dmu_tx_hold_free(tx, 2817 pzp->zp_acl.z_acl_extern_obj, 0, 2818 DMU_OBJECT_END); 2819 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 2820 0, aclp->z_acl_bytes); 2821 } else { 2822 dmu_tx_hold_write(tx, 2823 pzp->zp_acl.z_acl_extern_obj, 0, 2824 aclp->z_acl_bytes); 2825 } 2826 } else if (aclp->z_acl_bytes > ZFS_ACE_SPACE) { 2827 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 2828 0, aclp->z_acl_bytes); 2829 } 2830 } 2831 2832 if ((mask & (AT_UID | AT_GID)) && pzp->zp_xattr != 0) { 2833 err = zfs_zget(zp->z_zfsvfs, pzp->zp_xattr, &attrzp); 2834 if (err) { 2835 dmu_tx_abort(tx); 2836 ZFS_EXIT(zfsvfs); 2837 if (aclp) 2838 zfs_acl_free(aclp); 2839 return (err); 2840 } 2841 dmu_tx_hold_bonus(tx, attrzp->z_id); 2842 } 2843 2844 err = dmu_tx_assign(tx, zfsvfs->z_assign); 2845 if (err) { 2846 if (attrzp) 2847 VN_RELE(ZTOV(attrzp)); 2848 2849 if (aclp) { 2850 zfs_acl_free(aclp); 2851 aclp = NULL; 2852 } 2853 2854 if (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 2855 dmu_tx_wait(tx); 2856 dmu_tx_abort(tx); 2857 goto top; 2858 } 2859 dmu_tx_abort(tx); 2860 ZFS_EXIT(zfsvfs); 2861 return (err); 2862 } 2863 2864 dmu_buf_will_dirty(zp->z_dbuf, tx); 2865 2866 /* 2867 * Set each attribute requested. 2868 * We group settings according to the locks they need to acquire. 2869 * 2870 * Note: you cannot set ctime directly, although it will be 2871 * updated as a side-effect of calling this function. 2872 */ 2873 2874 mutex_enter(&zp->z_lock); 2875 2876 if (mask & AT_MODE) { 2877 mutex_enter(&zp->z_acl_lock); 2878 zp->z_phys->zp_mode = new_mode; 2879 err = zfs_aclset_common(zp, aclp, cr, &fuidp, tx); 2880 ASSERT3U(err, ==, 0); 2881 mutex_exit(&zp->z_acl_lock); 2882 } 2883 2884 if (attrzp) 2885 mutex_enter(&attrzp->z_lock); 2886 2887 if (mask & AT_UID) { 2888 pzp->zp_uid = zfs_fuid_create(zfsvfs, 2889 vap->va_uid, cr, ZFS_OWNER, tx, &fuidp); 2890 if (attrzp) { 2891 attrzp->z_phys->zp_uid = zfs_fuid_create(zfsvfs, 2892 vap->va_uid, cr, ZFS_OWNER, tx, &fuidp); 2893 } 2894 } 2895 2896 if (mask & AT_GID) { 2897 pzp->zp_gid = zfs_fuid_create(zfsvfs, vap->va_gid, 2898 cr, ZFS_GROUP, tx, &fuidp); 2899 if (attrzp) 2900 attrzp->z_phys->zp_gid = zfs_fuid_create(zfsvfs, 2901 vap->va_gid, cr, ZFS_GROUP, tx, &fuidp); 2902 } 2903 2904 if (aclp) 2905 zfs_acl_free(aclp); 2906 2907 if (attrzp) 2908 mutex_exit(&attrzp->z_lock); 2909 2910 if (mask & AT_ATIME) 2911 ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime); 2912 2913 if (mask & AT_MTIME) 2914 ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime); 2915 2916 /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */ 2917 if (mask & AT_SIZE) 2918 zfs_time_stamper_locked(zp, CONTENT_MODIFIED, tx); 2919 else if (mask != 0) 2920 zfs_time_stamper_locked(zp, STATE_CHANGED, tx); 2921 /* 2922 * Do this after setting timestamps to prevent timestamp 2923 * update from toggling bit 2924 */ 2925 2926 if (xoap && (mask & AT_XVATTR)) { 2927 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { 2928 size_t len; 2929 dmu_object_info_t doi; 2930 2931 ASSERT(vp->v_type == VREG); 2932 2933 /* Grow the bonus buffer if necessary. */ 2934 dmu_object_info_from_db(zp->z_dbuf, &doi); 2935 len = sizeof (xoap->xoa_av_scanstamp) + 2936 sizeof (znode_phys_t); 2937 if (len > doi.doi_bonus_size) 2938 VERIFY(dmu_set_bonus(zp->z_dbuf, len, tx) == 0); 2939 } 2940 zfs_xvattr_set(zp, xvap); 2941 } 2942 2943 if (mask != 0) 2944 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); 2945 2946 if (fuidp) 2947 zfs_fuid_info_free(fuidp); 2948 mutex_exit(&zp->z_lock); 2949 2950 if (attrzp) 2951 VN_RELE(ZTOV(attrzp)); 2952 2953 dmu_tx_commit(tx); 2954 2955 ZFS_EXIT(zfsvfs); 2956 return (err); 2957} 2958 2959typedef struct zfs_zlock { 2960 krwlock_t *zl_rwlock; /* lock we acquired */ 2961 znode_t *zl_znode; /* znode we held */ 2962 struct zfs_zlock *zl_next; /* next in list */ 2963} zfs_zlock_t; 2964 2965/* 2966 * Drop locks and release vnodes that were held by zfs_rename_lock(). 2967 */ 2968static void 2969zfs_rename_unlock(zfs_zlock_t **zlpp) 2970{ 2971 zfs_zlock_t *zl; 2972 2973 while ((zl = *zlpp) != NULL) { 2974 if (zl->zl_znode != NULL) 2975 VN_RELE(ZTOV(zl->zl_znode)); 2976 rw_exit(zl->zl_rwlock); 2977 *zlpp = zl->zl_next; 2978 kmem_free(zl, sizeof (*zl)); 2979 } 2980} 2981 2982/* 2983 * Search back through the directory tree, using the ".." entries. 2984 * Lock each directory in the chain to prevent concurrent renames. 2985 * Fail any attempt to move a directory into one of its own descendants. 2986 * XXX - z_parent_lock can overlap with map or grow locks 2987 */ 2988static int 2989zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) 2990{ 2991 zfs_zlock_t *zl; 2992 znode_t *zp = tdzp; 2993 uint64_t rootid = zp->z_zfsvfs->z_root; 2994 uint64_t *oidp = &zp->z_id; 2995 krwlock_t *rwlp = &szp->z_parent_lock; 2996 krw_t rw = RW_WRITER; 2997 2998 /* 2999 * First pass write-locks szp and compares to zp->z_id. 3000 * Later passes read-lock zp and compare to zp->z_parent. 3001 */ 3002 do { 3003 if (!rw_tryenter(rwlp, rw)) { 3004 /* 3005 * Another thread is renaming in this path. 3006 * Note that if we are a WRITER, we don't have any 3007 * parent_locks held yet. 3008 */ 3009 if (rw == RW_READER && zp->z_id > szp->z_id) { 3010 /* 3011 * Drop our locks and restart 3012 */ 3013 zfs_rename_unlock(&zl); 3014 *zlpp = NULL; 3015 zp = tdzp; 3016 oidp = &zp->z_id; 3017 rwlp = &szp->z_parent_lock; 3018 rw = RW_WRITER; 3019 continue; 3020 } else { 3021 /* 3022 * Wait for other thread to drop its locks 3023 */ 3024 rw_enter(rwlp, rw); 3025 } 3026 } 3027 3028 zl = kmem_alloc(sizeof (*zl), KM_SLEEP); 3029 zl->zl_rwlock = rwlp; 3030 zl->zl_znode = NULL; 3031 zl->zl_next = *zlpp; 3032 *zlpp = zl; 3033 3034 if (*oidp == szp->z_id) /* We're a descendant of szp */ 3035 return (EINVAL); 3036 3037 if (*oidp == rootid) /* We've hit the top */ 3038 return (0); 3039 3040 if (rw == RW_READER) { /* i.e. not the first pass */ 3041 int error = zfs_zget(zp->z_zfsvfs, *oidp, &zp); 3042 if (error) 3043 return (error); 3044 zl->zl_znode = zp; 3045 } 3046 oidp = &zp->z_phys->zp_parent; 3047 rwlp = &zp->z_parent_lock; 3048 rw = RW_READER; 3049 3050 } while (zp->z_id != sdzp->z_id); 3051 3052 return (0); 3053} 3054 3055/* 3056 * Move an entry from the provided source directory to the target 3057 * directory. Change the entry name as indicated. 3058 * 3059 * IN: sdvp - Source directory containing the "old entry". 3060 * snm - Old entry name. 3061 * tdvp - Target directory to contain the "new entry". 3062 * tnm - New entry name. 3063 * cr - credentials of caller. 3064 * ct - caller context 3065 * flags - case flags 3066 * 3067 * RETURN: 0 if success 3068 * error code if failure 3069 * 3070 * Timestamps: 3071 * sdvp,tdvp - ctime|mtime updated 3072 */ 3073/*ARGSUSED*/ 3074static int 3075zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr, 3076 caller_context_t *ct, int flags) 3077{ 3078 znode_t *tdzp, *szp, *tzp; 3079 znode_t *sdzp = VTOZ(sdvp); 3080 zfsvfs_t *zfsvfs = sdzp->z_zfsvfs; 3081 zilog_t *zilog; 3082 vnode_t *realvp; 3083 zfs_dirlock_t *sdl, *tdl; 3084 dmu_tx_t *tx; 3085 zfs_zlock_t *zl; 3086 int cmp, serr, terr; 3087 int error = 0; 3088 int zflg = 0; 3089 3090 ZFS_ENTER(zfsvfs); 3091 ZFS_VERIFY_ZP(sdzp); 3092 zilog = zfsvfs->z_log; 3093 3094 /* 3095 * Make sure we have the real vp for the target directory. 3096 */ 3097 if (VOP_REALVP(tdvp, &realvp, ct) == 0) 3098 tdvp = realvp; 3099 3100 if (tdvp->v_vfsp != sdvp->v_vfsp) { 3101 ZFS_EXIT(zfsvfs); 3102 return (EXDEV); 3103 } 3104 3105 tdzp = VTOZ(tdvp); 3106 ZFS_VERIFY_ZP(tdzp); 3107 if (zfsvfs->z_utf8 && u8_validate(tnm, 3108 strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 3109 ZFS_EXIT(zfsvfs); 3110 return (EILSEQ); 3111 } 3112 3113 if (flags & FIGNORECASE) 3114 zflg |= ZCILOOK; 3115 3116top: 3117 szp = NULL; 3118 tzp = NULL; 3119 zl = NULL; 3120 3121 /* 3122 * This is to prevent the creation of links into attribute space 3123 * by renaming a linked file into/outof an attribute directory. 3124 * See the comment in zfs_link() for why this is considered bad. 3125 */ 3126 if ((tdzp->z_phys->zp_flags & ZFS_XATTR) != 3127 (sdzp->z_phys->zp_flags & ZFS_XATTR)) { 3128 ZFS_EXIT(zfsvfs); 3129 return (EINVAL); 3130 } 3131 3132 /* 3133 * Lock source and target directory entries. To prevent deadlock, 3134 * a lock ordering must be defined. We lock the directory with 3135 * the smallest object id first, or if it's a tie, the one with 3136 * the lexically first name. 3137 */ 3138 if (sdzp->z_id < tdzp->z_id) { 3139 cmp = -1; 3140 } else if (sdzp->z_id > tdzp->z_id) { 3141 cmp = 1; 3142 } else { 3143 /* 3144 * First compare the two name arguments without 3145 * considering any case folding. 3146 */ 3147 int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER); 3148 3149 cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error); 3150 ASSERT(error == 0 || !zfsvfs->z_utf8); 3151 if (cmp == 0) { 3152 /* 3153 * POSIX: "If the old argument and the new argument 3154 * both refer to links to the same existing file, 3155 * the rename() function shall return successfully 3156 * and perform no other action." 3157 */ 3158 ZFS_EXIT(zfsvfs); 3159 return (0); 3160 } 3161 /* 3162 * If the file system is case-folding, then we may 3163 * have some more checking to do. A case-folding file 3164 * system is either supporting mixed case sensitivity 3165 * access or is completely case-insensitive. Note 3166 * that the file system is always case preserving. 3167 * 3168 * In mixed sensitivity mode case sensitive behavior 3169 * is the default. FIGNORECASE must be used to 3170 * explicitly request case insensitive behavior. 3171 * 3172 * If the source and target names provided differ only 3173 * by case (e.g., a request to rename 'tim' to 'Tim'), 3174 * we will treat this as a special case in the 3175 * case-insensitive mode: as long as the source name 3176 * is an exact match, we will allow this to proceed as 3177 * a name-change request. 3178 */ 3179 if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE || 3180 (zfsvfs->z_case == ZFS_CASE_MIXED && 3181 flags & FIGNORECASE)) && 3182 u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST, 3183 &error) == 0) { 3184 /* 3185 * case preserving rename request, require exact 3186 * name matches 3187 */ 3188 zflg |= ZCIEXACT; 3189 zflg &= ~ZCILOOK; 3190 } 3191 } 3192 3193 if (cmp < 0) { 3194 serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, 3195 ZEXISTS | zflg, NULL, NULL); 3196 terr = zfs_dirent_lock(&tdl, 3197 tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL); 3198 } else { 3199 terr = zfs_dirent_lock(&tdl, 3200 tdzp, tnm, &tzp, zflg, NULL, NULL); 3201 serr = zfs_dirent_lock(&sdl, 3202 sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg, 3203 NULL, NULL); 3204 } 3205 3206 if (serr) { 3207 /* 3208 * Source entry invalid or not there. 3209 */ 3210 if (!terr) { 3211 zfs_dirent_unlock(tdl); 3212 if (tzp) 3213 VN_RELE(ZTOV(tzp)); 3214 } 3215 if (strcmp(snm, ".") == 0 || strcmp(snm, "..") == 0) 3216 serr = EINVAL; 3217 ZFS_EXIT(zfsvfs); 3218 return (serr); 3219 } 3220 if (terr) { 3221 zfs_dirent_unlock(sdl); 3222 VN_RELE(ZTOV(szp)); 3223 if (strcmp(tnm, "..") == 0) 3224 terr = EINVAL; 3225 ZFS_EXIT(zfsvfs); 3226 return (terr); 3227 } 3228 3229 /* 3230 * Must have write access at the source to remove the old entry 3231 * and write access at the target to create the new entry. 3232 * Note that if target and source are the same, this can be 3233 * done in a single check. 3234 */ 3235 3236 if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr)) 3237 goto out; 3238 3239 if (ZTOV(szp)->v_type == VDIR) { 3240 /* 3241 * Check to make sure rename is valid. 3242 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d 3243 */ 3244 if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl)) 3245 goto out; 3246 } 3247 3248 /* 3249 * Does target exist? 3250 */ 3251 if (tzp) { 3252 /* 3253 * Source and target must be the same type. 3254 */ 3255 if (ZTOV(szp)->v_type == VDIR) { 3256 if (ZTOV(tzp)->v_type != VDIR) { 3257 error = ENOTDIR; 3258 goto out; 3259 } 3260 } else { 3261 if (ZTOV(tzp)->v_type == VDIR) { 3262 error = EISDIR; 3263 goto out; 3264 } 3265 } 3266 /* 3267 * POSIX dictates that when the source and target 3268 * entries refer to the same file object, rename 3269 * must do nothing and exit without error. 3270 */ 3271 if (szp->z_id == tzp->z_id) { 3272 error = 0; 3273 goto out; 3274 } 3275 } 3276 3277 vnevent_rename_src(ZTOV(szp), sdvp, snm, ct); 3278 if (tzp) 3279 vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct); 3280 3281 /* 3282 * notify the target directory if it is not the same 3283 * as source directory. 3284 */ 3285 if (tdvp != sdvp) { 3286 vnevent_rename_dest_dir(tdvp, ct); 3287 } 3288 3289 tx = dmu_tx_create(zfsvfs->z_os); 3290 dmu_tx_hold_bonus(tx, szp->z_id); /* nlink changes */ 3291 dmu_tx_hold_bonus(tx, sdzp->z_id); /* nlink changes */ 3292 dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); 3293 dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); 3294 if (sdzp != tdzp) 3295 dmu_tx_hold_bonus(tx, tdzp->z_id); /* nlink changes */ 3296 if (tzp) 3297 dmu_tx_hold_bonus(tx, tzp->z_id); /* parent changes */ 3298 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 3299 error = dmu_tx_assign(tx, zfsvfs->z_assign); 3300 if (error) { 3301 if (zl != NULL) 3302 zfs_rename_unlock(&zl); 3303 zfs_dirent_unlock(sdl); 3304 zfs_dirent_unlock(tdl); 3305 VN_RELE(ZTOV(szp)); 3306 if (tzp) 3307 VN_RELE(ZTOV(tzp)); 3308 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 3309 dmu_tx_wait(tx); 3310 dmu_tx_abort(tx); 3311 goto top; 3312 } 3313 dmu_tx_abort(tx); 3314 ZFS_EXIT(zfsvfs); 3315 return (error); 3316 } 3317 3318 if (tzp) /* Attempt to remove the existing target */ 3319 error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL); 3320 3321 if (error == 0) { 3322 error = zfs_link_create(tdl, szp, tx, ZRENAMING); 3323 if (error == 0) { 3324 szp->z_phys->zp_flags |= ZFS_AV_MODIFIED; 3325 3326 error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); 3327 ASSERT(error == 0); 3328 3329 zfs_log_rename(zilog, tx, 3330 TX_RENAME | (flags & FIGNORECASE ? TX_CI : 0), 3331 sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp); 3332 3333 /* Update path information for the target vnode */ 3334 vn_renamepath(tdvp, ZTOV(szp), tnm, strlen(tnm)); 3335 } 3336#ifdef FREEBSD_NAMECACHE 3337 if (error == 0) { 3338 cache_purge(sdvp); 3339 cache_purge(tdvp); 3340 } 3341#endif 3342 } 3343 3344 dmu_tx_commit(tx); 3345out: 3346 if (zl != NULL) 3347 zfs_rename_unlock(&zl); 3348 3349 zfs_dirent_unlock(sdl); 3350 zfs_dirent_unlock(tdl); 3351 3352 VN_RELE(ZTOV(szp)); 3353 if (tzp) 3354 VN_RELE(ZTOV(tzp)); 3355 3356 ZFS_EXIT(zfsvfs); 3357 3358 return (error); 3359} 3360 3361/* 3362 * Insert the indicated symbolic reference entry into the directory. 3363 * 3364 * IN: dvp - Directory to contain new symbolic link. 3365 * link - Name for new symlink entry. 3366 * vap - Attributes of new entry. 3367 * target - Target path of new symlink. 3368 * cr - credentials of caller. 3369 * ct - caller context 3370 * flags - case flags 3371 * 3372 * RETURN: 0 if success 3373 * error code if failure 3374 * 3375 * Timestamps: 3376 * dvp - ctime|mtime updated 3377 */ 3378/*ARGSUSED*/ 3379static int 3380zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link, 3381 cred_t *cr, kthread_t *td) 3382{ 3383 znode_t *zp, *dzp = VTOZ(dvp); 3384 zfs_dirlock_t *dl; 3385 dmu_tx_t *tx; 3386 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 3387 zilog_t *zilog; 3388 int len = strlen(link); 3389 int error; 3390 int zflg = ZNEW; 3391 zfs_fuid_info_t *fuidp = NULL; 3392 int flags = 0; 3393 3394 ASSERT(vap->va_type == VLNK); 3395 3396 ZFS_ENTER(zfsvfs); 3397 ZFS_VERIFY_ZP(dzp); 3398 zilog = zfsvfs->z_log; 3399 3400 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), 3401 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 3402 ZFS_EXIT(zfsvfs); 3403 return (EILSEQ); 3404 } 3405 if (flags & FIGNORECASE) 3406 zflg |= ZCILOOK; 3407top: 3408 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { 3409 ZFS_EXIT(zfsvfs); 3410 return (error); 3411 } 3412 3413 if (len > MAXPATHLEN) { 3414 ZFS_EXIT(zfsvfs); 3415 return (ENAMETOOLONG); 3416 } 3417 3418 /* 3419 * Attempt to lock directory; fail if entry already exists. 3420 */ 3421 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); 3422 if (error) { 3423 ZFS_EXIT(zfsvfs); 3424 return (error); 3425 } 3426 3427 tx = dmu_tx_create(zfsvfs->z_os); 3428 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); 3429 dmu_tx_hold_bonus(tx, dzp->z_id); 3430 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 3431 if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) 3432 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE); 3433 if (IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr))) { 3434 if (zfsvfs->z_fuid_obj == 0) { 3435 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 3436 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 3437 FUID_SIZE_ESTIMATE(zfsvfs)); 3438 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); 3439 } else { 3440 dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); 3441 dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, 3442 FUID_SIZE_ESTIMATE(zfsvfs)); 3443 } 3444 } 3445 error = dmu_tx_assign(tx, zfsvfs->z_assign); 3446 if (error) { 3447 zfs_dirent_unlock(dl); 3448 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 3449 dmu_tx_wait(tx); 3450 dmu_tx_abort(tx); 3451 goto top; 3452 } 3453 dmu_tx_abort(tx); 3454 ZFS_EXIT(zfsvfs); 3455 return (error); 3456 } 3457 3458 dmu_buf_will_dirty(dzp->z_dbuf, tx); 3459 3460 /* 3461 * Create a new object for the symlink. 3462 * Put the link content into bonus buffer if it will fit; 3463 * otherwise, store it just like any other file data. 3464 */ 3465 if (sizeof (znode_phys_t) + len <= dmu_bonus_max()) { 3466 zfs_mknode(dzp, vap, tx, cr, 0, &zp, len, NULL, &fuidp); 3467 if (len != 0) 3468 bcopy(link, zp->z_phys + 1, len); 3469 } else { 3470 dmu_buf_t *dbp; 3471 3472 zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, NULL, &fuidp); 3473 /* 3474 * Nothing can access the znode yet so no locking needed 3475 * for growing the znode's blocksize. 3476 */ 3477 zfs_grow_blocksize(zp, len, tx); 3478 3479 VERIFY(0 == dmu_buf_hold(zfsvfs->z_os, 3480 zp->z_id, 0, FTAG, &dbp)); 3481 dmu_buf_will_dirty(dbp, tx); 3482 3483 ASSERT3U(len, <=, dbp->db_size); 3484 bcopy(link, dbp->db_data, len); 3485 dmu_buf_rele(dbp, FTAG); 3486 } 3487 zp->z_phys->zp_size = len; 3488 3489 /* 3490 * Insert the new object into the directory. 3491 */ 3492 (void) zfs_link_create(dl, zp, tx, ZNEW); 3493out: 3494 if (error == 0) { 3495 uint64_t txtype = TX_SYMLINK; 3496 if (flags & FIGNORECASE) 3497 txtype |= TX_CI; 3498 zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); 3499 *vpp = ZTOV(zp); 3500 } 3501 if (fuidp) 3502 zfs_fuid_info_free(fuidp); 3503 3504 dmu_tx_commit(tx); 3505 3506 zfs_dirent_unlock(dl); 3507 3508 ZFS_EXIT(zfsvfs); 3509 return (error); 3510} 3511 3512/* 3513 * Return, in the buffer contained in the provided uio structure, 3514 * the symbolic path referred to by vp. 3515 * 3516 * IN: vp - vnode of symbolic link. 3517 * uoip - structure to contain the link path. 3518 * cr - credentials of caller. 3519 * ct - caller context 3520 * 3521 * OUT: uio - structure to contain the link path. 3522 * 3523 * RETURN: 0 if success 3524 * error code if failure 3525 * 3526 * Timestamps: 3527 * vp - atime updated 3528 */ 3529/* ARGSUSED */ 3530static int 3531zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct) 3532{ 3533 znode_t *zp = VTOZ(vp); 3534 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 3535 size_t bufsz; 3536 int error; 3537 3538 ZFS_ENTER(zfsvfs); 3539 ZFS_VERIFY_ZP(zp); 3540 3541 bufsz = (size_t)zp->z_phys->zp_size; 3542 if (bufsz + sizeof (znode_phys_t) <= zp->z_dbuf->db_size) { 3543 error = uiomove(zp->z_phys + 1, 3544 MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); 3545 } else { 3546 dmu_buf_t *dbp; 3547 error = dmu_buf_hold(zfsvfs->z_os, zp->z_id, 0, FTAG, &dbp); 3548 if (error) { 3549 ZFS_EXIT(zfsvfs); 3550 return (error); 3551 } 3552 error = uiomove(dbp->db_data, 3553 MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); 3554 dmu_buf_rele(dbp, FTAG); 3555 } 3556 3557 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 3558 ZFS_EXIT(zfsvfs); 3559 return (error); 3560} 3561 3562/* 3563 * Insert a new entry into directory tdvp referencing svp. 3564 * 3565 * IN: tdvp - Directory to contain new entry. 3566 * svp - vnode of new entry. 3567 * name - name of new entry. 3568 * cr - credentials of caller. 3569 * ct - caller context 3570 * 3571 * RETURN: 0 if success 3572 * error code if failure 3573 * 3574 * Timestamps: 3575 * tdvp - ctime|mtime updated 3576 * svp - ctime updated 3577 */ 3578/* ARGSUSED */ 3579static int 3580zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr, 3581 caller_context_t *ct, int flags) 3582{ 3583 znode_t *dzp = VTOZ(tdvp); 3584 znode_t *tzp, *szp; 3585 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 3586 zilog_t *zilog; 3587 zfs_dirlock_t *dl; 3588 dmu_tx_t *tx; 3589 vnode_t *realvp; 3590 int error; 3591 int zf = ZNEW; 3592 uid_t owner; 3593 3594 ASSERT(tdvp->v_type == VDIR); 3595 3596 ZFS_ENTER(zfsvfs); 3597 ZFS_VERIFY_ZP(dzp); 3598 zilog = zfsvfs->z_log; 3599 3600 if (VOP_REALVP(svp, &realvp, ct) == 0) 3601 svp = realvp; 3602 3603 if (svp->v_vfsp != tdvp->v_vfsp) { 3604 ZFS_EXIT(zfsvfs); 3605 return (EXDEV); 3606 } 3607 szp = VTOZ(svp); 3608 ZFS_VERIFY_ZP(szp); 3609 3610 if (zfsvfs->z_utf8 && u8_validate(name, 3611 strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 3612 ZFS_EXIT(zfsvfs); 3613 return (EILSEQ); 3614 } 3615 if (flags & FIGNORECASE) 3616 zf |= ZCILOOK; 3617 3618top: 3619 /* 3620 * We do not support links between attributes and non-attributes 3621 * because of the potential security risk of creating links 3622 * into "normal" file space in order to circumvent restrictions 3623 * imposed in attribute space. 3624 */ 3625 if ((szp->z_phys->zp_flags & ZFS_XATTR) != 3626 (dzp->z_phys->zp_flags & ZFS_XATTR)) { 3627 ZFS_EXIT(zfsvfs); 3628 return (EINVAL); 3629 } 3630 3631 /* 3632 * POSIX dictates that we return EPERM here. 3633 * Better choices include ENOTSUP or EISDIR. 3634 */ 3635 if (svp->v_type == VDIR) { 3636 ZFS_EXIT(zfsvfs); 3637 return (EPERM); 3638 } 3639 3640 owner = zfs_fuid_map_id(zfsvfs, szp->z_phys->zp_uid, cr, ZFS_OWNER); 3641 if (owner != crgetuid(cr) && 3642 secpolicy_basic_link(svp, cr) != 0) { 3643 ZFS_EXIT(zfsvfs); 3644 return (EPERM); 3645 } 3646 3647 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { 3648 ZFS_EXIT(zfsvfs); 3649 return (error); 3650 } 3651 3652 /* 3653 * Attempt to lock directory; fail if entry already exists. 3654 */ 3655 error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL); 3656 if (error) { 3657 ZFS_EXIT(zfsvfs); 3658 return (error); 3659 } 3660 3661 tx = dmu_tx_create(zfsvfs->z_os); 3662 dmu_tx_hold_bonus(tx, szp->z_id); 3663 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 3664 error = dmu_tx_assign(tx, zfsvfs->z_assign); 3665 if (error) { 3666 zfs_dirent_unlock(dl); 3667 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 3668 dmu_tx_wait(tx); 3669 dmu_tx_abort(tx); 3670 goto top; 3671 } 3672 dmu_tx_abort(tx); 3673 ZFS_EXIT(zfsvfs); 3674 return (error); 3675 } 3676 3677 error = zfs_link_create(dl, szp, tx, 0); 3678 3679 if (error == 0) { 3680 uint64_t txtype = TX_LINK; 3681 if (flags & FIGNORECASE) 3682 txtype |= TX_CI; 3683 zfs_log_link(zilog, tx, txtype, dzp, szp, name); 3684 } 3685 3686 dmu_tx_commit(tx); 3687 3688 zfs_dirent_unlock(dl); 3689 3690 if (error == 0) { 3691 vnevent_link(svp, ct); 3692 } 3693 3694 ZFS_EXIT(zfsvfs); 3695 return (error); 3696} 3697 3698/*ARGSUSED*/ 3699void 3700zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) 3701{ 3702 znode_t *zp = VTOZ(vp); 3703 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 3704 int error; 3705 3706 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); 3707 if (zp->z_dbuf == NULL) { 3708 /* 3709 * The fs has been unmounted, or we did a 3710 * suspend/resume and this file no longer exists. 3711 */ 3712 VI_LOCK(vp); 3713 vp->v_count = 0; /* count arrives as 1 */ 3714 VI_UNLOCK(vp); 3715 vrecycle(vp, curthread); 3716 rw_exit(&zfsvfs->z_teardown_inactive_lock); 3717 return; 3718 } 3719 3720 if (zp->z_atime_dirty && zp->z_unlinked == 0) { 3721 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); 3722 3723 dmu_tx_hold_bonus(tx, zp->z_id); 3724 error = dmu_tx_assign(tx, TXG_WAIT); 3725 if (error) { 3726 dmu_tx_abort(tx); 3727 } else { 3728 dmu_buf_will_dirty(zp->z_dbuf, tx); 3729 mutex_enter(&zp->z_lock); 3730 zp->z_atime_dirty = 0; 3731 mutex_exit(&zp->z_lock); 3732 dmu_tx_commit(tx); 3733 } 3734 } 3735 3736 zfs_zinactive(zp); 3737 rw_exit(&zfsvfs->z_teardown_inactive_lock); 3738} 3739 3740CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid)); 3741CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid)); 3742 3743/*ARGSUSED*/ 3744static int 3745zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct) 3746{ 3747 znode_t *zp = VTOZ(vp); 3748 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 3749 uint32_t gen; 3750 uint64_t object = zp->z_id; 3751 zfid_short_t *zfid; 3752 int size, i; 3753 3754 ZFS_ENTER(zfsvfs); 3755 ZFS_VERIFY_ZP(zp); 3756 gen = (uint32_t)zp->z_gen; 3757 3758 size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN; 3759 fidp->fid_len = size; 3760 3761 zfid = (zfid_short_t *)fidp; 3762 3763 zfid->zf_len = size; 3764 3765 for (i = 0; i < sizeof (zfid->zf_object); i++) 3766 zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); 3767 3768 /* Must have a non-zero generation number to distinguish from .zfs */ 3769 if (gen == 0) 3770 gen = 1; 3771 for (i = 0; i < sizeof (zfid->zf_gen); i++) 3772 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); 3773 3774 if (size == LONG_FID_LEN) { 3775 uint64_t objsetid = dmu_objset_id(zfsvfs->z_os); 3776 zfid_long_t *zlfid; 3777 3778 zlfid = (zfid_long_t *)fidp; 3779 3780 for (i = 0; i < sizeof (zlfid->zf_setid); i++) 3781 zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i)); 3782 3783 /* XXX - this should be the generation number for the objset */ 3784 for (i = 0; i < sizeof (zlfid->zf_setgen); i++) 3785 zlfid->zf_setgen[i] = 0; 3786 } 3787 3788 ZFS_EXIT(zfsvfs); 3789 return (0); 3790} 3791 3792static int 3793zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, 3794 caller_context_t *ct) 3795{ 3796 znode_t *zp, *xzp; 3797 zfsvfs_t *zfsvfs; 3798 zfs_dirlock_t *dl; 3799 int error; 3800 3801 switch (cmd) { 3802 case _PC_LINK_MAX: 3803 *valp = INT_MAX; 3804 return (0); 3805 3806 case _PC_FILESIZEBITS: 3807 *valp = 64; 3808 return (0); 3809 3810#if 0 3811 case _PC_XATTR_EXISTS: 3812 zp = VTOZ(vp); 3813 zfsvfs = zp->z_zfsvfs; 3814 ZFS_ENTER(zfsvfs); 3815 ZFS_VERIFY_ZP(zp); 3816 *valp = 0; 3817 error = zfs_dirent_lock(&dl, zp, "", &xzp, 3818 ZXATTR | ZEXISTS | ZSHARED, NULL, NULL); 3819 if (error == 0) { 3820 zfs_dirent_unlock(dl); 3821 if (!zfs_dirempty(xzp)) 3822 *valp = 1; 3823 VN_RELE(ZTOV(xzp)); 3824 } else if (error == ENOENT) { 3825 /* 3826 * If there aren't extended attributes, it's the 3827 * same as having zero of them. 3828 */ 3829 error = 0; 3830 } 3831 ZFS_EXIT(zfsvfs); 3832 return (error); 3833#endif 3834 3835 case _PC_ACL_EXTENDED: 3836 *valp = 0; /* TODO */ 3837 return (0); 3838 3839 case _PC_MIN_HOLE_SIZE: 3840 *valp = (int)SPA_MINBLOCKSIZE; 3841 return (0); 3842 3843 default: 3844 return (EOPNOTSUPP); 3845 } 3846} 3847 3848/*ARGSUSED*/ 3849static int 3850zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, 3851 caller_context_t *ct) 3852{ 3853 znode_t *zp = VTOZ(vp); 3854 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 3855 int error; 3856 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 3857 3858 ZFS_ENTER(zfsvfs); 3859 ZFS_VERIFY_ZP(zp); 3860 error = zfs_getacl(zp, vsecp, skipaclchk, cr); 3861 ZFS_EXIT(zfsvfs); 3862 3863 return (error); 3864} 3865 3866/*ARGSUSED*/ 3867static int 3868zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, 3869 caller_context_t *ct) 3870{ 3871 znode_t *zp = VTOZ(vp); 3872 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 3873 int error; 3874 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 3875 3876 ZFS_ENTER(zfsvfs); 3877 ZFS_VERIFY_ZP(zp); 3878 error = zfs_setacl(zp, vsecp, skipaclchk, cr); 3879 ZFS_EXIT(zfsvfs); 3880 return (error); 3881} 3882 3883static int 3884zfs_freebsd_open(ap) 3885 struct vop_open_args /* { 3886 struct vnode *a_vp; 3887 int a_mode; 3888 struct ucred *a_cred; 3889 struct thread *a_td; 3890 } */ *ap; 3891{ 3892 vnode_t *vp = ap->a_vp; 3893 znode_t *zp = VTOZ(vp); 3894 int error; 3895 3896 error = zfs_open(&vp, ap->a_mode, ap->a_cred, NULL); 3897 if (error == 0) 3898 vnode_create_vobject(vp, zp->z_phys->zp_size, ap->a_td); 3899 return (error); 3900} 3901 3902static int 3903zfs_freebsd_close(ap) 3904 struct vop_close_args /* { 3905 struct vnode *a_vp; 3906 int a_fflag; 3907 struct ucred *a_cred; 3908 struct thread *a_td; 3909 } */ *ap; 3910{ 3911 3912 return (zfs_close(ap->a_vp, ap->a_fflag, 0, 0, ap->a_cred, NULL)); 3913} 3914 3915static int 3916zfs_freebsd_ioctl(ap) 3917 struct vop_ioctl_args /* { 3918 struct vnode *a_vp; 3919 u_long a_command; 3920 caddr_t a_data; 3921 int a_fflag; 3922 struct ucred *cred; 3923 struct thread *td; 3924 } */ *ap; 3925{ 3926 3927 return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data, 3928 ap->a_fflag, ap->a_cred, NULL, NULL)); 3929} 3930 3931static int 3932zfs_freebsd_read(ap) 3933 struct vop_read_args /* { 3934 struct vnode *a_vp; 3935 struct uio *a_uio; 3936 int a_ioflag; 3937 struct ucred *a_cred; 3938 } */ *ap; 3939{ 3940 3941 return (zfs_read(ap->a_vp, ap->a_uio, ap->a_ioflag, ap->a_cred, NULL)); 3942} 3943 3944static int 3945zfs_freebsd_write(ap) 3946 struct vop_write_args /* { 3947 struct vnode *a_vp; 3948 struct uio *a_uio; 3949 int a_ioflag; 3950 struct ucred *a_cred; 3951 } */ *ap; 3952{ 3953 3954 return (zfs_write(ap->a_vp, ap->a_uio, ap->a_ioflag, ap->a_cred, NULL)); 3955} 3956 3957static int 3958zfs_freebsd_access(ap) 3959 struct vop_access_args /* { 3960 struct vnode *a_vp; 3961 accmode_t a_accmode; 3962 struct ucred *a_cred; 3963 struct thread *a_td; 3964 } */ *ap; 3965{ 3966 3967 /* 3968 * ZFS itself only knowns about VREAD, VWRITE and VEXEC, the rest 3969 * we have to handle by calling vaccess(). 3970 */ 3971 if ((ap->a_accmode & ~(VREAD|VWRITE|VEXEC)) != 0) { 3972 vnode_t *vp = ap->a_vp; 3973 znode_t *zp = VTOZ(vp); 3974 znode_phys_t *zphys = zp->z_phys; 3975 3976 return (vaccess(vp->v_type, zphys->zp_mode, zphys->zp_uid, 3977 zphys->zp_gid, ap->a_accmode, ap->a_cred, NULL)); 3978 } 3979 3980 return (zfs_access(ap->a_vp, ap->a_accmode, 0, ap->a_cred, NULL)); 3981} 3982 3983static int 3984zfs_freebsd_lookup(ap) 3985 struct vop_lookup_args /* { 3986 struct vnode *a_dvp; 3987 struct vnode **a_vpp; 3988 struct componentname *a_cnp; 3989 } */ *ap; 3990{ 3991 struct componentname *cnp = ap->a_cnp; 3992 char nm[NAME_MAX + 1]; 3993 3994 ASSERT(cnp->cn_namelen < sizeof(nm)); 3995 strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm))); 3996 3997 return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop, 3998 cnp->cn_cred, cnp->cn_thread, 0)); 3999} 4000 4001static int 4002zfs_freebsd_create(ap) 4003 struct vop_create_args /* { 4004 struct vnode *a_dvp; 4005 struct vnode **a_vpp; 4006 struct componentname *a_cnp; 4007 struct vattr *a_vap; 4008 } */ *ap; 4009{ 4010 struct componentname *cnp = ap->a_cnp; 4011 vattr_t *vap = ap->a_vap; 4012 int mode; 4013 4014 ASSERT(cnp->cn_flags & SAVENAME); 4015 4016 vattr_init_mask(vap); 4017 mode = vap->va_mode & ALLPERMS; 4018 4019 return (zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode, 4020 ap->a_vpp, cnp->cn_cred, cnp->cn_thread)); 4021} 4022 4023static int 4024zfs_freebsd_remove(ap) 4025 struct vop_remove_args /* { 4026 struct vnode *a_dvp; 4027 struct vnode *a_vp; 4028 struct componentname *a_cnp; 4029 } */ *ap; 4030{ 4031 4032 ASSERT(ap->a_cnp->cn_flags & SAVENAME); 4033 4034 return (zfs_remove(ap->a_dvp, ap->a_cnp->cn_nameptr, 4035 ap->a_cnp->cn_cred, NULL, 0)); 4036} 4037 4038static int 4039zfs_freebsd_mkdir(ap) 4040 struct vop_mkdir_args /* { 4041 struct vnode *a_dvp; 4042 struct vnode **a_vpp; 4043 struct componentname *a_cnp; 4044 struct vattr *a_vap; 4045 } */ *ap; 4046{ 4047 vattr_t *vap = ap->a_vap; 4048 4049 ASSERT(ap->a_cnp->cn_flags & SAVENAME); 4050 4051 vattr_init_mask(vap); 4052 4053 return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp, 4054 ap->a_cnp->cn_cred, NULL, 0, NULL)); 4055} 4056 4057static int 4058zfs_freebsd_rmdir(ap) 4059 struct vop_rmdir_args /* { 4060 struct vnode *a_dvp; 4061 struct vnode *a_vp; 4062 struct componentname *a_cnp; 4063 } */ *ap; 4064{ 4065 struct componentname *cnp = ap->a_cnp; 4066 4067 ASSERT(cnp->cn_flags & SAVENAME); 4068 4069 return (zfs_rmdir(ap->a_dvp, cnp->cn_nameptr, NULL, cnp->cn_cred, NULL, 0)); 4070} 4071 4072static int 4073zfs_freebsd_readdir(ap) 4074 struct vop_readdir_args /* { 4075 struct vnode *a_vp; 4076 struct uio *a_uio; 4077 struct ucred *a_cred; 4078 int *a_eofflag; 4079 int *a_ncookies; 4080 u_long **a_cookies; 4081 } */ *ap; 4082{ 4083 4084 return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag, 4085 ap->a_ncookies, ap->a_cookies)); 4086} 4087 4088static int 4089zfs_freebsd_fsync(ap) 4090 struct vop_fsync_args /* { 4091 struct vnode *a_vp; 4092 int a_waitfor; 4093 struct thread *a_td; 4094 } */ *ap; 4095{ 4096 4097 vop_stdfsync(ap); 4098 return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL)); 4099} 4100 4101static int 4102zfs_freebsd_getattr(ap) 4103 struct vop_getattr_args /* { 4104 struct vnode *a_vp; 4105 struct vattr *a_vap; 4106 struct ucred *a_cred; 4107 struct thread *a_td; 4108 } */ *ap; 4109{ 4110 vattr_t *vap = ap->a_vap; 4111 xvattr_t xvap; 4112 u_long fflags = 0; 4113 int error; 4114 4115 xva_init(&xvap); 4116 xvap.xva_vattr = *vap; 4117 xvap.xva_vattr.va_mask |= AT_XVATTR; 4118 4119 /* Convert chflags into ZFS-type flags. */ 4120 /* XXX: what about SF_SETTABLE?. */ 4121 XVA_SET_REQ(&xvap, XAT_IMMUTABLE); 4122 XVA_SET_REQ(&xvap, XAT_APPENDONLY); 4123 XVA_SET_REQ(&xvap, XAT_NOUNLINK); 4124 XVA_SET_REQ(&xvap, XAT_NODUMP); 4125 error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL); 4126 if (error != 0) 4127 return (error); 4128 4129 /* Convert ZFS xattr into chflags. */ 4130#define FLAG_CHECK(fflag, xflag, xfield) do { \ 4131 if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0) \ 4132 fflags |= (fflag); \ 4133} while (0) 4134 FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE, 4135 xvap.xva_xoptattrs.xoa_immutable); 4136 FLAG_CHECK(SF_APPEND, XAT_APPENDONLY, 4137 xvap.xva_xoptattrs.xoa_appendonly); 4138 FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK, 4139 xvap.xva_xoptattrs.xoa_nounlink); 4140 FLAG_CHECK(UF_NODUMP, XAT_NODUMP, 4141 xvap.xva_xoptattrs.xoa_nodump); 4142#undef FLAG_CHECK 4143 *vap = xvap.xva_vattr; 4144 vap->va_flags = fflags; 4145 return (0); 4146} 4147 4148static int 4149zfs_freebsd_setattr(ap) 4150 struct vop_setattr_args /* { 4151 struct vnode *a_vp; 4152 struct vattr *a_vap; 4153 struct ucred *a_cred; 4154 struct thread *a_td; 4155 } */ *ap; 4156{ 4157 vnode_t *vp = ap->a_vp; 4158 vattr_t *vap = ap->a_vap; 4159 cred_t *cred = ap->a_cred; 4160 xvattr_t xvap; 4161 u_long fflags; 4162 uint64_t zflags; 4163 4164 vattr_init_mask(vap); 4165 vap->va_mask &= ~AT_NOSET; 4166 4167 xva_init(&xvap); 4168 xvap.xva_vattr = *vap; 4169 4170 zflags = VTOZ(vp)->z_phys->zp_flags; 4171 4172 if (vap->va_flags != VNOVAL) { 4173 int error; 4174 4175 fflags = vap->va_flags; 4176 if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_NODUMP)) != 0) 4177 return (EOPNOTSUPP); 4178 /* 4179 * Callers may only modify the file flags on objects they 4180 * have VADMIN rights for. 4181 */ 4182 if ((error = VOP_ACCESS(vp, VADMIN, cred, curthread)) != 0) 4183 return (error); 4184 /* 4185 * Unprivileged processes are not permitted to unset system 4186 * flags, or modify flags if any system flags are set. 4187 * Privileged non-jail processes may not modify system flags 4188 * if securelevel > 0 and any existing system flags are set. 4189 * Privileged jail processes behave like privileged non-jail 4190 * processes if the security.jail.chflags_allowed sysctl is 4191 * is non-zero; otherwise, they behave like unprivileged 4192 * processes. 4193 */ 4194 if (priv_check_cred(cred, PRIV_VFS_SYSFLAGS, 0) == 0) { 4195 if (zflags & 4196 (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) { 4197 error = securelevel_gt(cred, 0); 4198 if (error) 4199 return (error); 4200 } 4201 } else { 4202 if (zflags & 4203 (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) { 4204 return (EPERM); 4205 } 4206 if (fflags & 4207 (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) { 4208 return (EPERM); 4209 } 4210 } 4211 4212#define FLAG_CHANGE(fflag, zflag, xflag, xfield) do { \ 4213 if (((fflags & (fflag)) && !(zflags & (zflag))) || \ 4214 ((zflags & (zflag)) && !(fflags & (fflag)))) { \ 4215 XVA_SET_REQ(&xvap, (xflag)); \ 4216 (xfield) = ((fflags & (fflag)) != 0); \ 4217 } \ 4218} while (0) 4219 /* Convert chflags into ZFS-type flags. */ 4220 /* XXX: what about SF_SETTABLE?. */ 4221 FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE, 4222 xvap.xva_xoptattrs.xoa_immutable); 4223 FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY, 4224 xvap.xva_xoptattrs.xoa_appendonly); 4225 FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK, 4226 xvap.xva_xoptattrs.xoa_nounlink); 4227 FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP, 4228 xvap.xva_xoptattrs.xoa_nodump); 4229#undef FLAG_CHANGE 4230 } 4231 return (zfs_setattr(vp, (vattr_t *)&xvap, 0, cred, NULL)); 4232} 4233 4234static int 4235zfs_freebsd_rename(ap) 4236 struct vop_rename_args /* { 4237 struct vnode *a_fdvp; 4238 struct vnode *a_fvp; 4239 struct componentname *a_fcnp; 4240 struct vnode *a_tdvp; 4241 struct vnode *a_tvp; 4242 struct componentname *a_tcnp; 4243 } */ *ap; 4244{ 4245 vnode_t *fdvp = ap->a_fdvp; 4246 vnode_t *fvp = ap->a_fvp; 4247 vnode_t *tdvp = ap->a_tdvp; 4248 vnode_t *tvp = ap->a_tvp; 4249 int error; 4250 4251 ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART)); 4252 ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART)); 4253 4254 error = zfs_rename(fdvp, ap->a_fcnp->cn_nameptr, tdvp, 4255 ap->a_tcnp->cn_nameptr, ap->a_fcnp->cn_cred, NULL, 0); 4256 4257 if (tdvp == tvp) 4258 VN_RELE(tdvp); 4259 else 4260 VN_URELE(tdvp); 4261 if (tvp) 4262 VN_URELE(tvp); 4263 VN_RELE(fdvp); 4264 VN_RELE(fvp); 4265 4266 return (error); 4267} 4268 4269static int 4270zfs_freebsd_symlink(ap) 4271 struct vop_symlink_args /* { 4272 struct vnode *a_dvp; 4273 struct vnode **a_vpp; 4274 struct componentname *a_cnp; 4275 struct vattr *a_vap; 4276 char *a_target; 4277 } */ *ap; 4278{ 4279 struct componentname *cnp = ap->a_cnp; 4280 vattr_t *vap = ap->a_vap; 4281 4282 ASSERT(cnp->cn_flags & SAVENAME); 4283 4284 vap->va_type = VLNK; /* FreeBSD: Syscall only sets va_mode. */ 4285 vattr_init_mask(vap); 4286 4287 return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap, 4288 ap->a_target, cnp->cn_cred, cnp->cn_thread)); 4289} 4290 4291static int 4292zfs_freebsd_readlink(ap) 4293 struct vop_readlink_args /* { 4294 struct vnode *a_vp; 4295 struct uio *a_uio; 4296 struct ucred *a_cred; 4297 } */ *ap; 4298{ 4299 4300 return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL)); 4301} 4302 4303static int 4304zfs_freebsd_link(ap) 4305 struct vop_link_args /* { 4306 struct vnode *a_tdvp; 4307 struct vnode *a_vp; 4308 struct componentname *a_cnp; 4309 } */ *ap; 4310{ 4311 struct componentname *cnp = ap->a_cnp; 4312 4313 ASSERT(cnp->cn_flags & SAVENAME); 4314 4315 return (zfs_link(ap->a_tdvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred, NULL, 0)); 4316} 4317 4318static int 4319zfs_freebsd_inactive(ap) 4320 struct vop_inactive_args /* { 4321 struct vnode *a_vp; 4322 struct thread *a_td; 4323 } */ *ap; 4324{ 4325 vnode_t *vp = ap->a_vp; 4326 4327 zfs_inactive(vp, ap->a_td->td_ucred, NULL); 4328 return (0); 4329} 4330 4331static void 4332zfs_reclaim_complete(void *arg, int pending) 4333{ 4334 znode_t *zp = arg; 4335 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4336 4337 ZFS_LOG(1, "zp=%p", zp); 4338 ZFS_OBJ_HOLD_ENTER(zfsvfs, zp->z_id); 4339 zfs_znode_dmu_fini(zp); 4340 ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id); 4341 zfs_znode_free(zp); 4342} 4343 4344static int 4345zfs_freebsd_reclaim(ap) 4346 struct vop_reclaim_args /* { 4347 struct vnode *a_vp; 4348 struct thread *a_td; 4349 } */ *ap; 4350{ 4351 vnode_t *vp = ap->a_vp; 4352 znode_t *zp = VTOZ(vp); 4353 zfsvfs_t *zfsvfs; 4354 4355 ASSERT(zp != NULL); 4356 4357 /* 4358 * Destroy the vm object and flush associated pages. 4359 */ 4360 vnode_destroy_vobject(vp); 4361 4362 mutex_enter(&zp->z_lock); 4363 ASSERT(zp->z_phys); 4364 ZTOV(zp) = NULL; 4365 if (!zp->z_unlinked) { 4366 int locked; 4367 4368 zfsvfs = zp->z_zfsvfs; 4369 mutex_exit(&zp->z_lock); 4370 locked = MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id)) ? 2 : 4371 ZFS_OBJ_HOLD_TRYENTER(zfsvfs, zp->z_id); 4372 if (locked == 0) { 4373 /* 4374 * Lock can't be obtained due to deadlock possibility, 4375 * so defer znode destruction. 4376 */ 4377 TASK_INIT(&zp->z_task, 0, zfs_reclaim_complete, zp); 4378 taskqueue_enqueue(taskqueue_thread, &zp->z_task); 4379 } else { 4380 zfs_znode_dmu_fini(zp); 4381 if (locked == 1) 4382 ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id); 4383 zfs_znode_free(zp); 4384 } 4385 } else { 4386 mutex_exit(&zp->z_lock); 4387 } 4388 VI_LOCK(vp); 4389 vp->v_data = NULL; 4390 ASSERT(vp->v_holdcnt >= 1); 4391 VI_UNLOCK(vp); 4392 return (0); 4393} 4394 4395static int 4396zfs_freebsd_fid(ap) 4397 struct vop_fid_args /* { 4398 struct vnode *a_vp; 4399 struct fid *a_fid; 4400 } */ *ap; 4401{ 4402 4403 return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL)); 4404} 4405 4406static int 4407zfs_freebsd_pathconf(ap) 4408 struct vop_pathconf_args /* { 4409 struct vnode *a_vp; 4410 int a_name; 4411 register_t *a_retval; 4412 } */ *ap; 4413{ 4414 ulong_t val; 4415 int error; 4416 4417 error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred, NULL); 4418 if (error == 0) 4419 *ap->a_retval = val; 4420 else if (error == EOPNOTSUPP) 4421 error = vop_stdpathconf(ap); 4422 return (error); 4423} 4424 4425/* 4426 * FreeBSD's extended attributes namespace defines file name prefix for ZFS' 4427 * extended attribute name: 4428 * 4429 * NAMESPACE PREFIX 4430 * system freebsd:system: 4431 * user (none, can be used to access ZFS fsattr(5) attributes 4432 * created on Solaris) 4433 */ 4434static int 4435zfs_create_attrname(int attrnamespace, const char *name, char *attrname, 4436 size_t size) 4437{ 4438 const char *namespace, *prefix, *suffix; 4439 4440 /* We don't allow '/' character in attribute name. */ 4441 if (strchr(name, '/') != NULL) 4442 return (EINVAL); 4443 /* We don't allow attribute names that start with "freebsd:" string. */ 4444 if (strncmp(name, "freebsd:", 8) == 0) 4445 return (EINVAL); 4446 4447 bzero(attrname, size); 4448 4449 switch (attrnamespace) { 4450 case EXTATTR_NAMESPACE_USER: 4451#if 0 4452 prefix = "freebsd:"; 4453 namespace = EXTATTR_NAMESPACE_USER_STRING; 4454 suffix = ":"; 4455#else 4456 /* 4457 * This is the default namespace by which we can access all 4458 * attributes created on Solaris. 4459 */ 4460 prefix = namespace = suffix = ""; 4461#endif 4462 break; 4463 case EXTATTR_NAMESPACE_SYSTEM: 4464 prefix = "freebsd:"; 4465 namespace = EXTATTR_NAMESPACE_SYSTEM_STRING; 4466 suffix = ":"; 4467 break; 4468 case EXTATTR_NAMESPACE_EMPTY: 4469 default: 4470 return (EINVAL); 4471 } 4472 if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix, 4473 name) >= size) { 4474 return (ENAMETOOLONG); 4475 } 4476 return (0); 4477} 4478 4479/* 4480 * Vnode operating to retrieve a named extended attribute. 4481 */ 4482static int 4483zfs_getextattr(struct vop_getextattr_args *ap) 4484/* 4485vop_getextattr { 4486 IN struct vnode *a_vp; 4487 IN int a_attrnamespace; 4488 IN const char *a_name; 4489 INOUT struct uio *a_uio; 4490 OUT size_t *a_size; 4491 IN struct ucred *a_cred; 4492 IN struct thread *a_td; 4493}; 4494*/ 4495{ 4496 zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; 4497 struct thread *td = ap->a_td; 4498 struct nameidata nd; 4499 char attrname[255]; 4500 struct vattr va; 4501 vnode_t *xvp = NULL, *vp; 4502 int error, flags; 4503 4504 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 4505 ap->a_cred, ap->a_td, VREAD); 4506 if (error != 0) 4507 return (error); 4508 4509 error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, 4510 sizeof(attrname)); 4511 if (error != 0) 4512 return (error); 4513 4514 ZFS_ENTER(zfsvfs); 4515 4516 error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, 4517 LOOKUP_XATTR); 4518 if (error != 0) { 4519 ZFS_EXIT(zfsvfs); 4520 return (error); 4521 } 4522 4523 flags = FREAD; 4524 NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_SYSSPACE, attrname, 4525 xvp, td); 4526 error = vn_open_cred(&nd, &flags, 0, 0, ap->a_cred, NULL); 4527 vp = nd.ni_vp; 4528 NDFREE(&nd, NDF_ONLY_PNBUF); 4529 if (error != 0) { 4530 if (error == ENOENT) 4531 error = ENOATTR; 4532 ZFS_EXIT(zfsvfs); 4533 return (error); 4534 } 4535 4536 if (ap->a_size != NULL) { 4537 error = VOP_GETATTR(vp, &va, ap->a_cred); 4538 if (error == 0) 4539 *ap->a_size = (size_t)va.va_size; 4540 } else if (ap->a_uio != NULL) 4541 error = VOP_READ(vp, ap->a_uio, IO_UNIT | IO_SYNC, ap->a_cred); 4542 4543 VOP_UNLOCK(vp, 0); 4544 vn_close(vp, flags, ap->a_cred, td); 4545 ZFS_EXIT(zfsvfs); 4546 4547 return (error); 4548} 4549 4550/* 4551 * Vnode operation to remove a named attribute. 4552 */ 4553int 4554zfs_deleteextattr(struct vop_deleteextattr_args *ap) 4555/* 4556vop_deleteextattr { 4557 IN struct vnode *a_vp; 4558 IN int a_attrnamespace; 4559 IN const char *a_name; 4560 IN struct ucred *a_cred; 4561 IN struct thread *a_td; 4562}; 4563*/ 4564{ 4565 zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; 4566 struct thread *td = ap->a_td; 4567 struct nameidata nd; 4568 char attrname[255]; 4569 struct vattr va; 4570 vnode_t *xvp = NULL, *vp; 4571 int error, flags; 4572 4573 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 4574 ap->a_cred, ap->a_td, VWRITE); 4575 if (error != 0) 4576 return (error); 4577 4578 error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, 4579 sizeof(attrname)); 4580 if (error != 0) 4581 return (error); 4582 4583 ZFS_ENTER(zfsvfs); 4584 4585 error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, 4586 LOOKUP_XATTR); 4587 if (error != 0) { 4588 ZFS_EXIT(zfsvfs); 4589 return (error); 4590 } 4591 4592 NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF | MPSAFE, 4593 UIO_SYSSPACE, attrname, xvp, td); 4594 error = namei(&nd); 4595 vp = nd.ni_vp; 4596 NDFREE(&nd, NDF_ONLY_PNBUF); 4597 if (error != 0) { 4598 if (error == ENOENT) 4599 error = ENOATTR; 4600 ZFS_EXIT(zfsvfs); 4601 return (error); 4602 } 4603 error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd); 4604 4605 vput(nd.ni_dvp); 4606 if (vp == nd.ni_dvp) 4607 vrele(vp); 4608 else 4609 vput(vp); 4610 ZFS_EXIT(zfsvfs); 4611 4612 return (error); 4613} 4614 4615/* 4616 * Vnode operation to set a named attribute. 4617 */ 4618static int 4619zfs_setextattr(struct vop_setextattr_args *ap) 4620/* 4621vop_setextattr { 4622 IN struct vnode *a_vp; 4623 IN int a_attrnamespace; 4624 IN const char *a_name; 4625 INOUT struct uio *a_uio; 4626 IN struct ucred *a_cred; 4627 IN struct thread *a_td; 4628}; 4629*/ 4630{ 4631 zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; 4632 struct thread *td = ap->a_td; 4633 struct nameidata nd; 4634 char attrname[255]; 4635 struct vattr va; 4636 vnode_t *xvp = NULL, *vp; 4637 int error, flags; 4638 4639 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 4640 ap->a_cred, ap->a_td, VWRITE); 4641 if (error != 0) 4642 return (error); 4643 4644 error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, 4645 sizeof(attrname)); 4646 if (error != 0) 4647 return (error); 4648 4649 ZFS_ENTER(zfsvfs); 4650 4651 error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, 4652 LOOKUP_XATTR | CREATE_XATTR_DIR); 4653 if (error != 0) { 4654 ZFS_EXIT(zfsvfs); 4655 return (error); 4656 } 4657 4658 flags = FFLAGS(O_WRONLY | O_CREAT); 4659 NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_SYSSPACE, attrname, 4660 xvp, td); 4661 error = vn_open_cred(&nd, &flags, 0600, 0, ap->a_cred, NULL); 4662 vp = nd.ni_vp; 4663 NDFREE(&nd, NDF_ONLY_PNBUF); 4664 if (error != 0) { 4665 ZFS_EXIT(zfsvfs); 4666 return (error); 4667 } 4668 4669 VATTR_NULL(&va); 4670 va.va_size = 0; 4671 error = VOP_SETATTR(vp, &va, ap->a_cred); 4672 if (error == 0) 4673 VOP_WRITE(vp, ap->a_uio, IO_UNIT | IO_SYNC, ap->a_cred); 4674 4675 VOP_UNLOCK(vp, 0); 4676 vn_close(vp, flags, ap->a_cred, td); 4677 ZFS_EXIT(zfsvfs); 4678 4679 return (error); 4680} 4681 4682/* 4683 * Vnode operation to retrieve extended attributes on a vnode. 4684 */ 4685static int 4686zfs_listextattr(struct vop_listextattr_args *ap) 4687/* 4688vop_listextattr { 4689 IN struct vnode *a_vp; 4690 IN int a_attrnamespace; 4691 INOUT struct uio *a_uio; 4692 OUT size_t *a_size; 4693 IN struct ucred *a_cred; 4694 IN struct thread *a_td; 4695}; 4696*/ 4697{ 4698 zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; 4699 struct thread *td = ap->a_td; 4700 struct nameidata nd; 4701 char attrprefix[16]; 4702 u_char dirbuf[sizeof(struct dirent)]; 4703 struct dirent *dp; 4704 struct iovec aiov; 4705 struct uio auio, *uio = ap->a_uio; 4706 size_t *sizep = ap->a_size; 4707 size_t plen; 4708 vnode_t *xvp = NULL, *vp; 4709 int done, error, eof, pos; 4710 4711 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 4712 ap->a_cred, ap->a_td, VREAD); 4713 if (error) 4714 return (error); 4715 4716 error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix, 4717 sizeof(attrprefix)); 4718 if (error != 0) 4719 return (error); 4720 plen = strlen(attrprefix); 4721 4722 ZFS_ENTER(zfsvfs); 4723 4724 if (sizep != NULL) 4725 *sizep = 0; 4726 4727 error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, 4728 LOOKUP_XATTR); 4729 if (error != 0) { 4730 /* 4731 * ENOATTR means that the EA directory does not yet exist, 4732 * i.e. there are no extended attributes there. 4733 */ 4734 if (error == ENOATTR) 4735 error = 0; 4736 ZFS_EXIT(zfsvfs); 4737 return (error); 4738 } 4739 4740 NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED | MPSAFE, 4741 UIO_SYSSPACE, ".", xvp, td); 4742 error = namei(&nd); 4743 vp = nd.ni_vp; 4744 NDFREE(&nd, NDF_ONLY_PNBUF); 4745 if (error != 0) { 4746 ZFS_EXIT(zfsvfs); 4747 return (error); 4748 } 4749 4750 auio.uio_iov = &aiov; 4751 auio.uio_iovcnt = 1; 4752 auio.uio_segflg = UIO_SYSSPACE; 4753 auio.uio_td = td; 4754 auio.uio_rw = UIO_READ; 4755 auio.uio_offset = 0; 4756 4757 do { 4758 u_char nlen; 4759 4760 aiov.iov_base = (void *)dirbuf; 4761 aiov.iov_len = sizeof(dirbuf); 4762 auio.uio_resid = sizeof(dirbuf); 4763 error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL); 4764 done = sizeof(dirbuf) - auio.uio_resid; 4765 if (error != 0) 4766 break; 4767 for (pos = 0; pos < done;) { 4768 dp = (struct dirent *)(dirbuf + pos); 4769 pos += dp->d_reclen; 4770 /* 4771 * XXX: Temporarily we also accept DT_UNKNOWN, as this 4772 * is what we get when attribute was created on Solaris. 4773 */ 4774 if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN) 4775 continue; 4776 if (plen == 0 && strncmp(dp->d_name, "freebsd:", 8) == 0) 4777 continue; 4778 else if (strncmp(dp->d_name, attrprefix, plen) != 0) 4779 continue; 4780 nlen = dp->d_namlen - plen; 4781 if (sizep != NULL) 4782 *sizep += 1 + nlen; 4783 else if (uio != NULL) { 4784 /* 4785 * Format of extattr name entry is one byte for 4786 * length and the rest for name. 4787 */ 4788 error = uiomove(&nlen, 1, uio->uio_rw, uio); 4789 if (error == 0) { 4790 error = uiomove(dp->d_name + plen, nlen, 4791 uio->uio_rw, uio); 4792 } 4793 if (error != 0) 4794 break; 4795 } 4796 } 4797 } while (!eof && error == 0); 4798 4799 vput(vp); 4800 ZFS_EXIT(zfsvfs); 4801 4802 return (error); 4803} 4804 4805int 4806zfs_freebsd_getacl(ap) 4807 struct vop_getacl_args /* { 4808 struct vnode *vp; 4809 acl_type_t type; 4810 struct acl *aclp; 4811 struct ucred *cred; 4812 struct thread *td; 4813 } */ *ap; 4814{ 4815 int error; 4816 vsecattr_t vsecattr; 4817 4818 if (ap->a_type != ACL_TYPE_NFS4) 4819 return (EOPNOTSUPP); 4820 4821 vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT; 4822 if (error = zfs_getsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL)) 4823 return (error); 4824 4825 error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp, vsecattr.vsa_aclcnt); 4826 if (vsecattr.vsa_aclentp != NULL) 4827 kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz); 4828 4829 return (error); 4830} 4831 4832int 4833zfs_freebsd_setacl(ap) 4834 struct vop_setacl_args /* { 4835 struct vnode *vp; 4836 acl_type_t type; 4837 struct acl *aclp; 4838 struct ucred *cred; 4839 struct thread *td; 4840 } */ *ap; 4841{ 4842 int error; 4843 vsecattr_t vsecattr; 4844 int aclbsize; /* size of acl list in bytes */ 4845 aclent_t *aaclp; 4846 4847 if (ap->a_type != ACL_TYPE_NFS4) 4848 return (EOPNOTSUPP); 4849 4850 if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES) 4851 return (EINVAL); 4852 4853 /* 4854 * With NFS4 ACLs, chmod(2) may need to add additional entries, 4855 * splitting every entry into two and appending "canonical six" 4856 * entries at the end. Don't allow for setting an ACL that would 4857 * cause chmod(2) to run out of ACL entries. 4858 */ 4859 if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES) 4860 return (ENOSPC); 4861 4862 vsecattr.vsa_mask = VSA_ACE; 4863 aclbsize = ap->a_aclp->acl_cnt * sizeof(ace_t); 4864 vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP); 4865 aaclp = vsecattr.vsa_aclentp; 4866 vsecattr.vsa_aclentsz = aclbsize; 4867 4868 aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp); 4869 error = zfs_setsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL); 4870 kmem_free(aaclp, aclbsize); 4871 4872 return (error); 4873} 4874 4875int 4876zfs_freebsd_aclcheck(ap) 4877 struct vop_aclcheck_args /* { 4878 struct vnode *vp; 4879 acl_type_t type; 4880 struct acl *aclp; 4881 struct ucred *cred; 4882 struct thread *td; 4883 } */ *ap; 4884{ 4885 4886 return (EOPNOTSUPP); 4887} 4888 4889struct vop_vector zfs_vnodeops; 4890struct vop_vector zfs_fifoops; 4891 4892struct vop_vector zfs_vnodeops = { 4893 .vop_default = &default_vnodeops, 4894 .vop_inactive = zfs_freebsd_inactive, 4895 .vop_reclaim = zfs_freebsd_reclaim, 4896 .vop_access = zfs_freebsd_access, 4897#ifdef FREEBSD_NAMECACHE 4898 .vop_lookup = vfs_cache_lookup, 4899 .vop_cachedlookup = zfs_freebsd_lookup, 4900#else 4901 .vop_lookup = zfs_freebsd_lookup, 4902#endif 4903 .vop_getattr = zfs_freebsd_getattr, 4904 .vop_setattr = zfs_freebsd_setattr, 4905 .vop_create = zfs_freebsd_create, 4906 .vop_mknod = zfs_freebsd_create, 4907 .vop_mkdir = zfs_freebsd_mkdir, 4908 .vop_readdir = zfs_freebsd_readdir, 4909 .vop_fsync = zfs_freebsd_fsync, 4910 .vop_open = zfs_freebsd_open, 4911 .vop_close = zfs_freebsd_close, 4912 .vop_rmdir = zfs_freebsd_rmdir, 4913 .vop_ioctl = zfs_freebsd_ioctl, 4914 .vop_link = zfs_freebsd_link, 4915 .vop_symlink = zfs_freebsd_symlink, 4916 .vop_readlink = zfs_freebsd_readlink, 4917 .vop_read = zfs_freebsd_read, 4918 .vop_write = zfs_freebsd_write, 4919 .vop_remove = zfs_freebsd_remove, 4920 .vop_rename = zfs_freebsd_rename, 4921 .vop_pathconf = zfs_freebsd_pathconf, 4922 .vop_bmap = VOP_EOPNOTSUPP, 4923 .vop_fid = zfs_freebsd_fid, 4924 .vop_getextattr = zfs_getextattr, 4925 .vop_deleteextattr = zfs_deleteextattr, 4926 .vop_setextattr = zfs_setextattr, 4927 .vop_listextattr = zfs_listextattr, 4928#ifdef notyet 4929 .vop_getacl = zfs_freebsd_getacl, 4930 .vop_setacl = zfs_freebsd_setacl, 4931 .vop_aclcheck = zfs_freebsd_aclcheck, 4932#endif 4933}; 4934 4935struct vop_vector zfs_fifoops = { 4936 .vop_default = &fifo_specops, 4937 .vop_fsync = VOP_PANIC, 4938 .vop_access = zfs_freebsd_access, 4939 .vop_getattr = zfs_freebsd_getattr, 4940 .vop_inactive = zfs_freebsd_inactive, 4941 .vop_read = VOP_PANIC, 4942 .vop_reclaim = zfs_freebsd_reclaim, 4943 .vop_setattr = zfs_freebsd_setattr, 4944 .vop_write = VOP_PANIC, 4945 .vop_fid = zfs_freebsd_fid, 4946#ifdef notyet 4947 .vop_getacl = zfs_freebsd_getacl, 4948 .vop_setacl = zfs_freebsd_setacl, 4949 .vop_aclcheck = zfs_freebsd_aclcheck, 4950#endif 4951}; 4952