zfs_vnops.c revision 185029
128019Sjoerg/* 228021Sjoerg * CDDL HEADER START 328021Sjoerg * 428021Sjoerg * The contents of this file are subject to the terms of the 528021Sjoerg * Common Development and Distribution License (the "License"). 628021Sjoerg * You may not use this file except in compliance with the License. 728021Sjoerg * 828021Sjoerg * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 928021Sjoerg * or http://www.opensolaris.org/os/licensing. 1028021Sjoerg * See the License for the specific language governing permissions 1128021Sjoerg * and limitations under the License. 1228021Sjoerg * 1328021Sjoerg * When distributing Covered Code, include this CDDL HEADER in each 1428021Sjoerg * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 1528021Sjoerg * If applicable, add the following below this CDDL HEADER, with the 1628021Sjoerg * fields enclosed by brackets "[]" replaced with your own identifying 1728021Sjoerg * information: Portions Copyright [yyyy] [name of copyright owner] 1828021Sjoerg * 1928021Sjoerg * CDDL HEADER END 2028021Sjoerg */ 2128021Sjoerg/* 2228021Sjoerg * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 2328019Sjoerg * Use is subject to license terms. 2428019Sjoerg */ 2528021Sjoerg 2628019Sjoerg/* Portions Copyright 2007 Jeremy Teo */ 2728019Sjoerg 2828019Sjoerg#include <sys/types.h> 2928019Sjoerg#include <sys/param.h> 3028019Sjoerg#include <sys/time.h> 3128019Sjoerg#include <sys/systm.h> 3228019Sjoerg#include <sys/sysmacros.h> 3328019Sjoerg#include <sys/resource.h> 3428019Sjoerg#include <sys/vfs.h> 3528019Sjoerg#include <sys/vnode.h> 3628019Sjoerg#include <sys/file.h> 3728019Sjoerg#include <sys/stat.h> 3828019Sjoerg#include <sys/kmem.h> 3928019Sjoerg#include <sys/taskq.h> 4028019Sjoerg#include <sys/uio.h> 4128019Sjoerg#include <sys/atomic.h> 4228019Sjoerg#include <sys/namei.h> 4328019Sjoerg#include <sys/mman.h> 4428019Sjoerg#include <sys/cmn_err.h> 4528019Sjoerg#include <sys/errno.h> 4628019Sjoerg#include <sys/unistd.h> 4728019Sjoerg#include <sys/zfs_dir.h> 4828019Sjoerg#include <sys/zfs_acl.h> 4928019Sjoerg#include <sys/zfs_ioctl.h> 5028019Sjoerg#include <sys/fs/zfs.h> 5128019Sjoerg#include <sys/dmu.h> 5228019Sjoerg#include <sys/spa.h> 5328019Sjoerg#include <sys/txg.h> 5428021Sjoerg#include <sys/dbuf.h> 5528021Sjoerg#include <sys/zap.h> 5650476Speter#include <sys/dirent.h> 5728021Sjoerg#include <sys/policy.h> 5828021Sjoerg#include <sys/sunddi.h> 5928019Sjoerg#include <sys/filio.h> 6028021Sjoerg#include <sys/zfs_ctldir.h> 6128019Sjoerg#include <sys/zfs_fuid.h> 6228019Sjoerg#include <sys/dnlc.h> 6328021Sjoerg#include <sys/zfs_rlock.h> 6428021Sjoerg#include <sys/extdirent.h> 6528019Sjoerg#include <sys/kidmap.h> 6628019Sjoerg#include <sys/bio.h> 6728019Sjoerg#include <sys/buf.h> 6828019Sjoerg#include <sys/sf_buf.h> 6928019Sjoerg#include <sys/sched.h> 7048614Sobrien 7148614Sobrien/* 7248614Sobrien * Programming rules. 7348614Sobrien * 7428021Sjoerg * Each vnode op performs some logical unit of work. To do this, the ZPL must 7528019Sjoerg * properly lock its in-core state, create a DMU transaction, do the work, 7648614Sobrien * record this work in the intent log (ZIL), commit the DMU transaction, 7748614Sobrien * and wait for the intent log to commit if it is a synchronous operation. 7848614Sobrien * Moreover, the vnode ops must work in both normal and log replay context. 7948614Sobrien * The ordering of events is important to avoid deadlocks and references 8048614Sobrien * to freed memory. The example below illustrates the following Big Rules: 8148614Sobrien * 8248614Sobrien * (1) A check must be made in each zfs thread for a mounted file system. 8348614Sobrien * This is done avoiding races using ZFS_ENTER(zfsvfs). 8428021Sjoerg * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes 8528019Sjoerg * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros 8648614Sobrien * can return EIO from the calling function. 8748614Sobrien * 8828019Sjoerg * (2) VN_RELE() should always be the last thing except for zil_commit() 8928021Sjoerg * (if necessary) and ZFS_EXIT(). This is for 3 reasons: 9028021Sjoerg * First, if it's the last reference, the vnode/znode 9128021Sjoerg * can be freed, so the zp may point to freed memory. Second, the last 9228021Sjoerg * reference will call zfs_zinactive(), which may induce a lot of work -- 9328019Sjoerg * pushing cached pages (which acquires range locks) and syncing out 9428021Sjoerg * cached atime changes. Third, zfs_zinactive() may require a new tx, 9528021Sjoerg * which could deadlock the system if you were already holding one. 9628021Sjoerg * 9728021Sjoerg * (3) All range locks must be grabbed before calling dmu_tx_assign(), 9828019Sjoerg * as they can span dmu_tx_assign() calls. 9928021Sjoerg * 10028019Sjoerg * (4) Always pass zfsvfs->z_assign as the second argument to dmu_tx_assign(). 10128021Sjoerg * In normal operation, this will be TXG_NOWAIT. During ZIL replay, 10228164Sache * it will be a specific txg. Either way, dmu_tx_assign() never blocks. 10328164Sache * This is critical because we don't want to block while holding locks. 10428021Sjoerg * Note, in particular, that if a lock is sometimes acquired before 10528021Sjoerg * the tx assigns, and sometimes after (e.g. z_lock), then failing to 10628021Sjoerg * use a non-blocking assign can deadlock the system. The scenario: 10728021Sjoerg * 10828021Sjoerg * Thread A has grabbed a lock before calling dmu_tx_assign(). 10928019Sjoerg * Thread B is in an already-assigned tx, and blocks for this lock. 11028021Sjoerg * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() 11128021Sjoerg * forever, because the previous txg can't quiesce until B's tx commits. 11228021Sjoerg * 11328021Sjoerg * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, 11428021Sjoerg * then drop all locks, call dmu_tx_wait(), and try again. 11528021Sjoerg * 11628021Sjoerg * (5) If the operation succeeded, generate the intent log entry for it 11728019Sjoerg * before dropping locks. This ensures that the ordering of events 11828021Sjoerg * in the intent log matches the order in which they actually occurred. 11948614Sobrien * 12028021Sjoerg * (6) At the end of each vnode op, the DMU tx must always commit, 12128021Sjoerg * regardless of whether there were any errors. 12228021Sjoerg * 12328019Sjoerg * (7) After dropping all locks, invoke zil_commit(zilog, seq, foid) 12428021Sjoerg * to ensure that synchronous semantics are provided when necessary. 12548614Sobrien * 12628021Sjoerg * In general, this is how things should be ordered in each vnode op: 12728021Sjoerg * 12828021Sjoerg * ZFS_ENTER(zfsvfs); // exit if unmounted 12928019Sjoerg * top: 13028021Sjoerg * zfs_dirent_lock(&dl, ...) // lock directory entry (may VN_HOLD()) 13148614Sobrien * rw_enter(...); // grab any other locks you need 13228021Sjoerg * tx = dmu_tx_create(...); // get DMU tx 13328021Sjoerg * dmu_tx_hold_*(); // hold each object you might modify 13428021Sjoerg * error = dmu_tx_assign(tx, zfsvfs->z_assign); // try to assign 13528019Sjoerg * if (error) { 13628021Sjoerg * rw_exit(...); // drop locks 13748614Sobrien * zfs_dirent_unlock(dl); // unlock directory entry 13828021Sjoerg * VN_RELE(...); // release held vnodes 13928021Sjoerg * if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 14028021Sjoerg * dmu_tx_wait(tx); 14128019Sjoerg * dmu_tx_abort(tx); 14228021Sjoerg * goto top; 14348614Sobrien * } 14428021Sjoerg * dmu_tx_abort(tx); // abort DMU tx 14528021Sjoerg * ZFS_EXIT(zfsvfs); // finished in zfs 14628021Sjoerg * return (error); // really out of space 14728019Sjoerg * } 14828021Sjoerg * error = do_real_work(); // do whatever this VOP does 14948614Sobrien * if (error == 0) 15028021Sjoerg * zfs_log_*(...); // on success, make ZIL entry 15128021Sjoerg * dmu_tx_commit(tx); // commit DMU tx -- error or not 15228021Sjoerg * rw_exit(...); // drop locks 15328019Sjoerg * zfs_dirent_unlock(dl); // unlock directory entry 15428021Sjoerg * VN_RELE(...); // release held vnodes 15548614Sobrien * zil_commit(zilog, seq, foid); // synchronous when necessary 15628021Sjoerg * ZFS_EXIT(zfsvfs); // finished in zfs 15728021Sjoerg * return (error); // done, report error 15828021Sjoerg */ 15928019Sjoerg 16028021Sjoerg/* ARGSUSED */ 16148614Sobrienstatic int 16228021Sjoergzfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) 16328021Sjoerg{ 16428021Sjoerg znode_t *zp = VTOZ(*vpp); 16528019Sjoerg 16628021Sjoerg if ((flag & FWRITE) && (zp->z_phys->zp_flags & ZFS_APPENDONLY) && 16728164Sache ((flag & FAPPEND) == 0)) { 16828021Sjoerg return (EPERM); 16928019Sjoerg } 17028164Sache 17128021Sjoerg if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && 17228021Sjoerg ZTOV(zp)->v_type == VREG && 17328021Sjoerg !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) && 17428021Sjoerg zp->z_phys->zp_size > 0) 17528021Sjoerg if (fs_vscan(*vpp, cr, 0) != 0) 17628019Sjoerg return (EACCES); 17728021Sjoerg 17828021Sjoerg /* Keep a count of the synchronous opens in the znode */ 17928019Sjoerg if (flag & (FSYNC | FDSYNC)) 18028021Sjoerg atomic_inc_32(&zp->z_sync_cnt); 18128021Sjoerg 18228164Sache return (0); 18328021Sjoerg} 18428019Sjoerg 18528164Sache/* ARGSUSED */ 18628021Sjoergstatic int 18728019Sjoergzfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, 18828164Sache caller_context_t *ct) 18928021Sjoerg{ 19028021Sjoerg znode_t *zp = VTOZ(vp); 19128021Sjoerg 19228021Sjoerg /* Decrement the synchronous opens in the znode */ 19328021Sjoerg if ((flag & (FSYNC | FDSYNC)) && (count == 1)) 19428019Sjoerg atomic_dec_32(&zp->z_sync_cnt); 19528021Sjoerg 19628021Sjoerg /* 19728021Sjoerg * Clean up any locks held by this process on the vp. 19828021Sjoerg */ 19928019Sjoerg cleanlocks(vp, ddi_get_pid(), 0); 20028164Sache cleanshares(vp, ddi_get_pid()); 20128164Sache 20228021Sjoerg if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && 20328021Sjoerg ZTOV(zp)->v_type == VREG && 20428019Sjoerg !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) && 20528021Sjoerg zp->z_phys->zp_size > 0) 20628021Sjoerg VERIFY(fs_vscan(vp, cr, 1) == 0); 20728021Sjoerg 20828021Sjoerg return (0); 20928164Sache} 21028021Sjoerg 21128019Sjoerg/* 21228164Sache * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and 21328021Sjoerg * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter. 21428021Sjoerg */ 21528021Sjoergstatic int 21628021Sjoergzfs_holey(vnode_t *vp, u_long cmd, offset_t *off) 21728021Sjoerg{ 21828021Sjoerg znode_t *zp = VTOZ(vp); 21928021Sjoerg uint64_t noff = (uint64_t)*off; /* new offset */ 22028021Sjoerg uint64_t file_sz; 22128019Sjoerg int error; 22228021Sjoerg boolean_t hole; 22328019Sjoerg 22428164Sache file_sz = zp->z_phys->zp_size; 22528164Sache if (noff >= file_sz) { 22628021Sjoerg return (ENXIO); 22728021Sjoerg } 22828019Sjoerg 22928021Sjoerg if (cmd == _FIO_SEEK_HOLE) 23028021Sjoerg hole = B_TRUE; 23128021Sjoerg else 23228021Sjoerg hole = B_FALSE; 23328021Sjoerg 23428021Sjoerg error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff); 23528021Sjoerg 23628021Sjoerg /* end of file? */ 23728021Sjoerg if ((error == ESRCH) || (noff > file_sz)) { 23828021Sjoerg /* 23928019Sjoerg * Handle the virtual hole at the end of file. 24028021Sjoerg */ 24128021Sjoerg if (hole) { 24228021Sjoerg *off = file_sz; 24328021Sjoerg return (0); 24428021Sjoerg } 24528021Sjoerg return (ENXIO); 24628021Sjoerg } 24728021Sjoerg 24828021Sjoerg if (noff < *off) 24928019Sjoerg return (error); 25028021Sjoerg *off = noff; 25128019Sjoerg return (error); 25228021Sjoerg} 25328021Sjoerg 25428021Sjoerg/* ARGSUSED */ 25528021Sjoergstatic int 25628021Sjoergzfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred, 25728021Sjoerg int *rvalp, caller_context_t *ct) 25828021Sjoerg{ 25928021Sjoerg offset_t off; 26028019Sjoerg int error; 26128021Sjoerg zfsvfs_t *zfsvfs; 26228021Sjoerg znode_t *zp; 26328021Sjoerg 26428021Sjoerg switch (com) { 26528021Sjoerg case _FIOFFS: 26628021Sjoerg return (0); 26728021Sjoerg 26828021Sjoerg /* 26928019Sjoerg * The following two ioctls are used by bfu. Faking out, 27028021Sjoerg * necessary to avoid bfu errors. 27128021Sjoerg */ 27228021Sjoerg case _FIOGDIO: 27328019Sjoerg case _FIOSDIO: 27428021Sjoerg return (0); 27528021Sjoerg 27628164Sache case _FIO_SEEK_DATA: 27728021Sjoerg case _FIO_SEEK_HOLE: 27828019Sjoerg if (ddi_copyin((void *)data, &off, sizeof (off), flag)) 27928164Sache return (EFAULT); 28028021Sjoerg 28128021Sjoerg zp = VTOZ(vp); 28228021Sjoerg zfsvfs = zp->z_zfsvfs; 28328021Sjoerg ZFS_ENTER(zfsvfs); 28428021Sjoerg ZFS_VERIFY_ZP(zp); 28528019Sjoerg 28628021Sjoerg /* offset parameter is in/out */ 28728019Sjoerg error = zfs_holey(vp, com, &off); 28828164Sache ZFS_EXIT(zfsvfs); 28928164Sache if (error) 29028021Sjoerg return (error); 29128021Sjoerg if (ddi_copyout(&off, (void *)data, sizeof (off), flag)) 29228019Sjoerg return (EFAULT); 29328021Sjoerg return (0); 29428021Sjoerg } 29528021Sjoerg return (ENOTTY); 29628021Sjoerg} 29728021Sjoerg 29828021Sjoerg/* 29928021Sjoerg * When a file is memory mapped, we must keep the IO data synchronized 30028021Sjoerg * between the DMU cache and the memory mapped pages. What this means: 30128021Sjoerg * 30228019Sjoerg * On Write: If we find a memory mapped page, we write to *both* 30328021Sjoerg * the page and the dmu buffer. 30428021Sjoerg * 30528021Sjoerg * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when 30628021Sjoerg * the file is memory mapped. 30728021Sjoerg */ 30828021Sjoergstatic int 30928021Sjoergmappedwrite(vnode_t *vp, int nbytes, uio_t *uio, dmu_tx_t *tx) 31028021Sjoerg{ 31128019Sjoerg znode_t *zp = VTOZ(vp); 31228021Sjoerg objset_t *os = zp->z_zfsvfs->z_os; 31328021Sjoerg vm_object_t obj; 31428021Sjoerg vm_page_t m; 31528019Sjoerg struct sf_buf *sf; 31628021Sjoerg int64_t start, off; 31728164Sache int len = nbytes; 31828021Sjoerg int error = 0; 31928019Sjoerg uint64_t dirbytes; 32028164Sache 32128021Sjoerg ASSERT(vp->v_mount != NULL); 32228021Sjoerg obj = vp->v_object; 32328021Sjoerg ASSERT(obj != NULL); 32428021Sjoerg 32528021Sjoerg start = uio->uio_loffset; 32628019Sjoerg off = start & PAGEOFFSET; 32728021Sjoerg dirbytes = 0; 32828019Sjoerg VM_OBJECT_LOCK(obj); 32928164Sache for (start &= PAGEMASK; len > 0; start += PAGESIZE) { 33028164Sache uint64_t bytes = MIN(PAGESIZE - off, len); 33128021Sjoerg uint64_t fsize; 33228021Sjoerg 33328019Sjoergagain: 33428021Sjoerg if ((m = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && 33528021Sjoerg vm_page_is_valid(m, (vm_offset_t)off, bytes)) { 33628164Sache uint64_t woff; 33728021Sjoerg caddr_t va; 33828019Sjoerg 33928164Sache if (vm_page_sleep_if_busy(m, FALSE, "zfsmwb")) 34028021Sjoerg goto again; 34128019Sjoerg fsize = obj->un_pager.vnp.vnp_size; 34228164Sache vm_page_busy(m); 34328021Sjoerg vm_page_lock_queues(); 34428021Sjoerg vm_page_undirty(m); 34528021Sjoerg vm_page_unlock_queues(); 34628021Sjoerg VM_OBJECT_UNLOCK(obj); 34728021Sjoerg if (dirbytes > 0) { 34846051Swes error = dmu_write_uio(os, zp->z_id, uio, 34946042Swes dirbytes, tx); 35028021Sjoerg dirbytes = 0; 35128021Sjoerg } 35228019Sjoerg if (error == 0) { 35328021Sjoerg sched_pin(); 35428019Sjoerg sf = sf_buf_alloc(m, SFB_CPUPRIVATE); 35528164Sache va = (caddr_t)sf_buf_kva(sf); 35628164Sache woff = uio->uio_loffset - off; 35728021Sjoerg error = uiomove(va + off, bytes, UIO_WRITE, uio); 35828021Sjoerg /* 35948550Sobrien * The uiomove() above could have been partially 36048550Sobrien * successful, that's why we call dmu_write() 36148550Sobrien * below unconditionally. The page was marked 36248550Sobrien * non-dirty above and we would lose the changes 36348550Sobrien * without doing so. If the uiomove() failed 36448550Sobrien * entirely, well, we just write what we got 36548550Sobrien * before one more time. 36648550Sobrien */ 36748550Sobrien dmu_write(os, zp->z_id, woff, 36848550Sobrien MIN(PAGESIZE, fsize - woff), va, tx); 36948550Sobrien sf_buf_free(sf); 37048550Sobrien sched_unpin(); 37148550Sobrien } 37248614Sobrien VM_OBJECT_LOCK(obj); 37348550Sobrien vm_page_wakeup(m); 37448614Sobrien } else { 37548550Sobrien if (__predict_false(obj->cache != NULL)) { 37648614Sobrien vm_page_cache_free(obj, OFF_TO_IDX(start), 37748550Sobrien OFF_TO_IDX(start) + 1); 37848614Sobrien } 37948550Sobrien dirbytes += bytes; 38048550Sobrien } 38148550Sobrien len -= bytes; 38248550Sobrien off = 0; 38348550Sobrien if (error) 38428021Sjoerg break; 38528021Sjoerg } 38648614Sobrien VM_OBJECT_UNLOCK(obj); 38748614Sobrien if (error == 0 && dirbytes > 0) 38828019Sjoerg error = dmu_write_uio(os, zp->z_id, uio, dirbytes, tx); 38948614Sobrien return (error); 39048614Sobrien} 39148614Sobrien 39248614Sobrien/* 39348614Sobrien * When a file is memory mapped, we must keep the IO data synchronized 39448614Sobrien * between the DMU cache and the memory mapped pages. What this means: 39548614Sobrien * 39648614Sobrien * On Read: We "read" preferentially from memory mapped pages, 39748614Sobrien * else we default from the dmu buffer. 39848614Sobrien * 39948614Sobrien * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when 40048614Sobrien * the file is memory mapped. 40148614Sobrien */ 40248550Sobrienstatic int 40348614Sobrienmappedread(vnode_t *vp, int nbytes, uio_t *uio) 40448614Sobrien{ 40548550Sobrien znode_t *zp = VTOZ(vp); 40648614Sobrien objset_t *os = zp->z_zfsvfs->z_os; 40748614Sobrien vm_object_t obj; 40848614Sobrien vm_page_t m; 40948614Sobrien struct sf_buf *sf; 41048614Sobrien int64_t start, off; 41148614Sobrien caddr_t va; 41228019Sjoerg int len = nbytes; 413 int error = 0; 414 uint64_t dirbytes; 415 416 ASSERT(vp->v_mount != NULL); 417 obj = vp->v_object; 418 ASSERT(obj != NULL); 419 420 start = uio->uio_loffset; 421 off = start & PAGEOFFSET; 422 dirbytes = 0; 423 VM_OBJECT_LOCK(obj); 424 for (start &= PAGEMASK; len > 0; start += PAGESIZE) { 425 uint64_t bytes = MIN(PAGESIZE - off, len); 426 427again: 428 if ((m = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && 429 vm_page_is_valid(m, (vm_offset_t)off, bytes)) { 430 if (vm_page_sleep_if_busy(m, FALSE, "zfsmrb")) 431 goto again; 432 vm_page_busy(m); 433 VM_OBJECT_UNLOCK(obj); 434 if (dirbytes > 0) { 435 error = dmu_read_uio(os, zp->z_id, uio, 436 dirbytes); 437 dirbytes = 0; 438 } 439 if (error == 0) { 440 sched_pin(); 441 sf = sf_buf_alloc(m, SFB_CPUPRIVATE); 442 va = (caddr_t)sf_buf_kva(sf); 443 error = uiomove(va + off, bytes, UIO_READ, uio); 444 sf_buf_free(sf); 445 sched_unpin(); 446 } 447 VM_OBJECT_LOCK(obj); 448 vm_page_wakeup(m); 449 } else if (m != NULL && uio->uio_segflg == UIO_NOCOPY) { 450 /* 451 * The code below is here to make sendfile(2) work 452 * correctly with ZFS. As pointed out by ups@ 453 * sendfile(2) should be changed to use VOP_GETPAGES(), 454 * but it pessimize performance of sendfile/UFS, that's 455 * why I handle this special case in ZFS code. 456 */ 457 if (vm_page_sleep_if_busy(m, FALSE, "zfsmrb")) 458 goto again; 459 vm_page_busy(m); 460 VM_OBJECT_UNLOCK(obj); 461 if (dirbytes > 0) { 462 error = dmu_read_uio(os, zp->z_id, uio, 463 dirbytes); 464 dirbytes = 0; 465 } 466 if (error == 0) { 467 sched_pin(); 468 sf = sf_buf_alloc(m, SFB_CPUPRIVATE); 469 va = (caddr_t)sf_buf_kva(sf); 470 error = dmu_read(os, zp->z_id, start + off, 471 bytes, (void *)(va + off)); 472 sf_buf_free(sf); 473 sched_unpin(); 474 } 475 VM_OBJECT_LOCK(obj); 476 vm_page_wakeup(m); 477 if (error == 0) 478 uio->uio_resid -= bytes; 479 } else { 480 dirbytes += bytes; 481 } 482 len -= bytes; 483 off = 0; 484 if (error) 485 break; 486 } 487 VM_OBJECT_UNLOCK(obj); 488 if (error == 0 && dirbytes > 0) 489 error = dmu_read_uio(os, zp->z_id, uio, dirbytes); 490 return (error); 491} 492 493offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */ 494 495/* 496 * Read bytes from specified file into supplied buffer. 497 * 498 * IN: vp - vnode of file to be read from. 499 * uio - structure supplying read location, range info, 500 * and return buffer. 501 * ioflag - SYNC flags; used to provide FRSYNC semantics. 502 * cr - credentials of caller. 503 * ct - caller context 504 * 505 * OUT: uio - updated offset and range, buffer filled. 506 * 507 * RETURN: 0 if success 508 * error code if failure 509 * 510 * Side Effects: 511 * vp - atime updated if byte count > 0 512 */ 513/* ARGSUSED */ 514static int 515zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) 516{ 517 znode_t *zp = VTOZ(vp); 518 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 519 objset_t *os; 520 ssize_t n, nbytes; 521 int error; 522 rl_t *rl; 523 524 ZFS_ENTER(zfsvfs); 525 ZFS_VERIFY_ZP(zp); 526 os = zfsvfs->z_os; 527 528 if (zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) { 529 ZFS_EXIT(zfsvfs); 530 return (EACCES); 531 } 532 533 /* 534 * Validate file offset 535 */ 536 if (uio->uio_loffset < (offset_t)0) { 537 ZFS_EXIT(zfsvfs); 538 return (EINVAL); 539 } 540 541 /* 542 * Fasttrack empty reads 543 */ 544 if (uio->uio_resid == 0) { 545 ZFS_EXIT(zfsvfs); 546 return (0); 547 } 548 549 /* 550 * Check for mandatory locks 551 */ 552 if (MANDMODE((mode_t)zp->z_phys->zp_mode)) { 553 if (error = chklock(vp, FREAD, 554 uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) { 555 ZFS_EXIT(zfsvfs); 556 return (error); 557 } 558 } 559 560 /* 561 * If we're in FRSYNC mode, sync out this znode before reading it. 562 */ 563 if (ioflag & FRSYNC) 564 zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id); 565 566 /* 567 * Lock the range against changes. 568 */ 569 rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER); 570 571 /* 572 * If we are reading past end-of-file we can skip 573 * to the end; but we might still need to set atime. 574 */ 575 if (uio->uio_loffset >= zp->z_phys->zp_size) { 576 error = 0; 577 goto out; 578 } 579 580 ASSERT(uio->uio_loffset < zp->z_phys->zp_size); 581 n = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset); 582 583 while (n > 0) { 584 nbytes = MIN(n, zfs_read_chunk_size - 585 P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); 586 587 if (vn_has_cached_data(vp)) 588 error = mappedread(vp, nbytes, uio); 589 else 590 error = dmu_read_uio(os, zp->z_id, uio, nbytes); 591 if (error) { 592 /* convert checksum errors into IO errors */ 593 if (error == ECKSUM) 594 error = EIO; 595 break; 596 } 597 598 n -= nbytes; 599 } 600 601out: 602 zfs_range_unlock(rl); 603 604 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 605 ZFS_EXIT(zfsvfs); 606 return (error); 607} 608 609/* 610 * Fault in the pages of the first n bytes specified by the uio structure. 611 * 1 byte in each page is touched and the uio struct is unmodified. 612 * Any error will exit this routine as this is only a best 613 * attempt to get the pages resident. This is a copy of ufs_trans_touch(). 614 */ 615static void 616zfs_prefault_write(ssize_t n, struct uio *uio) 617{ 618 struct iovec *iov; 619 ulong_t cnt, incr; 620 caddr_t p; 621 622 if (uio->uio_segflg != UIO_USERSPACE) 623 return; 624 625 iov = uio->uio_iov; 626 627 while (n) { 628 cnt = MIN(iov->iov_len, n); 629 if (cnt == 0) { 630 /* empty iov entry */ 631 iov++; 632 continue; 633 } 634 n -= cnt; 635 /* 636 * touch each page in this segment. 637 */ 638 p = iov->iov_base; 639 while (cnt) { 640 if (fubyte(p) == -1) 641 return; 642 incr = MIN(cnt, PAGESIZE); 643 p += incr; 644 cnt -= incr; 645 } 646 /* 647 * touch the last byte in case it straddles a page. 648 */ 649 p--; 650 if (fubyte(p) == -1) 651 return; 652 iov++; 653 } 654} 655 656/* 657 * Write the bytes to a file. 658 * 659 * IN: vp - vnode of file to be written to. 660 * uio - structure supplying write location, range info, 661 * and data buffer. 662 * ioflag - IO_APPEND flag set if in append mode. 663 * cr - credentials of caller. 664 * ct - caller context (NFS/CIFS fem monitor only) 665 * 666 * OUT: uio - updated offset and range. 667 * 668 * RETURN: 0 if success 669 * error code if failure 670 * 671 * Timestamps: 672 * vp - ctime|mtime updated if byte count > 0 673 */ 674/* ARGSUSED */ 675static int 676zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) 677{ 678 znode_t *zp = VTOZ(vp); 679 rlim64_t limit = MAXOFFSET_T; 680 ssize_t start_resid = uio->uio_resid; 681 ssize_t tx_bytes; 682 uint64_t end_size; 683 dmu_tx_t *tx; 684 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 685 zilog_t *zilog; 686 offset_t woff; 687 ssize_t n, nbytes; 688 rl_t *rl; 689 int max_blksz = zfsvfs->z_max_blksz; 690 uint64_t pflags; 691 int error; 692 693 /* 694 * Fasttrack empty write 695 */ 696 n = start_resid; 697 if (n == 0) 698 return (0); 699 700 if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) 701 limit = MAXOFFSET_T; 702 703 ZFS_ENTER(zfsvfs); 704 ZFS_VERIFY_ZP(zp); 705 706 /* 707 * If immutable or not appending then return EPERM 708 */ 709 pflags = zp->z_phys->zp_flags; 710 if ((pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) || 711 ((pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) && 712 (uio->uio_loffset < zp->z_phys->zp_size))) { 713 ZFS_EXIT(zfsvfs); 714 return (EPERM); 715 } 716 717 zilog = zfsvfs->z_log; 718 719 /* 720 * Pre-fault the pages to ensure slow (eg NFS) pages 721 * don't hold up txg. 722 */ 723 zfs_prefault_write(n, uio); 724 725 /* 726 * If in append mode, set the io offset pointer to eof. 727 */ 728 if (ioflag & IO_APPEND) { 729 /* 730 * Range lock for a file append: 731 * The value for the start of range will be determined by 732 * zfs_range_lock() (to guarantee append semantics). 733 * If this write will cause the block size to increase, 734 * zfs_range_lock() will lock the entire file, so we must 735 * later reduce the range after we grow the block size. 736 */ 737 rl = zfs_range_lock(zp, 0, n, RL_APPEND); 738 if (rl->r_len == UINT64_MAX) { 739 /* overlocked, zp_size can't change */ 740 woff = uio->uio_loffset = zp->z_phys->zp_size; 741 } else { 742 woff = uio->uio_loffset = rl->r_off; 743 } 744 } else { 745 woff = uio->uio_loffset; 746 /* 747 * Validate file offset 748 */ 749 if (woff < 0) { 750 ZFS_EXIT(zfsvfs); 751 return (EINVAL); 752 } 753 754 /* 755 * If we need to grow the block size then zfs_range_lock() 756 * will lock a wider range than we request here. 757 * Later after growing the block size we reduce the range. 758 */ 759 rl = zfs_range_lock(zp, woff, n, RL_WRITER); 760 } 761 762 if (woff >= limit) { 763 zfs_range_unlock(rl); 764 ZFS_EXIT(zfsvfs); 765 return (EFBIG); 766 } 767 768 if ((woff + n) > limit || woff > (limit - n)) 769 n = limit - woff; 770 771 /* 772 * Check for mandatory locks 773 */ 774 if (MANDMODE((mode_t)zp->z_phys->zp_mode) && 775 (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) { 776 zfs_range_unlock(rl); 777 ZFS_EXIT(zfsvfs); 778 return (error); 779 } 780 end_size = MAX(zp->z_phys->zp_size, woff + n); 781 782 /* 783 * Write the file in reasonable size chunks. Each chunk is written 784 * in a separate transaction; this keeps the intent log records small 785 * and allows us to do more fine-grained space accounting. 786 */ 787 while (n > 0) { 788 /* 789 * Start a transaction. 790 */ 791 woff = uio->uio_loffset; 792 tx = dmu_tx_create(zfsvfs->z_os); 793 dmu_tx_hold_bonus(tx, zp->z_id); 794 dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz)); 795 error = dmu_tx_assign(tx, zfsvfs->z_assign); 796 if (error) { 797 if (error == ERESTART && 798 zfsvfs->z_assign == TXG_NOWAIT) { 799 dmu_tx_wait(tx); 800 dmu_tx_abort(tx); 801 continue; 802 } 803 dmu_tx_abort(tx); 804 break; 805 } 806 807 /* 808 * If zfs_range_lock() over-locked we grow the blocksize 809 * and then reduce the lock range. This will only happen 810 * on the first iteration since zfs_range_reduce() will 811 * shrink down r_len to the appropriate size. 812 */ 813 if (rl->r_len == UINT64_MAX) { 814 uint64_t new_blksz; 815 816 if (zp->z_blksz > max_blksz) { 817 ASSERT(!ISP2(zp->z_blksz)); 818 new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE); 819 } else { 820 new_blksz = MIN(end_size, max_blksz); 821 } 822 zfs_grow_blocksize(zp, new_blksz, tx); 823 zfs_range_reduce(rl, woff, n); 824 } 825 826 /* 827 * XXX - should we really limit each write to z_max_blksz? 828 * Perhaps we should use SPA_MAXBLOCKSIZE chunks? 829 */ 830 nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); 831 832 if (woff + nbytes > zp->z_phys->zp_size) 833 vnode_pager_setsize(vp, woff + nbytes); 834 835 rw_enter(&zp->z_map_lock, RW_READER); 836 837 tx_bytes = uio->uio_resid; 838 if (vn_has_cached_data(vp)) { 839 rw_exit(&zp->z_map_lock); 840 error = mappedwrite(vp, nbytes, uio, tx); 841 } else { 842 error = dmu_write_uio(zfsvfs->z_os, zp->z_id, 843 uio, nbytes, tx); 844 rw_exit(&zp->z_map_lock); 845 } 846 tx_bytes -= uio->uio_resid; 847 848 /* 849 * If we made no progress, we're done. If we made even 850 * partial progress, update the znode and ZIL accordingly. 851 */ 852 if (tx_bytes == 0) { 853 dmu_tx_commit(tx); 854 ASSERT(error != 0); 855 break; 856 } 857 858 /* 859 * Clear Set-UID/Set-GID bits on successful write if not 860 * privileged and at least one of the excute bits is set. 861 * 862 * It would be nice to to this after all writes have 863 * been done, but that would still expose the ISUID/ISGID 864 * to another app after the partial write is committed. 865 * 866 * Note: we don't call zfs_fuid_map_id() here because 867 * user 0 is not an ephemeral uid. 868 */ 869 mutex_enter(&zp->z_acl_lock); 870 if ((zp->z_phys->zp_mode & (S_IXUSR | (S_IXUSR >> 3) | 871 (S_IXUSR >> 6))) != 0 && 872 (zp->z_phys->zp_mode & (S_ISUID | S_ISGID)) != 0 && 873 secpolicy_vnode_setid_retain(vp, cr, 874 (zp->z_phys->zp_mode & S_ISUID) != 0 && 875 zp->z_phys->zp_uid == 0) != 0) { 876 zp->z_phys->zp_mode &= ~(S_ISUID | S_ISGID); 877 } 878 mutex_exit(&zp->z_acl_lock); 879 880 /* 881 * Update time stamp. NOTE: This marks the bonus buffer as 882 * dirty, so we don't have to do it again for zp_size. 883 */ 884 zfs_time_stamper(zp, CONTENT_MODIFIED, tx); 885 886 /* 887 * Update the file size (zp_size) if it has changed; 888 * account for possible concurrent updates. 889 */ 890 while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset) 891 (void) atomic_cas_64(&zp->z_phys->zp_size, end_size, 892 uio->uio_loffset); 893 zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag); 894 dmu_tx_commit(tx); 895 896 if (error != 0) 897 break; 898 ASSERT(tx_bytes == nbytes); 899 n -= nbytes; 900 } 901 902 zfs_range_unlock(rl); 903 904 /* 905 * If we're in replay mode, or we made no progress, return error. 906 * Otherwise, it's at least a partial write, so it's successful. 907 */ 908 if (zfsvfs->z_assign >= TXG_INITIAL || uio->uio_resid == start_resid) { 909 ZFS_EXIT(zfsvfs); 910 return (error); 911 } 912 913 if (ioflag & (FSYNC | FDSYNC)) 914 zil_commit(zilog, zp->z_last_itx, zp->z_id); 915 916 ZFS_EXIT(zfsvfs); 917 return (0); 918} 919 920void 921zfs_get_done(dmu_buf_t *db, void *vzgd) 922{ 923 zgd_t *zgd = (zgd_t *)vzgd; 924 rl_t *rl = zgd->zgd_rl; 925 vnode_t *vp = ZTOV(rl->r_zp); 926 int vfslocked; 927 928 vfslocked = VFS_LOCK_GIANT(vp->v_vfsp); 929 dmu_buf_rele(db, vzgd); 930 zfs_range_unlock(rl); 931 VN_RELE(vp); 932 zil_add_block(zgd->zgd_zilog, zgd->zgd_bp); 933 kmem_free(zgd, sizeof (zgd_t)); 934 VFS_UNLOCK_GIANT(vfslocked); 935} 936 937/* 938 * Get data to generate a TX_WRITE intent log record. 939 */ 940int 941zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) 942{ 943 zfsvfs_t *zfsvfs = arg; 944 objset_t *os = zfsvfs->z_os; 945 znode_t *zp; 946 uint64_t off = lr->lr_offset; 947 dmu_buf_t *db; 948 rl_t *rl; 949 zgd_t *zgd; 950 int dlen = lr->lr_length; /* length of user data */ 951 int error = 0; 952 953 ASSERT(zio); 954 ASSERT(dlen != 0); 955 956 /* 957 * Nothing to do if the file has been removed 958 */ 959 if (zfs_zget(zfsvfs, lr->lr_foid, &zp) != 0) 960 return (ENOENT); 961 if (zp->z_unlinked) { 962 VN_RELE(ZTOV(zp)); 963 return (ENOENT); 964 } 965 966 /* 967 * Write records come in two flavors: immediate and indirect. 968 * For small writes it's cheaper to store the data with the 969 * log record (immediate); for large writes it's cheaper to 970 * sync the data and get a pointer to it (indirect) so that 971 * we don't have to write the data twice. 972 */ 973 if (buf != NULL) { /* immediate write */ 974 rl = zfs_range_lock(zp, off, dlen, RL_READER); 975 /* test for truncation needs to be done while range locked */ 976 if (off >= zp->z_phys->zp_size) { 977 error = ENOENT; 978 goto out; 979 } 980 VERIFY(0 == dmu_read(os, lr->lr_foid, off, dlen, buf)); 981 } else { /* indirect write */ 982 uint64_t boff; /* block starting offset */ 983 984 /* 985 * Have to lock the whole block to ensure when it's 986 * written out and it's checksum is being calculated 987 * that no one can change the data. We need to re-check 988 * blocksize after we get the lock in case it's changed! 989 */ 990 for (;;) { 991 if (ISP2(zp->z_blksz)) { 992 boff = P2ALIGN_TYPED(off, zp->z_blksz, 993 uint64_t); 994 } else { 995 boff = 0; 996 } 997 dlen = zp->z_blksz; 998 rl = zfs_range_lock(zp, boff, dlen, RL_READER); 999 if (zp->z_blksz == dlen) 1000 break; 1001 zfs_range_unlock(rl); 1002 } 1003 /* test for truncation needs to be done while range locked */ 1004 if (off >= zp->z_phys->zp_size) { 1005 error = ENOENT; 1006 goto out; 1007 } 1008 zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP); 1009 zgd->zgd_rl = rl; 1010 zgd->zgd_zilog = zfsvfs->z_log; 1011 zgd->zgd_bp = &lr->lr_blkptr; 1012 VERIFY(0 == dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db)); 1013 ASSERT(boff == db->db_offset); 1014 lr->lr_blkoff = off - boff; 1015 error = dmu_sync(zio, db, &lr->lr_blkptr, 1016 lr->lr_common.lrc_txg, zfs_get_done, zgd); 1017 ASSERT((error && error != EINPROGRESS) || 1018 lr->lr_length <= zp->z_blksz); 1019 if (error == 0) 1020 zil_add_block(zfsvfs->z_log, &lr->lr_blkptr); 1021 /* 1022 * If we get EINPROGRESS, then we need to wait for a 1023 * write IO initiated by dmu_sync() to complete before 1024 * we can release this dbuf. We will finish everything 1025 * up in the zfs_get_done() callback. 1026 */ 1027 if (error == EINPROGRESS) 1028 return (0); 1029 dmu_buf_rele(db, zgd); 1030 kmem_free(zgd, sizeof (zgd_t)); 1031 } 1032out: 1033 zfs_range_unlock(rl); 1034 VN_RELE(ZTOV(zp)); 1035 return (error); 1036} 1037 1038/*ARGSUSED*/ 1039static int 1040zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr, 1041 caller_context_t *ct) 1042{ 1043 znode_t *zp = VTOZ(vp); 1044 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1045 int error; 1046 1047 ZFS_ENTER(zfsvfs); 1048 ZFS_VERIFY_ZP(zp); 1049 1050 if (flag & V_ACE_MASK) 1051 error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); 1052 else 1053 error = zfs_zaccess_rwx(zp, mode, flag, cr); 1054 1055 ZFS_EXIT(zfsvfs); 1056 return (error); 1057} 1058 1059/* 1060 * Lookup an entry in a directory, or an extended attribute directory. 1061 * If it exists, return a held vnode reference for it. 1062 * 1063 * IN: dvp - vnode of directory to search. 1064 * nm - name of entry to lookup. 1065 * pnp - full pathname to lookup [UNUSED]. 1066 * flags - LOOKUP_XATTR set if looking for an attribute. 1067 * rdir - root directory vnode [UNUSED]. 1068 * cr - credentials of caller. 1069 * ct - caller context 1070 * direntflags - directory lookup flags 1071 * realpnp - returned pathname. 1072 * 1073 * OUT: vpp - vnode of located entry, NULL if not found. 1074 * 1075 * RETURN: 0 if success 1076 * error code if failure 1077 * 1078 * Timestamps: 1079 * NA 1080 */ 1081/* ARGSUSED */ 1082static int 1083zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp, 1084 int nameiop, cred_t *cr, kthread_t *td, int flags) 1085{ 1086 znode_t *zdp = VTOZ(dvp); 1087 zfsvfs_t *zfsvfs = zdp->z_zfsvfs; 1088 int error; 1089 int *direntflags = NULL; 1090 void *realpnp = NULL; 1091 1092 ZFS_ENTER(zfsvfs); 1093 ZFS_VERIFY_ZP(zdp); 1094 1095 *vpp = NULL; 1096 1097 if (flags & LOOKUP_XATTR) { 1098#ifdef TODO 1099 /* 1100 * If the xattr property is off, refuse the lookup request. 1101 */ 1102 if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) { 1103 ZFS_EXIT(zfsvfs); 1104 return (EINVAL); 1105 } 1106#endif 1107 1108 /* 1109 * We don't allow recursive attributes.. 1110 * Maybe someday we will. 1111 */ 1112 if (zdp->z_phys->zp_flags & ZFS_XATTR) { 1113 ZFS_EXIT(zfsvfs); 1114 return (EINVAL); 1115 } 1116 1117 if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) { 1118 ZFS_EXIT(zfsvfs); 1119 return (error); 1120 } 1121 1122 /* 1123 * Do we have permission to get into attribute directory? 1124 */ 1125 1126 if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0, 1127 B_FALSE, cr)) { 1128 VN_RELE(*vpp); 1129 *vpp = NULL; 1130 } 1131 1132 ZFS_EXIT(zfsvfs); 1133 return (error); 1134 } 1135 1136 if (dvp->v_type != VDIR) { 1137 ZFS_EXIT(zfsvfs); 1138 return (ENOTDIR); 1139 } 1140 1141 /* 1142 * Check accessibility of directory. 1143 */ 1144 1145 if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) { 1146 ZFS_EXIT(zfsvfs); 1147 return (error); 1148 } 1149 1150 if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), 1151 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 1152 ZFS_EXIT(zfsvfs); 1153 return (EILSEQ); 1154 } 1155 1156 error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp); 1157 if (error == 0) { 1158 /* 1159 * Convert device special files 1160 */ 1161 if (IS_DEVVP(*vpp)) { 1162 vnode_t *svp; 1163 1164 svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); 1165 VN_RELE(*vpp); 1166 if (svp == NULL) 1167 error = ENOSYS; 1168 else 1169 *vpp = svp; 1170 } 1171 } 1172 1173 ZFS_EXIT(zfsvfs); 1174 1175 /* Translate errors and add SAVENAME when needed. */ 1176 if (cnp->cn_flags & ISLASTCN) { 1177 switch (nameiop) { 1178 case CREATE: 1179 case RENAME: 1180 if (error == ENOENT) { 1181 error = EJUSTRETURN; 1182 cnp->cn_flags |= SAVENAME; 1183 break; 1184 } 1185 /* FALLTHROUGH */ 1186 case DELETE: 1187 if (error == 0) 1188 cnp->cn_flags |= SAVENAME; 1189 break; 1190 } 1191 } 1192 if (error == 0 && (nm[0] != '.' || nm[1] != '\0')) { 1193 int ltype = 0; 1194 1195 if (cnp->cn_flags & ISDOTDOT) { 1196 ltype = VOP_ISLOCKED(dvp); 1197 VOP_UNLOCK(dvp, 0); 1198 } 1199 error = vn_lock(*vpp, cnp->cn_lkflags); 1200 if (cnp->cn_flags & ISDOTDOT) 1201 vn_lock(dvp, ltype | LK_RETRY); 1202 if (error != 0) { 1203 VN_RELE(*vpp); 1204 *vpp = NULL; 1205 return (error); 1206 } 1207 } 1208 1209#ifdef FREEBSD_NAMECACHE 1210 /* 1211 * Insert name into cache (as non-existent) if appropriate. 1212 */ 1213 if (error == ENOENT && (cnp->cn_flags & MAKEENTRY) && nameiop != CREATE) 1214 cache_enter(dvp, *vpp, cnp); 1215 /* 1216 * Insert name into cache if appropriate. 1217 */ 1218 if (error == 0 && (cnp->cn_flags & MAKEENTRY)) { 1219 if (!(cnp->cn_flags & ISLASTCN) || 1220 (nameiop != DELETE && nameiop != RENAME)) { 1221 cache_enter(dvp, *vpp, cnp); 1222 } 1223 } 1224#endif 1225 1226 return (error); 1227} 1228 1229/* 1230 * Attempt to create a new entry in a directory. If the entry 1231 * already exists, truncate the file if permissible, else return 1232 * an error. Return the vp of the created or trunc'd file. 1233 * 1234 * IN: dvp - vnode of directory to put new file entry in. 1235 * name - name of new file entry. 1236 * vap - attributes of new file. 1237 * excl - flag indicating exclusive or non-exclusive mode. 1238 * mode - mode to open file with. 1239 * cr - credentials of caller. 1240 * flag - large file flag [UNUSED]. 1241 * ct - caller context 1242 * vsecp - ACL to be set 1243 * 1244 * OUT: vpp - vnode of created or trunc'd entry. 1245 * 1246 * RETURN: 0 if success 1247 * error code if failure 1248 * 1249 * Timestamps: 1250 * dvp - ctime|mtime updated if new entry created 1251 * vp - ctime|mtime always, atime if new 1252 */ 1253 1254/* ARGSUSED */ 1255static int 1256zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode, 1257 vnode_t **vpp, cred_t *cr, kthread_t *td) 1258{ 1259 znode_t *zp, *dzp = VTOZ(dvp); 1260 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1261 zilog_t *zilog; 1262 objset_t *os; 1263 zfs_dirlock_t *dl; 1264 dmu_tx_t *tx; 1265 int error; 1266 zfs_acl_t *aclp = NULL; 1267 zfs_fuid_info_t *fuidp = NULL; 1268 void *vsecp = NULL; 1269 int flag = 0; 1270 1271 /* 1272 * If we have an ephemeral id, ACL, or XVATTR then 1273 * make sure file system is at proper version 1274 */ 1275 1276 if (zfsvfs->z_use_fuids == B_FALSE && 1277 (vsecp || (vap->va_mask & AT_XVATTR) || 1278 IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr)))) 1279 return (EINVAL); 1280 1281 ZFS_ENTER(zfsvfs); 1282 ZFS_VERIFY_ZP(dzp); 1283 os = zfsvfs->z_os; 1284 zilog = zfsvfs->z_log; 1285 1286 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), 1287 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 1288 ZFS_EXIT(zfsvfs); 1289 return (EILSEQ); 1290 } 1291 1292 if (vap->va_mask & AT_XVATTR) { 1293 if ((error = secpolicy_xvattr((xvattr_t *)vap, 1294 crgetuid(cr), cr, vap->va_type)) != 0) { 1295 ZFS_EXIT(zfsvfs); 1296 return (error); 1297 } 1298 } 1299top: 1300 *vpp = NULL; 1301 1302 if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr)) 1303 vap->va_mode &= ~S_ISVTX; 1304 1305 if (*name == '\0') { 1306 /* 1307 * Null component name refers to the directory itself. 1308 */ 1309 VN_HOLD(dvp); 1310 zp = dzp; 1311 dl = NULL; 1312 error = 0; 1313 } else { 1314 /* possible VN_HOLD(zp) */ 1315 int zflg = 0; 1316 1317 if (flag & FIGNORECASE) 1318 zflg |= ZCILOOK; 1319 1320 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 1321 NULL, NULL); 1322 if (error) { 1323 if (strcmp(name, "..") == 0) 1324 error = EISDIR; 1325 ZFS_EXIT(zfsvfs); 1326 if (aclp) 1327 zfs_acl_free(aclp); 1328 return (error); 1329 } 1330 } 1331 if (vsecp && aclp == NULL) { 1332 error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, &aclp); 1333 if (error) { 1334 ZFS_EXIT(zfsvfs); 1335 if (dl) 1336 zfs_dirent_unlock(dl); 1337 return (error); 1338 } 1339 } 1340 1341 if (zp == NULL) { 1342 uint64_t txtype; 1343 1344 /* 1345 * Create a new file object and update the directory 1346 * to reference it. 1347 */ 1348 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { 1349 goto out; 1350 } 1351 1352 /* 1353 * We only support the creation of regular files in 1354 * extended attribute directories. 1355 */ 1356 if ((dzp->z_phys->zp_flags & ZFS_XATTR) && 1357 (vap->va_type != VREG)) { 1358 error = EINVAL; 1359 goto out; 1360 } 1361 1362 tx = dmu_tx_create(os); 1363 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 1364 if ((aclp && aclp->z_has_fuids) || IS_EPHEMERAL(crgetuid(cr)) || 1365 IS_EPHEMERAL(crgetgid(cr))) { 1366 if (zfsvfs->z_fuid_obj == 0) { 1367 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 1368 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 1369 FUID_SIZE_ESTIMATE(zfsvfs)); 1370 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, 1371 FALSE, NULL); 1372 } else { 1373 dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); 1374 dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, 1375 FUID_SIZE_ESTIMATE(zfsvfs)); 1376 } 1377 } 1378 dmu_tx_hold_bonus(tx, dzp->z_id); 1379 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 1380 if ((dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) || aclp) { 1381 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 1382 0, SPA_MAXBLOCKSIZE); 1383 } 1384 error = dmu_tx_assign(tx, zfsvfs->z_assign); 1385 if (error) { 1386 zfs_dirent_unlock(dl); 1387 if (error == ERESTART && 1388 zfsvfs->z_assign == TXG_NOWAIT) { 1389 dmu_tx_wait(tx); 1390 dmu_tx_abort(tx); 1391 goto top; 1392 } 1393 dmu_tx_abort(tx); 1394 ZFS_EXIT(zfsvfs); 1395 if (aclp) 1396 zfs_acl_free(aclp); 1397 return (error); 1398 } 1399 zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, aclp, &fuidp); 1400 (void) zfs_link_create(dl, zp, tx, ZNEW); 1401 txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); 1402 if (flag & FIGNORECASE) 1403 txtype |= TX_CI; 1404 zfs_log_create(zilog, tx, txtype, dzp, zp, name, 1405 vsecp, fuidp, vap); 1406 if (fuidp) 1407 zfs_fuid_info_free(fuidp); 1408 dmu_tx_commit(tx); 1409 } else { 1410 int aflags = (flag & FAPPEND) ? V_APPEND : 0; 1411 1412 /* 1413 * A directory entry already exists for this name. 1414 */ 1415 /* 1416 * Can't truncate an existing file if in exclusive mode. 1417 */ 1418 if (excl == EXCL) { 1419 error = EEXIST; 1420 goto out; 1421 } 1422 /* 1423 * Can't open a directory for writing. 1424 */ 1425 if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) { 1426 error = EISDIR; 1427 goto out; 1428 } 1429 /* 1430 * Verify requested access to file. 1431 */ 1432 if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) { 1433 goto out; 1434 } 1435 1436 mutex_enter(&dzp->z_lock); 1437 dzp->z_seq++; 1438 mutex_exit(&dzp->z_lock); 1439 1440 /* 1441 * Truncate regular files if requested. 1442 */ 1443 if ((ZTOV(zp)->v_type == VREG) && 1444 (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) { 1445 /* we can't hold any locks when calling zfs_freesp() */ 1446 zfs_dirent_unlock(dl); 1447 dl = NULL; 1448 error = zfs_freesp(zp, 0, 0, mode, TRUE); 1449 if (error == 0) { 1450 vnevent_create(ZTOV(zp), ct); 1451 } 1452 } 1453 } 1454out: 1455 if (dl) 1456 zfs_dirent_unlock(dl); 1457 1458 if (error) { 1459 if (zp) 1460 VN_RELE(ZTOV(zp)); 1461 } else { 1462 *vpp = ZTOV(zp); 1463 /* 1464 * If vnode is for a device return a specfs vnode instead. 1465 */ 1466 if (IS_DEVVP(*vpp)) { 1467 struct vnode *svp; 1468 1469 svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); 1470 VN_RELE(*vpp); 1471 if (svp == NULL) { 1472 error = ENOSYS; 1473 } 1474 *vpp = svp; 1475 } 1476 } 1477 if (aclp) 1478 zfs_acl_free(aclp); 1479 1480 ZFS_EXIT(zfsvfs); 1481 return (error); 1482} 1483 1484/* 1485 * Remove an entry from a directory. 1486 * 1487 * IN: dvp - vnode of directory to remove entry from. 1488 * name - name of entry to remove. 1489 * cr - credentials of caller. 1490 * ct - caller context 1491 * flags - case flags 1492 * 1493 * RETURN: 0 if success 1494 * error code if failure 1495 * 1496 * Timestamps: 1497 * dvp - ctime|mtime 1498 * vp - ctime (if nlink > 0) 1499 */ 1500/*ARGSUSED*/ 1501static int 1502zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct, 1503 int flags) 1504{ 1505 znode_t *zp, *dzp = VTOZ(dvp); 1506 znode_t *xzp = NULL; 1507 vnode_t *vp; 1508 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1509 zilog_t *zilog; 1510 uint64_t acl_obj, xattr_obj; 1511 zfs_dirlock_t *dl; 1512 dmu_tx_t *tx; 1513 boolean_t may_delete_now, delete_now = FALSE; 1514 boolean_t unlinked, toobig = FALSE; 1515 uint64_t txtype; 1516 pathname_t *realnmp = NULL; 1517 pathname_t realnm; 1518 int error; 1519 int zflg = ZEXISTS; 1520 1521 ZFS_ENTER(zfsvfs); 1522 ZFS_VERIFY_ZP(dzp); 1523 zilog = zfsvfs->z_log; 1524 1525 if (flags & FIGNORECASE) { 1526 zflg |= ZCILOOK; 1527 pn_alloc(&realnm); 1528 realnmp = &realnm; 1529 } 1530 1531top: 1532 /* 1533 * Attempt to lock directory; fail if entry doesn't exist. 1534 */ 1535 if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 1536 NULL, realnmp)) { 1537 if (realnmp) 1538 pn_free(realnmp); 1539 ZFS_EXIT(zfsvfs); 1540 return (error); 1541 } 1542 1543 vp = ZTOV(zp); 1544 1545 if (error = zfs_zaccess_delete(dzp, zp, cr)) { 1546 goto out; 1547 } 1548 1549 /* 1550 * Need to use rmdir for removing directories. 1551 */ 1552 if (vp->v_type == VDIR) { 1553 error = EPERM; 1554 goto out; 1555 } 1556 1557 vnevent_remove(vp, dvp, name, ct); 1558 1559 if (realnmp) 1560 dnlc_remove(dvp, realnmp->pn_buf); 1561 else 1562 dnlc_remove(dvp, name); 1563 1564 may_delete_now = FALSE; 1565 1566 /* 1567 * We may delete the znode now, or we may put it in the unlinked set; 1568 * it depends on whether we're the last link, and on whether there are 1569 * other holds on the vnode. So we dmu_tx_hold() the right things to 1570 * allow for either case. 1571 */ 1572 tx = dmu_tx_create(zfsvfs->z_os); 1573 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); 1574 dmu_tx_hold_bonus(tx, zp->z_id); 1575 if (may_delete_now) { 1576 toobig = 1577 zp->z_phys->zp_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT; 1578 /* if the file is too big, only hold_free a token amount */ 1579 dmu_tx_hold_free(tx, zp->z_id, 0, 1580 (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END)); 1581 } 1582 1583 /* are there any extended attributes? */ 1584 if ((xattr_obj = zp->z_phys->zp_xattr) != 0) { 1585 /* XXX - do we need this if we are deleting? */ 1586 dmu_tx_hold_bonus(tx, xattr_obj); 1587 } 1588 1589 /* are there any additional acls */ 1590 if ((acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj) != 0 && 1591 may_delete_now) 1592 dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); 1593 1594 /* charge as an update -- would be nice not to charge at all */ 1595 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 1596 1597 error = dmu_tx_assign(tx, zfsvfs->z_assign); 1598 if (error) { 1599 zfs_dirent_unlock(dl); 1600 VN_RELE(vp); 1601 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 1602 dmu_tx_wait(tx); 1603 dmu_tx_abort(tx); 1604 goto top; 1605 } 1606 if (realnmp) 1607 pn_free(realnmp); 1608 dmu_tx_abort(tx); 1609 ZFS_EXIT(zfsvfs); 1610 return (error); 1611 } 1612 1613 /* 1614 * Remove the directory entry. 1615 */ 1616 error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked); 1617 1618 if (error) { 1619 dmu_tx_commit(tx); 1620 goto out; 1621 } 1622 1623 if (0 && unlinked) { 1624 VI_LOCK(vp); 1625 delete_now = may_delete_now && !toobig && 1626 vp->v_count == 1 && !vn_has_cached_data(vp) && 1627 zp->z_phys->zp_xattr == xattr_obj && 1628 zp->z_phys->zp_acl.z_acl_extern_obj == acl_obj; 1629 VI_UNLOCK(vp); 1630 } 1631 1632 if (delete_now) { 1633 if (zp->z_phys->zp_xattr) { 1634 error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp); 1635 ASSERT3U(error, ==, 0); 1636 ASSERT3U(xzp->z_phys->zp_links, ==, 2); 1637 dmu_buf_will_dirty(xzp->z_dbuf, tx); 1638 mutex_enter(&xzp->z_lock); 1639 xzp->z_unlinked = 1; 1640 xzp->z_phys->zp_links = 0; 1641 mutex_exit(&xzp->z_lock); 1642 zfs_unlinked_add(xzp, tx); 1643 zp->z_phys->zp_xattr = 0; /* probably unnecessary */ 1644 } 1645 mutex_enter(&zp->z_lock); 1646 VI_LOCK(vp); 1647 vp->v_count--; 1648 ASSERT3U(vp->v_count, ==, 0); 1649 VI_UNLOCK(vp); 1650 mutex_exit(&zp->z_lock); 1651 zfs_znode_delete(zp, tx); 1652 } else if (unlinked) { 1653 zfs_unlinked_add(zp, tx); 1654 } 1655 1656 txtype = TX_REMOVE; 1657 if (flags & FIGNORECASE) 1658 txtype |= TX_CI; 1659 zfs_log_remove(zilog, tx, txtype, dzp, name); 1660 1661 dmu_tx_commit(tx); 1662out: 1663 if (realnmp) 1664 pn_free(realnmp); 1665 1666 zfs_dirent_unlock(dl); 1667 1668 if (!delete_now) { 1669 VN_RELE(vp); 1670 } else if (xzp) { 1671 /* this rele is delayed to prevent nesting transactions */ 1672 VN_RELE(ZTOV(xzp)); 1673 } 1674 1675 ZFS_EXIT(zfsvfs); 1676 return (error); 1677} 1678 1679/* 1680 * Create a new directory and insert it into dvp using the name 1681 * provided. Return a pointer to the inserted directory. 1682 * 1683 * IN: dvp - vnode of directory to add subdir to. 1684 * dirname - name of new directory. 1685 * vap - attributes of new directory. 1686 * cr - credentials of caller. 1687 * ct - caller context 1688 * vsecp - ACL to be set 1689 * 1690 * OUT: vpp - vnode of created directory. 1691 * 1692 * RETURN: 0 if success 1693 * error code if failure 1694 * 1695 * Timestamps: 1696 * dvp - ctime|mtime updated 1697 * vp - ctime|mtime|atime updated 1698 */ 1699/*ARGSUSED*/ 1700static int 1701zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr, 1702 caller_context_t *ct, int flags, vsecattr_t *vsecp) 1703{ 1704 znode_t *zp, *dzp = VTOZ(dvp); 1705 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1706 zilog_t *zilog; 1707 zfs_dirlock_t *dl; 1708 uint64_t txtype; 1709 dmu_tx_t *tx; 1710 int error; 1711 zfs_acl_t *aclp = NULL; 1712 zfs_fuid_info_t *fuidp = NULL; 1713 int zf = ZNEW; 1714 1715 ASSERT(vap->va_type == VDIR); 1716 1717 /* 1718 * If we have an ephemeral id, ACL, or XVATTR then 1719 * make sure file system is at proper version 1720 */ 1721 1722 if (zfsvfs->z_use_fuids == B_FALSE && 1723 (vsecp || (vap->va_mask & AT_XVATTR) || IS_EPHEMERAL(crgetuid(cr))|| 1724 IS_EPHEMERAL(crgetgid(cr)))) 1725 return (EINVAL); 1726 1727 ZFS_ENTER(zfsvfs); 1728 ZFS_VERIFY_ZP(dzp); 1729 zilog = zfsvfs->z_log; 1730 1731 if (dzp->z_phys->zp_flags & ZFS_XATTR) { 1732 ZFS_EXIT(zfsvfs); 1733 return (EINVAL); 1734 } 1735 1736 if (zfsvfs->z_utf8 && u8_validate(dirname, 1737 strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 1738 ZFS_EXIT(zfsvfs); 1739 return (EILSEQ); 1740 } 1741 if (flags & FIGNORECASE) 1742 zf |= ZCILOOK; 1743 1744 if (vap->va_mask & AT_XVATTR) 1745 if ((error = secpolicy_xvattr((xvattr_t *)vap, 1746 crgetuid(cr), cr, vap->va_type)) != 0) { 1747 ZFS_EXIT(zfsvfs); 1748 return (error); 1749 } 1750 1751 /* 1752 * First make sure the new directory doesn't exist. 1753 */ 1754top: 1755 *vpp = NULL; 1756 1757 if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf, 1758 NULL, NULL)) { 1759 ZFS_EXIT(zfsvfs); 1760 return (error); 1761 } 1762 1763 if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) { 1764 zfs_dirent_unlock(dl); 1765 ZFS_EXIT(zfsvfs); 1766 return (error); 1767 } 1768 1769 if (vsecp && aclp == NULL) { 1770 error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, &aclp); 1771 if (error) { 1772 zfs_dirent_unlock(dl); 1773 ZFS_EXIT(zfsvfs); 1774 return (error); 1775 } 1776 } 1777 /* 1778 * Add a new entry to the directory. 1779 */ 1780 tx = dmu_tx_create(zfsvfs->z_os); 1781 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); 1782 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); 1783 if ((aclp && aclp->z_has_fuids) || IS_EPHEMERAL(crgetuid(cr)) || 1784 IS_EPHEMERAL(crgetgid(cr))) { 1785 if (zfsvfs->z_fuid_obj == 0) { 1786 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 1787 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 1788 FUID_SIZE_ESTIMATE(zfsvfs)); 1789 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); 1790 } else { 1791 dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); 1792 dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, 1793 FUID_SIZE_ESTIMATE(zfsvfs)); 1794 } 1795 } 1796 if ((dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) || aclp) 1797 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 1798 0, SPA_MAXBLOCKSIZE); 1799 error = dmu_tx_assign(tx, zfsvfs->z_assign); 1800 if (error) { 1801 zfs_dirent_unlock(dl); 1802 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 1803 dmu_tx_wait(tx); 1804 dmu_tx_abort(tx); 1805 goto top; 1806 } 1807 dmu_tx_abort(tx); 1808 ZFS_EXIT(zfsvfs); 1809 if (aclp) 1810 zfs_acl_free(aclp); 1811 return (error); 1812 } 1813 1814 /* 1815 * Create new node. 1816 */ 1817 zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, aclp, &fuidp); 1818 1819 if (aclp) 1820 zfs_acl_free(aclp); 1821 1822 /* 1823 * Now put new name in parent dir. 1824 */ 1825 (void) zfs_link_create(dl, zp, tx, ZNEW); 1826 1827 *vpp = ZTOV(zp); 1828 1829 txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap); 1830 if (flags & FIGNORECASE) 1831 txtype |= TX_CI; 1832 zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, fuidp, vap); 1833 1834 if (fuidp) 1835 zfs_fuid_info_free(fuidp); 1836 dmu_tx_commit(tx); 1837 1838 zfs_dirent_unlock(dl); 1839 1840 ZFS_EXIT(zfsvfs); 1841 return (0); 1842} 1843 1844/* 1845 * Remove a directory subdir entry. If the current working 1846 * directory is the same as the subdir to be removed, the 1847 * remove will fail. 1848 * 1849 * IN: dvp - vnode of directory to remove from. 1850 * name - name of directory to be removed. 1851 * cwd - vnode of current working directory. 1852 * cr - credentials of caller. 1853 * ct - caller context 1854 * flags - case flags 1855 * 1856 * RETURN: 0 if success 1857 * error code if failure 1858 * 1859 * Timestamps: 1860 * dvp - ctime|mtime updated 1861 */ 1862/*ARGSUSED*/ 1863static int 1864zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr, 1865 caller_context_t *ct, int flags) 1866{ 1867 znode_t *dzp = VTOZ(dvp); 1868 znode_t *zp; 1869 vnode_t *vp; 1870 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1871 zilog_t *zilog; 1872 zfs_dirlock_t *dl; 1873 dmu_tx_t *tx; 1874 int error; 1875 int zflg = ZEXISTS; 1876 1877 ZFS_ENTER(zfsvfs); 1878 ZFS_VERIFY_ZP(dzp); 1879 zilog = zfsvfs->z_log; 1880 1881 if (flags & FIGNORECASE) 1882 zflg |= ZCILOOK; 1883top: 1884 zp = NULL; 1885 1886 /* 1887 * Attempt to lock directory; fail if entry doesn't exist. 1888 */ 1889 if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 1890 NULL, NULL)) { 1891 ZFS_EXIT(zfsvfs); 1892 return (error); 1893 } 1894 1895 vp = ZTOV(zp); 1896 1897 if (error = zfs_zaccess_delete(dzp, zp, cr)) { 1898 goto out; 1899 } 1900 1901 if (vp->v_type != VDIR) { 1902 error = ENOTDIR; 1903 goto out; 1904 } 1905 1906 if (vp == cwd) { 1907 error = EINVAL; 1908 goto out; 1909 } 1910 1911 vnevent_rmdir(vp, dvp, name, ct); 1912 1913 /* 1914 * Grab a lock on the directory to make sure that noone is 1915 * trying to add (or lookup) entries while we are removing it. 1916 */ 1917 rw_enter(&zp->z_name_lock, RW_WRITER); 1918 1919 /* 1920 * Grab a lock on the parent pointer to make sure we play well 1921 * with the treewalk and directory rename code. 1922 */ 1923 rw_enter(&zp->z_parent_lock, RW_WRITER); 1924 1925 tx = dmu_tx_create(zfsvfs->z_os); 1926 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); 1927 dmu_tx_hold_bonus(tx, zp->z_id); 1928 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 1929 error = dmu_tx_assign(tx, zfsvfs->z_assign); 1930 if (error) { 1931 rw_exit(&zp->z_parent_lock); 1932 rw_exit(&zp->z_name_lock); 1933 zfs_dirent_unlock(dl); 1934 VN_RELE(vp); 1935 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 1936 dmu_tx_wait(tx); 1937 dmu_tx_abort(tx); 1938 goto top; 1939 } 1940 dmu_tx_abort(tx); 1941 ZFS_EXIT(zfsvfs); 1942 return (error); 1943 } 1944 1945#ifdef FREEBSD_NAMECACHE 1946 cache_purge(dvp); 1947#endif 1948 1949 error = zfs_link_destroy(dl, zp, tx, zflg, NULL); 1950 1951 if (error == 0) { 1952 uint64_t txtype = TX_RMDIR; 1953 if (flags & FIGNORECASE) 1954 txtype |= TX_CI; 1955 zfs_log_remove(zilog, tx, txtype, dzp, name); 1956 } 1957 1958 dmu_tx_commit(tx); 1959 1960 rw_exit(&zp->z_parent_lock); 1961 rw_exit(&zp->z_name_lock); 1962#ifdef FREEBSD_NAMECACHE 1963 cache_purge(vp); 1964#endif 1965out: 1966 zfs_dirent_unlock(dl); 1967 1968 VN_RELE(vp); 1969 1970 ZFS_EXIT(zfsvfs); 1971 return (error); 1972} 1973 1974/* 1975 * Read as many directory entries as will fit into the provided 1976 * buffer from the given directory cursor position (specified in 1977 * the uio structure. 1978 * 1979 * IN: vp - vnode of directory to read. 1980 * uio - structure supplying read location, range info, 1981 * and return buffer. 1982 * cr - credentials of caller. 1983 * ct - caller context 1984 * flags - case flags 1985 * 1986 * OUT: uio - updated offset and range, buffer filled. 1987 * eofp - set to true if end-of-file detected. 1988 * 1989 * RETURN: 0 if success 1990 * error code if failure 1991 * 1992 * Timestamps: 1993 * vp - atime updated 1994 * 1995 * Note that the low 4 bits of the cookie returned by zap is always zero. 1996 * This allows us to use the low range for "special" directory entries: 1997 * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, 1998 * we use the offset 2 for the '.zfs' directory. 1999 */ 2000/* ARGSUSED */ 2001static int 2002zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies) 2003{ 2004 znode_t *zp = VTOZ(vp); 2005 iovec_t *iovp; 2006 edirent_t *eodp; 2007 dirent64_t *odp; 2008 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2009 objset_t *os; 2010 caddr_t outbuf; 2011 size_t bufsize; 2012 zap_cursor_t zc; 2013 zap_attribute_t zap; 2014 uint_t bytes_wanted; 2015 uint64_t offset; /* must be unsigned; checks for < 1 */ 2016 int local_eof; 2017 int outcount; 2018 int error; 2019 uint8_t prefetch; 2020 boolean_t check_sysattrs; 2021 uint8_t type; 2022 int ncooks; 2023 u_long *cooks = NULL; 2024 int flags = 0; 2025 2026 ZFS_ENTER(zfsvfs); 2027 ZFS_VERIFY_ZP(zp); 2028 2029 /* 2030 * If we are not given an eof variable, 2031 * use a local one. 2032 */ 2033 if (eofp == NULL) 2034 eofp = &local_eof; 2035 2036 /* 2037 * Check for valid iov_len. 2038 */ 2039 if (uio->uio_iov->iov_len <= 0) { 2040 ZFS_EXIT(zfsvfs); 2041 return (EINVAL); 2042 } 2043 2044 /* 2045 * Quit if directory has been removed (posix) 2046 */ 2047 if ((*eofp = zp->z_unlinked) != 0) { 2048 ZFS_EXIT(zfsvfs); 2049 return (0); 2050 } 2051 2052 error = 0; 2053 os = zfsvfs->z_os; 2054 offset = uio->uio_loffset; 2055 prefetch = zp->z_zn_prefetch; 2056 2057 /* 2058 * Initialize the iterator cursor. 2059 */ 2060 if (offset <= 3) { 2061 /* 2062 * Start iteration from the beginning of the directory. 2063 */ 2064 zap_cursor_init(&zc, os, zp->z_id); 2065 } else { 2066 /* 2067 * The offset is a serialized cursor. 2068 */ 2069 zap_cursor_init_serialized(&zc, os, zp->z_id, offset); 2070 } 2071 2072 /* 2073 * Get space to change directory entries into fs independent format. 2074 */ 2075 iovp = uio->uio_iov; 2076 bytes_wanted = iovp->iov_len; 2077 if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) { 2078 bufsize = bytes_wanted; 2079 outbuf = kmem_alloc(bufsize, KM_SLEEP); 2080 odp = (struct dirent64 *)outbuf; 2081 } else { 2082 bufsize = bytes_wanted; 2083 odp = (struct dirent64 *)iovp->iov_base; 2084 } 2085 eodp = (struct edirent *)odp; 2086 2087 if (ncookies != NULL) { 2088 /* 2089 * Minimum entry size is dirent size and 1 byte for a file name. 2090 */ 2091 ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1); 2092 cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK); 2093 *cookies = cooks; 2094 *ncookies = ncooks; 2095 } 2096 /* 2097 * If this VFS supports the system attribute view interface; and 2098 * we're looking at an extended attribute directory; and we care 2099 * about normalization conflicts on this vfs; then we must check 2100 * for normalization conflicts with the sysattr name space. 2101 */ 2102#ifdef TODO 2103 check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && 2104 (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm && 2105 (flags & V_RDDIR_ENTFLAGS); 2106#else 2107 check_sysattrs = 0; 2108#endif 2109 2110 /* 2111 * Transform to file-system independent format 2112 */ 2113 outcount = 0; 2114 while (outcount < bytes_wanted) { 2115 ino64_t objnum; 2116 ushort_t reclen; 2117 off64_t *next; 2118 2119 /* 2120 * Special case `.', `..', and `.zfs'. 2121 */ 2122 if (offset == 0) { 2123 (void) strcpy(zap.za_name, "."); 2124 zap.za_normalization_conflict = 0; 2125 objnum = zp->z_id; 2126 type = DT_DIR; 2127 } else if (offset == 1) { 2128 (void) strcpy(zap.za_name, ".."); 2129 zap.za_normalization_conflict = 0; 2130 objnum = zp->z_phys->zp_parent; 2131 type = DT_DIR; 2132 } else if (offset == 2 && zfs_show_ctldir(zp)) { 2133 (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); 2134 zap.za_normalization_conflict = 0; 2135 objnum = ZFSCTL_INO_ROOT; 2136 type = DT_DIR; 2137 } else { 2138 /* 2139 * Grab next entry. 2140 */ 2141 if (error = zap_cursor_retrieve(&zc, &zap)) { 2142 if ((*eofp = (error == ENOENT)) != 0) 2143 break; 2144 else 2145 goto update; 2146 } 2147 2148 if (zap.za_integer_length != 8 || 2149 zap.za_num_integers != 1) { 2150 cmn_err(CE_WARN, "zap_readdir: bad directory " 2151 "entry, obj = %lld, offset = %lld\n", 2152 (u_longlong_t)zp->z_id, 2153 (u_longlong_t)offset); 2154 error = ENXIO; 2155 goto update; 2156 } 2157 2158 objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); 2159 /* 2160 * MacOS X can extract the object type here such as: 2161 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer); 2162 */ 2163 type = ZFS_DIRENT_TYPE(zap.za_first_integer); 2164 2165 if (check_sysattrs && !zap.za_normalization_conflict) { 2166#ifdef TODO 2167 zap.za_normalization_conflict = 2168 xattr_sysattr_casechk(zap.za_name); 2169#else 2170 panic("%s:%u: TODO", __func__, __LINE__); 2171#endif 2172 } 2173 } 2174 2175 if (flags & V_RDDIR_ENTFLAGS) 2176 reclen = EDIRENT_RECLEN(strlen(zap.za_name)); 2177 else 2178 reclen = DIRENT64_RECLEN(strlen(zap.za_name)); 2179 2180 /* 2181 * Will this entry fit in the buffer? 2182 */ 2183 if (outcount + reclen > bufsize) { 2184 /* 2185 * Did we manage to fit anything in the buffer? 2186 */ 2187 if (!outcount) { 2188 error = EINVAL; 2189 goto update; 2190 } 2191 break; 2192 } 2193 if (flags & V_RDDIR_ENTFLAGS) { 2194 /* 2195 * Add extended flag entry: 2196 */ 2197 eodp->ed_ino = objnum; 2198 eodp->ed_reclen = reclen; 2199 /* NOTE: ed_off is the offset for the *next* entry */ 2200 next = &(eodp->ed_off); 2201 eodp->ed_eflags = zap.za_normalization_conflict ? 2202 ED_CASE_CONFLICT : 0; 2203 (void) strncpy(eodp->ed_name, zap.za_name, 2204 EDIRENT_NAMELEN(reclen)); 2205 eodp = (edirent_t *)((intptr_t)eodp + reclen); 2206 } else { 2207 /* 2208 * Add normal entry: 2209 */ 2210 odp->d_ino = objnum; 2211 odp->d_reclen = reclen; 2212 odp->d_namlen = strlen(zap.za_name); 2213 (void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1); 2214 odp->d_type = type; 2215 odp = (dirent64_t *)((intptr_t)odp + reclen); 2216 } 2217 outcount += reclen; 2218 2219 ASSERT(outcount <= bufsize); 2220 2221 /* Prefetch znode */ 2222 if (prefetch) 2223 dmu_prefetch(os, objnum, 0, 0); 2224 2225 /* 2226 * Move to the next entry, fill in the previous offset. 2227 */ 2228 if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { 2229 zap_cursor_advance(&zc); 2230 offset = zap_cursor_serialize(&zc); 2231 } else { 2232 offset += 1; 2233 } 2234 2235 if (cooks != NULL) { 2236 *cooks++ = offset; 2237 ncooks--; 2238 KASSERT(ncooks >= 0, ("ncookies=%d", ncooks)); 2239 } 2240 } 2241 zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ 2242 2243 /* Subtract unused cookies */ 2244 if (ncookies != NULL) 2245 *ncookies -= ncooks; 2246 2247 if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) { 2248 iovp->iov_base += outcount; 2249 iovp->iov_len -= outcount; 2250 uio->uio_resid -= outcount; 2251 } else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) { 2252 /* 2253 * Reset the pointer. 2254 */ 2255 offset = uio->uio_loffset; 2256 } 2257 2258update: 2259 zap_cursor_fini(&zc); 2260 if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) 2261 kmem_free(outbuf, bufsize); 2262 2263 if (error == ENOENT) 2264 error = 0; 2265 2266 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 2267 2268 uio->uio_loffset = offset; 2269 ZFS_EXIT(zfsvfs); 2270 if (error != 0 && cookies != NULL) { 2271 free(*cookies, M_TEMP); 2272 *cookies = NULL; 2273 *ncookies = 0; 2274 } 2275 return (error); 2276} 2277 2278ulong_t zfs_fsync_sync_cnt = 4; 2279 2280static int 2281zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct) 2282{ 2283 znode_t *zp = VTOZ(vp); 2284 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2285 2286 (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt); 2287 2288 ZFS_ENTER(zfsvfs); 2289 ZFS_VERIFY_ZP(zp); 2290 zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id); 2291 ZFS_EXIT(zfsvfs); 2292 return (0); 2293} 2294 2295 2296/* 2297 * Get the requested file attributes and place them in the provided 2298 * vattr structure. 2299 * 2300 * IN: vp - vnode of file. 2301 * vap - va_mask identifies requested attributes. 2302 * If AT_XVATTR set, then optional attrs are requested 2303 * flags - ATTR_NOACLCHECK (CIFS server context) 2304 * cr - credentials of caller. 2305 * ct - caller context 2306 * 2307 * OUT: vap - attribute values. 2308 * 2309 * RETURN: 0 (always succeeds) 2310 */ 2311/* ARGSUSED */ 2312static int 2313zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, 2314 caller_context_t *ct) 2315{ 2316 znode_t *zp = VTOZ(vp); 2317 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2318 znode_phys_t *pzp; 2319 int error = 0; 2320 uint32_t blksize; 2321 u_longlong_t nblocks; 2322 uint64_t links; 2323 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ 2324 xoptattr_t *xoap = NULL; 2325 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 2326 2327 ZFS_ENTER(zfsvfs); 2328 ZFS_VERIFY_ZP(zp); 2329 pzp = zp->z_phys; 2330 2331 mutex_enter(&zp->z_lock); 2332 2333 /* 2334 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES. 2335 * Also, if we are the owner don't bother, since owner should 2336 * always be allowed to read basic attributes of file. 2337 */ 2338 if (!(pzp->zp_flags & ZFS_ACL_TRIVIAL) && 2339 (pzp->zp_uid != crgetuid(cr))) { 2340 if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0, 2341 skipaclchk, cr)) { 2342 mutex_exit(&zp->z_lock); 2343 ZFS_EXIT(zfsvfs); 2344 return (error); 2345 } 2346 } 2347 2348 /* 2349 * Return all attributes. It's cheaper to provide the answer 2350 * than to determine whether we were asked the question. 2351 */ 2352 2353 vap->va_type = IFTOVT(pzp->zp_mode); 2354 vap->va_mode = pzp->zp_mode & ~S_IFMT; 2355 zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid); 2356// vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev; 2357 vap->va_nodeid = zp->z_id; 2358 if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp)) 2359 links = pzp->zp_links + 1; 2360 else 2361 links = pzp->zp_links; 2362 vap->va_nlink = MIN(links, UINT32_MAX); /* nlink_t limit! */ 2363 vap->va_size = pzp->zp_size; 2364 vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; 2365 vap->va_rdev = zfs_cmpldev(pzp->zp_rdev); 2366 vap->va_seq = zp->z_seq; 2367 vap->va_flags = 0; /* FreeBSD: Reset chflags(2) flags. */ 2368 2369 /* 2370 * Add in any requested optional attributes and the create time. 2371 * Also set the corresponding bits in the returned attribute bitmap. 2372 */ 2373 if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) { 2374 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { 2375 xoap->xoa_archive = 2376 ((pzp->zp_flags & ZFS_ARCHIVE) != 0); 2377 XVA_SET_RTN(xvap, XAT_ARCHIVE); 2378 } 2379 2380 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { 2381 xoap->xoa_readonly = 2382 ((pzp->zp_flags & ZFS_READONLY) != 0); 2383 XVA_SET_RTN(xvap, XAT_READONLY); 2384 } 2385 2386 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { 2387 xoap->xoa_system = 2388 ((pzp->zp_flags & ZFS_SYSTEM) != 0); 2389 XVA_SET_RTN(xvap, XAT_SYSTEM); 2390 } 2391 2392 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { 2393 xoap->xoa_hidden = 2394 ((pzp->zp_flags & ZFS_HIDDEN) != 0); 2395 XVA_SET_RTN(xvap, XAT_HIDDEN); 2396 } 2397 2398 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { 2399 xoap->xoa_nounlink = 2400 ((pzp->zp_flags & ZFS_NOUNLINK) != 0); 2401 XVA_SET_RTN(xvap, XAT_NOUNLINK); 2402 } 2403 2404 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { 2405 xoap->xoa_immutable = 2406 ((pzp->zp_flags & ZFS_IMMUTABLE) != 0); 2407 XVA_SET_RTN(xvap, XAT_IMMUTABLE); 2408 } 2409 2410 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { 2411 xoap->xoa_appendonly = 2412 ((pzp->zp_flags & ZFS_APPENDONLY) != 0); 2413 XVA_SET_RTN(xvap, XAT_APPENDONLY); 2414 } 2415 2416 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { 2417 xoap->xoa_nodump = 2418 ((pzp->zp_flags & ZFS_NODUMP) != 0); 2419 XVA_SET_RTN(xvap, XAT_NODUMP); 2420 } 2421 2422 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { 2423 xoap->xoa_opaque = 2424 ((pzp->zp_flags & ZFS_OPAQUE) != 0); 2425 XVA_SET_RTN(xvap, XAT_OPAQUE); 2426 } 2427 2428 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { 2429 xoap->xoa_av_quarantined = 2430 ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0); 2431 XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); 2432 } 2433 2434 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { 2435 xoap->xoa_av_modified = 2436 ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0); 2437 XVA_SET_RTN(xvap, XAT_AV_MODIFIED); 2438 } 2439 2440 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) && 2441 vp->v_type == VREG && 2442 (pzp->zp_flags & ZFS_BONUS_SCANSTAMP)) { 2443 size_t len; 2444 dmu_object_info_t doi; 2445 2446 /* 2447 * Only VREG files have anti-virus scanstamps, so we 2448 * won't conflict with symlinks in the bonus buffer. 2449 */ 2450 dmu_object_info_from_db(zp->z_dbuf, &doi); 2451 len = sizeof (xoap->xoa_av_scanstamp) + 2452 sizeof (znode_phys_t); 2453 if (len <= doi.doi_bonus_size) { 2454 /* 2455 * pzp points to the start of the 2456 * znode_phys_t. pzp + 1 points to the 2457 * first byte after the znode_phys_t. 2458 */ 2459 (void) memcpy(xoap->xoa_av_scanstamp, 2460 pzp + 1, 2461 sizeof (xoap->xoa_av_scanstamp)); 2462 XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); 2463 } 2464 } 2465 2466 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { 2467 ZFS_TIME_DECODE(&xoap->xoa_createtime, pzp->zp_crtime); 2468 XVA_SET_RTN(xvap, XAT_CREATETIME); 2469 } 2470 } 2471 2472 ZFS_TIME_DECODE(&vap->va_atime, pzp->zp_atime); 2473 ZFS_TIME_DECODE(&vap->va_mtime, pzp->zp_mtime); 2474 ZFS_TIME_DECODE(&vap->va_ctime, pzp->zp_ctime); 2475 ZFS_TIME_DECODE(&vap->va_birthtime, pzp->zp_crtime); 2476 2477 mutex_exit(&zp->z_lock); 2478 2479 dmu_object_size_from_db(zp->z_dbuf, &blksize, &nblocks); 2480 vap->va_blksize = blksize; 2481 vap->va_bytes = nblocks << 9; /* nblocks * 512 */ 2482 2483 if (zp->z_blksz == 0) { 2484 /* 2485 * Block size hasn't been set; suggest maximal I/O transfers. 2486 */ 2487 vap->va_blksize = zfsvfs->z_max_blksz; 2488 } 2489 2490 ZFS_EXIT(zfsvfs); 2491 return (0); 2492} 2493 2494/* 2495 * Set the file attributes to the values contained in the 2496 * vattr structure. 2497 * 2498 * IN: vp - vnode of file to be modified. 2499 * vap - new attribute values. 2500 * If AT_XVATTR set, then optional attrs are being set 2501 * flags - ATTR_UTIME set if non-default time values provided. 2502 * - ATTR_NOACLCHECK (CIFS context only). 2503 * cr - credentials of caller. 2504 * ct - caller context 2505 * 2506 * RETURN: 0 if success 2507 * error code if failure 2508 * 2509 * Timestamps: 2510 * vp - ctime updated, mtime updated if size changed. 2511 */ 2512/* ARGSUSED */ 2513static int 2514zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, 2515 caller_context_t *ct) 2516{ 2517 znode_t *zp = VTOZ(vp); 2518 znode_phys_t *pzp; 2519 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2520 zilog_t *zilog; 2521 dmu_tx_t *tx; 2522 vattr_t oldva; 2523 uint_t mask = vap->va_mask; 2524 uint_t saved_mask; 2525 int trim_mask = 0; 2526 uint64_t new_mode; 2527 znode_t *attrzp; 2528 int need_policy = FALSE; 2529 int err; 2530 zfs_fuid_info_t *fuidp = NULL; 2531 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ 2532 xoptattr_t *xoap; 2533 zfs_acl_t *aclp = NULL; 2534 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 2535 2536 if (mask == 0) 2537 return (0); 2538 2539 if (mask & AT_NOSET) 2540 return (EINVAL); 2541 2542 ZFS_ENTER(zfsvfs); 2543 ZFS_VERIFY_ZP(zp); 2544 2545 pzp = zp->z_phys; 2546 zilog = zfsvfs->z_log; 2547 2548 /* 2549 * Make sure that if we have ephemeral uid/gid or xvattr specified 2550 * that file system is at proper version level 2551 */ 2552 2553 if (zfsvfs->z_use_fuids == B_FALSE && 2554 (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) || 2555 ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) || 2556 (mask & AT_XVATTR))) { 2557 ZFS_EXIT(zfsvfs); 2558 return (EINVAL); 2559 } 2560 2561 if (mask & AT_SIZE && vp->v_type == VDIR) { 2562 ZFS_EXIT(zfsvfs); 2563 return (EISDIR); 2564 } 2565 2566 if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) { 2567 ZFS_EXIT(zfsvfs); 2568 return (EINVAL); 2569 } 2570 2571 /* 2572 * If this is an xvattr_t, then get a pointer to the structure of 2573 * optional attributes. If this is NULL, then we have a vattr_t. 2574 */ 2575 xoap = xva_getxoptattr(xvap); 2576 2577 /* 2578 * Immutable files can only alter immutable bit and atime 2579 */ 2580 if ((pzp->zp_flags & ZFS_IMMUTABLE) && 2581 ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) || 2582 ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { 2583 ZFS_EXIT(zfsvfs); 2584 return (EPERM); 2585 } 2586 2587 if ((mask & AT_SIZE) && (pzp->zp_flags & ZFS_READONLY)) { 2588 ZFS_EXIT(zfsvfs); 2589 return (EPERM); 2590 } 2591 2592 /* 2593 * Verify timestamps doesn't overflow 32 bits. 2594 * ZFS can handle large timestamps, but 32bit syscalls can't 2595 * handle times greater than 2039. This check should be removed 2596 * once large timestamps are fully supported. 2597 */ 2598 if (mask & (AT_ATIME | AT_MTIME)) { 2599 if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || 2600 ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { 2601 ZFS_EXIT(zfsvfs); 2602 return (EOVERFLOW); 2603 } 2604 } 2605 2606top: 2607 attrzp = NULL; 2608 2609 if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { 2610 ZFS_EXIT(zfsvfs); 2611 return (EROFS); 2612 } 2613 2614 /* 2615 * First validate permissions 2616 */ 2617 2618 if (mask & AT_SIZE) { 2619 err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr); 2620 if (err) { 2621 ZFS_EXIT(zfsvfs); 2622 return (err); 2623 } 2624 /* 2625 * XXX - Note, we are not providing any open 2626 * mode flags here (like FNDELAY), so we may 2627 * block if there are locks present... this 2628 * should be addressed in openat(). 2629 */ 2630 /* XXX - would it be OK to generate a log record here? */ 2631 err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); 2632 if (err) { 2633 ZFS_EXIT(zfsvfs); 2634 return (err); 2635 } 2636 } 2637 2638 if (mask & (AT_ATIME|AT_MTIME) || 2639 ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || 2640 XVA_ISSET_REQ(xvap, XAT_READONLY) || 2641 XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || 2642 XVA_ISSET_REQ(xvap, XAT_CREATETIME) || 2643 XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) 2644 need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, 2645 skipaclchk, cr); 2646 2647 if (mask & (AT_UID|AT_GID)) { 2648 int idmask = (mask & (AT_UID|AT_GID)); 2649 int take_owner; 2650 int take_group; 2651 2652 /* 2653 * NOTE: even if a new mode is being set, 2654 * we may clear S_ISUID/S_ISGID bits. 2655 */ 2656 2657 if (!(mask & AT_MODE)) 2658 vap->va_mode = pzp->zp_mode; 2659 2660 /* 2661 * Take ownership or chgrp to group we are a member of 2662 */ 2663 2664 take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr)); 2665 take_group = (mask & AT_GID) && 2666 zfs_groupmember(zfsvfs, vap->va_gid, cr); 2667 2668 /* 2669 * If both AT_UID and AT_GID are set then take_owner and 2670 * take_group must both be set in order to allow taking 2671 * ownership. 2672 * 2673 * Otherwise, send the check through secpolicy_vnode_setattr() 2674 * 2675 */ 2676 2677 if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) || 2678 ((idmask == AT_UID) && take_owner) || 2679 ((idmask == AT_GID) && take_group)) { 2680 if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, 2681 skipaclchk, cr) == 0) { 2682 /* 2683 * Remove setuid/setgid for non-privileged users 2684 */ 2685 secpolicy_setid_clear(vap, vp, cr); 2686 trim_mask = (mask & (AT_UID|AT_GID)); 2687 } else { 2688 need_policy = TRUE; 2689 } 2690 } else { 2691 need_policy = TRUE; 2692 } 2693 } 2694 2695 mutex_enter(&zp->z_lock); 2696 oldva.va_mode = pzp->zp_mode; 2697 zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); 2698 if (mask & AT_XVATTR) { 2699 if ((need_policy == FALSE) && 2700 (XVA_ISSET_REQ(xvap, XAT_APPENDONLY) && 2701 xoap->xoa_appendonly != 2702 ((pzp->zp_flags & ZFS_APPENDONLY) != 0)) || 2703 (XVA_ISSET_REQ(xvap, XAT_NOUNLINK) && 2704 xoap->xoa_nounlink != 2705 ((pzp->zp_flags & ZFS_NOUNLINK) != 0)) || 2706 (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE) && 2707 xoap->xoa_immutable != 2708 ((pzp->zp_flags & ZFS_IMMUTABLE) != 0)) || 2709 (XVA_ISSET_REQ(xvap, XAT_NODUMP) && 2710 xoap->xoa_nodump != 2711 ((pzp->zp_flags & ZFS_NODUMP) != 0)) || 2712 (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED) && 2713 xoap->xoa_av_modified != 2714 ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0)) || 2715 ((XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED) && 2716 ((vp->v_type != VREG && xoap->xoa_av_quarantined) || 2717 xoap->xoa_av_quarantined != 2718 ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0)))) || 2719 (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) || 2720 (XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { 2721 need_policy = TRUE; 2722 } 2723 } 2724 2725 mutex_exit(&zp->z_lock); 2726 2727 if (mask & AT_MODE) { 2728 if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) { 2729 err = secpolicy_setid_setsticky_clear(vp, vap, 2730 &oldva, cr); 2731 if (err) { 2732 ZFS_EXIT(zfsvfs); 2733 return (err); 2734 } 2735 trim_mask |= AT_MODE; 2736 } else { 2737 need_policy = TRUE; 2738 } 2739 } 2740 2741 if (need_policy) { 2742 /* 2743 * If trim_mask is set then take ownership 2744 * has been granted or write_acl is present and user 2745 * has the ability to modify mode. In that case remove 2746 * UID|GID and or MODE from mask so that 2747 * secpolicy_vnode_setattr() doesn't revoke it. 2748 */ 2749 2750 if (trim_mask) { 2751 saved_mask = vap->va_mask; 2752 vap->va_mask &= ~trim_mask; 2753 } 2754 err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags, 2755 (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp); 2756 if (err) { 2757 ZFS_EXIT(zfsvfs); 2758 return (err); 2759 } 2760 2761 if (trim_mask) 2762 vap->va_mask |= saved_mask; 2763 } 2764 2765 /* 2766 * secpolicy_vnode_setattr, or take ownership may have 2767 * changed va_mask 2768 */ 2769 mask = vap->va_mask; 2770 2771 tx = dmu_tx_create(zfsvfs->z_os); 2772 dmu_tx_hold_bonus(tx, zp->z_id); 2773 if (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) || 2774 ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid))) { 2775 if (zfsvfs->z_fuid_obj == 0) { 2776 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 2777 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 2778 FUID_SIZE_ESTIMATE(zfsvfs)); 2779 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); 2780 } else { 2781 dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); 2782 dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, 2783 FUID_SIZE_ESTIMATE(zfsvfs)); 2784 } 2785 } 2786 2787 if (mask & AT_MODE) { 2788 uint64_t pmode = pzp->zp_mode; 2789 2790 new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); 2791 2792 if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)) { 2793 dmu_tx_abort(tx); 2794 ZFS_EXIT(zfsvfs); 2795 return (err); 2796 } 2797 if (pzp->zp_acl.z_acl_extern_obj) { 2798 /* Are we upgrading ACL from old V0 format to new V1 */ 2799 if (zfsvfs->z_version <= ZPL_VERSION_FUID && 2800 pzp->zp_acl.z_acl_version == 2801 ZFS_ACL_VERSION_INITIAL) { 2802 dmu_tx_hold_free(tx, 2803 pzp->zp_acl.z_acl_extern_obj, 0, 2804 DMU_OBJECT_END); 2805 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 2806 0, aclp->z_acl_bytes); 2807 } else { 2808 dmu_tx_hold_write(tx, 2809 pzp->zp_acl.z_acl_extern_obj, 0, 2810 aclp->z_acl_bytes); 2811 } 2812 } else if (aclp->z_acl_bytes > ZFS_ACE_SPACE) { 2813 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 2814 0, aclp->z_acl_bytes); 2815 } 2816 } 2817 2818 if ((mask & (AT_UID | AT_GID)) && pzp->zp_xattr != 0) { 2819 err = zfs_zget(zp->z_zfsvfs, pzp->zp_xattr, &attrzp); 2820 if (err) { 2821 dmu_tx_abort(tx); 2822 ZFS_EXIT(zfsvfs); 2823 if (aclp) 2824 zfs_acl_free(aclp); 2825 return (err); 2826 } 2827 dmu_tx_hold_bonus(tx, attrzp->z_id); 2828 } 2829 2830 err = dmu_tx_assign(tx, zfsvfs->z_assign); 2831 if (err) { 2832 if (attrzp) 2833 VN_RELE(ZTOV(attrzp)); 2834 2835 if (aclp) { 2836 zfs_acl_free(aclp); 2837 aclp = NULL; 2838 } 2839 2840 if (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 2841 dmu_tx_wait(tx); 2842 dmu_tx_abort(tx); 2843 goto top; 2844 } 2845 dmu_tx_abort(tx); 2846 ZFS_EXIT(zfsvfs); 2847 return (err); 2848 } 2849 2850 dmu_buf_will_dirty(zp->z_dbuf, tx); 2851 2852 /* 2853 * Set each attribute requested. 2854 * We group settings according to the locks they need to acquire. 2855 * 2856 * Note: you cannot set ctime directly, although it will be 2857 * updated as a side-effect of calling this function. 2858 */ 2859 2860 mutex_enter(&zp->z_lock); 2861 2862 if (mask & AT_MODE) { 2863 mutex_enter(&zp->z_acl_lock); 2864 zp->z_phys->zp_mode = new_mode; 2865 err = zfs_aclset_common(zp, aclp, cr, &fuidp, tx); 2866 ASSERT3U(err, ==, 0); 2867 mutex_exit(&zp->z_acl_lock); 2868 } 2869 2870 if (attrzp) 2871 mutex_enter(&attrzp->z_lock); 2872 2873 if (mask & AT_UID) { 2874 pzp->zp_uid = zfs_fuid_create(zfsvfs, 2875 vap->va_uid, cr, ZFS_OWNER, tx, &fuidp); 2876 if (attrzp) { 2877 attrzp->z_phys->zp_uid = zfs_fuid_create(zfsvfs, 2878 vap->va_uid, cr, ZFS_OWNER, tx, &fuidp); 2879 } 2880 } 2881 2882 if (mask & AT_GID) { 2883 pzp->zp_gid = zfs_fuid_create(zfsvfs, vap->va_gid, 2884 cr, ZFS_GROUP, tx, &fuidp); 2885 if (attrzp) 2886 attrzp->z_phys->zp_gid = zfs_fuid_create(zfsvfs, 2887 vap->va_gid, cr, ZFS_GROUP, tx, &fuidp); 2888 } 2889 2890 if (aclp) 2891 zfs_acl_free(aclp); 2892 2893 if (attrzp) 2894 mutex_exit(&attrzp->z_lock); 2895 2896 if (mask & AT_ATIME) 2897 ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime); 2898 2899 if (mask & AT_MTIME) 2900 ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime); 2901 2902 /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */ 2903 if (mask & AT_SIZE) 2904 zfs_time_stamper_locked(zp, CONTENT_MODIFIED, tx); 2905 else if (mask != 0) 2906 zfs_time_stamper_locked(zp, STATE_CHANGED, tx); 2907 /* 2908 * Do this after setting timestamps to prevent timestamp 2909 * update from toggling bit 2910 */ 2911 2912 if (xoap && (mask & AT_XVATTR)) { 2913 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { 2914 size_t len; 2915 dmu_object_info_t doi; 2916 2917 ASSERT(vp->v_type == VREG); 2918 2919 /* Grow the bonus buffer if necessary. */ 2920 dmu_object_info_from_db(zp->z_dbuf, &doi); 2921 len = sizeof (xoap->xoa_av_scanstamp) + 2922 sizeof (znode_phys_t); 2923 if (len > doi.doi_bonus_size) 2924 VERIFY(dmu_set_bonus(zp->z_dbuf, len, tx) == 0); 2925 } 2926 zfs_xvattr_set(zp, xvap); 2927 } 2928 2929 if (mask != 0) 2930 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); 2931 2932 if (fuidp) 2933 zfs_fuid_info_free(fuidp); 2934 mutex_exit(&zp->z_lock); 2935 2936 if (attrzp) 2937 VN_RELE(ZTOV(attrzp)); 2938 2939 dmu_tx_commit(tx); 2940 2941 ZFS_EXIT(zfsvfs); 2942 return (err); 2943} 2944 2945typedef struct zfs_zlock { 2946 krwlock_t *zl_rwlock; /* lock we acquired */ 2947 znode_t *zl_znode; /* znode we held */ 2948 struct zfs_zlock *zl_next; /* next in list */ 2949} zfs_zlock_t; 2950 2951/* 2952 * Drop locks and release vnodes that were held by zfs_rename_lock(). 2953 */ 2954static void 2955zfs_rename_unlock(zfs_zlock_t **zlpp) 2956{ 2957 zfs_zlock_t *zl; 2958 2959 while ((zl = *zlpp) != NULL) { 2960 if (zl->zl_znode != NULL) 2961 VN_RELE(ZTOV(zl->zl_znode)); 2962 rw_exit(zl->zl_rwlock); 2963 *zlpp = zl->zl_next; 2964 kmem_free(zl, sizeof (*zl)); 2965 } 2966} 2967 2968/* 2969 * Search back through the directory tree, using the ".." entries. 2970 * Lock each directory in the chain to prevent concurrent renames. 2971 * Fail any attempt to move a directory into one of its own descendants. 2972 * XXX - z_parent_lock can overlap with map or grow locks 2973 */ 2974static int 2975zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) 2976{ 2977 zfs_zlock_t *zl; 2978 znode_t *zp = tdzp; 2979 uint64_t rootid = zp->z_zfsvfs->z_root; 2980 uint64_t *oidp = &zp->z_id; 2981 krwlock_t *rwlp = &szp->z_parent_lock; 2982 krw_t rw = RW_WRITER; 2983 2984 /* 2985 * First pass write-locks szp and compares to zp->z_id. 2986 * Later passes read-lock zp and compare to zp->z_parent. 2987 */ 2988 do { 2989 if (!rw_tryenter(rwlp, rw)) { 2990 /* 2991 * Another thread is renaming in this path. 2992 * Note that if we are a WRITER, we don't have any 2993 * parent_locks held yet. 2994 */ 2995 if (rw == RW_READER && zp->z_id > szp->z_id) { 2996 /* 2997 * Drop our locks and restart 2998 */ 2999 zfs_rename_unlock(&zl); 3000 *zlpp = NULL; 3001 zp = tdzp; 3002 oidp = &zp->z_id; 3003 rwlp = &szp->z_parent_lock; 3004 rw = RW_WRITER; 3005 continue; 3006 } else { 3007 /* 3008 * Wait for other thread to drop its locks 3009 */ 3010 rw_enter(rwlp, rw); 3011 } 3012 } 3013 3014 zl = kmem_alloc(sizeof (*zl), KM_SLEEP); 3015 zl->zl_rwlock = rwlp; 3016 zl->zl_znode = NULL; 3017 zl->zl_next = *zlpp; 3018 *zlpp = zl; 3019 3020 if (*oidp == szp->z_id) /* We're a descendant of szp */ 3021 return (EINVAL); 3022 3023 if (*oidp == rootid) /* We've hit the top */ 3024 return (0); 3025 3026 if (rw == RW_READER) { /* i.e. not the first pass */ 3027 int error = zfs_zget(zp->z_zfsvfs, *oidp, &zp); 3028 if (error) 3029 return (error); 3030 zl->zl_znode = zp; 3031 } 3032 oidp = &zp->z_phys->zp_parent; 3033 rwlp = &zp->z_parent_lock; 3034 rw = RW_READER; 3035 3036 } while (zp->z_id != sdzp->z_id); 3037 3038 return (0); 3039} 3040 3041/* 3042 * Move an entry from the provided source directory to the target 3043 * directory. Change the entry name as indicated. 3044 * 3045 * IN: sdvp - Source directory containing the "old entry". 3046 * snm - Old entry name. 3047 * tdvp - Target directory to contain the "new entry". 3048 * tnm - New entry name. 3049 * cr - credentials of caller. 3050 * ct - caller context 3051 * flags - case flags 3052 * 3053 * RETURN: 0 if success 3054 * error code if failure 3055 * 3056 * Timestamps: 3057 * sdvp,tdvp - ctime|mtime updated 3058 */ 3059/*ARGSUSED*/ 3060static int 3061zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr, 3062 caller_context_t *ct, int flags) 3063{ 3064 znode_t *tdzp, *szp, *tzp; 3065 znode_t *sdzp = VTOZ(sdvp); 3066 zfsvfs_t *zfsvfs = sdzp->z_zfsvfs; 3067 zilog_t *zilog; 3068 vnode_t *realvp; 3069 zfs_dirlock_t *sdl, *tdl; 3070 dmu_tx_t *tx; 3071 zfs_zlock_t *zl; 3072 int cmp, serr, terr; 3073 int error = 0; 3074 int zflg = 0; 3075 3076 ZFS_ENTER(zfsvfs); 3077 ZFS_VERIFY_ZP(sdzp); 3078 zilog = zfsvfs->z_log; 3079 3080 /* 3081 * Make sure we have the real vp for the target directory. 3082 */ 3083 if (VOP_REALVP(tdvp, &realvp, ct) == 0) 3084 tdvp = realvp; 3085 3086 if (tdvp->v_vfsp != sdvp->v_vfsp) { 3087 ZFS_EXIT(zfsvfs); 3088 return (EXDEV); 3089 } 3090 3091 tdzp = VTOZ(tdvp); 3092 ZFS_VERIFY_ZP(tdzp); 3093 if (zfsvfs->z_utf8 && u8_validate(tnm, 3094 strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 3095 ZFS_EXIT(zfsvfs); 3096 return (EILSEQ); 3097 } 3098 3099 if (flags & FIGNORECASE) 3100 zflg |= ZCILOOK; 3101 3102top: 3103 szp = NULL; 3104 tzp = NULL; 3105 zl = NULL; 3106 3107 /* 3108 * This is to prevent the creation of links into attribute space 3109 * by renaming a linked file into/outof an attribute directory. 3110 * See the comment in zfs_link() for why this is considered bad. 3111 */ 3112 if ((tdzp->z_phys->zp_flags & ZFS_XATTR) != 3113 (sdzp->z_phys->zp_flags & ZFS_XATTR)) { 3114 ZFS_EXIT(zfsvfs); 3115 return (EINVAL); 3116 } 3117 3118 /* 3119 * Lock source and target directory entries. To prevent deadlock, 3120 * a lock ordering must be defined. We lock the directory with 3121 * the smallest object id first, or if it's a tie, the one with 3122 * the lexically first name. 3123 */ 3124 if (sdzp->z_id < tdzp->z_id) { 3125 cmp = -1; 3126 } else if (sdzp->z_id > tdzp->z_id) { 3127 cmp = 1; 3128 } else { 3129 /* 3130 * First compare the two name arguments without 3131 * considering any case folding. 3132 */ 3133 int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER); 3134 3135 cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error); 3136 ASSERT(error == 0 || !zfsvfs->z_utf8); 3137 if (cmp == 0) { 3138 /* 3139 * POSIX: "If the old argument and the new argument 3140 * both refer to links to the same existing file, 3141 * the rename() function shall return successfully 3142 * and perform no other action." 3143 */ 3144 ZFS_EXIT(zfsvfs); 3145 return (0); 3146 } 3147 /* 3148 * If the file system is case-folding, then we may 3149 * have some more checking to do. A case-folding file 3150 * system is either supporting mixed case sensitivity 3151 * access or is completely case-insensitive. Note 3152 * that the file system is always case preserving. 3153 * 3154 * In mixed sensitivity mode case sensitive behavior 3155 * is the default. FIGNORECASE must be used to 3156 * explicitly request case insensitive behavior. 3157 * 3158 * If the source and target names provided differ only 3159 * by case (e.g., a request to rename 'tim' to 'Tim'), 3160 * we will treat this as a special case in the 3161 * case-insensitive mode: as long as the source name 3162 * is an exact match, we will allow this to proceed as 3163 * a name-change request. 3164 */ 3165 if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE || 3166 (zfsvfs->z_case == ZFS_CASE_MIXED && 3167 flags & FIGNORECASE)) && 3168 u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST, 3169 &error) == 0) { 3170 /* 3171 * case preserving rename request, require exact 3172 * name matches 3173 */ 3174 zflg |= ZCIEXACT; 3175 zflg &= ~ZCILOOK; 3176 } 3177 } 3178 3179 if (cmp < 0) { 3180 serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, 3181 ZEXISTS | zflg, NULL, NULL); 3182 terr = zfs_dirent_lock(&tdl, 3183 tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL); 3184 } else { 3185 terr = zfs_dirent_lock(&tdl, 3186 tdzp, tnm, &tzp, zflg, NULL, NULL); 3187 serr = zfs_dirent_lock(&sdl, 3188 sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg, 3189 NULL, NULL); 3190 } 3191 3192 if (serr) { 3193 /* 3194 * Source entry invalid or not there. 3195 */ 3196 if (!terr) { 3197 zfs_dirent_unlock(tdl); 3198 if (tzp) 3199 VN_RELE(ZTOV(tzp)); 3200 } 3201 if (strcmp(snm, ".") == 0 || strcmp(snm, "..") == 0) 3202 serr = EINVAL; 3203 ZFS_EXIT(zfsvfs); 3204 return (serr); 3205 } 3206 if (terr) { 3207 zfs_dirent_unlock(sdl); 3208 VN_RELE(ZTOV(szp)); 3209 if (strcmp(tnm, "..") == 0) 3210 terr = EINVAL; 3211 ZFS_EXIT(zfsvfs); 3212 return (terr); 3213 } 3214 3215 /* 3216 * Must have write access at the source to remove the old entry 3217 * and write access at the target to create the new entry. 3218 * Note that if target and source are the same, this can be 3219 * done in a single check. 3220 */ 3221 3222 if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr)) 3223 goto out; 3224 3225 if (ZTOV(szp)->v_type == VDIR) { 3226 /* 3227 * Check to make sure rename is valid. 3228 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d 3229 */ 3230 if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl)) 3231 goto out; 3232 } 3233 3234 /* 3235 * Does target exist? 3236 */ 3237 if (tzp) { 3238 /* 3239 * Source and target must be the same type. 3240 */ 3241 if (ZTOV(szp)->v_type == VDIR) { 3242 if (ZTOV(tzp)->v_type != VDIR) { 3243 error = ENOTDIR; 3244 goto out; 3245 } 3246 } else { 3247 if (ZTOV(tzp)->v_type == VDIR) { 3248 error = EISDIR; 3249 goto out; 3250 } 3251 } 3252 /* 3253 * POSIX dictates that when the source and target 3254 * entries refer to the same file object, rename 3255 * must do nothing and exit without error. 3256 */ 3257 if (szp->z_id == tzp->z_id) { 3258 error = 0; 3259 goto out; 3260 } 3261 } 3262 3263 vnevent_rename_src(ZTOV(szp), sdvp, snm, ct); 3264 if (tzp) 3265 vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct); 3266 3267 /* 3268 * notify the target directory if it is not the same 3269 * as source directory. 3270 */ 3271 if (tdvp != sdvp) { 3272 vnevent_rename_dest_dir(tdvp, ct); 3273 } 3274 3275 tx = dmu_tx_create(zfsvfs->z_os); 3276 dmu_tx_hold_bonus(tx, szp->z_id); /* nlink changes */ 3277 dmu_tx_hold_bonus(tx, sdzp->z_id); /* nlink changes */ 3278 dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); 3279 dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); 3280 if (sdzp != tdzp) 3281 dmu_tx_hold_bonus(tx, tdzp->z_id); /* nlink changes */ 3282 if (tzp) 3283 dmu_tx_hold_bonus(tx, tzp->z_id); /* parent changes */ 3284 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 3285 error = dmu_tx_assign(tx, zfsvfs->z_assign); 3286 if (error) { 3287 if (zl != NULL) 3288 zfs_rename_unlock(&zl); 3289 zfs_dirent_unlock(sdl); 3290 zfs_dirent_unlock(tdl); 3291 VN_RELE(ZTOV(szp)); 3292 if (tzp) 3293 VN_RELE(ZTOV(tzp)); 3294 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 3295 dmu_tx_wait(tx); 3296 dmu_tx_abort(tx); 3297 goto top; 3298 } 3299 dmu_tx_abort(tx); 3300 ZFS_EXIT(zfsvfs); 3301 return (error); 3302 } 3303 3304 if (tzp) /* Attempt to remove the existing target */ 3305 error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL); 3306 3307 if (error == 0) { 3308 error = zfs_link_create(tdl, szp, tx, ZRENAMING); 3309 if (error == 0) { 3310 szp->z_phys->zp_flags |= ZFS_AV_MODIFIED; 3311 3312 error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); 3313 ASSERT(error == 0); 3314 3315 zfs_log_rename(zilog, tx, 3316 TX_RENAME | (flags & FIGNORECASE ? TX_CI : 0), 3317 sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp); 3318 3319 /* Update path information for the target vnode */ 3320 vn_renamepath(tdvp, ZTOV(szp), tnm, strlen(tnm)); 3321 } 3322#ifdef FREEBSD_NAMECACHE 3323 if (error == 0) { 3324 cache_purge(sdvp); 3325 cache_purge(tdvp); 3326 } 3327#endif 3328 } 3329 3330 dmu_tx_commit(tx); 3331out: 3332 if (zl != NULL) 3333 zfs_rename_unlock(&zl); 3334 3335 zfs_dirent_unlock(sdl); 3336 zfs_dirent_unlock(tdl); 3337 3338 VN_RELE(ZTOV(szp)); 3339 if (tzp) 3340 VN_RELE(ZTOV(tzp)); 3341 3342 ZFS_EXIT(zfsvfs); 3343 3344 return (error); 3345} 3346 3347/* 3348 * Insert the indicated symbolic reference entry into the directory. 3349 * 3350 * IN: dvp - Directory to contain new symbolic link. 3351 * link - Name for new symlink entry. 3352 * vap - Attributes of new entry. 3353 * target - Target path of new symlink. 3354 * cr - credentials of caller. 3355 * ct - caller context 3356 * flags - case flags 3357 * 3358 * RETURN: 0 if success 3359 * error code if failure 3360 * 3361 * Timestamps: 3362 * dvp - ctime|mtime updated 3363 */ 3364/*ARGSUSED*/ 3365static int 3366zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link, 3367 cred_t *cr, kthread_t *td) 3368{ 3369 znode_t *zp, *dzp = VTOZ(dvp); 3370 zfs_dirlock_t *dl; 3371 dmu_tx_t *tx; 3372 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 3373 zilog_t *zilog; 3374 int len = strlen(link); 3375 int error; 3376 int zflg = ZNEW; 3377 zfs_fuid_info_t *fuidp = NULL; 3378 int flags = 0; 3379 3380 ASSERT(vap->va_type == VLNK); 3381 3382 ZFS_ENTER(zfsvfs); 3383 ZFS_VERIFY_ZP(dzp); 3384 zilog = zfsvfs->z_log; 3385 3386 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), 3387 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 3388 ZFS_EXIT(zfsvfs); 3389 return (EILSEQ); 3390 } 3391 if (flags & FIGNORECASE) 3392 zflg |= ZCILOOK; 3393top: 3394 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { 3395 ZFS_EXIT(zfsvfs); 3396 return (error); 3397 } 3398 3399 if (len > MAXPATHLEN) { 3400 ZFS_EXIT(zfsvfs); 3401 return (ENAMETOOLONG); 3402 } 3403 3404 /* 3405 * Attempt to lock directory; fail if entry already exists. 3406 */ 3407 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); 3408 if (error) { 3409 ZFS_EXIT(zfsvfs); 3410 return (error); 3411 } 3412 3413 tx = dmu_tx_create(zfsvfs->z_os); 3414 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); 3415 dmu_tx_hold_bonus(tx, dzp->z_id); 3416 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 3417 if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) 3418 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE); 3419 if (IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr))) { 3420 if (zfsvfs->z_fuid_obj == 0) { 3421 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 3422 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 3423 FUID_SIZE_ESTIMATE(zfsvfs)); 3424 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); 3425 } else { 3426 dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); 3427 dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, 3428 FUID_SIZE_ESTIMATE(zfsvfs)); 3429 } 3430 } 3431 error = dmu_tx_assign(tx, zfsvfs->z_assign); 3432 if (error) { 3433 zfs_dirent_unlock(dl); 3434 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 3435 dmu_tx_wait(tx); 3436 dmu_tx_abort(tx); 3437 goto top; 3438 } 3439 dmu_tx_abort(tx); 3440 ZFS_EXIT(zfsvfs); 3441 return (error); 3442 } 3443 3444 dmu_buf_will_dirty(dzp->z_dbuf, tx); 3445 3446 /* 3447 * Create a new object for the symlink. 3448 * Put the link content into bonus buffer if it will fit; 3449 * otherwise, store it just like any other file data. 3450 */ 3451 if (sizeof (znode_phys_t) + len <= dmu_bonus_max()) { 3452 zfs_mknode(dzp, vap, tx, cr, 0, &zp, len, NULL, &fuidp); 3453 if (len != 0) 3454 bcopy(link, zp->z_phys + 1, len); 3455 } else { 3456 dmu_buf_t *dbp; 3457 3458 zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, NULL, &fuidp); 3459 /* 3460 * Nothing can access the znode yet so no locking needed 3461 * for growing the znode's blocksize. 3462 */ 3463 zfs_grow_blocksize(zp, len, tx); 3464 3465 VERIFY(0 == dmu_buf_hold(zfsvfs->z_os, 3466 zp->z_id, 0, FTAG, &dbp)); 3467 dmu_buf_will_dirty(dbp, tx); 3468 3469 ASSERT3U(len, <=, dbp->db_size); 3470 bcopy(link, dbp->db_data, len); 3471 dmu_buf_rele(dbp, FTAG); 3472 } 3473 zp->z_phys->zp_size = len; 3474 3475 /* 3476 * Insert the new object into the directory. 3477 */ 3478 (void) zfs_link_create(dl, zp, tx, ZNEW); 3479out: 3480 if (error == 0) { 3481 uint64_t txtype = TX_SYMLINK; 3482 if (flags & FIGNORECASE) 3483 txtype |= TX_CI; 3484 zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); 3485 *vpp = ZTOV(zp); 3486 } 3487 if (fuidp) 3488 zfs_fuid_info_free(fuidp); 3489 3490 dmu_tx_commit(tx); 3491 3492 zfs_dirent_unlock(dl); 3493 3494 ZFS_EXIT(zfsvfs); 3495 return (error); 3496} 3497 3498/* 3499 * Return, in the buffer contained in the provided uio structure, 3500 * the symbolic path referred to by vp. 3501 * 3502 * IN: vp - vnode of symbolic link. 3503 * uoip - structure to contain the link path. 3504 * cr - credentials of caller. 3505 * ct - caller context 3506 * 3507 * OUT: uio - structure to contain the link path. 3508 * 3509 * RETURN: 0 if success 3510 * error code if failure 3511 * 3512 * Timestamps: 3513 * vp - atime updated 3514 */ 3515/* ARGSUSED */ 3516static int 3517zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct) 3518{ 3519 znode_t *zp = VTOZ(vp); 3520 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 3521 size_t bufsz; 3522 int error; 3523 3524 ZFS_ENTER(zfsvfs); 3525 ZFS_VERIFY_ZP(zp); 3526 3527 bufsz = (size_t)zp->z_phys->zp_size; 3528 if (bufsz + sizeof (znode_phys_t) <= zp->z_dbuf->db_size) { 3529 error = uiomove(zp->z_phys + 1, 3530 MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); 3531 } else { 3532 dmu_buf_t *dbp; 3533 error = dmu_buf_hold(zfsvfs->z_os, zp->z_id, 0, FTAG, &dbp); 3534 if (error) { 3535 ZFS_EXIT(zfsvfs); 3536 return (error); 3537 } 3538 error = uiomove(dbp->db_data, 3539 MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); 3540 dmu_buf_rele(dbp, FTAG); 3541 } 3542 3543 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 3544 ZFS_EXIT(zfsvfs); 3545 return (error); 3546} 3547 3548/* 3549 * Insert a new entry into directory tdvp referencing svp. 3550 * 3551 * IN: tdvp - Directory to contain new entry. 3552 * svp - vnode of new entry. 3553 * name - name of new entry. 3554 * cr - credentials of caller. 3555 * ct - caller context 3556 * 3557 * RETURN: 0 if success 3558 * error code if failure 3559 * 3560 * Timestamps: 3561 * tdvp - ctime|mtime updated 3562 * svp - ctime updated 3563 */ 3564/* ARGSUSED */ 3565static int 3566zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr, 3567 caller_context_t *ct, int flags) 3568{ 3569 znode_t *dzp = VTOZ(tdvp); 3570 znode_t *tzp, *szp; 3571 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 3572 zilog_t *zilog; 3573 zfs_dirlock_t *dl; 3574 dmu_tx_t *tx; 3575 vnode_t *realvp; 3576 int error; 3577 int zf = ZNEW; 3578 uid_t owner; 3579 3580 ASSERT(tdvp->v_type == VDIR); 3581 3582 ZFS_ENTER(zfsvfs); 3583 ZFS_VERIFY_ZP(dzp); 3584 zilog = zfsvfs->z_log; 3585 3586 if (VOP_REALVP(svp, &realvp, ct) == 0) 3587 svp = realvp; 3588 3589 if (svp->v_vfsp != tdvp->v_vfsp) { 3590 ZFS_EXIT(zfsvfs); 3591 return (EXDEV); 3592 } 3593 szp = VTOZ(svp); 3594 ZFS_VERIFY_ZP(szp); 3595 3596 if (zfsvfs->z_utf8 && u8_validate(name, 3597 strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 3598 ZFS_EXIT(zfsvfs); 3599 return (EILSEQ); 3600 } 3601 if (flags & FIGNORECASE) 3602 zf |= ZCILOOK; 3603 3604top: 3605 /* 3606 * We do not support links between attributes and non-attributes 3607 * because of the potential security risk of creating links 3608 * into "normal" file space in order to circumvent restrictions 3609 * imposed in attribute space. 3610 */ 3611 if ((szp->z_phys->zp_flags & ZFS_XATTR) != 3612 (dzp->z_phys->zp_flags & ZFS_XATTR)) { 3613 ZFS_EXIT(zfsvfs); 3614 return (EINVAL); 3615 } 3616 3617 /* 3618 * POSIX dictates that we return EPERM here. 3619 * Better choices include ENOTSUP or EISDIR. 3620 */ 3621 if (svp->v_type == VDIR) { 3622 ZFS_EXIT(zfsvfs); 3623 return (EPERM); 3624 } 3625 3626 owner = zfs_fuid_map_id(zfsvfs, szp->z_phys->zp_uid, cr, ZFS_OWNER); 3627 if (owner != crgetuid(cr) && 3628 secpolicy_basic_link(svp, cr) != 0) { 3629 ZFS_EXIT(zfsvfs); 3630 return (EPERM); 3631 } 3632 3633 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { 3634 ZFS_EXIT(zfsvfs); 3635 return (error); 3636 } 3637 3638 /* 3639 * Attempt to lock directory; fail if entry already exists. 3640 */ 3641 error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL); 3642 if (error) { 3643 ZFS_EXIT(zfsvfs); 3644 return (error); 3645 } 3646 3647 tx = dmu_tx_create(zfsvfs->z_os); 3648 dmu_tx_hold_bonus(tx, szp->z_id); 3649 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 3650 error = dmu_tx_assign(tx, zfsvfs->z_assign); 3651 if (error) { 3652 zfs_dirent_unlock(dl); 3653 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 3654 dmu_tx_wait(tx); 3655 dmu_tx_abort(tx); 3656 goto top; 3657 } 3658 dmu_tx_abort(tx); 3659 ZFS_EXIT(zfsvfs); 3660 return (error); 3661 } 3662 3663 error = zfs_link_create(dl, szp, tx, 0); 3664 3665 if (error == 0) { 3666 uint64_t txtype = TX_LINK; 3667 if (flags & FIGNORECASE) 3668 txtype |= TX_CI; 3669 zfs_log_link(zilog, tx, txtype, dzp, szp, name); 3670 } 3671 3672 dmu_tx_commit(tx); 3673 3674 zfs_dirent_unlock(dl); 3675 3676 if (error == 0) { 3677 vnevent_link(svp, ct); 3678 } 3679 3680 ZFS_EXIT(zfsvfs); 3681 return (error); 3682} 3683 3684/*ARGSUSED*/ 3685void 3686zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) 3687{ 3688 znode_t *zp = VTOZ(vp); 3689 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 3690 int error; 3691 3692 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); 3693 if (zp->z_dbuf == NULL) { 3694 /* 3695 * The fs has been unmounted, or we did a 3696 * suspend/resume and this file no longer exists. 3697 */ 3698 mutex_enter(&zp->z_lock); 3699 VI_LOCK(vp); 3700 vp->v_count = 0; /* count arrives as 1 */ 3701 mutex_exit(&zp->z_lock); 3702 rw_exit(&zfsvfs->z_teardown_inactive_lock); 3703 zfs_znode_free(zp); 3704 return; 3705 } 3706 3707 if (zp->z_atime_dirty && zp->z_unlinked == 0) { 3708 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); 3709 3710 dmu_tx_hold_bonus(tx, zp->z_id); 3711 error = dmu_tx_assign(tx, TXG_WAIT); 3712 if (error) { 3713 dmu_tx_abort(tx); 3714 } else { 3715 dmu_buf_will_dirty(zp->z_dbuf, tx); 3716 mutex_enter(&zp->z_lock); 3717 zp->z_atime_dirty = 0; 3718 mutex_exit(&zp->z_lock); 3719 dmu_tx_commit(tx); 3720 } 3721 } 3722 3723 zfs_zinactive(zp); 3724 rw_exit(&zfsvfs->z_teardown_inactive_lock); 3725} 3726 3727CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid)); 3728CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid)); 3729 3730/*ARGSUSED*/ 3731static int 3732zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct) 3733{ 3734 znode_t *zp = VTOZ(vp); 3735 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 3736 uint32_t gen; 3737 uint64_t object = zp->z_id; 3738 zfid_short_t *zfid; 3739 int size, i; 3740 3741 ZFS_ENTER(zfsvfs); 3742 ZFS_VERIFY_ZP(zp); 3743 gen = (uint32_t)zp->z_gen; 3744 3745 size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN; 3746 fidp->fid_len = size; 3747 3748 zfid = (zfid_short_t *)fidp; 3749 3750 zfid->zf_len = size; 3751 3752 for (i = 0; i < sizeof (zfid->zf_object); i++) 3753 zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); 3754 3755 /* Must have a non-zero generation number to distinguish from .zfs */ 3756 if (gen == 0) 3757 gen = 1; 3758 for (i = 0; i < sizeof (zfid->zf_gen); i++) 3759 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); 3760 3761 if (size == LONG_FID_LEN) { 3762 uint64_t objsetid = dmu_objset_id(zfsvfs->z_os); 3763 zfid_long_t *zlfid; 3764 3765 zlfid = (zfid_long_t *)fidp; 3766 3767 for (i = 0; i < sizeof (zlfid->zf_setid); i++) 3768 zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i)); 3769 3770 /* XXX - this should be the generation number for the objset */ 3771 for (i = 0; i < sizeof (zlfid->zf_setgen); i++) 3772 zlfid->zf_setgen[i] = 0; 3773 } 3774 3775 ZFS_EXIT(zfsvfs); 3776 return (0); 3777} 3778 3779static int 3780zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, 3781 caller_context_t *ct) 3782{ 3783 znode_t *zp, *xzp; 3784 zfsvfs_t *zfsvfs; 3785 zfs_dirlock_t *dl; 3786 int error; 3787 3788 switch (cmd) { 3789 case _PC_LINK_MAX: 3790 *valp = INT_MAX; 3791 return (0); 3792 3793 case _PC_FILESIZEBITS: 3794 *valp = 64; 3795 return (0); 3796 3797#if 0 3798 case _PC_XATTR_EXISTS: 3799 zp = VTOZ(vp); 3800 zfsvfs = zp->z_zfsvfs; 3801 ZFS_ENTER(zfsvfs); 3802 ZFS_VERIFY_ZP(zp); 3803 *valp = 0; 3804 error = zfs_dirent_lock(&dl, zp, "", &xzp, 3805 ZXATTR | ZEXISTS | ZSHARED, NULL, NULL); 3806 if (error == 0) { 3807 zfs_dirent_unlock(dl); 3808 if (!zfs_dirempty(xzp)) 3809 *valp = 1; 3810 VN_RELE(ZTOV(xzp)); 3811 } else if (error == ENOENT) { 3812 /* 3813 * If there aren't extended attributes, it's the 3814 * same as having zero of them. 3815 */ 3816 error = 0; 3817 } 3818 ZFS_EXIT(zfsvfs); 3819 return (error); 3820#endif 3821 3822 case _PC_ACL_EXTENDED: 3823 *valp = 0; /* TODO */ 3824 return (0); 3825 3826 case _PC_MIN_HOLE_SIZE: 3827 *valp = (int)SPA_MINBLOCKSIZE; 3828 return (0); 3829 3830 default: 3831 return (EOPNOTSUPP); 3832 } 3833} 3834 3835#ifdef TODO 3836/*ARGSUSED*/ 3837static int 3838zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, 3839 caller_context_t *ct) 3840{ 3841 znode_t *zp = VTOZ(vp); 3842 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 3843 int error; 3844 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 3845 3846 ZFS_ENTER(zfsvfs); 3847 ZFS_VERIFY_ZP(zp); 3848 error = zfs_getacl(zp, vsecp, skipaclchk, cr); 3849 ZFS_EXIT(zfsvfs); 3850 3851 return (error); 3852} 3853#endif /* TODO */ 3854 3855#ifdef TODO 3856/*ARGSUSED*/ 3857static int 3858zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, 3859 caller_context_t *ct) 3860{ 3861 znode_t *zp = VTOZ(vp); 3862 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 3863 int error; 3864 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 3865 3866 ZFS_ENTER(zfsvfs); 3867 ZFS_VERIFY_ZP(zp); 3868 error = zfs_setacl(zp, vsecp, skipaclchk, cr); 3869 ZFS_EXIT(zfsvfs); 3870 return (error); 3871} 3872#endif /* TODO */ 3873 3874static int 3875zfs_freebsd_open(ap) 3876 struct vop_open_args /* { 3877 struct vnode *a_vp; 3878 int a_mode; 3879 struct ucred *a_cred; 3880 struct thread *a_td; 3881 } */ *ap; 3882{ 3883 vnode_t *vp = ap->a_vp; 3884 znode_t *zp = VTOZ(vp); 3885 int error; 3886 3887 error = zfs_open(&vp, ap->a_mode, ap->a_cred, NULL); 3888 if (error == 0) 3889 vnode_create_vobject(vp, zp->z_phys->zp_size, ap->a_td); 3890 return (error); 3891} 3892 3893static int 3894zfs_freebsd_close(ap) 3895 struct vop_close_args /* { 3896 struct vnode *a_vp; 3897 int a_fflag; 3898 struct ucred *a_cred; 3899 struct thread *a_td; 3900 } */ *ap; 3901{ 3902 3903 return (zfs_close(ap->a_vp, ap->a_fflag, 0, 0, ap->a_cred, NULL)); 3904} 3905 3906static int 3907zfs_freebsd_ioctl(ap) 3908 struct vop_ioctl_args /* { 3909 struct vnode *a_vp; 3910 u_long a_command; 3911 caddr_t a_data; 3912 int a_fflag; 3913 struct ucred *cred; 3914 struct thread *td; 3915 } */ *ap; 3916{ 3917 3918 return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data, 3919 ap->a_fflag, ap->a_cred, NULL, NULL)); 3920} 3921 3922static int 3923zfs_freebsd_read(ap) 3924 struct vop_read_args /* { 3925 struct vnode *a_vp; 3926 struct uio *a_uio; 3927 int a_ioflag; 3928 struct ucred *a_cred; 3929 } */ *ap; 3930{ 3931 3932 return (zfs_read(ap->a_vp, ap->a_uio, ap->a_ioflag, ap->a_cred, NULL)); 3933} 3934 3935static int 3936zfs_freebsd_write(ap) 3937 struct vop_write_args /* { 3938 struct vnode *a_vp; 3939 struct uio *a_uio; 3940 int a_ioflag; 3941 struct ucred *a_cred; 3942 } */ *ap; 3943{ 3944 3945 return (zfs_write(ap->a_vp, ap->a_uio, ap->a_ioflag, ap->a_cred, NULL)); 3946} 3947 3948static int 3949zfs_freebsd_access(ap) 3950 struct vop_access_args /* { 3951 struct vnode *a_vp; 3952 int a_accmode; 3953 struct ucred *a_cred; 3954 struct thread *a_td; 3955 } */ *ap; 3956{ 3957 3958 return (zfs_access(ap->a_vp, ap->a_accmode, 0, ap->a_cred, NULL)); 3959} 3960 3961static int 3962zfs_freebsd_lookup(ap) 3963 struct vop_lookup_args /* { 3964 struct vnode *a_dvp; 3965 struct vnode **a_vpp; 3966 struct componentname *a_cnp; 3967 } */ *ap; 3968{ 3969 struct componentname *cnp = ap->a_cnp; 3970 char nm[NAME_MAX + 1]; 3971 3972 ASSERT(cnp->cn_namelen < sizeof(nm)); 3973 strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm))); 3974 3975 return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop, 3976 cnp->cn_cred, cnp->cn_thread, 0)); 3977} 3978 3979static int 3980zfs_freebsd_create(ap) 3981 struct vop_create_args /* { 3982 struct vnode *a_dvp; 3983 struct vnode **a_vpp; 3984 struct componentname *a_cnp; 3985 struct vattr *a_vap; 3986 } */ *ap; 3987{ 3988 struct componentname *cnp = ap->a_cnp; 3989 vattr_t *vap = ap->a_vap; 3990 int mode; 3991 3992 ASSERT(cnp->cn_flags & SAVENAME); 3993 3994 vattr_init_mask(vap); 3995 mode = vap->va_mode & ALLPERMS; 3996 3997 return (zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode, 3998 ap->a_vpp, cnp->cn_cred, cnp->cn_thread)); 3999} 4000 4001static int 4002zfs_freebsd_remove(ap) 4003 struct vop_remove_args /* { 4004 struct vnode *a_dvp; 4005 struct vnode *a_vp; 4006 struct componentname *a_cnp; 4007 } */ *ap; 4008{ 4009 4010 ASSERT(ap->a_cnp->cn_flags & SAVENAME); 4011 4012 return (zfs_remove(ap->a_dvp, ap->a_cnp->cn_nameptr, 4013 ap->a_cnp->cn_cred, NULL, 0)); 4014} 4015 4016static int 4017zfs_freebsd_mkdir(ap) 4018 struct vop_mkdir_args /* { 4019 struct vnode *a_dvp; 4020 struct vnode **a_vpp; 4021 struct componentname *a_cnp; 4022 struct vattr *a_vap; 4023 } */ *ap; 4024{ 4025 vattr_t *vap = ap->a_vap; 4026 4027 ASSERT(ap->a_cnp->cn_flags & SAVENAME); 4028 4029 vattr_init_mask(vap); 4030 4031 return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp, 4032 ap->a_cnp->cn_cred, NULL, 0, NULL)); 4033} 4034 4035static int 4036zfs_freebsd_rmdir(ap) 4037 struct vop_rmdir_args /* { 4038 struct vnode *a_dvp; 4039 struct vnode *a_vp; 4040 struct componentname *a_cnp; 4041 } */ *ap; 4042{ 4043 struct componentname *cnp = ap->a_cnp; 4044 4045 ASSERT(cnp->cn_flags & SAVENAME); 4046 4047 return (zfs_rmdir(ap->a_dvp, cnp->cn_nameptr, NULL, cnp->cn_cred, NULL, 0)); 4048} 4049 4050static int 4051zfs_freebsd_readdir(ap) 4052 struct vop_readdir_args /* { 4053 struct vnode *a_vp; 4054 struct uio *a_uio; 4055 struct ucred *a_cred; 4056 int *a_eofflag; 4057 int *a_ncookies; 4058 u_long **a_cookies; 4059 } */ *ap; 4060{ 4061 4062 return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag, 4063 ap->a_ncookies, ap->a_cookies)); 4064} 4065 4066static int 4067zfs_freebsd_fsync(ap) 4068 struct vop_fsync_args /* { 4069 struct vnode *a_vp; 4070 int a_waitfor; 4071 struct thread *a_td; 4072 } */ *ap; 4073{ 4074 4075 vop_stdfsync(ap); 4076 return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL)); 4077} 4078 4079static int 4080zfs_freebsd_getattr(ap) 4081 struct vop_getattr_args /* { 4082 struct vnode *a_vp; 4083 struct vattr *a_vap; 4084 struct ucred *a_cred; 4085 struct thread *a_td; 4086 } */ *ap; 4087{ 4088 vattr_t *vap = ap->a_vap; 4089 xvattr_t xvap; 4090 u_long fflags = 0; 4091 int error; 4092 4093 xva_init(&xvap); 4094 xvap.xva_vattr = *vap; 4095 xvap.xva_vattr.va_mask |= AT_XVATTR; 4096 4097 /* Convert chflags into ZFS-type flags. */ 4098 /* XXX: what about SF_SETTABLE?. */ 4099 XVA_SET_REQ(&xvap, XAT_IMMUTABLE); 4100 XVA_SET_REQ(&xvap, XAT_APPENDONLY); 4101 XVA_SET_REQ(&xvap, XAT_NOUNLINK); 4102 XVA_SET_REQ(&xvap, XAT_NODUMP); 4103 error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL); 4104 if (error != 0) 4105 return (error); 4106 4107 /* Convert ZFS xattr into chflags. */ 4108#define FLAG_CHECK(fflag, xflag, xfield) do { \ 4109 if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0) \ 4110 fflags |= (fflag); \ 4111} while (0) 4112 FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE, 4113 xvap.xva_xoptattrs.xoa_immutable); 4114 FLAG_CHECK(SF_APPEND, XAT_APPENDONLY, 4115 xvap.xva_xoptattrs.xoa_appendonly); 4116 FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK, 4117 xvap.xva_xoptattrs.xoa_nounlink); 4118 FLAG_CHECK(UF_NODUMP, XAT_NODUMP, 4119 xvap.xva_xoptattrs.xoa_nodump); 4120#undef FLAG_CHECK 4121 *vap = xvap.xva_vattr; 4122 vap->va_flags = fflags; 4123 return (0); 4124} 4125 4126static int 4127zfs_freebsd_setattr(ap) 4128 struct vop_setattr_args /* { 4129 struct vnode *a_vp; 4130 struct vattr *a_vap; 4131 struct ucred *a_cred; 4132 struct thread *a_td; 4133 } */ *ap; 4134{ 4135 vattr_t *vap = ap->a_vap; 4136 xvattr_t xvap; 4137 u_long fflags; 4138 uint64_t zflags; 4139 4140 vattr_init_mask(vap); 4141 vap->va_mask &= ~AT_NOSET; 4142 4143 xva_init(&xvap); 4144 xvap.xva_vattr = *vap; 4145 4146 if (vap->va_flags != VNOVAL) { 4147 fflags = vap->va_flags; 4148 if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_NODUMP)) != 0) 4149 return (EOPNOTSUPP); 4150 zflags = VTOZ(ap->a_vp)->z_phys->zp_flags; 4151 4152#define FLAG_CHANGE(fflag, zflag, xflag, xfield) do { \ 4153 if (((fflags & (fflag)) && !(zflags & (zflag))) || \ 4154 ((zflags & (zflag)) && !(fflags & (fflag)))) { \ 4155 XVA_SET_REQ(&xvap, (xflag)); \ 4156 (xfield) = ((fflags & (fflag)) != 0); \ 4157 } \ 4158} while (0) 4159 /* Convert chflags into ZFS-type flags. */ 4160 /* XXX: what about SF_SETTABLE?. */ 4161 FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE, 4162 xvap.xva_xoptattrs.xoa_immutable); 4163 FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY, 4164 xvap.xva_xoptattrs.xoa_appendonly); 4165 FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK, 4166 xvap.xva_xoptattrs.xoa_nounlink); 4167 FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP, 4168 xvap.xva_xoptattrs.xoa_nounlink); 4169#undef FLAG_CHANGE 4170 } 4171 return (zfs_setattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL)); 4172} 4173 4174static int 4175zfs_freebsd_rename(ap) 4176 struct vop_rename_args /* { 4177 struct vnode *a_fdvp; 4178 struct vnode *a_fvp; 4179 struct componentname *a_fcnp; 4180 struct vnode *a_tdvp; 4181 struct vnode *a_tvp; 4182 struct componentname *a_tcnp; 4183 } */ *ap; 4184{ 4185 vnode_t *fdvp = ap->a_fdvp; 4186 vnode_t *fvp = ap->a_fvp; 4187 vnode_t *tdvp = ap->a_tdvp; 4188 vnode_t *tvp = ap->a_tvp; 4189 int error; 4190 4191 ASSERT(ap->a_fcnp->cn_flags & SAVENAME); 4192 ASSERT(ap->a_tcnp->cn_flags & SAVENAME); 4193 4194 error = zfs_rename(fdvp, ap->a_fcnp->cn_nameptr, tdvp, 4195 ap->a_tcnp->cn_nameptr, ap->a_fcnp->cn_cred, NULL, 0); 4196 4197 if (tdvp == tvp) 4198 VN_RELE(tdvp); 4199 else 4200 VN_URELE(tdvp); 4201 if (tvp) 4202 VN_URELE(tvp); 4203 VN_RELE(fdvp); 4204 VN_RELE(fvp); 4205 4206 return (error); 4207} 4208 4209static int 4210zfs_freebsd_symlink(ap) 4211 struct vop_symlink_args /* { 4212 struct vnode *a_dvp; 4213 struct vnode **a_vpp; 4214 struct componentname *a_cnp; 4215 struct vattr *a_vap; 4216 char *a_target; 4217 } */ *ap; 4218{ 4219 struct componentname *cnp = ap->a_cnp; 4220 vattr_t *vap = ap->a_vap; 4221 4222 ASSERT(cnp->cn_flags & SAVENAME); 4223 4224 vap->va_type = VLNK; /* FreeBSD: Syscall only sets va_mode. */ 4225 vattr_init_mask(vap); 4226 4227 return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap, 4228 ap->a_target, cnp->cn_cred, cnp->cn_thread)); 4229} 4230 4231static int 4232zfs_freebsd_readlink(ap) 4233 struct vop_readlink_args /* { 4234 struct vnode *a_vp; 4235 struct uio *a_uio; 4236 struct ucred *a_cred; 4237 } */ *ap; 4238{ 4239 4240 return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL)); 4241} 4242 4243static int 4244zfs_freebsd_link(ap) 4245 struct vop_link_args /* { 4246 struct vnode *a_tdvp; 4247 struct vnode *a_vp; 4248 struct componentname *a_cnp; 4249 } */ *ap; 4250{ 4251 struct componentname *cnp = ap->a_cnp; 4252 4253 ASSERT(cnp->cn_flags & SAVENAME); 4254 4255 return (zfs_link(ap->a_tdvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred, NULL, 0)); 4256} 4257 4258static int 4259zfs_freebsd_inactive(ap) 4260 struct vop_inactive_args /* { 4261 struct vnode *a_vp; 4262 struct thread *a_td; 4263 } */ *ap; 4264{ 4265 vnode_t *vp = ap->a_vp; 4266 4267 zfs_inactive(vp, ap->a_td->td_ucred, NULL); 4268 return (0); 4269} 4270 4271static void 4272zfs_reclaim_complete(void *arg, int pending) 4273{ 4274 znode_t *zp = arg; 4275 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4276 4277 ZFS_LOG(1, "zp=%p", zp); 4278 ZFS_OBJ_HOLD_ENTER(zfsvfs, zp->z_id); 4279 zfs_znode_dmu_fini(zp); 4280 ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id); 4281 zfs_znode_free(zp); 4282} 4283 4284static int 4285zfs_freebsd_reclaim(ap) 4286 struct vop_reclaim_args /* { 4287 struct vnode *a_vp; 4288 struct thread *a_td; 4289 } */ *ap; 4290{ 4291 vnode_t *vp = ap->a_vp; 4292 znode_t *zp = VTOZ(vp); 4293 zfsvfs_t *zfsvfs; 4294 4295 ASSERT(zp != NULL); 4296 4297 /* 4298 * Destroy the vm object and flush associated pages. 4299 */ 4300 vnode_destroy_vobject(vp); 4301 4302 mutex_enter(&zp->z_lock); 4303 ASSERT(zp->z_phys); 4304 ZTOV(zp) = NULL; 4305 if (!zp->z_unlinked) { 4306 int locked; 4307 4308 zfsvfs = zp->z_zfsvfs; 4309 mutex_exit(&zp->z_lock); 4310 locked = MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id)) ? 2 : 4311 ZFS_OBJ_HOLD_TRYENTER(zfsvfs, zp->z_id); 4312 if (locked == 0) { 4313 /* 4314 * Lock can't be obtained due to deadlock possibility, 4315 * so defer znode destruction. 4316 */ 4317 TASK_INIT(&zp->z_task, 0, zfs_reclaim_complete, zp); 4318 taskqueue_enqueue(taskqueue_thread, &zp->z_task); 4319 } else { 4320 zfs_znode_dmu_fini(zp); 4321 if (locked == 1) 4322 ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id); 4323 zfs_znode_free(zp); 4324 } 4325 } else { 4326 mutex_exit(&zp->z_lock); 4327 } 4328 VI_LOCK(vp); 4329 vp->v_data = NULL; 4330 ASSERT(vp->v_holdcnt >= 1); 4331 VI_UNLOCK(vp); 4332 return (0); 4333} 4334 4335static int 4336zfs_freebsd_fid(ap) 4337 struct vop_fid_args /* { 4338 struct vnode *a_vp; 4339 struct fid *a_fid; 4340 } */ *ap; 4341{ 4342 4343 return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL)); 4344} 4345 4346static int 4347zfs_freebsd_pathconf(ap) 4348 struct vop_pathconf_args /* { 4349 struct vnode *a_vp; 4350 int a_name; 4351 register_t *a_retval; 4352 } */ *ap; 4353{ 4354 ulong_t val; 4355 int error; 4356 4357 error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred, NULL); 4358 if (error == 0) 4359 *ap->a_retval = val; 4360 else if (error == EOPNOTSUPP) 4361 error = vop_stdpathconf(ap); 4362 return (error); 4363} 4364 4365/* 4366 * FreeBSD's extended attributes namespace defines file name prefix for ZFS' 4367 * extended attribute name: 4368 * 4369 * NAMESPACE PREFIX 4370 * system freebsd:system: 4371 * user (none, can be used to access ZFS fsattr(5) attributes 4372 * created on Solaris) 4373 */ 4374static int 4375zfs_create_attrname(int attrnamespace, const char *name, char *attrname, 4376 size_t size) 4377{ 4378 const char *namespace, *prefix, *suffix; 4379 4380 /* We don't allow '/' character in attribute name. */ 4381 if (strchr(name, '/') != NULL) 4382 return (EINVAL); 4383 /* We don't allow attribute names that start with "freebsd:" string. */ 4384 if (strncmp(name, "freebsd:", 8) == 0) 4385 return (EINVAL); 4386 4387 bzero(attrname, size); 4388 4389 switch (attrnamespace) { 4390 case EXTATTR_NAMESPACE_USER: 4391#if 0 4392 prefix = "freebsd:"; 4393 namespace = EXTATTR_NAMESPACE_USER_STRING; 4394 suffix = ":"; 4395#else 4396 /* 4397 * This is the default namespace by which we can access all 4398 * attributes created on Solaris. 4399 */ 4400 prefix = namespace = suffix = ""; 4401#endif 4402 break; 4403 case EXTATTR_NAMESPACE_SYSTEM: 4404 prefix = "freebsd:"; 4405 namespace = EXTATTR_NAMESPACE_SYSTEM_STRING; 4406 suffix = ":"; 4407 break; 4408 case EXTATTR_NAMESPACE_EMPTY: 4409 default: 4410 return (EINVAL); 4411 } 4412 if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix, 4413 name) >= size) { 4414 return (ENAMETOOLONG); 4415 } 4416 return (0); 4417} 4418 4419/* 4420 * Vnode operating to retrieve a named extended attribute. 4421 */ 4422static int 4423zfs_getextattr(struct vop_getextattr_args *ap) 4424/* 4425vop_getextattr { 4426 IN struct vnode *a_vp; 4427 IN int a_attrnamespace; 4428 IN const char *a_name; 4429 INOUT struct uio *a_uio; 4430 OUT size_t *a_size; 4431 IN struct ucred *a_cred; 4432 IN struct thread *a_td; 4433}; 4434*/ 4435{ 4436 zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; 4437 struct thread *td = ap->a_td; 4438 struct nameidata nd; 4439 char attrname[255]; 4440 struct vattr va; 4441 vnode_t *xvp = NULL, *vp; 4442 int error, flags; 4443 4444 error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, 4445 sizeof(attrname)); 4446 if (error != 0) 4447 return (error); 4448 4449 ZFS_ENTER(zfsvfs); 4450 4451 error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, 4452 LOOKUP_XATTR); 4453 if (error != 0) { 4454 ZFS_EXIT(zfsvfs); 4455 return (error); 4456 } 4457 4458 flags = FREAD; 4459 NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_SYSSPACE, attrname, 4460 xvp, td); 4461 error = vn_open_cred(&nd, &flags, 0, ap->a_cred, NULL); 4462 vp = nd.ni_vp; 4463 NDFREE(&nd, NDF_ONLY_PNBUF); 4464 if (error != 0) { 4465 ZFS_EXIT(zfsvfs); 4466 return (error); 4467 } 4468 4469 if (ap->a_size != NULL) { 4470 error = VOP_GETATTR(vp, &va, ap->a_cred); 4471 if (error == 0) 4472 *ap->a_size = (size_t)va.va_size; 4473 } else if (ap->a_uio != NULL) 4474 error = VOP_READ(vp, ap->a_uio, IO_UNIT | IO_SYNC, ap->a_cred); 4475 4476 VOP_UNLOCK(vp, 0); 4477 vn_close(vp, flags, ap->a_cred, td); 4478 ZFS_EXIT(zfsvfs); 4479 4480 return (error); 4481} 4482 4483/* 4484 * Vnode operation to remove a named attribute. 4485 */ 4486int 4487zfs_deleteextattr(struct vop_deleteextattr_args *ap) 4488/* 4489vop_deleteextattr { 4490 IN struct vnode *a_vp; 4491 IN int a_attrnamespace; 4492 IN const char *a_name; 4493 IN struct ucred *a_cred; 4494 IN struct thread *a_td; 4495}; 4496*/ 4497{ 4498 zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; 4499 struct thread *td = ap->a_td; 4500 struct nameidata nd; 4501 char attrname[255]; 4502 struct vattr va; 4503 vnode_t *xvp = NULL, *vp; 4504 int error, flags; 4505 4506 error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, 4507 sizeof(attrname)); 4508 if (error != 0) 4509 return (error); 4510 4511 ZFS_ENTER(zfsvfs); 4512 4513 error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, 4514 LOOKUP_XATTR); 4515 if (error != 0) { 4516 ZFS_EXIT(zfsvfs); 4517 return (error); 4518 } 4519 4520 NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF | MPSAFE, 4521 UIO_SYSSPACE, attrname, xvp, td); 4522 error = namei(&nd); 4523 vp = nd.ni_vp; 4524 NDFREE(&nd, NDF_ONLY_PNBUF); 4525 if (error != 0) { 4526 ZFS_EXIT(zfsvfs); 4527 return (error); 4528 } 4529 VOP_LEASE(nd.ni_dvp, td, ap->a_cred, LEASE_WRITE); 4530 error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd); 4531 4532 vput(nd.ni_dvp); 4533 if (vp == nd.ni_dvp) 4534 vrele(vp); 4535 else 4536 vput(vp); 4537 ZFS_EXIT(zfsvfs); 4538 4539 return (error); 4540} 4541 4542/* 4543 * Vnode operation to set a named attribute. 4544 */ 4545static int 4546zfs_setextattr(struct vop_setextattr_args *ap) 4547/* 4548vop_setextattr { 4549 IN struct vnode *a_vp; 4550 IN int a_attrnamespace; 4551 IN const char *a_name; 4552 INOUT struct uio *a_uio; 4553 IN struct ucred *a_cred; 4554 IN struct thread *a_td; 4555}; 4556*/ 4557{ 4558 zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; 4559 struct thread *td = ap->a_td; 4560 struct nameidata nd; 4561 char attrname[255]; 4562 struct vattr va; 4563 vnode_t *xvp = NULL, *vp; 4564 int error, flags; 4565 4566 error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, 4567 sizeof(attrname)); 4568 if (error != 0) 4569 return (error); 4570 4571 ZFS_ENTER(zfsvfs); 4572 4573 error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, 4574 LOOKUP_XATTR); 4575 if (error != 0) { 4576 ZFS_EXIT(zfsvfs); 4577 return (error); 4578 } 4579 4580 flags = FFLAGS(O_WRONLY | O_CREAT); 4581 NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_SYSSPACE, attrname, 4582 xvp, td); 4583 error = vn_open_cred(&nd, &flags, 0600, ap->a_cred, NULL); 4584 vp = nd.ni_vp; 4585 NDFREE(&nd, NDF_ONLY_PNBUF); 4586 if (error != 0) { 4587 ZFS_EXIT(zfsvfs); 4588 return (error); 4589 } 4590 4591 VOP_LEASE(vp, td, ap->a_cred, LEASE_WRITE); 4592 VATTR_NULL(&va); 4593 va.va_size = 0; 4594 error = VOP_SETATTR(vp, &va, ap->a_cred); 4595 if (error == 0) 4596 VOP_WRITE(vp, ap->a_uio, IO_UNIT | IO_SYNC, ap->a_cred); 4597 4598 VOP_UNLOCK(vp, 0); 4599 vn_close(vp, flags, ap->a_cred, td); 4600 ZFS_EXIT(zfsvfs); 4601 4602 return (error); 4603} 4604 4605/* 4606 * Vnode operation to retrieve extended attributes on a vnode. 4607 */ 4608static int 4609zfs_listextattr(struct vop_listextattr_args *ap) 4610/* 4611vop_listextattr { 4612 IN struct vnode *a_vp; 4613 IN int a_attrnamespace; 4614 INOUT struct uio *a_uio; 4615 OUT size_t *a_size; 4616 IN struct ucred *a_cred; 4617 IN struct thread *a_td; 4618}; 4619*/ 4620{ 4621 zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; 4622 struct thread *td = ap->a_td; 4623 struct nameidata nd; 4624 char attrprefix[16]; 4625 u_char dirbuf[sizeof(struct dirent)]; 4626 struct dirent *dp; 4627 struct iovec aiov; 4628 struct uio auio, *uio = ap->a_uio; 4629 size_t *sizep = ap->a_size; 4630 size_t plen; 4631 vnode_t *xvp = NULL, *vp; 4632 int done, error, eof, pos; 4633 4634 error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix, 4635 sizeof(attrprefix)); 4636 if (error != 0) 4637 return (error); 4638 plen = strlen(attrprefix); 4639 4640 ZFS_ENTER(zfsvfs); 4641 4642 error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, 4643 LOOKUP_XATTR); 4644 if (error != 0) { 4645 ZFS_EXIT(zfsvfs); 4646 return (error); 4647 } 4648 4649 NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | MPSAFE, UIO_SYSSPACE, 4650 ".", xvp, td); 4651 error = namei(&nd); 4652 vp = nd.ni_vp; 4653 NDFREE(&nd, NDF_ONLY_PNBUF); 4654 if (error != 0) { 4655 ZFS_EXIT(zfsvfs); 4656 return (error); 4657 } 4658 4659 auio.uio_iov = &aiov; 4660 auio.uio_iovcnt = 1; 4661 auio.uio_segflg = UIO_SYSSPACE; 4662 auio.uio_td = td; 4663 auio.uio_rw = UIO_READ; 4664 auio.uio_offset = 0; 4665 4666 if (sizep != NULL) 4667 *sizep = 0; 4668 4669 do { 4670 u_char nlen; 4671 4672 aiov.iov_base = (void *)dirbuf; 4673 aiov.iov_len = sizeof(dirbuf); 4674 auio.uio_resid = sizeof(dirbuf); 4675 error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL); 4676 done = sizeof(dirbuf) - auio.uio_resid; 4677 if (error != 0) 4678 break; 4679 for (pos = 0; pos < done;) { 4680 dp = (struct dirent *)(dirbuf + pos); 4681 pos += dp->d_reclen; 4682 /* 4683 * XXX: Temporarily we also accept DT_UNKNOWN, as this 4684 * is what we get when attribute was created on Solaris. 4685 */ 4686 if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN) 4687 continue; 4688 if (plen == 0 && strncmp(dp->d_name, "freebsd:", 8) == 0) 4689 continue; 4690 else if (strncmp(dp->d_name, attrprefix, plen) != 0) 4691 continue; 4692 nlen = dp->d_namlen - plen; 4693 if (sizep != NULL) 4694 *sizep += 1 + nlen; 4695 else if (uio != NULL) { 4696 /* 4697 * Format of extattr name entry is one byte for 4698 * length and the rest for name. 4699 */ 4700 error = uiomove(&nlen, 1, uio->uio_rw, uio); 4701 if (error == 0) { 4702 error = uiomove(dp->d_name + plen, nlen, 4703 uio->uio_rw, uio); 4704 } 4705 if (error != 0) 4706 break; 4707 } 4708 } 4709 } while (!eof && error == 0); 4710 4711 vput(vp); 4712 ZFS_EXIT(zfsvfs); 4713 4714 return (error); 4715} 4716 4717struct vop_vector zfs_vnodeops; 4718struct vop_vector zfs_fifoops; 4719 4720struct vop_vector zfs_vnodeops = { 4721 .vop_default = &default_vnodeops, 4722 .vop_inactive = zfs_freebsd_inactive, 4723 .vop_reclaim = zfs_freebsd_reclaim, 4724 .vop_access = zfs_freebsd_access, 4725#ifdef FREEBSD_NAMECACHE 4726 .vop_lookup = vfs_cache_lookup, 4727 .vop_cachedlookup = zfs_freebsd_lookup, 4728#else 4729 .vop_lookup = zfs_freebsd_lookup, 4730#endif 4731 .vop_getattr = zfs_freebsd_getattr, 4732 .vop_setattr = zfs_freebsd_setattr, 4733 .vop_create = zfs_freebsd_create, 4734 .vop_mknod = zfs_freebsd_create, 4735 .vop_mkdir = zfs_freebsd_mkdir, 4736 .vop_readdir = zfs_freebsd_readdir, 4737 .vop_fsync = zfs_freebsd_fsync, 4738 .vop_open = zfs_freebsd_open, 4739 .vop_close = zfs_freebsd_close, 4740 .vop_rmdir = zfs_freebsd_rmdir, 4741 .vop_ioctl = zfs_freebsd_ioctl, 4742 .vop_link = zfs_freebsd_link, 4743 .vop_symlink = zfs_freebsd_symlink, 4744 .vop_readlink = zfs_freebsd_readlink, 4745 .vop_read = zfs_freebsd_read, 4746 .vop_write = zfs_freebsd_write, 4747 .vop_remove = zfs_freebsd_remove, 4748 .vop_rename = zfs_freebsd_rename, 4749 .vop_pathconf = zfs_freebsd_pathconf, 4750 .vop_bmap = VOP_EOPNOTSUPP, 4751 .vop_fid = zfs_freebsd_fid, 4752 .vop_getextattr = zfs_getextattr, 4753 .vop_deleteextattr = zfs_deleteextattr, 4754 .vop_setextattr = zfs_setextattr, 4755 .vop_listextattr = zfs_listextattr, 4756}; 4757 4758struct vop_vector zfs_fifoops = { 4759 .vop_default = &fifo_specops, 4760 .vop_fsync = VOP_PANIC, 4761 .vop_access = zfs_freebsd_access, 4762 .vop_getattr = zfs_freebsd_getattr, 4763 .vop_inactive = zfs_freebsd_inactive, 4764 .vop_read = VOP_PANIC, 4765 .vop_reclaim = zfs_freebsd_reclaim, 4766 .vop_setattr = zfs_freebsd_setattr, 4767 .vop_write = VOP_PANIC, 4768 .vop_fid = zfs_freebsd_fid, 4769}; 4770