zfs_vnops.c revision 169167
190075Sobrien/* 290075Sobrien * CDDL HEADER START 390075Sobrien * 490075Sobrien * The contents of this file are subject to the terms of the 590075Sobrien * Common Development and Distribution License (the "License"). 690075Sobrien * You may not use this file except in compliance with the License. 790075Sobrien * 890075Sobrien * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 990075Sobrien * or http://www.opensolaris.org/os/licensing. 1090075Sobrien * See the License for the specific language governing permissions 1190075Sobrien * and limitations under the License. 1290075Sobrien * 1390075Sobrien * When distributing Covered Code, include this CDDL HEADER in each 1490075Sobrien * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 1590075Sobrien * If applicable, add the following below this CDDL HEADER, with the 1690075Sobrien * fields enclosed by brackets "[]" replaced with your own identifying 1790075Sobrien * information: Portions Copyright [yyyy] [name of copyright owner] 1890075Sobrien * 1990075Sobrien * CDDL HEADER END 2090075Sobrien */ 2190075Sobrien/* 2290075Sobrien * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 2390075Sobrien * Use is subject to license terms. 24169689Skan */ 25169689Skan 2690075Sobrien#pragma ident "%Z%%M% %I% %E% SMI" 2790075Sobrien 28117395Skan#include <sys/types.h> 2990075Sobrien#include <sys/param.h> 30169689Skan#include <sys/time.h> 31117395Skan#include <sys/systm.h> 3290075Sobrien#include <sys/sysmacros.h> 33117395Skan#include <sys/resource.h> 3490075Sobrien#include <sys/vfs.h> 3590075Sobrien#include <sys/vnode.h> 3690075Sobrien#include <sys/file.h> 3790075Sobrien#include <sys/stat.h> 3890075Sobrien#include <sys/kmem.h> 3990075Sobrien#include <sys/taskq.h> 4090075Sobrien#include <sys/uio.h> 4190075Sobrien#include <sys/atomic.h> 4290075Sobrien#include <sys/namei.h> 4390075Sobrien#include <sys/mman.h> 4490075Sobrien#include <sys/cmn_err.h> 4590075Sobrien#include <sys/errno.h> 4690075Sobrien#include <sys/unistd.h> 4790075Sobrien#include <sys/zfs_vfsops.h> 4890075Sobrien#include <sys/zfs_dir.h> 49117395Skan#include <sys/zfs_acl.h> 50117395Skan#include <sys/zfs_ioctl.h> 51169689Skan#include <sys/fs/zfs.h> 52117395Skan#include <sys/dmu.h> 53117395Skan#include <sys/spa.h> 54117395Skan#include <sys/txg.h> 55117395Skan#include <sys/dbuf.h> 56117395Skan#include <sys/zap.h> 57117395Skan#include <sys/dirent.h> 58169689Skan#include <sys/policy.h> 5990075Sobrien#include <sys/sunddi.h> 6090075Sobrien#include <sys/filio.h> 6190075Sobrien#include <sys/zfs_ctldir.h> 6290075Sobrien#include <sys/dnlc.h> 63169689Skan#include <sys/zfs_rlock.h> 64169689Skan#include <sys/bio.h> 65169689Skan#include <sys/buf.h> 6690075Sobrien#include <sys/sf_buf.h> 6790075Sobrien#include <sys/sched.h> 68117395Skan 6990075Sobrien/* 7090075Sobrien * Programming rules. 7190075Sobrien * 7290075Sobrien * Each vnode op performs some logical unit of work. To do this, the ZPL must 7390075Sobrien * properly lock its in-core state, create a DMU transaction, do the work, 7490075Sobrien * record this work in the intent log (ZIL), commit the DMU transaction, 7590075Sobrien * and wait the the intent log to commit if it's is a synchronous operation. 7690075Sobrien * Morover, the vnode ops must work in both normal and log replay context. 7790075Sobrien * The ordering of events is important to avoid deadlocks and references 7890075Sobrien * to freed memory. The example below illustrates the following Big Rules: 7990075Sobrien * 8090075Sobrien * (1) A check must be made in each zfs thread for a mounted file system. 8190075Sobrien * This is done avoiding races using ZFS_ENTER(zfsvfs). 8290075Sobrien * A ZFS_EXIT(zfsvfs) is needed before all returns. 8390075Sobrien * 8490075Sobrien * (2) VN_RELE() should always be the last thing except for zil_commit() 8590075Sobrien * (if necessary) and ZFS_EXIT(). This is for 3 reasons: 8690075Sobrien * First, if it's the last reference, the vnode/znode 8790075Sobrien * can be freed, so the zp may point to freed memory. Second, the last 8890075Sobrien * reference will call zfs_zinactive(), which may induce a lot of work -- 8990075Sobrien * pushing cached pages (which acquires range locks) and syncing out 9090075Sobrien * cached atime changes. Third, zfs_zinactive() may require a new tx, 9190075Sobrien * which could deadlock the system if you were already holding one. 9290075Sobrien * 9390075Sobrien * (3) All range locks must be grabbed before calling dmu_tx_assign(), 9490075Sobrien * as they can span dmu_tx_assign() calls. 9590075Sobrien * 9690075Sobrien * (4) Always pass zfsvfs->z_assign as the second argument to dmu_tx_assign(). 9790075Sobrien * In normal operation, this will be TXG_NOWAIT. During ZIL replay, 9890075Sobrien * it will be a specific txg. Either way, dmu_tx_assign() never blocks. 9990075Sobrien * This is critical because we don't want to block while holding locks. 10090075Sobrien * Note, in particular, that if a lock is sometimes acquired before 10190075Sobrien * the tx assigns, and sometimes after (e.g. z_lock), then failing to 102132718Skan * use a non-blocking assign can deadlock the system. The scenario: 10390075Sobrien * 10490075Sobrien * Thread A has grabbed a lock before calling dmu_tx_assign(). 105169689Skan * Thread B is in an already-assigned tx, and blocks for this lock. 10690075Sobrien * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() 10790075Sobrien * forever, because the previous txg can't quiesce until B's tx commits. 10890075Sobrien * 109169689Skan * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, 110169689Skan * then drop all locks, call dmu_tx_wait(), and try again. 111169689Skan * 11290075Sobrien * (5) If the operation succeeded, generate the intent log entry for it 11390075Sobrien * before dropping locks. This ensures that the ordering of events 11490075Sobrien * in the intent log matches the order in which they actually occurred. 11590075Sobrien * 11690075Sobrien * (6) At the end of each vnode op, the DMU tx must always commit, 11790075Sobrien * regardless of whether there were any errors. 118117395Skan * 11990075Sobrien * (7) After dropping all locks, invoke zil_commit(zilog, seq, foid) 12090075Sobrien * to ensure that synchronous semantics are provided when necessary. 12190075Sobrien * 12290075Sobrien * In general, this is how things should be ordered in each vnode op: 12390075Sobrien * 12490075Sobrien * ZFS_ENTER(zfsvfs); // exit if unmounted 12590075Sobrien * top: 12690075Sobrien * zfs_dirent_lock(&dl, ...) // lock directory entry (may VN_HOLD()) 12790075Sobrien * rw_enter(...); // grab any other locks you need 12890075Sobrien * tx = dmu_tx_create(...); // get DMU tx 129169689Skan * dmu_tx_hold_*(); // hold each object you might modify 13090075Sobrien * error = dmu_tx_assign(tx, zfsvfs->z_assign); // try to assign 13190075Sobrien * if (error) { 13290075Sobrien * rw_exit(...); // drop locks 13390075Sobrien * zfs_dirent_unlock(dl); // unlock directory entry 13490075Sobrien * VN_RELE(...); // release held vnodes 135132718Skan * if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 13690075Sobrien * dmu_tx_wait(tx); 13790075Sobrien * dmu_tx_abort(tx); 138169689Skan * goto top; 13990075Sobrien * } 14090075Sobrien * dmu_tx_abort(tx); // abort DMU tx 141169689Skan * ZFS_EXIT(zfsvfs); // finished in zfs 142169689Skan * return (error); // really out of space 14390075Sobrien * } 144169689Skan * error = do_real_work(); // do whatever this VOP does 14590075Sobrien * if (error == 0) 14690075Sobrien * zfs_log_*(...); // on success, make ZIL entry 14790075Sobrien * dmu_tx_commit(tx); // commit DMU tx -- error or not 14890075Sobrien * rw_exit(...); // drop locks 14990075Sobrien * zfs_dirent_unlock(dl); // unlock directory entry 15090075Sobrien * VN_RELE(...); // release held vnodes 151117395Skan * zil_commit(zilog, seq, foid); // synchronous when necessary 15290075Sobrien * ZFS_EXIT(zfsvfs); // finished in zfs 15390075Sobrien * return (error); // done, report error 15490075Sobrien */ 15590075Sobrien/* ARGSUSED */ 15690075Sobrienstatic int 15790075Sobrienzfs_open(vnode_t **vpp, int flag, cred_t *cr) 15890075Sobrien{ 15990075Sobrien znode_t *zp = VTOZ(*vpp); 16090075Sobrien 16190075Sobrien /* Keep a count of the synchronous opens in the znode */ 16290075Sobrien if (flag & (FSYNC | FDSYNC)) 16390075Sobrien atomic_inc_32(&zp->z_sync_cnt); 16490075Sobrien return (0); 16590075Sobrien} 16690075Sobrien 16790075Sobrien/* ARGSUSED */ 16890075Sobrienstatic int 16990075Sobrienzfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr) 17090075Sobrien{ 17190075Sobrien znode_t *zp = VTOZ(vp); 17290075Sobrien 17390075Sobrien /* Decrement the synchronous opens in the znode */ 17490075Sobrien if (flag & (FSYNC | FDSYNC)) 17590075Sobrien atomic_dec_32(&zp->z_sync_cnt); 17690075Sobrien 17790075Sobrien /* 17890075Sobrien * Clean up any locks held by this process on the vp. 17990075Sobrien */ 180169689Skan cleanlocks(vp, ddi_get_pid(), 0); 181169689Skan cleanshares(vp, ddi_get_pid()); 18290075Sobrien 18390075Sobrien return (0); 18490075Sobrien} 18590075Sobrien 18690075Sobrien/* 18790075Sobrien * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and 18890075Sobrien * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter. 18990075Sobrien */ 190static int 191zfs_holey(vnode_t *vp, u_long cmd, offset_t *off) 192{ 193 znode_t *zp = VTOZ(vp); 194 uint64_t noff = (uint64_t)*off; /* new offset */ 195 uint64_t file_sz; 196 int error; 197 boolean_t hole; 198 199 file_sz = zp->z_phys->zp_size; 200 if (noff >= file_sz) { 201 return (ENXIO); 202 } 203 204 if (cmd == _FIO_SEEK_HOLE) 205 hole = B_TRUE; 206 else 207 hole = B_FALSE; 208 209 error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff); 210 211 /* end of file? */ 212 if ((error == ESRCH) || (noff > file_sz)) { 213 /* 214 * Handle the virtual hole at the end of file. 215 */ 216 if (hole) { 217 *off = file_sz; 218 return (0); 219 } 220 return (ENXIO); 221 } 222 223 if (noff < *off) 224 return (error); 225 *off = noff; 226 return (error); 227} 228 229/* ARGSUSED */ 230static int 231zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred, 232 int *rvalp) 233{ 234 offset_t off; 235 int error; 236 zfsvfs_t *zfsvfs; 237 238 switch (com) { 239 case _FIOFFS: 240 return (0); 241 242 /* 243 * The following two ioctls are used by bfu. Faking out, 244 * necessary to avoid bfu errors. 245 */ 246 case _FIOGDIO: 247 case _FIOSDIO: 248 return (0); 249 250 case _FIO_SEEK_DATA: 251 case _FIO_SEEK_HOLE: 252 if (ddi_copyin((void *)data, &off, sizeof (off), flag)) 253 return (EFAULT); 254 255 zfsvfs = VTOZ(vp)->z_zfsvfs; 256 ZFS_ENTER(zfsvfs); 257 258 /* offset parameter is in/out */ 259 error = zfs_holey(vp, com, &off); 260 ZFS_EXIT(zfsvfs); 261 if (error) 262 return (error); 263 if (ddi_copyout(&off, (void *)data, sizeof (off), flag)) 264 return (EFAULT); 265 return (0); 266 } 267 return (ENOTTY); 268} 269 270/* 271 * When a file is memory mapped, we must keep the IO data synchronized 272 * between the DMU cache and the memory mapped pages. What this means: 273 * 274 * On Write: If we find a memory mapped page, we write to *both* 275 * the page and the dmu buffer. 276 * 277 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when 278 * the file is memory mapped. 279 */ 280static int 281mappedwrite(vnode_t *vp, int nbytes, uio_t *uio, dmu_tx_t *tx) 282{ 283 znode_t *zp = VTOZ(vp); 284 objset_t *os = zp->z_zfsvfs->z_os; 285 vm_object_t obj; 286 vm_page_t m; 287 struct sf_buf *sf; 288 int64_t start, off; 289 int len = nbytes; 290 int error = 0; 291 uint64_t dirbytes; 292 293 ASSERT(vp->v_mount != NULL); 294 obj = vp->v_object; 295 ASSERT(obj != NULL); 296 297 start = uio->uio_loffset; 298 off = start & PAGEOFFSET; 299 dirbytes = 0; 300 VM_OBJECT_LOCK(obj); 301 for (start &= PAGEMASK; len > 0; start += PAGESIZE) { 302 uint64_t bytes = MIN(PAGESIZE - off, len); 303 uint64_t fsize; 304 305again: 306 if ((m = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && 307 vm_page_is_valid(m, (vm_offset_t)off, bytes)) { 308 uint64_t woff; 309 caddr_t va; 310 311 if (vm_page_sleep_if_busy(m, FALSE, "zfsmwb")) 312 goto again; 313 fsize = obj->un_pager.vnp.vnp_size; 314 vm_page_busy(m); 315 vm_page_lock_queues(); 316 vm_page_undirty(m); 317 vm_page_unlock_queues(); 318 VM_OBJECT_UNLOCK(obj); 319 if (dirbytes > 0) { 320 error = dmu_write_uio(os, zp->z_id, uio, 321 dirbytes, tx); 322 dirbytes = 0; 323 } 324 if (error == 0) { 325 sched_pin(); 326 sf = sf_buf_alloc(m, SFB_CPUPRIVATE); 327 va = (caddr_t)sf_buf_kva(sf); 328 woff = uio->uio_loffset - off; 329 error = uiomove(va + off, bytes, UIO_WRITE, uio); 330 /* 331 * The uiomove() above could have been partially 332 * successful, that's why we call dmu_write() 333 * below unconditionally. The page was marked 334 * non-dirty above and we would lose the changes 335 * without doing so. If the uiomove() failed 336 * entirely, well, we just write what we got 337 * before one more time. 338 */ 339 dmu_write(os, zp->z_id, woff, 340 MIN(PAGESIZE, fsize - woff), va, tx); 341 sf_buf_free(sf); 342 sched_unpin(); 343 } 344 VM_OBJECT_LOCK(obj); 345 vm_page_wakeup(m); 346 } else { 347 dirbytes += bytes; 348 } 349 len -= bytes; 350 off = 0; 351 if (error) 352 break; 353 } 354 VM_OBJECT_UNLOCK(obj); 355 if (error == 0 && dirbytes > 0) 356 error = dmu_write_uio(os, zp->z_id, uio, dirbytes, tx); 357 return (error); 358} 359 360/* 361 * When a file is memory mapped, we must keep the IO data synchronized 362 * between the DMU cache and the memory mapped pages. What this means: 363 * 364 * On Read: We "read" preferentially from memory mapped pages, 365 * else we default from the dmu buffer. 366 * 367 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when 368 * the file is memory mapped. 369 */ 370static int 371mappedread(vnode_t *vp, int nbytes, uio_t *uio) 372{ 373 znode_t *zp = VTOZ(vp); 374 objset_t *os = zp->z_zfsvfs->z_os; 375 vm_object_t obj; 376 vm_page_t m; 377 struct sf_buf *sf; 378 int64_t start, off; 379 caddr_t va; 380 int len = nbytes; 381 int error = 0; 382 uint64_t dirbytes; 383 384 ASSERT(vp->v_mount != NULL); 385 obj = vp->v_object; 386 ASSERT(obj != NULL); 387 388 start = uio->uio_loffset; 389 off = start & PAGEOFFSET; 390 dirbytes = 0; 391 VM_OBJECT_LOCK(obj); 392 for (start &= PAGEMASK; len > 0; start += PAGESIZE) { 393 uint64_t bytes = MIN(PAGESIZE - off, len); 394 395again: 396 if ((m = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && 397 vm_page_is_valid(m, (vm_offset_t)off, bytes)) { 398 if (vm_page_sleep_if_busy(m, FALSE, "zfsmrb")) 399 goto again; 400 vm_page_busy(m); 401 VM_OBJECT_UNLOCK(obj); 402 if (dirbytes > 0) { 403 error = dmu_read_uio(os, zp->z_id, uio, 404 dirbytes); 405 dirbytes = 0; 406 } 407 if (error == 0) { 408 sched_pin(); 409 sf = sf_buf_alloc(m, SFB_CPUPRIVATE); 410 va = (caddr_t)sf_buf_kva(sf); 411 error = uiomove(va + off, bytes, UIO_READ, uio); 412 sf_buf_free(sf); 413 sched_unpin(); 414 } 415 VM_OBJECT_LOCK(obj); 416 vm_page_wakeup(m); 417 } else if (m != NULL && uio->uio_segflg == UIO_NOCOPY) { 418 /* 419 * The code below is here to make sendfile(2) work 420 * correctly with ZFS. As pointed out by ups@ 421 * sendfile(2) should be changed to use VOP_GETPAGES(), 422 * but it pessimize performance of sendfile/UFS, that's 423 * why I handle this special case in ZFS code. 424 */ 425 if (vm_page_sleep_if_busy(m, FALSE, "zfsmrb")) 426 goto again; 427 vm_page_busy(m); 428 VM_OBJECT_UNLOCK(obj); 429 if (dirbytes > 0) { 430 error = dmu_read_uio(os, zp->z_id, uio, 431 dirbytes); 432 dirbytes = 0; 433 } 434 if (error == 0) { 435 sched_pin(); 436 sf = sf_buf_alloc(m, SFB_CPUPRIVATE); 437 va = (caddr_t)sf_buf_kva(sf); 438 error = dmu_read(os, zp->z_id, start + off, 439 bytes, (void *)(va + off)); 440 sf_buf_free(sf); 441 sched_unpin(); 442 } 443 VM_OBJECT_LOCK(obj); 444 vm_page_wakeup(m); 445 if (error == 0) 446 uio->uio_resid -= bytes; 447 } else { 448 dirbytes += bytes; 449 } 450 len -= bytes; 451 off = 0; 452 if (error) 453 break; 454 } 455 VM_OBJECT_UNLOCK(obj); 456 if (error == 0 && dirbytes > 0) 457 error = dmu_read_uio(os, zp->z_id, uio, dirbytes); 458 return (error); 459} 460 461offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */ 462 463/* 464 * Read bytes from specified file into supplied buffer. 465 * 466 * IN: vp - vnode of file to be read from. 467 * uio - structure supplying read location, range info, 468 * and return buffer. 469 * ioflag - SYNC flags; used to provide FRSYNC semantics. 470 * cr - credentials of caller. 471 * 472 * OUT: uio - updated offset and range, buffer filled. 473 * 474 * RETURN: 0 if success 475 * error code if failure 476 * 477 * Side Effects: 478 * vp - atime updated if byte count > 0 479 */ 480/* ARGSUSED */ 481static int 482zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) 483{ 484 znode_t *zp = VTOZ(vp); 485 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 486 objset_t *os = zfsvfs->z_os; 487 ssize_t n, nbytes; 488 int error; 489 rl_t *rl; 490 491 ZFS_ENTER(zfsvfs); 492 493 /* 494 * Validate file offset 495 */ 496 if (uio->uio_loffset < (offset_t)0) { 497 ZFS_EXIT(zfsvfs); 498 return (EINVAL); 499 } 500 501 /* 502 * Fasttrack empty reads 503 */ 504 if (uio->uio_resid == 0) { 505 ZFS_EXIT(zfsvfs); 506 return (0); 507 } 508 509 /* 510 * Check for mandatory locks 511 */ 512 if (MANDMODE((mode_t)zp->z_phys->zp_mode)) { 513 if (error = chklock(vp, FREAD, 514 uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) { 515 ZFS_EXIT(zfsvfs); 516 return (error); 517 } 518 } 519 520 /* 521 * If we're in FRSYNC mode, sync out this znode before reading it. 522 */ 523 if (ioflag & FRSYNC) 524 zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id); 525 526 /* 527 * Lock the range against changes. 528 */ 529 rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER); 530 531 /* 532 * If we are reading past end-of-file we can skip 533 * to the end; but we might still need to set atime. 534 */ 535 if (uio->uio_loffset >= zp->z_phys->zp_size) { 536 error = 0; 537 goto out; 538 } 539 540 ASSERT(uio->uio_loffset < zp->z_phys->zp_size); 541 n = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset); 542 543 while (n > 0) { 544 nbytes = MIN(n, zfs_read_chunk_size - 545 P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); 546 547 if (vn_has_cached_data(vp)) 548 error = mappedread(vp, nbytes, uio); 549 else 550 error = dmu_read_uio(os, zp->z_id, uio, nbytes); 551 if (error) 552 break; 553 554 n -= nbytes; 555 } 556 557out: 558 zfs_range_unlock(rl); 559 560 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 561 ZFS_EXIT(zfsvfs); 562 return (error); 563} 564 565/* 566 * Fault in the pages of the first n bytes specified by the uio structure. 567 * 1 byte in each page is touched and the uio struct is unmodified. 568 * Any error will exit this routine as this is only a best 569 * attempt to get the pages resident. This is a copy of ufs_trans_touch(). 570 */ 571static void 572zfs_prefault_write(ssize_t n, struct uio *uio) 573{ 574 struct iovec *iov; 575 ulong_t cnt, incr; 576 caddr_t p; 577 578 if (uio->uio_segflg != UIO_USERSPACE) 579 return; 580 581 iov = uio->uio_iov; 582 583 while (n) { 584 cnt = MIN(iov->iov_len, n); 585 if (cnt == 0) { 586 /* empty iov entry */ 587 iov++; 588 continue; 589 } 590 n -= cnt; 591 /* 592 * touch each page in this segment. 593 */ 594 p = iov->iov_base; 595 while (cnt) { 596 if (fubyte(p) == -1) 597 return; 598 incr = MIN(cnt, PAGESIZE); 599 p += incr; 600 cnt -= incr; 601 } 602 /* 603 * touch the last byte in case it straddles a page. 604 */ 605 p--; 606 if (fubyte(p) == -1) 607 return; 608 iov++; 609 } 610} 611 612/* 613 * Write the bytes to a file. 614 * 615 * IN: vp - vnode of file to be written to. 616 * uio - structure supplying write location, range info, 617 * and data buffer. 618 * ioflag - IO_APPEND flag set if in append mode. 619 * cr - credentials of caller. 620 * 621 * OUT: uio - updated offset and range. 622 * 623 * RETURN: 0 if success 624 * error code if failure 625 * 626 * Timestamps: 627 * vp - ctime|mtime updated if byte count > 0 628 */ 629/* ARGSUSED */ 630static int 631zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) 632{ 633 znode_t *zp = VTOZ(vp); 634 rlim64_t limit = MAXOFFSET_T; 635 ssize_t start_resid = uio->uio_resid; 636 ssize_t tx_bytes; 637 uint64_t end_size; 638 dmu_tx_t *tx; 639 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 640 zilog_t *zilog = zfsvfs->z_log; 641 offset_t woff; 642 ssize_t n, nbytes; 643 rl_t *rl; 644 int max_blksz = zfsvfs->z_max_blksz; 645 int error; 646 647 /* 648 * Fasttrack empty write 649 */ 650 n = start_resid; 651 if (n == 0) 652 return (0); 653 654 if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) 655 limit = MAXOFFSET_T; 656 657 ZFS_ENTER(zfsvfs); 658 659 /* 660 * Pre-fault the pages to ensure slow (eg NFS) pages 661 * don't hold up txg. 662 */ 663 zfs_prefault_write(n, uio); 664 665 /* 666 * If in append mode, set the io offset pointer to eof. 667 */ 668 if (ioflag & IO_APPEND) { 669 /* 670 * Range lock for a file append: 671 * The value for the start of range will be determined by 672 * zfs_range_lock() (to guarantee append semantics). 673 * If this write will cause the block size to increase, 674 * zfs_range_lock() will lock the entire file, so we must 675 * later reduce the range after we grow the block size. 676 */ 677 rl = zfs_range_lock(zp, 0, n, RL_APPEND); 678 if (rl->r_len == UINT64_MAX) { 679 /* overlocked, zp_size can't change */ 680 woff = uio->uio_loffset = zp->z_phys->zp_size; 681 } else { 682 woff = uio->uio_loffset = rl->r_off; 683 } 684 } else { 685 woff = uio->uio_loffset; 686 /* 687 * Validate file offset 688 */ 689 if (woff < 0) { 690 ZFS_EXIT(zfsvfs); 691 return (EINVAL); 692 } 693 694 /* 695 * If we need to grow the block size then zfs_range_lock() 696 * will lock a wider range than we request here. 697 * Later after growing the block size we reduce the range. 698 */ 699 rl = zfs_range_lock(zp, woff, n, RL_WRITER); 700 } 701 702 if (woff >= limit) { 703 zfs_range_unlock(rl); 704 ZFS_EXIT(zfsvfs); 705 return (EFBIG); 706 } 707 708 if ((woff + n) > limit || woff > (limit - n)) 709 n = limit - woff; 710 711 /* 712 * Check for mandatory locks 713 */ 714 if (MANDMODE((mode_t)zp->z_phys->zp_mode) && 715 (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) { 716 zfs_range_unlock(rl); 717 ZFS_EXIT(zfsvfs); 718 return (error); 719 } 720 end_size = MAX(zp->z_phys->zp_size, woff + n); 721 722 /* 723 * Write the file in reasonable size chunks. Each chunk is written 724 * in a separate transaction; this keeps the intent log records small 725 * and allows us to do more fine-grained space accounting. 726 */ 727 while (n > 0) { 728 /* 729 * Start a transaction. 730 */ 731 woff = uio->uio_loffset; 732 tx = dmu_tx_create(zfsvfs->z_os); 733 dmu_tx_hold_bonus(tx, zp->z_id); 734 dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz)); 735 error = dmu_tx_assign(tx, zfsvfs->z_assign); 736 if (error) { 737 if (error == ERESTART && 738 zfsvfs->z_assign == TXG_NOWAIT) { 739 dmu_tx_wait(tx); 740 dmu_tx_abort(tx); 741 continue; 742 } 743 dmu_tx_abort(tx); 744 break; 745 } 746 747 /* 748 * If zfs_range_lock() over-locked we grow the blocksize 749 * and then reduce the lock range. This will only happen 750 * on the first iteration since zfs_range_reduce() will 751 * shrink down r_len to the appropriate size. 752 */ 753 if (rl->r_len == UINT64_MAX) { 754 uint64_t new_blksz; 755 756 if (zp->z_blksz > max_blksz) { 757 ASSERT(!ISP2(zp->z_blksz)); 758 new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE); 759 } else { 760 new_blksz = MIN(end_size, max_blksz); 761 } 762 zfs_grow_blocksize(zp, new_blksz, tx); 763 zfs_range_reduce(rl, woff, n); 764 } 765 766 /* 767 * XXX - should we really limit each write to z_max_blksz? 768 * Perhaps we should use SPA_MAXBLOCKSIZE chunks? 769 */ 770 nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); 771 rw_enter(&zp->z_map_lock, RW_READER); 772 773 if (woff + nbytes > zp->z_phys->zp_size) 774 vnode_pager_setsize(vp, woff + nbytes); 775 776 tx_bytes = uio->uio_resid; 777 if (vn_has_cached_data(vp)) { 778 rw_exit(&zp->z_map_lock); 779 error = mappedwrite(vp, nbytes, uio, tx); 780 } else { 781 error = dmu_write_uio(zfsvfs->z_os, zp->z_id, 782 uio, nbytes, tx); 783 rw_exit(&zp->z_map_lock); 784 } 785 tx_bytes -= uio->uio_resid; 786 787 /* 788 * If we made no progress, we're done. If we made even 789 * partial progress, update the znode and ZIL accordingly. 790 */ 791 if (tx_bytes == 0) { 792 dmu_tx_commit(tx); 793 ASSERT(error != 0); 794 break; 795 } 796 797 /* 798 * Clear Set-UID/Set-GID bits on successful write if not 799 * privileged and at least one of the excute bits is set. 800 * 801 * It would be nice to to this after all writes have 802 * been done, but that would still expose the ISUID/ISGID 803 * to another app after the partial write is committed. 804 */ 805 mutex_enter(&zp->z_acl_lock); 806 if ((zp->z_phys->zp_mode & (S_IXUSR | (S_IXUSR >> 3) | 807 (S_IXUSR >> 6))) != 0 && 808 (zp->z_phys->zp_mode & (S_ISUID | S_ISGID)) != 0 && 809 secpolicy_vnode_setid_retain(cr, 810 (zp->z_phys->zp_mode & S_ISUID) != 0 && 811 zp->z_phys->zp_uid == 0) != 0) { 812 zp->z_phys->zp_mode &= ~(S_ISUID | S_ISGID); 813 } 814 mutex_exit(&zp->z_acl_lock); 815 816 /* 817 * Update time stamp. NOTE: This marks the bonus buffer as 818 * dirty, so we don't have to do it again for zp_size. 819 */ 820 zfs_time_stamper(zp, CONTENT_MODIFIED, tx); 821 822 /* 823 * Update the file size (zp_size) if it has changed; 824 * account for possible concurrent updates. 825 */ 826 while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset) 827 (void) atomic_cas_64(&zp->z_phys->zp_size, end_size, 828 uio->uio_loffset); 829 zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag); 830 dmu_tx_commit(tx); 831 832 if (error != 0) 833 break; 834 ASSERT(tx_bytes == nbytes); 835 n -= nbytes; 836 } 837 838 zfs_range_unlock(rl); 839 840 /* 841 * If we're in replay mode, or we made no progress, return error. 842 * Otherwise, it's at least a partial write, so it's successful. 843 */ 844 if (zfsvfs->z_assign >= TXG_INITIAL || uio->uio_resid == start_resid) { 845 ZFS_EXIT(zfsvfs); 846 return (error); 847 } 848 849 if (ioflag & (FSYNC | FDSYNC)) 850 zil_commit(zilog, zp->z_last_itx, zp->z_id); 851 852 ZFS_EXIT(zfsvfs); 853 return (0); 854} 855 856void 857zfs_get_done(dmu_buf_t *db, void *vzgd) 858{ 859 zgd_t *zgd = (zgd_t *)vzgd; 860 rl_t *rl = zgd->zgd_rl; 861 vnode_t *vp = ZTOV(rl->r_zp); 862 int vfslocked; 863 864 vfslocked = VFS_LOCK_GIANT(vp->v_vfsp); 865 dmu_buf_rele(db, vzgd); 866 zfs_range_unlock(rl); 867 VN_RELE(vp); 868 zil_add_vdev(zgd->zgd_zilog, DVA_GET_VDEV(BP_IDENTITY(zgd->zgd_bp))); 869 kmem_free(zgd, sizeof (zgd_t)); 870 VFS_UNLOCK_GIANT(vfslocked); 871} 872 873/* 874 * Get data to generate a TX_WRITE intent log record. 875 */ 876int 877zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) 878{ 879 zfsvfs_t *zfsvfs = arg; 880 objset_t *os = zfsvfs->z_os; 881 znode_t *zp; 882 uint64_t off = lr->lr_offset; 883 dmu_buf_t *db; 884 rl_t *rl; 885 zgd_t *zgd; 886 int dlen = lr->lr_length; /* length of user data */ 887 int error = 0; 888 889 ASSERT(zio); 890 ASSERT(dlen != 0); 891 892 /* 893 * Nothing to do if the file has been removed 894 */ 895 if (zfs_zget(zfsvfs, lr->lr_foid, &zp) != 0) 896 return (ENOENT); 897 if (zp->z_unlinked) { 898 VN_RELE(ZTOV(zp)); 899 return (ENOENT); 900 } 901 902 /* 903 * Write records come in two flavors: immediate and indirect. 904 * For small writes it's cheaper to store the data with the 905 * log record (immediate); for large writes it's cheaper to 906 * sync the data and get a pointer to it (indirect) so that 907 * we don't have to write the data twice. 908 */ 909 if (buf != NULL) { /* immediate write */ 910 rl = zfs_range_lock(zp, off, dlen, RL_READER); 911 /* test for truncation needs to be done while range locked */ 912 if (off >= zp->z_phys->zp_size) { 913 error = ENOENT; 914 goto out; 915 } 916 VERIFY(0 == dmu_read(os, lr->lr_foid, off, dlen, buf)); 917 } else { /* indirect write */ 918 uint64_t boff; /* block starting offset */ 919 920 /* 921 * Have to lock the whole block to ensure when it's 922 * written out and it's checksum is being calculated 923 * that no one can change the data. We need to re-check 924 * blocksize after we get the lock in case it's changed! 925 */ 926 for (;;) { 927 if (ISP2(zp->z_blksz)) { 928 boff = P2ALIGN_TYPED(off, zp->z_blksz, 929 uint64_t); 930 } else { 931 boff = 0; 932 } 933 dlen = zp->z_blksz; 934 rl = zfs_range_lock(zp, boff, dlen, RL_READER); 935 if (zp->z_blksz == dlen) 936 break; 937 zfs_range_unlock(rl); 938 } 939 /* test for truncation needs to be done while range locked */ 940 if (off >= zp->z_phys->zp_size) { 941 error = ENOENT; 942 goto out; 943 } 944 zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP); 945 zgd->zgd_rl = rl; 946 zgd->zgd_zilog = zfsvfs->z_log; 947 zgd->zgd_bp = &lr->lr_blkptr; 948 VERIFY(0 == dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db)); 949 ASSERT(boff == db->db_offset); 950 lr->lr_blkoff = off - boff; 951 error = dmu_sync(zio, db, &lr->lr_blkptr, 952 lr->lr_common.lrc_txg, zfs_get_done, zgd); 953 ASSERT(error == EEXIST || lr->lr_length <= zp->z_blksz); 954 if (error == 0) { 955 zil_add_vdev(zfsvfs->z_log, 956 DVA_GET_VDEV(BP_IDENTITY(&lr->lr_blkptr))); 957 } 958 /* 959 * If we get EINPROGRESS, then we need to wait for a 960 * write IO initiated by dmu_sync() to complete before 961 * we can release this dbuf. We will finish everything 962 * up in the zfs_get_done() callback. 963 */ 964 if (error == EINPROGRESS) 965 return (0); 966 dmu_buf_rele(db, zgd); 967 kmem_free(zgd, sizeof (zgd_t)); 968 } 969out: 970 zfs_range_unlock(rl); 971 VN_RELE(ZTOV(zp)); 972 return (error); 973} 974 975/*ARGSUSED*/ 976static int 977zfs_access(vnode_t *vp, int mode, int flags, cred_t *cr) 978{ 979 znode_t *zp = VTOZ(vp); 980 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 981 int error; 982 983 ZFS_ENTER(zfsvfs); 984 error = zfs_zaccess_rwx(zp, mode, cr); 985 ZFS_EXIT(zfsvfs); 986 return (error); 987} 988 989/* 990 * Lookup an entry in a directory, or an extended attribute directory. 991 * If it exists, return a held vnode reference for it. 992 * 993 * IN: dvp - vnode of directory to search. 994 * nm - name of entry to lookup. 995 * pnp - full pathname to lookup [UNUSED]. 996 * flags - LOOKUP_XATTR set if looking for an attribute. 997 * rdir - root directory vnode [UNUSED]. 998 * cr - credentials of caller. 999 * 1000 * OUT: vpp - vnode of located entry, NULL if not found. 1001 * 1002 * RETURN: 0 if success 1003 * error code if failure 1004 * 1005 * Timestamps: 1006 * NA 1007 */ 1008/* ARGSUSED */ 1009static int 1010zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp, 1011 int nameiop, cred_t *cr, kthread_t *td) 1012{ 1013 1014 znode_t *zdp = VTOZ(dvp); 1015 zfsvfs_t *zfsvfs = zdp->z_zfsvfs; 1016 int error; 1017 1018 ZFS_ENTER(zfsvfs); 1019 1020 *vpp = NULL; 1021 1022#ifdef TODO 1023 if (flags & LOOKUP_XATTR) { 1024 /* 1025 * If the xattr property is off, refuse the lookup request. 1026 */ 1027 if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) { 1028 ZFS_EXIT(zfsvfs); 1029 return (EINVAL); 1030 } 1031 1032 /* 1033 * We don't allow recursive attributes.. 1034 * Maybe someday we will. 1035 */ 1036 if (zdp->z_phys->zp_flags & ZFS_XATTR) { 1037 ZFS_EXIT(zfsvfs); 1038 return (EINVAL); 1039 } 1040 1041 if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) { 1042 ZFS_EXIT(zfsvfs); 1043 return (error); 1044 } 1045 1046 /* 1047 * Do we have permission to get into attribute directory? 1048 */ 1049 1050 if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, cr)) { 1051 VN_RELE(*vpp); 1052 } 1053 1054 ZFS_EXIT(zfsvfs); 1055 return (error); 1056 } 1057#endif /* TODO */ 1058 1059 if (dvp->v_type != VDIR) { 1060 ZFS_EXIT(zfsvfs); 1061 return (ENOTDIR); 1062 } 1063 1064 /* 1065 * Check accessibility of directory. 1066 */ 1067 1068 if (error = zfs_zaccess(zdp, ACE_EXECUTE, cr)) { 1069 ZFS_EXIT(zfsvfs); 1070 return (error); 1071 } 1072 1073 if ((error = zfs_dirlook(zdp, nm, vpp)) == 0) { 1074 1075 /* 1076 * Convert device special files 1077 */ 1078 if (IS_DEVVP(*vpp)) { 1079 vnode_t *svp; 1080 1081 svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); 1082 VN_RELE(*vpp); 1083 if (svp == NULL) 1084 error = ENOSYS; 1085 else 1086 *vpp = svp; 1087 } 1088 } 1089 1090 ZFS_EXIT(zfsvfs); 1091 1092 /* Translate errors and add SAVENAME when needed. */ 1093 if (cnp->cn_flags & ISLASTCN) { 1094 switch (nameiop) { 1095 case CREATE: 1096 case RENAME: 1097 if (error == ENOENT) { 1098 error = EJUSTRETURN; 1099 cnp->cn_flags |= SAVENAME; 1100 break; 1101 } 1102 /* FALLTHROUGH */ 1103 case DELETE: 1104 if (error == 0) 1105 cnp->cn_flags |= SAVENAME; 1106 break; 1107 } 1108 } 1109 if (error == 0 && (nm[0] != '.' || nm[1] != '\0')) { 1110 if (cnp->cn_flags & ISDOTDOT) 1111 VOP_UNLOCK(dvp, 0, td); 1112 vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, td); 1113 if (cnp->cn_flags & ISDOTDOT) 1114 vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, td); 1115 } 1116 1117#ifdef FREEBSD_NAMECACHE 1118 /* 1119 * Insert name into cache (as non-existent) if appropriate. 1120 */ 1121 if (error == ENOENT && (cnp->cn_flags & MAKEENTRY) && nameiop != CREATE) 1122 cache_enter(dvp, *vpp, cnp); 1123 /* 1124 * Insert name into cache if appropriate. 1125 */ 1126 if (error == 0 && (cnp->cn_flags & MAKEENTRY)) { 1127 if (!(cnp->cn_flags & ISLASTCN) || 1128 (nameiop != DELETE && nameiop != RENAME)) { 1129 cache_enter(dvp, *vpp, cnp); 1130 } 1131 } 1132#endif 1133 1134 return (error); 1135} 1136 1137/* 1138 * Attempt to create a new entry in a directory. If the entry 1139 * already exists, truncate the file if permissible, else return 1140 * an error. Return the vp of the created or trunc'd file. 1141 * 1142 * IN: dvp - vnode of directory to put new file entry in. 1143 * name - name of new file entry. 1144 * vap - attributes of new file. 1145 * excl - flag indicating exclusive or non-exclusive mode. 1146 * mode - mode to open file with. 1147 * cr - credentials of caller. 1148 * flag - large file flag [UNUSED]. 1149 * 1150 * OUT: vpp - vnode of created or trunc'd entry. 1151 * 1152 * RETURN: 0 if success 1153 * error code if failure 1154 * 1155 * Timestamps: 1156 * dvp - ctime|mtime updated if new entry created 1157 * vp - ctime|mtime always, atime if new 1158 */ 1159/* ARGSUSED */ 1160static int 1161zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode, 1162 vnode_t **vpp, cred_t *cr, kthread_t *td) 1163{ 1164 znode_t *zp, *dzp = VTOZ(dvp); 1165 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1166 zilog_t *zilog = zfsvfs->z_log; 1167 objset_t *os = zfsvfs->z_os; 1168 zfs_dirlock_t *dl; 1169 dmu_tx_t *tx; 1170 int error; 1171 uint64_t zoid; 1172 1173 ZFS_ENTER(zfsvfs); 1174 1175top: 1176 *vpp = NULL; 1177 1178 if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr)) 1179 vap->va_mode &= ~VSVTX; 1180 1181 if (*name == '\0') { 1182 /* 1183 * Null component name refers to the directory itself. 1184 */ 1185 VN_HOLD(dvp); 1186 zp = dzp; 1187 dl = NULL; 1188 error = 0; 1189 } else { 1190 /* possible VN_HOLD(zp) */ 1191 if (error = zfs_dirent_lock(&dl, dzp, name, &zp, 0)) { 1192 if (strcmp(name, "..") == 0) 1193 error = EISDIR; 1194 ZFS_EXIT(zfsvfs); 1195 return (error); 1196 } 1197 } 1198 1199 zoid = zp ? zp->z_id : -1ULL; 1200 1201 if (zp == NULL) { 1202 /* 1203 * Create a new file object and update the directory 1204 * to reference it. 1205 */ 1206 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) { 1207 goto out; 1208 } 1209 1210 /* 1211 * We only support the creation of regular files in 1212 * extended attribute directories. 1213 */ 1214 if ((dzp->z_phys->zp_flags & ZFS_XATTR) && 1215 (vap->va_type != VREG)) { 1216 error = EINVAL; 1217 goto out; 1218 } 1219 1220 tx = dmu_tx_create(os); 1221 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 1222 dmu_tx_hold_bonus(tx, dzp->z_id); 1223 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 1224 if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) 1225 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 1226 0, SPA_MAXBLOCKSIZE); 1227 error = dmu_tx_assign(tx, zfsvfs->z_assign); 1228 if (error) { 1229 zfs_dirent_unlock(dl); 1230 if (error == ERESTART && 1231 zfsvfs->z_assign == TXG_NOWAIT) { 1232 dmu_tx_wait(tx); 1233 dmu_tx_abort(tx); 1234 goto top; 1235 } 1236 dmu_tx_abort(tx); 1237 ZFS_EXIT(zfsvfs); 1238 return (error); 1239 } 1240 zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0); 1241 ASSERT(zp->z_id == zoid); 1242 (void) zfs_link_create(dl, zp, tx, ZNEW); 1243 zfs_log_create(zilog, tx, TX_CREATE, dzp, zp, name); 1244 dmu_tx_commit(tx); 1245 } else { 1246 /* 1247 * A directory entry already exists for this name. 1248 */ 1249 /* 1250 * Can't truncate an existing file if in exclusive mode. 1251 */ 1252 if (excl == EXCL) { 1253 error = EEXIST; 1254 goto out; 1255 } 1256 /* 1257 * Can't open a directory for writing. 1258 */ 1259 if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) { 1260 error = EISDIR; 1261 goto out; 1262 } 1263 /* 1264 * Verify requested access to file. 1265 */ 1266 if (mode && (error = zfs_zaccess_rwx(zp, mode, cr))) { 1267 goto out; 1268 } 1269 1270 mutex_enter(&dzp->z_lock); 1271 dzp->z_seq++; 1272 mutex_exit(&dzp->z_lock); 1273 1274 /* 1275 * Truncate regular files if requested. 1276 */ 1277 if ((ZTOV(zp)->v_type == VREG) && 1278 (zp->z_phys->zp_size != 0) && 1279 (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) { 1280 error = zfs_freesp(zp, 0, 0, mode, TRUE); 1281 if (error == ERESTART && 1282 zfsvfs->z_assign == TXG_NOWAIT) { 1283 /* NB: we already did dmu_tx_wait() */ 1284 zfs_dirent_unlock(dl); 1285 VN_RELE(ZTOV(zp)); 1286 goto top; 1287 } 1288 } 1289 } 1290out: 1291 1292 if (error == 0) { 1293 *vpp = ZTOV(zp); 1294 vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, td); 1295 } 1296 1297 if (dl) 1298 zfs_dirent_unlock(dl); 1299 1300 if (error) { 1301 if (zp) 1302 VN_RELE(ZTOV(zp)); 1303 } else { 1304 *vpp = ZTOV(zp); 1305 /* 1306 * If vnode is for a device return a specfs vnode instead. 1307 */ 1308 if (IS_DEVVP(*vpp)) { 1309 struct vnode *svp; 1310 1311 svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); 1312 VN_RELE(*vpp); 1313 if (svp == NULL) { 1314 error = ENOSYS; 1315 } 1316 *vpp = svp; 1317 } 1318 } 1319 1320 ZFS_EXIT(zfsvfs); 1321 return (error); 1322} 1323 1324/* 1325 * Remove an entry from a directory. 1326 * 1327 * IN: dvp - vnode of directory to remove entry from. 1328 * name - name of entry to remove. 1329 * cr - credentials of caller. 1330 * 1331 * RETURN: 0 if success 1332 * error code if failure 1333 * 1334 * Timestamps: 1335 * dvp - ctime|mtime 1336 * vp - ctime (if nlink > 0) 1337 */ 1338static int 1339zfs_remove(vnode_t *dvp, char *name, cred_t *cr) 1340{ 1341 znode_t *zp, *dzp = VTOZ(dvp); 1342 znode_t *xzp = NULL; 1343 vnode_t *vp; 1344 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1345 zilog_t *zilog = zfsvfs->z_log; 1346 uint64_t acl_obj, xattr_obj; 1347 zfs_dirlock_t *dl; 1348 dmu_tx_t *tx; 1349 boolean_t may_delete_now, delete_now = FALSE; 1350 boolean_t unlinked; 1351 int error; 1352 1353 ZFS_ENTER(zfsvfs); 1354 1355top: 1356 /* 1357 * Attempt to lock directory; fail if entry doesn't exist. 1358 */ 1359 if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZEXISTS)) { 1360 ZFS_EXIT(zfsvfs); 1361 return (error); 1362 } 1363 1364 vp = ZTOV(zp); 1365 1366 if (error = zfs_zaccess_delete(dzp, zp, cr)) { 1367 goto out; 1368 } 1369 1370 /* 1371 * Need to use rmdir for removing directories. 1372 */ 1373 if (vp->v_type == VDIR) { 1374 error = EPERM; 1375 goto out; 1376 } 1377 1378 vnevent_remove(vp); 1379 1380 dnlc_remove(dvp, name); 1381 1382 may_delete_now = FALSE; 1383 1384 /* 1385 * We may delete the znode now, or we may put it in the unlinked set; 1386 * it depends on whether we're the last link, and on whether there are 1387 * other holds on the vnode. So we dmu_tx_hold() the right things to 1388 * allow for either case. 1389 */ 1390 tx = dmu_tx_create(zfsvfs->z_os); 1391 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); 1392 dmu_tx_hold_bonus(tx, zp->z_id); 1393 if (may_delete_now) 1394 dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END); 1395 1396 /* are there any extended attributes? */ 1397 if ((xattr_obj = zp->z_phys->zp_xattr) != 0) { 1398 /* XXX - do we need this if we are deleting? */ 1399 dmu_tx_hold_bonus(tx, xattr_obj); 1400 } 1401 1402 /* are there any additional acls */ 1403 if ((acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj) != 0 && 1404 may_delete_now) 1405 dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); 1406 1407 /* charge as an update -- would be nice not to charge at all */ 1408 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 1409 1410 error = dmu_tx_assign(tx, zfsvfs->z_assign); 1411 if (error) { 1412 zfs_dirent_unlock(dl); 1413 VN_RELE(vp); 1414 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 1415 dmu_tx_wait(tx); 1416 dmu_tx_abort(tx); 1417 goto top; 1418 } 1419 dmu_tx_abort(tx); 1420 ZFS_EXIT(zfsvfs); 1421 return (error); 1422 } 1423 1424 /* 1425 * Remove the directory entry. 1426 */ 1427 error = zfs_link_destroy(dl, zp, tx, 0, &unlinked); 1428 1429 if (error) { 1430 dmu_tx_commit(tx); 1431 goto out; 1432 } 1433 1434 if (0 && unlinked) { 1435 VI_LOCK(vp); 1436 delete_now = may_delete_now && 1437 vp->v_count == 1 && !vn_has_cached_data(vp) && 1438 zp->z_phys->zp_xattr == xattr_obj && 1439 zp->z_phys->zp_acl.z_acl_extern_obj == acl_obj; 1440 VI_UNLOCK(vp); 1441 } 1442 1443 if (delete_now) { 1444 if (zp->z_phys->zp_xattr) { 1445 error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp); 1446 ASSERT3U(error, ==, 0); 1447 ASSERT3U(xzp->z_phys->zp_links, ==, 2); 1448 dmu_buf_will_dirty(xzp->z_dbuf, tx); 1449 mutex_enter(&xzp->z_lock); 1450 xzp->z_unlinked = 1; 1451 xzp->z_phys->zp_links = 0; 1452 mutex_exit(&xzp->z_lock); 1453 zfs_unlinked_add(xzp, tx); 1454 zp->z_phys->zp_xattr = 0; /* probably unnecessary */ 1455 } 1456 mutex_enter(&zp->z_lock); 1457 VI_LOCK(vp); 1458 vp->v_count--; 1459 ASSERT3U(vp->v_count, ==, 0); 1460 VI_UNLOCK(vp); 1461 mutex_exit(&zp->z_lock); 1462 zfs_znode_delete(zp, tx); 1463 VFS_RELE(zfsvfs->z_vfs); 1464 } else if (unlinked) { 1465 zfs_unlinked_add(zp, tx); 1466 } 1467 1468 zfs_log_remove(zilog, tx, TX_REMOVE, dzp, name); 1469 1470 dmu_tx_commit(tx); 1471out: 1472 zfs_dirent_unlock(dl); 1473 1474 if (!delete_now) { 1475 VN_RELE(vp); 1476 } else if (xzp) { 1477 /* this rele delayed to prevent nesting transactions */ 1478 VN_RELE(ZTOV(xzp)); 1479 } 1480 1481 ZFS_EXIT(zfsvfs); 1482 return (error); 1483} 1484 1485/* 1486 * Create a new directory and insert it into dvp using the name 1487 * provided. Return a pointer to the inserted directory. 1488 * 1489 * IN: dvp - vnode of directory to add subdir to. 1490 * dirname - name of new directory. 1491 * vap - attributes of new directory. 1492 * cr - credentials of caller. 1493 * 1494 * OUT: vpp - vnode of created directory. 1495 * 1496 * RETURN: 0 if success 1497 * error code if failure 1498 * 1499 * Timestamps: 1500 * dvp - ctime|mtime updated 1501 * vp - ctime|mtime|atime updated 1502 */ 1503static int 1504zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr) 1505{ 1506 znode_t *zp, *dzp = VTOZ(dvp); 1507 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1508 zilog_t *zilog = zfsvfs->z_log; 1509 zfs_dirlock_t *dl; 1510 uint64_t zoid = 0; 1511 dmu_tx_t *tx; 1512 int error; 1513 1514 ASSERT(vap->va_type == VDIR); 1515 1516 ZFS_ENTER(zfsvfs); 1517 1518 if (dzp->z_phys->zp_flags & ZFS_XATTR) { 1519 ZFS_EXIT(zfsvfs); 1520 return (EINVAL); 1521 } 1522top: 1523 *vpp = NULL; 1524 1525 /* 1526 * First make sure the new directory doesn't exist. 1527 */ 1528 if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, ZNEW)) { 1529 ZFS_EXIT(zfsvfs); 1530 return (error); 1531 } 1532 1533 if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, cr)) { 1534 zfs_dirent_unlock(dl); 1535 ZFS_EXIT(zfsvfs); 1536 return (error); 1537 } 1538 1539 /* 1540 * Add a new entry to the directory. 1541 */ 1542 tx = dmu_tx_create(zfsvfs->z_os); 1543 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); 1544 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); 1545 if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) 1546 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 1547 0, SPA_MAXBLOCKSIZE); 1548 error = dmu_tx_assign(tx, zfsvfs->z_assign); 1549 if (error) { 1550 zfs_dirent_unlock(dl); 1551 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 1552 dmu_tx_wait(tx); 1553 dmu_tx_abort(tx); 1554 goto top; 1555 } 1556 dmu_tx_abort(tx); 1557 ZFS_EXIT(zfsvfs); 1558 return (error); 1559 } 1560 1561 /* 1562 * Create new node. 1563 */ 1564 zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0); 1565 1566 /* 1567 * Now put new name in parent dir. 1568 */ 1569 (void) zfs_link_create(dl, zp, tx, ZNEW); 1570 1571 *vpp = ZTOV(zp); 1572 1573 zfs_log_create(zilog, tx, TX_MKDIR, dzp, zp, dirname); 1574 dmu_tx_commit(tx); 1575 1576 vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, curthread); 1577 1578 zfs_dirent_unlock(dl); 1579 1580 ZFS_EXIT(zfsvfs); 1581 return (0); 1582} 1583 1584/* 1585 * Remove a directory subdir entry. If the current working 1586 * directory is the same as the subdir to be removed, the 1587 * remove will fail. 1588 * 1589 * IN: dvp - vnode of directory to remove from. 1590 * name - name of directory to be removed. 1591 * cwd - vnode of current working directory. 1592 * cr - credentials of caller. 1593 * 1594 * RETURN: 0 if success 1595 * error code if failure 1596 * 1597 * Timestamps: 1598 * dvp - ctime|mtime updated 1599 */ 1600static int 1601zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr) 1602{ 1603 znode_t *dzp = VTOZ(dvp); 1604 znode_t *zp; 1605 vnode_t *vp; 1606 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1607 zilog_t *zilog = zfsvfs->z_log; 1608 zfs_dirlock_t *dl; 1609 dmu_tx_t *tx; 1610 int error; 1611 1612 ZFS_ENTER(zfsvfs); 1613 1614top: 1615 zp = NULL; 1616 1617 /* 1618 * Attempt to lock directory; fail if entry doesn't exist. 1619 */ 1620 if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZEXISTS)) { 1621 ZFS_EXIT(zfsvfs); 1622 return (error); 1623 } 1624 1625 vp = ZTOV(zp); 1626 1627 if (error = zfs_zaccess_delete(dzp, zp, cr)) { 1628 goto out; 1629 } 1630 1631 if (vp->v_type != VDIR) { 1632 error = ENOTDIR; 1633 goto out; 1634 } 1635 1636 if (vp == cwd) { 1637 error = EINVAL; 1638 goto out; 1639 } 1640 1641 vnevent_rmdir(vp); 1642 1643 /* 1644 * Grab a lock on the directory to make sure that noone is 1645 * trying to add (or lookup) entries while we are removing it. 1646 */ 1647 rw_enter(&zp->z_name_lock, RW_WRITER); 1648 1649 /* 1650 * Grab a lock on the parent pointer to make sure we play well 1651 * with the treewalk and directory rename code. 1652 */ 1653 rw_enter(&zp->z_parent_lock, RW_WRITER); 1654 1655 tx = dmu_tx_create(zfsvfs->z_os); 1656 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); 1657 dmu_tx_hold_bonus(tx, zp->z_id); 1658 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 1659 error = dmu_tx_assign(tx, zfsvfs->z_assign); 1660 if (error) { 1661 rw_exit(&zp->z_parent_lock); 1662 rw_exit(&zp->z_name_lock); 1663 zfs_dirent_unlock(dl); 1664 VN_RELE(vp); 1665 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 1666 dmu_tx_wait(tx); 1667 dmu_tx_abort(tx); 1668 goto top; 1669 } 1670 dmu_tx_abort(tx); 1671 ZFS_EXIT(zfsvfs); 1672 return (error); 1673 } 1674 1675#ifdef FREEBSD_NAMECACHE 1676 cache_purge(dvp); 1677#endif 1678 1679 error = zfs_link_destroy(dl, zp, tx, 0, NULL); 1680 1681 if (error == 0) 1682 zfs_log_remove(zilog, tx, TX_RMDIR, dzp, name); 1683 1684 dmu_tx_commit(tx); 1685 1686 rw_exit(&zp->z_parent_lock); 1687 rw_exit(&zp->z_name_lock); 1688#ifdef FREEBSD_NAMECACHE 1689 cache_purge(vp); 1690#endif 1691out: 1692 zfs_dirent_unlock(dl); 1693 1694 VN_RELE(vp); 1695 1696 ZFS_EXIT(zfsvfs); 1697 return (error); 1698} 1699 1700/* 1701 * Read as many directory entries as will fit into the provided 1702 * buffer from the given directory cursor position (specified in 1703 * the uio structure. 1704 * 1705 * IN: vp - vnode of directory to read. 1706 * uio - structure supplying read location, range info, 1707 * and return buffer. 1708 * cr - credentials of caller. 1709 * 1710 * OUT: uio - updated offset and range, buffer filled. 1711 * eofp - set to true if end-of-file detected. 1712 * 1713 * RETURN: 0 if success 1714 * error code if failure 1715 * 1716 * Timestamps: 1717 * vp - atime updated 1718 * 1719 * Note that the low 4 bits of the cookie returned by zap is always zero. 1720 * This allows us to use the low range for "special" directory entries: 1721 * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, 1722 * we use the offset 2 for the '.zfs' directory. 1723 */ 1724/* ARGSUSED */ 1725static int 1726zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies) 1727{ 1728 znode_t *zp = VTOZ(vp); 1729 iovec_t *iovp; 1730 dirent64_t *odp; 1731 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1732 objset_t *os; 1733 caddr_t outbuf; 1734 size_t bufsize; 1735 zap_cursor_t zc; 1736 zap_attribute_t zap; 1737 uint_t bytes_wanted; 1738 uint64_t offset; /* must be unsigned; checks for < 1 */ 1739 int local_eof; 1740 int outcount; 1741 int error; 1742 uint8_t prefetch; 1743 uint8_t type; 1744 int ncooks; 1745 u_long *cooks = NULL; 1746 1747 ZFS_ENTER(zfsvfs); 1748 1749 /* 1750 * If we are not given an eof variable, 1751 * use a local one. 1752 */ 1753 if (eofp == NULL) 1754 eofp = &local_eof; 1755 1756 /* 1757 * Check for valid iov_len. 1758 */ 1759 if (uio->uio_iov->iov_len <= 0) { 1760 ZFS_EXIT(zfsvfs); 1761 return (EINVAL); 1762 } 1763 1764 /* 1765 * Quit if directory has been removed (posix) 1766 */ 1767 if ((*eofp = zp->z_unlinked) != 0) { 1768 ZFS_EXIT(zfsvfs); 1769 return (0); 1770 } 1771 1772 error = 0; 1773 os = zfsvfs->z_os; 1774 offset = uio->uio_loffset; 1775 prefetch = zp->z_zn_prefetch; 1776 1777 /* 1778 * Initialize the iterator cursor. 1779 */ 1780 if (offset <= 3) { 1781 /* 1782 * Start iteration from the beginning of the directory. 1783 */ 1784 zap_cursor_init(&zc, os, zp->z_id); 1785 } else { 1786 /* 1787 * The offset is a serialized cursor. 1788 */ 1789 zap_cursor_init_serialized(&zc, os, zp->z_id, offset); 1790 } 1791 1792 /* 1793 * Get space to change directory entries into fs independent format. 1794 */ 1795 iovp = uio->uio_iov; 1796 bytes_wanted = iovp->iov_len; 1797 if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) { 1798 bufsize = bytes_wanted; 1799 outbuf = kmem_alloc(bufsize, KM_SLEEP); 1800 odp = (struct dirent64 *)outbuf; 1801 } else { 1802 bufsize = bytes_wanted; 1803 odp = (struct dirent64 *)iovp->iov_base; 1804 } 1805 1806 if (ncookies != NULL) { 1807 /* 1808 * Minimum entry size is dirent size and 1 byte for a file name. 1809 */ 1810 ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1); 1811 cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK); 1812 *cookies = cooks; 1813 *ncookies = ncooks; 1814 } 1815 1816 /* 1817 * Transform to file-system independent format 1818 */ 1819 outcount = 0; 1820 while (outcount < bytes_wanted) { 1821 ino64_t objnum; 1822 ushort_t reclen; 1823 1824 /* 1825 * Special case `.', `..', and `.zfs'. 1826 */ 1827 if (offset == 0) { 1828 (void) strcpy(zap.za_name, "."); 1829 objnum = zp->z_id; 1830 type = DT_DIR; 1831 } else if (offset == 1) { 1832 (void) strcpy(zap.za_name, ".."); 1833 objnum = zp->z_phys->zp_parent; 1834 type = DT_DIR; 1835 } else if (offset == 2 && zfs_show_ctldir(zp)) { 1836 (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); 1837 objnum = ZFSCTL_INO_ROOT; 1838 type = DT_DIR; 1839 } else { 1840 /* 1841 * Grab next entry. 1842 */ 1843 if (error = zap_cursor_retrieve(&zc, &zap)) { 1844 if ((*eofp = (error == ENOENT)) != 0) 1845 break; 1846 else 1847 goto update; 1848 } 1849 1850 if (zap.za_integer_length != 8 || 1851 zap.za_num_integers != 1) { 1852 cmn_err(CE_WARN, "zap_readdir: bad directory " 1853 "entry, obj = %lld, offset = %lld\n", 1854 (u_longlong_t)zp->z_id, 1855 (u_longlong_t)offset); 1856 error = ENXIO; 1857 goto update; 1858 } 1859 1860 objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); 1861 /* 1862 * MacOS X can extract the object type here such as: 1863 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer); 1864 */ 1865 type = ZFS_DIRENT_TYPE(zap.za_first_integer); 1866 } 1867 reclen = DIRENT64_RECLEN(strlen(zap.za_name)); 1868 1869 /* 1870 * Will this entry fit in the buffer? 1871 */ 1872 if (outcount + reclen > bufsize) { 1873 /* 1874 * Did we manage to fit anything in the buffer? 1875 */ 1876 if (!outcount) { 1877 error = EINVAL; 1878 goto update; 1879 } 1880 break; 1881 } 1882 /* 1883 * Add this entry: 1884 */ 1885 odp->d_ino = objnum; 1886 odp->d_reclen = reclen; 1887 odp->d_namlen = strlen(zap.za_name); 1888 (void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1); 1889 odp->d_type = type; 1890 outcount += reclen; 1891 odp = (dirent64_t *)((intptr_t)odp + reclen); 1892 1893 ASSERT(outcount <= bufsize); 1894 1895 /* Prefetch znode */ 1896 if (prefetch) 1897 dmu_prefetch(os, objnum, 0, 0); 1898 1899 /* 1900 * Move to the next entry, fill in the previous offset. 1901 */ 1902 if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { 1903 zap_cursor_advance(&zc); 1904 offset = zap_cursor_serialize(&zc); 1905 } else { 1906 offset += 1; 1907 } 1908 1909 if (cooks != NULL) { 1910 *cooks++ = offset; 1911 ncooks--; 1912 KASSERT(ncooks >= 0, ("ncookies=%d", ncooks)); 1913 } 1914 } 1915 zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ 1916 1917 /* Subtract unused cookies */ 1918 if (ncookies != NULL) 1919 *ncookies -= ncooks; 1920 1921 if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) { 1922 iovp->iov_base += outcount; 1923 iovp->iov_len -= outcount; 1924 uio->uio_resid -= outcount; 1925 } else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) { 1926 /* 1927 * Reset the pointer. 1928 */ 1929 offset = uio->uio_loffset; 1930 } 1931 1932update: 1933 zap_cursor_fini(&zc); 1934 if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) 1935 kmem_free(outbuf, bufsize); 1936 1937 if (error == ENOENT) 1938 error = 0; 1939 1940 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 1941 1942 uio->uio_loffset = offset; 1943 ZFS_EXIT(zfsvfs); 1944 if (error != 0 && cookies != NULL) { 1945 free(*cookies, M_TEMP); 1946 *cookies = NULL; 1947 *ncookies = 0; 1948 } 1949 return (error); 1950} 1951 1952static int 1953zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr) 1954{ 1955 znode_t *zp = VTOZ(vp); 1956 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1957 1958 ZFS_ENTER(zfsvfs); 1959 zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id); 1960 ZFS_EXIT(zfsvfs); 1961 return (0); 1962} 1963 1964/* 1965 * Get the requested file attributes and place them in the provided 1966 * vattr structure. 1967 * 1968 * IN: vp - vnode of file. 1969 * vap - va_mask identifies requested attributes. 1970 * flags - [UNUSED] 1971 * cr - credentials of caller. 1972 * 1973 * OUT: vap - attribute values. 1974 * 1975 * RETURN: 0 (always succeeds) 1976 */ 1977/* ARGSUSED */ 1978static int 1979zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr) 1980{ 1981 znode_t *zp = VTOZ(vp); 1982 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1983 znode_phys_t *pzp = zp->z_phys; 1984 uint32_t blksize; 1985 u_longlong_t nblocks; 1986 int error; 1987 1988 ZFS_ENTER(zfsvfs); 1989 1990 /* 1991 * Return all attributes. It's cheaper to provide the answer 1992 * than to determine whether we were asked the question. 1993 */ 1994 mutex_enter(&zp->z_lock); 1995 1996 vap->va_type = IFTOVT(pzp->zp_mode); 1997 vap->va_mode = pzp->zp_mode & ~S_IFMT; 1998 vap->va_uid = zp->z_phys->zp_uid; 1999 vap->va_gid = zp->z_phys->zp_gid; 2000 vap->va_nodeid = zp->z_id; 2001 vap->va_nlink = MIN(pzp->zp_links, UINT32_MAX); /* nlink_t limit! */ 2002 vap->va_size = pzp->zp_size; 2003 vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; 2004 vap->va_rdev = zfs_cmpldev(pzp->zp_rdev); 2005 vap->va_seq = zp->z_seq; 2006 vap->va_flags = 0; /* FreeBSD: Reset chflags(2) flags. */ 2007 2008 ZFS_TIME_DECODE(&vap->va_atime, pzp->zp_atime); 2009 ZFS_TIME_DECODE(&vap->va_mtime, pzp->zp_mtime); 2010 ZFS_TIME_DECODE(&vap->va_ctime, pzp->zp_ctime); 2011 ZFS_TIME_DECODE(&vap->va_birthtime, pzp->zp_crtime); 2012 2013 /* 2014 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES. 2015 * Also, if we are the owner don't bother, since owner should 2016 * always be allowed to read basic attributes of file. 2017 */ 2018 if (!(zp->z_phys->zp_flags & ZFS_ACL_TRIVIAL) && 2019 (zp->z_phys->zp_uid != crgetuid(cr))) { 2020 if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, cr)) { 2021 mutex_exit(&zp->z_lock); 2022 ZFS_EXIT(zfsvfs); 2023 return (error); 2024 } 2025 } 2026 2027 mutex_exit(&zp->z_lock); 2028 2029 dmu_object_size_from_db(zp->z_dbuf, &blksize, &nblocks); 2030 vap->va_blksize = blksize; 2031 vap->va_bytes = nblocks << 9; /* nblocks * 512 */ 2032 2033 if (zp->z_blksz == 0) { 2034 /* 2035 * Block size hasn't been set; suggest maximal I/O transfers. 2036 */ 2037 vap->va_blksize = zfsvfs->z_max_blksz; 2038 } 2039 2040 ZFS_EXIT(zfsvfs); 2041 return (0); 2042} 2043 2044/* 2045 * Set the file attributes to the values contained in the 2046 * vattr structure. 2047 * 2048 * IN: vp - vnode of file to be modified. 2049 * vap - new attribute values. 2050 * flags - ATTR_UTIME set if non-default time values provided. 2051 * cr - credentials of caller. 2052 * 2053 * RETURN: 0 if success 2054 * error code if failure 2055 * 2056 * Timestamps: 2057 * vp - ctime updated, mtime updated if size changed. 2058 */ 2059/* ARGSUSED */ 2060static int 2061zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, 2062 caller_context_t *ct) 2063{ 2064 struct znode *zp = VTOZ(vp); 2065 znode_phys_t *pzp = zp->z_phys; 2066 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2067 zilog_t *zilog = zfsvfs->z_log; 2068 dmu_tx_t *tx; 2069 vattr_t oldva; 2070 uint_t mask = vap->va_mask; 2071 uint_t saved_mask; 2072 int trim_mask = 0; 2073 uint64_t new_mode; 2074 znode_t *attrzp; 2075 int need_policy = FALSE; 2076 int err; 2077 2078 if (mask == 0) 2079 return (0); 2080 2081 if (mask & AT_NOSET) 2082 return (EINVAL); 2083 2084 if (mask & AT_SIZE && vp->v_type == VDIR) 2085 return (EISDIR); 2086 2087 if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) 2088 return (EINVAL); 2089 2090 ZFS_ENTER(zfsvfs); 2091 2092top: 2093 attrzp = NULL; 2094 2095 if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { 2096 ZFS_EXIT(zfsvfs); 2097 return (EROFS); 2098 } 2099 2100 /* 2101 * First validate permissions 2102 */ 2103 2104 if (mask & AT_SIZE) { 2105 err = zfs_zaccess(zp, ACE_WRITE_DATA, cr); 2106 if (err) { 2107 ZFS_EXIT(zfsvfs); 2108 return (err); 2109 } 2110 /* 2111 * XXX - Note, we are not providing any open 2112 * mode flags here (like FNDELAY), so we may 2113 * block if there are locks present... this 2114 * should be addressed in openat(). 2115 */ 2116 do { 2117 err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); 2118 /* NB: we already did dmu_tx_wait() if necessary */ 2119 } while (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT); 2120 if (err) { 2121 ZFS_EXIT(zfsvfs); 2122 return (err); 2123 } 2124 } 2125 2126 if (mask & (AT_ATIME|AT_MTIME)) 2127 need_policy = zfs_zaccess_v4_perm(zp, ACE_WRITE_ATTRIBUTES, cr); 2128 2129 if (mask & (AT_UID|AT_GID)) { 2130 int idmask = (mask & (AT_UID|AT_GID)); 2131 int take_owner; 2132 int take_group; 2133 2134 /* 2135 * NOTE: even if a new mode is being set, 2136 * we may clear S_ISUID/S_ISGID bits. 2137 */ 2138 2139 if (!(mask & AT_MODE)) 2140 vap->va_mode = pzp->zp_mode; 2141 2142 /* 2143 * Take ownership or chgrp to group we are a member of 2144 */ 2145 2146 take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr)); 2147 take_group = (mask & AT_GID) && groupmember(vap->va_gid, cr); 2148 2149 /* 2150 * If both AT_UID and AT_GID are set then take_owner and 2151 * take_group must both be set in order to allow taking 2152 * ownership. 2153 * 2154 * Otherwise, send the check through secpolicy_vnode_setattr() 2155 * 2156 */ 2157 2158 if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) || 2159 ((idmask == AT_UID) && take_owner) || 2160 ((idmask == AT_GID) && take_group)) { 2161 if (zfs_zaccess_v4_perm(zp, ACE_WRITE_OWNER, cr) == 0) { 2162 /* 2163 * Remove setuid/setgid for non-privileged users 2164 */ 2165 secpolicy_setid_clear(vap, cr); 2166 trim_mask = (mask & (AT_UID|AT_GID)); 2167 } else { 2168 need_policy = TRUE; 2169 } 2170 } else { 2171 need_policy = TRUE; 2172 } 2173 } 2174 2175 mutex_enter(&zp->z_lock); 2176 oldva.va_mode = pzp->zp_mode; 2177 oldva.va_uid = zp->z_phys->zp_uid; 2178 oldva.va_gid = zp->z_phys->zp_gid; 2179 mutex_exit(&zp->z_lock); 2180 2181 if (mask & AT_MODE) { 2182 if (zfs_zaccess_v4_perm(zp, ACE_WRITE_ACL, cr) == 0) { 2183 err = secpolicy_setid_setsticky_clear(vp, vap, 2184 &oldva, cr); 2185 if (err) { 2186 ZFS_EXIT(zfsvfs); 2187 return (err); 2188 } 2189 trim_mask |= AT_MODE; 2190 } else { 2191 need_policy = TRUE; 2192 } 2193 } 2194 2195 if (need_policy) { 2196 /* 2197 * If trim_mask is set then take ownership 2198 * has been granted or write_acl is present and user 2199 * has the ability to modify mode. In that case remove 2200 * UID|GID and or MODE from mask so that 2201 * secpolicy_vnode_setattr() doesn't revoke it. 2202 */ 2203 2204 if (trim_mask) { 2205 saved_mask = vap->va_mask; 2206 vap->va_mask &= ~trim_mask; 2207 2208 } 2209 err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags, 2210 (int (*)(void *, int, cred_t *))zfs_zaccess_rwx, zp); 2211 if (err) { 2212 ZFS_EXIT(zfsvfs); 2213 return (err); 2214 } 2215 2216 if (trim_mask) 2217 vap->va_mask |= saved_mask; 2218 } 2219 2220 /* 2221 * secpolicy_vnode_setattr, or take ownership may have 2222 * changed va_mask 2223 */ 2224 mask = vap->va_mask; 2225 2226 tx = dmu_tx_create(zfsvfs->z_os); 2227 dmu_tx_hold_bonus(tx, zp->z_id); 2228 2229 if (mask & AT_MODE) { 2230 uint64_t pmode = pzp->zp_mode; 2231 2232 new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); 2233 2234 if (zp->z_phys->zp_acl.z_acl_extern_obj) 2235 dmu_tx_hold_write(tx, 2236 pzp->zp_acl.z_acl_extern_obj, 0, SPA_MAXBLOCKSIZE); 2237 else 2238 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 2239 0, ZFS_ACL_SIZE(MAX_ACL_SIZE)); 2240 } 2241 2242 if ((mask & (AT_UID | AT_GID)) && zp->z_phys->zp_xattr != 0) { 2243 err = zfs_zget(zp->z_zfsvfs, zp->z_phys->zp_xattr, &attrzp); 2244 if (err) { 2245 dmu_tx_abort(tx); 2246 ZFS_EXIT(zfsvfs); 2247 return (err); 2248 } 2249 dmu_tx_hold_bonus(tx, attrzp->z_id); 2250 } 2251 2252 err = dmu_tx_assign(tx, zfsvfs->z_assign); 2253 if (err) { 2254 if (attrzp) 2255 VN_RELE(ZTOV(attrzp)); 2256 if (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 2257 dmu_tx_wait(tx); 2258 dmu_tx_abort(tx); 2259 goto top; 2260 } 2261 dmu_tx_abort(tx); 2262 ZFS_EXIT(zfsvfs); 2263 return (err); 2264 } 2265 2266 dmu_buf_will_dirty(zp->z_dbuf, tx); 2267 2268 /* 2269 * Set each attribute requested. 2270 * We group settings according to the locks they need to acquire. 2271 * 2272 * Note: you cannot set ctime directly, although it will be 2273 * updated as a side-effect of calling this function. 2274 */ 2275 2276 mutex_enter(&zp->z_lock); 2277 2278 if (mask & AT_MODE) { 2279 err = zfs_acl_chmod_setattr(zp, new_mode, tx); 2280 ASSERT3U(err, ==, 0); 2281 } 2282 2283 if (attrzp) 2284 mutex_enter(&attrzp->z_lock); 2285 2286 if (mask & AT_UID) { 2287 zp->z_phys->zp_uid = (uint64_t)vap->va_uid; 2288 if (attrzp) { 2289 attrzp->z_phys->zp_uid = (uint64_t)vap->va_uid; 2290 } 2291 } 2292 2293 if (mask & AT_GID) { 2294 zp->z_phys->zp_gid = (uint64_t)vap->va_gid; 2295 if (attrzp) 2296 attrzp->z_phys->zp_gid = (uint64_t)vap->va_gid; 2297 } 2298 2299 if (attrzp) 2300 mutex_exit(&attrzp->z_lock); 2301 2302 if (mask & AT_ATIME) 2303 ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime); 2304 2305 if (mask & AT_MTIME) 2306 ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime); 2307 2308 if (mask & AT_SIZE) 2309 zfs_time_stamper_locked(zp, CONTENT_MODIFIED, tx); 2310 else if (mask != 0) 2311 zfs_time_stamper_locked(zp, STATE_CHANGED, tx); 2312 2313 if (mask != 0) 2314 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask); 2315 2316 mutex_exit(&zp->z_lock); 2317 2318 if (attrzp) 2319 VN_RELE(ZTOV(attrzp)); 2320 2321 dmu_tx_commit(tx); 2322 2323 ZFS_EXIT(zfsvfs); 2324 return (err); 2325} 2326 2327typedef struct zfs_zlock { 2328 krwlock_t *zl_rwlock; /* lock we acquired */ 2329 znode_t *zl_znode; /* znode we held */ 2330 struct zfs_zlock *zl_next; /* next in list */ 2331} zfs_zlock_t; 2332 2333/* 2334 * Drop locks and release vnodes that were held by zfs_rename_lock(). 2335 */ 2336static void 2337zfs_rename_unlock(zfs_zlock_t **zlpp) 2338{ 2339 zfs_zlock_t *zl; 2340 2341 while ((zl = *zlpp) != NULL) { 2342 if (zl->zl_znode != NULL) 2343 VN_RELE(ZTOV(zl->zl_znode)); 2344 rw_exit(zl->zl_rwlock); 2345 *zlpp = zl->zl_next; 2346 kmem_free(zl, sizeof (*zl)); 2347 } 2348} 2349 2350/* 2351 * Search back through the directory tree, using the ".." entries. 2352 * Lock each directory in the chain to prevent concurrent renames. 2353 * Fail any attempt to move a directory into one of its own descendants. 2354 * XXX - z_parent_lock can overlap with map or grow locks 2355 */ 2356static int 2357zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) 2358{ 2359 zfs_zlock_t *zl; 2360 znode_t *zp = tdzp; 2361 uint64_t rootid = zp->z_zfsvfs->z_root; 2362 uint64_t *oidp = &zp->z_id; 2363 krwlock_t *rwlp = &szp->z_parent_lock; 2364 krw_t rw = RW_WRITER; 2365 2366 /* 2367 * First pass write-locks szp and compares to zp->z_id. 2368 * Later passes read-lock zp and compare to zp->z_parent. 2369 */ 2370 do { 2371 if (!rw_tryenter(rwlp, rw)) { 2372 /* 2373 * Another thread is renaming in this path. 2374 * Note that if we are a WRITER, we don't have any 2375 * parent_locks held yet. 2376 */ 2377 if (rw == RW_READER && zp->z_id > szp->z_id) { 2378 /* 2379 * Drop our locks and restart 2380 */ 2381 zfs_rename_unlock(&zl); 2382 *zlpp = NULL; 2383 zp = tdzp; 2384 oidp = &zp->z_id; 2385 rwlp = &szp->z_parent_lock; 2386 rw = RW_WRITER; 2387 continue; 2388 } else { 2389 /* 2390 * Wait for other thread to drop its locks 2391 */ 2392 rw_enter(rwlp, rw); 2393 } 2394 } 2395 2396 zl = kmem_alloc(sizeof (*zl), KM_SLEEP); 2397 zl->zl_rwlock = rwlp; 2398 zl->zl_znode = NULL; 2399 zl->zl_next = *zlpp; 2400 *zlpp = zl; 2401 2402 if (*oidp == szp->z_id) /* We're a descendant of szp */ 2403 return (EINVAL); 2404 2405 if (*oidp == rootid) /* We've hit the top */ 2406 return (0); 2407 2408 if (rw == RW_READER) { /* i.e. not the first pass */ 2409 int error = zfs_zget(zp->z_zfsvfs, *oidp, &zp); 2410 if (error) 2411 return (error); 2412 zl->zl_znode = zp; 2413 } 2414 oidp = &zp->z_phys->zp_parent; 2415 rwlp = &zp->z_parent_lock; 2416 rw = RW_READER; 2417 2418 } while (zp->z_id != sdzp->z_id); 2419 2420 return (0); 2421} 2422 2423/* 2424 * Move an entry from the provided source directory to the target 2425 * directory. Change the entry name as indicated. 2426 * 2427 * IN: sdvp - Source directory containing the "old entry". 2428 * snm - Old entry name. 2429 * tdvp - Target directory to contain the "new entry". 2430 * tnm - New entry name. 2431 * cr - credentials of caller. 2432 * 2433 * RETURN: 0 if success 2434 * error code if failure 2435 * 2436 * Timestamps: 2437 * sdvp,tdvp - ctime|mtime updated 2438 */ 2439static int 2440zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr) 2441{ 2442 znode_t *tdzp, *szp, *tzp; 2443 znode_t *sdzp = VTOZ(sdvp); 2444 zfsvfs_t *zfsvfs = sdzp->z_zfsvfs; 2445 zilog_t *zilog = zfsvfs->z_log; 2446 vnode_t *realvp; 2447 zfs_dirlock_t *sdl, *tdl; 2448 dmu_tx_t *tx; 2449 zfs_zlock_t *zl; 2450 int cmp, serr, terr, error; 2451 2452 ZFS_ENTER(zfsvfs); 2453 2454 /* 2455 * Make sure we have the real vp for the target directory. 2456 */ 2457 if (VOP_REALVP(tdvp, &realvp) == 0) 2458 tdvp = realvp; 2459 2460 if (tdvp->v_vfsp != sdvp->v_vfsp) { 2461 ZFS_EXIT(zfsvfs); 2462 return (EXDEV); 2463 } 2464 2465 tdzp = VTOZ(tdvp); 2466top: 2467 szp = NULL; 2468 tzp = NULL; 2469 zl = NULL; 2470 2471 /* 2472 * This is to prevent the creation of links into attribute space 2473 * by renaming a linked file into/outof an attribute directory. 2474 * See the comment in zfs_link() for why this is considered bad. 2475 */ 2476 if ((tdzp->z_phys->zp_flags & ZFS_XATTR) != 2477 (sdzp->z_phys->zp_flags & ZFS_XATTR)) { 2478 ZFS_EXIT(zfsvfs); 2479 return (EINVAL); 2480 } 2481 2482 /* 2483 * Lock source and target directory entries. To prevent deadlock, 2484 * a lock ordering must be defined. We lock the directory with 2485 * the smallest object id first, or if it's a tie, the one with 2486 * the lexically first name. 2487 */ 2488 if (sdzp->z_id < tdzp->z_id) { 2489 cmp = -1; 2490 } else if (sdzp->z_id > tdzp->z_id) { 2491 cmp = 1; 2492 } else { 2493 cmp = strcmp(snm, tnm); 2494 if (cmp == 0) { 2495 /* 2496 * POSIX: "If the old argument and the new argument 2497 * both refer to links to the same existing file, 2498 * the rename() function shall return successfully 2499 * and perform no other action." 2500 */ 2501 ZFS_EXIT(zfsvfs); 2502 return (0); 2503 } 2504 } 2505 if (cmp < 0) { 2506 serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS); 2507 terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, 0); 2508 } else { 2509 terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, 0); 2510 serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS); 2511 } 2512 2513 if (serr) { 2514 /* 2515 * Source entry invalid or not there. 2516 */ 2517 if (!terr) { 2518 zfs_dirent_unlock(tdl); 2519 if (tzp) 2520 VN_RELE(ZTOV(tzp)); 2521 } 2522 if (strcmp(snm, ".") == 0 || strcmp(snm, "..") == 0) 2523 serr = EINVAL; 2524 ZFS_EXIT(zfsvfs); 2525 return (serr); 2526 } 2527 if (terr) { 2528 zfs_dirent_unlock(sdl); 2529 VN_RELE(ZTOV(szp)); 2530 if (strcmp(tnm, "..") == 0) 2531 terr = EINVAL; 2532 ZFS_EXIT(zfsvfs); 2533 return (terr); 2534 } 2535 2536 /* 2537 * Must have write access at the source to remove the old entry 2538 * and write access at the target to create the new entry. 2539 * Note that if target and source are the same, this can be 2540 * done in a single check. 2541 */ 2542 2543 if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr)) 2544 goto out; 2545 2546 if (ZTOV(szp)->v_type == VDIR) { 2547 /* 2548 * Check to make sure rename is valid. 2549 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d 2550 */ 2551 if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl)) 2552 goto out; 2553 } 2554 2555 /* 2556 * Does target exist? 2557 */ 2558 if (tzp) { 2559 /* 2560 * Source and target must be the same type. 2561 */ 2562 if (ZTOV(szp)->v_type == VDIR) { 2563 if (ZTOV(tzp)->v_type != VDIR) { 2564 error = ENOTDIR; 2565 goto out; 2566 } 2567 } else { 2568 if (ZTOV(tzp)->v_type == VDIR) { 2569 error = EISDIR; 2570 goto out; 2571 } 2572 } 2573 /* 2574 * POSIX dictates that when the source and target 2575 * entries refer to the same file object, rename 2576 * must do nothing and exit without error. 2577 */ 2578 if (szp->z_id == tzp->z_id) { 2579 error = 0; 2580 goto out; 2581 } 2582 } 2583 2584 vnevent_rename_src(ZTOV(szp)); 2585 if (tzp) 2586 vnevent_rename_dest(ZTOV(tzp)); 2587 2588 tx = dmu_tx_create(zfsvfs->z_os); 2589 dmu_tx_hold_bonus(tx, szp->z_id); /* nlink changes */ 2590 dmu_tx_hold_bonus(tx, sdzp->z_id); /* nlink changes */ 2591 dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); 2592 dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); 2593 if (sdzp != tdzp) 2594 dmu_tx_hold_bonus(tx, tdzp->z_id); /* nlink changes */ 2595 if (tzp) 2596 dmu_tx_hold_bonus(tx, tzp->z_id); /* parent changes */ 2597 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 2598 error = dmu_tx_assign(tx, zfsvfs->z_assign); 2599 if (error) { 2600 if (zl != NULL) 2601 zfs_rename_unlock(&zl); 2602 zfs_dirent_unlock(sdl); 2603 zfs_dirent_unlock(tdl); 2604 VN_RELE(ZTOV(szp)); 2605 if (tzp) 2606 VN_RELE(ZTOV(tzp)); 2607 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 2608 dmu_tx_wait(tx); 2609 dmu_tx_abort(tx); 2610 goto top; 2611 } 2612 dmu_tx_abort(tx); 2613 ZFS_EXIT(zfsvfs); 2614 return (error); 2615 } 2616 2617 if (tzp) /* Attempt to remove the existing target */ 2618 error = zfs_link_destroy(tdl, tzp, tx, 0, NULL); 2619 2620 if (error == 0) { 2621 error = zfs_link_create(tdl, szp, tx, ZRENAMING); 2622 if (error == 0) { 2623 error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); 2624 ASSERT(error == 0); 2625 zfs_log_rename(zilog, tx, TX_RENAME, sdzp, 2626 sdl->dl_name, tdzp, tdl->dl_name, szp); 2627 } 2628#ifdef FREEBSD_NAMECACHE 2629 if (error == 0) { 2630 cache_purge(sdvp); 2631 cache_purge(tdvp); 2632 } 2633#endif 2634 } 2635 2636 dmu_tx_commit(tx); 2637out: 2638 if (zl != NULL) 2639 zfs_rename_unlock(&zl); 2640 2641 zfs_dirent_unlock(sdl); 2642 zfs_dirent_unlock(tdl); 2643 2644 VN_RELE(ZTOV(szp)); 2645 if (tzp) 2646 VN_RELE(ZTOV(tzp)); 2647 2648 ZFS_EXIT(zfsvfs); 2649 2650 return (error); 2651} 2652 2653/* 2654 * Insert the indicated symbolic reference entry into the directory. 2655 * 2656 * IN: dvp - Directory to contain new symbolic link. 2657 * link - Name for new symlink entry. 2658 * vap - Attributes of new entry. 2659 * target - Target path of new symlink. 2660 * cr - credentials of caller. 2661 * 2662 * RETURN: 0 if success 2663 * error code if failure 2664 * 2665 * Timestamps: 2666 * dvp - ctime|mtime updated 2667 */ 2668static int 2669zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link, cred_t *cr, kthread_t *td) 2670{ 2671 znode_t *zp, *dzp = VTOZ(dvp); 2672 zfs_dirlock_t *dl; 2673 dmu_tx_t *tx; 2674 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 2675 zilog_t *zilog = zfsvfs->z_log; 2676 uint64_t zoid; 2677 int len = strlen(link); 2678 int error; 2679 2680 ASSERT(vap->va_type == VLNK); 2681 2682 ZFS_ENTER(zfsvfs); 2683top: 2684 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) { 2685 ZFS_EXIT(zfsvfs); 2686 return (error); 2687 } 2688 2689 if (len > MAXPATHLEN) { 2690 ZFS_EXIT(zfsvfs); 2691 return (ENAMETOOLONG); 2692 } 2693 2694 /* 2695 * Attempt to lock directory; fail if entry already exists. 2696 */ 2697 if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZNEW)) { 2698 ZFS_EXIT(zfsvfs); 2699 return (error); 2700 } 2701 2702 tx = dmu_tx_create(zfsvfs->z_os); 2703 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); 2704 dmu_tx_hold_bonus(tx, dzp->z_id); 2705 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 2706 if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) 2707 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE); 2708 error = dmu_tx_assign(tx, zfsvfs->z_assign); 2709 if (error) { 2710 zfs_dirent_unlock(dl); 2711 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 2712 dmu_tx_wait(tx); 2713 dmu_tx_abort(tx); 2714 goto top; 2715 } 2716 dmu_tx_abort(tx); 2717 ZFS_EXIT(zfsvfs); 2718 return (error); 2719 } 2720 2721 dmu_buf_will_dirty(dzp->z_dbuf, tx); 2722 2723 /* 2724 * Create a new object for the symlink. 2725 * Put the link content into bonus buffer if it will fit; 2726 * otherwise, store it just like any other file data. 2727 */ 2728 zoid = 0; 2729 if (sizeof (znode_phys_t) + len <= dmu_bonus_max()) { 2730 zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, len); 2731 if (len != 0) 2732 bcopy(link, zp->z_phys + 1, len); 2733 } else { 2734 dmu_buf_t *dbp; 2735 2736 zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0); 2737 2738 /* 2739 * Nothing can access the znode yet so no locking needed 2740 * for growing the znode's blocksize. 2741 */ 2742 zfs_grow_blocksize(zp, len, tx); 2743 2744 VERIFY(0 == dmu_buf_hold(zfsvfs->z_os, zoid, 0, FTAG, &dbp)); 2745 dmu_buf_will_dirty(dbp, tx); 2746 2747 ASSERT3U(len, <=, dbp->db_size); 2748 bcopy(link, dbp->db_data, len); 2749 dmu_buf_rele(dbp, FTAG); 2750 } 2751 zp->z_phys->zp_size = len; 2752 2753 /* 2754 * Insert the new object into the directory. 2755 */ 2756 (void) zfs_link_create(dl, zp, tx, ZNEW); 2757out: 2758 if (error == 0) { 2759 zfs_log_symlink(zilog, tx, TX_SYMLINK, dzp, zp, name, link); 2760 *vpp = ZTOV(zp); 2761 vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, td); 2762 } 2763 2764 dmu_tx_commit(tx); 2765 2766 zfs_dirent_unlock(dl); 2767 2768 ZFS_EXIT(zfsvfs); 2769 return (error); 2770} 2771 2772/* 2773 * Return, in the buffer contained in the provided uio structure, 2774 * the symbolic path referred to by vp. 2775 * 2776 * IN: vp - vnode of symbolic link. 2777 * uoip - structure to contain the link path. 2778 * cr - credentials of caller. 2779 * 2780 * OUT: uio - structure to contain the link path. 2781 * 2782 * RETURN: 0 if success 2783 * error code if failure 2784 * 2785 * Timestamps: 2786 * vp - atime updated 2787 */ 2788/* ARGSUSED */ 2789static int 2790zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr) 2791{ 2792 znode_t *zp = VTOZ(vp); 2793 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2794 size_t bufsz; 2795 int error; 2796 2797 ZFS_ENTER(zfsvfs); 2798 2799 bufsz = (size_t)zp->z_phys->zp_size; 2800 if (bufsz + sizeof (znode_phys_t) <= zp->z_dbuf->db_size) { 2801 error = uiomove(zp->z_phys + 1, 2802 MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); 2803 } else { 2804 dmu_buf_t *dbp; 2805 error = dmu_buf_hold(zfsvfs->z_os, zp->z_id, 0, FTAG, &dbp); 2806 if (error) { 2807 ZFS_EXIT(zfsvfs); 2808 return (error); 2809 } 2810 error = uiomove(dbp->db_data, 2811 MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); 2812 dmu_buf_rele(dbp, FTAG); 2813 } 2814 2815 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 2816 ZFS_EXIT(zfsvfs); 2817 return (error); 2818} 2819 2820/* 2821 * Insert a new entry into directory tdvp referencing svp. 2822 * 2823 * IN: tdvp - Directory to contain new entry. 2824 * svp - vnode of new entry. 2825 * name - name of new entry. 2826 * cr - credentials of caller. 2827 * 2828 * RETURN: 0 if success 2829 * error code if failure 2830 * 2831 * Timestamps: 2832 * tdvp - ctime|mtime updated 2833 * svp - ctime updated 2834 */ 2835/* ARGSUSED */ 2836static int 2837zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr) 2838{ 2839 znode_t *dzp = VTOZ(tdvp); 2840 znode_t *tzp, *szp; 2841 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 2842 zilog_t *zilog = zfsvfs->z_log; 2843 zfs_dirlock_t *dl; 2844 dmu_tx_t *tx; 2845 vnode_t *realvp; 2846 int error; 2847 2848 ASSERT(tdvp->v_type == VDIR); 2849 2850 ZFS_ENTER(zfsvfs); 2851 2852 if (VOP_REALVP(svp, &realvp) == 0) 2853 svp = realvp; 2854 2855 if (svp->v_vfsp != tdvp->v_vfsp) { 2856 ZFS_EXIT(zfsvfs); 2857 return (EXDEV); 2858 } 2859 2860 szp = VTOZ(svp); 2861top: 2862 /* 2863 * We do not support links between attributes and non-attributes 2864 * because of the potential security risk of creating links 2865 * into "normal" file space in order to circumvent restrictions 2866 * imposed in attribute space. 2867 */ 2868 if ((szp->z_phys->zp_flags & ZFS_XATTR) != 2869 (dzp->z_phys->zp_flags & ZFS_XATTR)) { 2870 ZFS_EXIT(zfsvfs); 2871 return (EINVAL); 2872 } 2873 2874 /* 2875 * POSIX dictates that we return EPERM here. 2876 * Better choices include ENOTSUP or EISDIR. 2877 */ 2878 if (svp->v_type == VDIR) { 2879 ZFS_EXIT(zfsvfs); 2880 return (EPERM); 2881 } 2882 2883 if ((uid_t)szp->z_phys->zp_uid != crgetuid(cr) && 2884 secpolicy_basic_link(cr) != 0) { 2885 ZFS_EXIT(zfsvfs); 2886 return (EPERM); 2887 } 2888 2889 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) { 2890 ZFS_EXIT(zfsvfs); 2891 return (error); 2892 } 2893 2894 /* 2895 * Attempt to lock directory; fail if entry already exists. 2896 */ 2897 if (error = zfs_dirent_lock(&dl, dzp, name, &tzp, ZNEW)) { 2898 ZFS_EXIT(zfsvfs); 2899 return (error); 2900 } 2901 2902 tx = dmu_tx_create(zfsvfs->z_os); 2903 dmu_tx_hold_bonus(tx, szp->z_id); 2904 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 2905 error = dmu_tx_assign(tx, zfsvfs->z_assign); 2906 if (error) { 2907 zfs_dirent_unlock(dl); 2908 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 2909 dmu_tx_wait(tx); 2910 dmu_tx_abort(tx); 2911 goto top; 2912 } 2913 dmu_tx_abort(tx); 2914 ZFS_EXIT(zfsvfs); 2915 return (error); 2916 } 2917 2918 error = zfs_link_create(dl, szp, tx, 0); 2919 2920 if (error == 0) 2921 zfs_log_link(zilog, tx, TX_LINK, dzp, szp, name); 2922 2923 dmu_tx_commit(tx); 2924 2925 zfs_dirent_unlock(dl); 2926 2927 ZFS_EXIT(zfsvfs); 2928 return (error); 2929} 2930 2931void 2932zfs_inactive(vnode_t *vp, cred_t *cr) 2933{ 2934 znode_t *zp = VTOZ(vp); 2935 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2936 int error; 2937 2938 rw_enter(&zfsvfs->z_um_lock, RW_READER); 2939 if (zfsvfs->z_unmounted2) { 2940 ASSERT(zp->z_dbuf_held == 0); 2941 2942 mutex_enter(&zp->z_lock); 2943 VI_LOCK(vp); 2944 vp->v_count = 0; /* count arrives as 1 */ 2945 VI_UNLOCK(vp); 2946 if (zp->z_dbuf == NULL) { 2947 mutex_exit(&zp->z_lock); 2948 zfs_znode_free(zp); 2949 } else { 2950 mutex_exit(&zp->z_lock); 2951 } 2952 rw_exit(&zfsvfs->z_um_lock); 2953 VFS_RELE(zfsvfs->z_vfs); 2954 return; 2955 } 2956 2957 if (zp->z_atime_dirty && zp->z_unlinked == 0) { 2958 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); 2959 2960 dmu_tx_hold_bonus(tx, zp->z_id); 2961 error = dmu_tx_assign(tx, TXG_WAIT); 2962 if (error) { 2963 dmu_tx_abort(tx); 2964 } else { 2965 dmu_buf_will_dirty(zp->z_dbuf, tx); 2966 mutex_enter(&zp->z_lock); 2967 zp->z_atime_dirty = 0; 2968 mutex_exit(&zp->z_lock); 2969 dmu_tx_commit(tx); 2970 } 2971 } 2972 2973 zfs_zinactive(zp); 2974 rw_exit(&zfsvfs->z_um_lock); 2975} 2976 2977CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid)); 2978CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid)); 2979 2980static int 2981zfs_fid(vnode_t *vp, fid_t *fidp) 2982{ 2983 znode_t *zp = VTOZ(vp); 2984 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2985 uint32_t gen = (uint32_t)zp->z_phys->zp_gen; 2986 uint64_t object = zp->z_id; 2987 zfid_short_t *zfid; 2988 int size, i; 2989 2990 ZFS_ENTER(zfsvfs); 2991 2992 size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN; 2993 fidp->fid_len = size; 2994 2995 zfid = (zfid_short_t *)fidp; 2996 2997 zfid->zf_len = size; 2998 2999 for (i = 0; i < sizeof (zfid->zf_object); i++) 3000 zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); 3001 3002 /* Must have a non-zero generation number to distinguish from .zfs */ 3003 if (gen == 0) 3004 gen = 1; 3005 for (i = 0; i < sizeof (zfid->zf_gen); i++) 3006 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); 3007 3008 if (size == LONG_FID_LEN) { 3009 uint64_t objsetid = dmu_objset_id(zfsvfs->z_os); 3010 zfid_long_t *zlfid; 3011 3012 zlfid = (zfid_long_t *)fidp; 3013 3014 for (i = 0; i < sizeof (zlfid->zf_setid); i++) 3015 zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i)); 3016 3017 /* XXX - this should be the generation number for the objset */ 3018 for (i = 0; i < sizeof (zlfid->zf_setgen); i++) 3019 zlfid->zf_setgen[i] = 0; 3020 } 3021 3022 ZFS_EXIT(zfsvfs); 3023 return (0); 3024} 3025 3026static int 3027zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr) 3028{ 3029 znode_t *zp, *xzp; 3030 zfsvfs_t *zfsvfs; 3031 zfs_dirlock_t *dl; 3032 int error; 3033 3034 switch (cmd) { 3035 case _PC_LINK_MAX: 3036 *valp = INT_MAX; 3037 return (0); 3038 3039 case _PC_FILESIZEBITS: 3040 *valp = 64; 3041 return (0); 3042 3043#if 0 3044 case _PC_XATTR_EXISTS: 3045 zp = VTOZ(vp); 3046 zfsvfs = zp->z_zfsvfs; 3047 ZFS_ENTER(zfsvfs); 3048 *valp = 0; 3049 error = zfs_dirent_lock(&dl, zp, "", &xzp, 3050 ZXATTR | ZEXISTS | ZSHARED); 3051 if (error == 0) { 3052 zfs_dirent_unlock(dl); 3053 if (!zfs_dirempty(xzp)) 3054 *valp = 1; 3055 VN_RELE(ZTOV(xzp)); 3056 } else if (error == ENOENT) { 3057 /* 3058 * If there aren't extended attributes, it's the 3059 * same as having zero of them. 3060 */ 3061 error = 0; 3062 } 3063 ZFS_EXIT(zfsvfs); 3064 return (error); 3065#endif 3066 3067 case _PC_ACL_EXTENDED: 3068 *valp = 0; /* TODO */ 3069 return (0); 3070 3071 case _PC_MIN_HOLE_SIZE: 3072 *valp = (int)SPA_MINBLOCKSIZE; 3073 return (0); 3074 3075 default: 3076 return (EOPNOTSUPP); 3077 } 3078} 3079 3080#ifdef TODO 3081/*ARGSUSED*/ 3082static int 3083zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr) 3084{ 3085 znode_t *zp = VTOZ(vp); 3086 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 3087 int error; 3088 3089 ZFS_ENTER(zfsvfs); 3090 error = zfs_getacl(zp, vsecp, cr); 3091 ZFS_EXIT(zfsvfs); 3092 3093 return (error); 3094} 3095#endif /* TODO */ 3096 3097#ifdef TODO 3098/*ARGSUSED*/ 3099static int 3100zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr) 3101{ 3102 znode_t *zp = VTOZ(vp); 3103 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 3104 int error; 3105 3106 ZFS_ENTER(zfsvfs); 3107 error = zfs_setacl(zp, vsecp, cr); 3108 ZFS_EXIT(zfsvfs); 3109 return (error); 3110} 3111#endif /* TODO */ 3112 3113static int 3114zfs_freebsd_open(ap) 3115 struct vop_open_args /* { 3116 struct vnode *a_vp; 3117 int a_mode; 3118 struct ucred *a_cred; 3119 struct thread *a_td; 3120 } */ *ap; 3121{ 3122 vnode_t *vp = ap->a_vp; 3123 znode_t *zp = VTOZ(vp); 3124 int error; 3125 3126 error = zfs_open(&vp, ap->a_mode, ap->a_cred); 3127 if (error == 0) 3128 vnode_create_vobject(vp, zp->z_phys->zp_size, ap->a_td); 3129 return (error); 3130} 3131 3132static int 3133zfs_freebsd_close(ap) 3134 struct vop_close_args /* { 3135 struct vnode *a_vp; 3136 int a_fflag; 3137 struct ucred *a_cred; 3138 struct thread *a_td; 3139 } */ *ap; 3140{ 3141 3142 return (zfs_close(ap->a_vp, ap->a_fflag, 0, 0, ap->a_cred)); 3143} 3144 3145static int 3146zfs_freebsd_ioctl(ap) 3147 struct vop_ioctl_args /* { 3148 struct vnode *a_vp; 3149 u_long a_command; 3150 caddr_t a_data; 3151 int a_fflag; 3152 struct ucred *cred; 3153 struct thread *td; 3154 } */ *ap; 3155{ 3156 3157 return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data, 3158 ap->a_fflag, ap->a_cred, NULL)); 3159} 3160 3161static int 3162zfs_freebsd_read(ap) 3163 struct vop_read_args /* { 3164 struct vnode *a_vp; 3165 struct uio *a_uio; 3166 int a_ioflag; 3167 struct ucred *a_cred; 3168 } */ *ap; 3169{ 3170 3171 return (zfs_read(ap->a_vp, ap->a_uio, ap->a_ioflag, ap->a_cred, NULL)); 3172} 3173 3174static int 3175zfs_freebsd_write(ap) 3176 struct vop_write_args /* { 3177 struct vnode *a_vp; 3178 struct uio *a_uio; 3179 int a_ioflag; 3180 struct ucred *a_cred; 3181 } */ *ap; 3182{ 3183 3184 return (zfs_write(ap->a_vp, ap->a_uio, ap->a_ioflag, ap->a_cred, NULL)); 3185} 3186 3187static int 3188zfs_freebsd_access(ap) 3189 struct vop_access_args /* { 3190 struct vnode *a_vp; 3191 int a_mode; 3192 struct ucred *a_cred; 3193 struct thread *a_td; 3194 } */ *ap; 3195{ 3196 3197 return (zfs_access(ap->a_vp, ap->a_mode, 0, ap->a_cred)); 3198} 3199 3200static int 3201zfs_freebsd_lookup(ap) 3202 struct vop_lookup_args /* { 3203 struct vnode *a_dvp; 3204 struct vnode **a_vpp; 3205 struct componentname *a_cnp; 3206 } */ *ap; 3207{ 3208 struct componentname *cnp = ap->a_cnp; 3209 char nm[NAME_MAX + 1]; 3210 3211 ASSERT(cnp->cn_namelen < sizeof(nm)); 3212 strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm))); 3213 3214 return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop, 3215 cnp->cn_cred, cnp->cn_thread)); 3216} 3217 3218static int 3219zfs_freebsd_create(ap) 3220 struct vop_create_args /* { 3221 struct vnode *a_dvp; 3222 struct vnode **a_vpp; 3223 struct componentname *a_cnp; 3224 struct vattr *a_vap; 3225 } */ *ap; 3226{ 3227 struct componentname *cnp = ap->a_cnp; 3228 vattr_t *vap = ap->a_vap; 3229 int mode; 3230 3231 ASSERT(cnp->cn_flags & SAVENAME); 3232 3233 vattr_init_mask(vap); 3234 mode = vap->va_mode & ALLPERMS; 3235 3236 return (zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode, 3237 ap->a_vpp, cnp->cn_cred, cnp->cn_thread)); 3238} 3239 3240static int 3241zfs_freebsd_remove(ap) 3242 struct vop_remove_args /* { 3243 struct vnode *a_dvp; 3244 struct vnode *a_vp; 3245 struct componentname *a_cnp; 3246 } */ *ap; 3247{ 3248 3249 ASSERT(ap->a_cnp->cn_flags & SAVENAME); 3250 3251 return (zfs_remove(ap->a_dvp, ap->a_cnp->cn_nameptr, 3252 ap->a_cnp->cn_cred)); 3253} 3254 3255static int 3256zfs_freebsd_mkdir(ap) 3257 struct vop_mkdir_args /* { 3258 struct vnode *a_dvp; 3259 struct vnode **a_vpp; 3260 struct componentname *a_cnp; 3261 struct vattr *a_vap; 3262 } */ *ap; 3263{ 3264 vattr_t *vap = ap->a_vap; 3265 3266 ASSERT(ap->a_cnp->cn_flags & SAVENAME); 3267 3268 vattr_init_mask(vap); 3269 3270 return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp, 3271 ap->a_cnp->cn_cred)); 3272} 3273 3274static int 3275zfs_freebsd_rmdir(ap) 3276 struct vop_rmdir_args /* { 3277 struct vnode *a_dvp; 3278 struct vnode *a_vp; 3279 struct componentname *a_cnp; 3280 } */ *ap; 3281{ 3282 struct componentname *cnp = ap->a_cnp; 3283 3284 ASSERT(cnp->cn_flags & SAVENAME); 3285 3286 return (zfs_rmdir(ap->a_dvp, cnp->cn_nameptr, NULL, cnp->cn_cred)); 3287} 3288 3289static int 3290zfs_freebsd_readdir(ap) 3291 struct vop_readdir_args /* { 3292 struct vnode *a_vp; 3293 struct uio *a_uio; 3294 struct ucred *a_cred; 3295 int *a_eofflag; 3296 int *a_ncookies; 3297 u_long **a_cookies; 3298 } */ *ap; 3299{ 3300 3301 return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag, 3302 ap->a_ncookies, ap->a_cookies)); 3303} 3304 3305static int 3306zfs_freebsd_fsync(ap) 3307 struct vop_fsync_args /* { 3308 struct vnode *a_vp; 3309 int a_waitfor; 3310 struct thread *a_td; 3311 } */ *ap; 3312{ 3313 3314 vop_stdfsync(ap); 3315 return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred)); 3316} 3317 3318static int 3319zfs_freebsd_getattr(ap) 3320 struct vop_getattr_args /* { 3321 struct vnode *a_vp; 3322 struct vattr *a_vap; 3323 struct ucred *a_cred; 3324 struct thread *a_td; 3325 } */ *ap; 3326{ 3327 3328 return (zfs_getattr(ap->a_vp, ap->a_vap, 0, ap->a_cred)); 3329} 3330 3331static int 3332zfs_freebsd_setattr(ap) 3333 struct vop_setattr_args /* { 3334 struct vnode *a_vp; 3335 struct vattr *a_vap; 3336 struct ucred *a_cred; 3337 struct thread *a_td; 3338 } */ *ap; 3339{ 3340 vattr_t *vap = ap->a_vap; 3341 3342 /* No support for FreeBSD's chflags(2). */ 3343 if (vap->va_flags != VNOVAL) 3344 return (EOPNOTSUPP); 3345 3346 vattr_init_mask(vap); 3347 3348 return (zfs_setattr(ap->a_vp, vap, 0, ap->a_cred, NULL)); 3349} 3350 3351static int 3352zfs_freebsd_rename(ap) 3353 struct vop_rename_args /* { 3354 struct vnode *a_fdvp; 3355 struct vnode *a_fvp; 3356 struct componentname *a_fcnp; 3357 struct vnode *a_tdvp; 3358 struct vnode *a_tvp; 3359 struct componentname *a_tcnp; 3360 } */ *ap; 3361{ 3362 vnode_t *fdvp = ap->a_fdvp; 3363 vnode_t *fvp = ap->a_fvp; 3364 vnode_t *tdvp = ap->a_tdvp; 3365 vnode_t *tvp = ap->a_tvp; 3366 int error; 3367 3368 ASSERT(ap->a_fcnp->cn_flags & SAVENAME); 3369 ASSERT(ap->a_tcnp->cn_flags & SAVENAME); 3370 3371 error = zfs_rename(fdvp, ap->a_fcnp->cn_nameptr, tdvp, 3372 ap->a_tcnp->cn_nameptr, ap->a_fcnp->cn_cred); 3373 3374 if (tdvp == tvp) 3375 VN_RELE(tdvp); 3376 else 3377 VN_URELE(tdvp); 3378 if (tvp) 3379 VN_URELE(tvp); 3380 VN_RELE(fdvp); 3381 VN_RELE(fvp); 3382 3383 return (error); 3384} 3385 3386static int 3387zfs_freebsd_symlink(ap) 3388 struct vop_symlink_args /* { 3389 struct vnode *a_dvp; 3390 struct vnode **a_vpp; 3391 struct componentname *a_cnp; 3392 struct vattr *a_vap; 3393 char *a_target; 3394 } */ *ap; 3395{ 3396 struct componentname *cnp = ap->a_cnp; 3397 vattr_t *vap = ap->a_vap; 3398 3399 ASSERT(cnp->cn_flags & SAVENAME); 3400 3401 vap->va_type = VLNK; /* FreeBSD: Syscall only sets va_mode. */ 3402 vattr_init_mask(vap); 3403 3404 return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap, 3405 ap->a_target, cnp->cn_cred, cnp->cn_thread)); 3406} 3407 3408static int 3409zfs_freebsd_readlink(ap) 3410 struct vop_readlink_args /* { 3411 struct vnode *a_vp; 3412 struct uio *a_uio; 3413 struct ucred *a_cred; 3414 } */ *ap; 3415{ 3416 3417 return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred)); 3418} 3419 3420static int 3421zfs_freebsd_link(ap) 3422 struct vop_link_args /* { 3423 struct vnode *a_tdvp; 3424 struct vnode *a_vp; 3425 struct componentname *a_cnp; 3426 } */ *ap; 3427{ 3428 struct componentname *cnp = ap->a_cnp; 3429 3430 ASSERT(cnp->cn_flags & SAVENAME); 3431 3432 return (zfs_link(ap->a_tdvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred)); 3433} 3434 3435static int 3436zfs_freebsd_inactive(ap) 3437 struct vop_inactive_args /* { 3438 struct vnode *a_vp; 3439 struct thread *a_td; 3440 } */ *ap; 3441{ 3442 vnode_t *vp = ap->a_vp; 3443 3444 zfs_inactive(vp, ap->a_td->td_ucred); 3445 return (0); 3446} 3447 3448static int 3449zfs_freebsd_reclaim(ap) 3450 struct vop_reclaim_args /* { 3451 struct vnode *a_vp; 3452 struct thread *a_td; 3453 } */ *ap; 3454{ 3455 vnode_t *vp = ap->a_vp; 3456 znode_t *zp = VTOZ(vp); 3457 zfsvfs_t *zfsvfs; 3458 int rele = 1; 3459 3460 ASSERT(zp != NULL); 3461 3462 /* 3463 * Destroy the vm object and flush associated pages. 3464 */ 3465 vnode_destroy_vobject(vp); 3466 3467 mutex_enter(&zp->z_lock); 3468 ASSERT(zp->z_phys); 3469 ASSERT(zp->z_dbuf_held); 3470 zfsvfs = zp->z_zfsvfs; 3471 if (!zp->z_unlinked) { 3472 zp->z_dbuf_held = 0; 3473 ZTOV(zp) = NULL; 3474 mutex_exit(&zp->z_lock); 3475 dmu_buf_rele(zp->z_dbuf, NULL); 3476 } else { 3477 mutex_exit(&zp->z_lock); 3478 } 3479 VI_LOCK(vp); 3480 if (vp->v_count > 0) 3481 rele = 0; 3482 vp->v_data = NULL; 3483 ASSERT(vp->v_holdcnt > 1); 3484 vdropl(vp); 3485 if (!zp->z_unlinked && rele) 3486 VFS_RELE(zfsvfs->z_vfs); 3487 return (0); 3488} 3489 3490static int 3491zfs_freebsd_fid(ap) 3492 struct vop_fid_args /* { 3493 struct vnode *a_vp; 3494 struct fid *a_fid; 3495 } */ *ap; 3496{ 3497 3498 return (zfs_fid(ap->a_vp, (void *)ap->a_fid)); 3499} 3500 3501static int 3502zfs_freebsd_pathconf(ap) 3503 struct vop_pathconf_args /* { 3504 struct vnode *a_vp; 3505 int a_name; 3506 register_t *a_retval; 3507 } */ *ap; 3508{ 3509 ulong_t val; 3510 int error; 3511 3512 error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred); 3513 if (error == 0) 3514 *ap->a_retval = val; 3515 else if (error == EOPNOTSUPP) 3516 error = vop_stdpathconf(ap); 3517 return (error); 3518} 3519 3520/* 3521 * Advisory record locking support 3522 */ 3523static int 3524zfs_freebsd_advlock(ap) 3525 struct vop_advlock_args /* { 3526 struct vnode *a_vp; 3527 caddr_t a_id; 3528 int a_op; 3529 struct flock *a_fl; 3530 int a_flags; 3531 } */ *ap; 3532{ 3533 znode_t *zp = VTOZ(ap->a_vp); 3534 3535 return (lf_advlock(ap, &(zp->z_lockf), zp->z_phys->zp_size)); 3536} 3537 3538struct vop_vector zfs_vnodeops; 3539struct vop_vector zfs_fifoops; 3540 3541struct vop_vector zfs_vnodeops = { 3542 .vop_default = &default_vnodeops, 3543 .vop_inactive = zfs_freebsd_inactive, 3544 .vop_reclaim = zfs_freebsd_reclaim, 3545 .vop_access = zfs_freebsd_access, 3546#ifdef FREEBSD_NAMECACHE 3547 .vop_lookup = vfs_cache_lookup, 3548 .vop_cachedlookup = zfs_freebsd_lookup, 3549#else 3550 .vop_lookup = zfs_freebsd_lookup, 3551#endif 3552 .vop_getattr = zfs_freebsd_getattr, 3553 .vop_setattr = zfs_freebsd_setattr, 3554 .vop_create = zfs_freebsd_create, 3555 .vop_mknod = zfs_freebsd_create, 3556 .vop_mkdir = zfs_freebsd_mkdir, 3557 .vop_readdir = zfs_freebsd_readdir, 3558 .vop_fsync = zfs_freebsd_fsync, 3559 .vop_open = zfs_freebsd_open, 3560 .vop_close = zfs_freebsd_close, 3561 .vop_rmdir = zfs_freebsd_rmdir, 3562 .vop_ioctl = zfs_freebsd_ioctl, 3563 .vop_link = zfs_freebsd_link, 3564 .vop_symlink = zfs_freebsd_symlink, 3565 .vop_readlink = zfs_freebsd_readlink, 3566 .vop_read = zfs_freebsd_read, 3567 .vop_write = zfs_freebsd_write, 3568 .vop_remove = zfs_freebsd_remove, 3569 .vop_rename = zfs_freebsd_rename, 3570 .vop_advlock = zfs_freebsd_advlock, 3571 .vop_pathconf = zfs_freebsd_pathconf, 3572 .vop_bmap = VOP_EOPNOTSUPP, 3573 .vop_fid = zfs_freebsd_fid, 3574}; 3575 3576struct vop_vector zfs_fifoops = { 3577 .vop_default = &fifo_specops, 3578 .vop_fsync = VOP_PANIC, 3579 .vop_access = zfs_freebsd_access, 3580 .vop_getattr = zfs_freebsd_getattr, 3581 .vop_inactive = zfs_freebsd_inactive, 3582 .vop_read = VOP_PANIC, 3583 .vop_reclaim = zfs_freebsd_reclaim, 3584 .vop_setattr = zfs_freebsd_setattr, 3585 .vop_write = VOP_PANIC, 3586 .vop_fid = zfs_freebsd_fid, 3587}; 3588